Skip to content

Commit

Permalink
Stop special casing InlineString1
Browse files Browse the repository at this point in the history
I was looking into fixing JuliaStrings#15 but realized that the special casing of `InlineString1` to only have one byte makes that not work. I would say that the current special casing of `InlineString1` creates quite a bit of confusing behavior:

```
julia> InlineString("") |> typeof
String3

julia> InlineString("a") |> typeof
String1
```

Why would an empty string take more place than a one letter string?

```

julia> String1("")
ERROR: ArgumentError: string too large (0) to convert to String1
Stacktrace:
 [1] stringtoolong(T::Type, n::Int64)
   @ InlineStrings ~/.julia/packages/InlineStrings/xUsry/src/InlineStrings.jl:321
 [2] String1(x::String)
   @ InlineStrings ~/.julia/packages/InlineStrings/xUsry/src/InlineStrings.jl:208
 [3] top-level scope
   @ REPL[4]:1

julia> String3("")
""
```

Wut?

There is nothing in the documentation that indicates this type of special behavior.

I'm sure there is some reason for doing this since so much pain seems to have been gone through to do it but I thought I would put up this PR nonetheless.

Fixes JuliaStrings#73
Fixes JuliaStrings#72
  • Loading branch information
KristofferC committed Jul 12, 2024
1 parent 3458995 commit 2a1164d
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 106 deletions.
13 changes: 2 additions & 11 deletions ext/ParsersExt.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module ParsersExt
using Parsers
using InlineStrings: InlineString, InlineString1, addcodeunit
using InlineStrings: InlineString, addcodeunit

Parsers.xparse(::Type{T}, buf::AbstractString, pos, len, options, ::Type{S}=T) where {T <: InlineString, S} =
Parsers.xparse(T, codeunits(buf), pos, len, options, S)
Expand All @@ -14,16 +14,7 @@ function Parsers.xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos
x = T()
else
poslen = res.val
if T === InlineString1
if poslen.len != 1
overflowed = true
x = T()
else
Parsers.fastseek!(source, poslen.pos)
x = InlineString1(Parsers.peekbyte(source, poslen.pos))
Parsers.fastseek!(source, pos + res.tlen - 1)
end
elseif Parsers.escapedstring(code) || !(source isa AbstractVector{UInt8})
if Parsers.escapedstring(code) || !(source isa AbstractVector{UInt8})
if poslen.len > (sizeof(T) - 1)
overflowed = true
x = T()
Expand Down
101 changes: 18 additions & 83 deletions src/InlineStrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@ See more details by looking at individual docs for [`String1`](@ref),
"""
abstract type InlineString <: AbstractString end

for sz in (1, 4, 8, 16, 32, 64, 128, 256)
nm = Symbol(:String, max(1, sz - 1))
nma = Symbol(:InlineString, max(1, sz - 1))
macro_nm = Symbol(:inline, max(1, sz - 1), :_str)
for sz in (2, 4, 8, 16, 32, 64, 128, 256)
nm = Symbol(:String, sz-1)
nma = Symbol(:InlineString, sz-1)
macro_nm = Symbol(:inline, sz-1, :_str)
at_macro_nm = Symbol("@", macro_nm)
@eval begin
"""
$($nm)(str::AbstractString)
$($nm)(bytes::AbstractVector{UInt8}, pos, len)
$($nm)(ptr::Ptr{UInt8}, [len])
Custom fixed-size string with a fixed size of $($sz) bytes.
1 byte is used to store the length of the string. If an
inline string is shorter than $($(max(1, sz - 1))) bytes, the entire
Custom C compatible fixed-size string with a fixed size of $($sz) bytes.
1 byte is used to store the capacity minus the length of the string. If an
inline string is shorter than $($(sz-1)) bytes, the entire
string still occupies the full $($sz) bytes since they are,
by definition, fixed size. Otherwise, they can be treated
just like normal `String` values. Note that `sizeof(x)` will
Expand All @@ -58,7 +58,7 @@ for sz in (1, 4, 8, 16, 32, 64, 128, 256)
export $nma

"""
inline$($(max(1, sz - 1)))"string"
inline$($(sz-1))"string"
Macro to create a [`$($nm)`](@ref) with a fixed size of $($sz) bytes.
"""
Expand All @@ -75,7 +75,7 @@ const SmallInlineStrings = Union{String1, String3, String7, String15}

# used to zero out n lower bytes of an inline string
clear_n_bytes(s, n) = Base.shl_int(Base.lshr_int(s, 8 * n), 8 * n)
_bswap(x::T) where {T <: InlineString} = T === InlineString1 ? x : Base.bswap_int(x)
_bswap(x::T) where {T <: InlineString} = Base.bswap_int(x)

const InlineStringTypes = Union{InlineString1,
InlineString3,
Expand Down Expand Up @@ -115,26 +115,17 @@ Base.widen(::Type{InlineString63}) = InlineString127
Base.widen(::Type{InlineString127}) = InlineString255
Base.widen(::Type{InlineString255}) = String

Base.ncodeunits(::InlineString1) = 1
Base.ncodeunits(x::InlineString) = Int(Base.trunc_int(UInt8, x))
Base.codeunit(::InlineString) = UInt8

Base.@propagate_inbounds function Base.codeunit(x::T, i::Int) where {T <: InlineString}
@boundscheck checkbounds(Bool, x, i) || throw(BoundsError(x, i))
if T === InlineString1
return Base.bitcast(UInt8, x)
else
return Base.trunc_int(UInt8, Base.lshr_int(x, 8 * (sizeof(T) - i)))
end
return Base.trunc_int(UInt8, Base.lshr_int(x, 8 * (sizeof(T) - i)))
end

function Base.String(x::T) where {T <: InlineString}
len = ncodeunits(x)
out = Base._string_n(len)
if T === InlineString1
GC.@preserve out unsafe_store!(pointer(out), codeunit(x, 1))
return out
end
ref = Ref{T}(_bswap(x))
GC.@preserve ref out begin
ptr = convert(Ptr{UInt8}, Base.unsafe_convert(Ptr{T}, ref))
Expand All @@ -149,9 +140,6 @@ function Base.Symbol(x::T) where {T <: InlineString}
(Ref{T}, Int), ref, sizeof(x))
end

Base.cconvert(::Type{Ptr{UInt8}}, x::InlineString1) = Base.cconvert(Ptr{UInt8}, InlineString3(x))
Base.cconvert(::Type{Ptr{Int8}}, x::InlineString1) = Base.cconvert(Ptr{Int8}, InlineString3(x))
Base.cconvert(::Type{Cstring}, x::InlineString1) = Base.cconvert(Cstring, InlineString3(x))
Base.cconvert(::Type{Ptr{UInt8}}, x::T) where {T <: InlineString} =
Ref{T}(_bswap(clear_n_bytes(x, 1)))
Base.cconvert(::Type{Ptr{Int8}}, x::T) where {T <: InlineString} =
Expand Down Expand Up @@ -189,47 +177,17 @@ end

# add a codeunit to end of string method
function addcodeunit(x::T, b::UInt8) where {T <: InlineString}
if T === InlineString1
return Base.bitcast(InlineString1, b), false
end
len = Base.trunc_int(UInt8, x)
sz = Base.trunc_int(UInt8, sizeof(T))
shf = Base.zext_int(Int16, max(0x01, sz - len - 0x01)) << 3
x = Base.or_int(x, Base.shl_int(Base.zext_int(T, b), shf))
return Base.add_int(x, Base.zext_int(T, 0x01)), (len + 0x01) >= sz
end

# from String
InlineString1(byte::UInt8=0x00) = Base.bitcast(InlineString1, byte)

function InlineString1(x::AbstractString)
sizeof(x) == 1 || stringtoolong(InlineString1, sizeof(x))
return Base.bitcast(InlineString1, codeunit(x, 1))
end

function InlineString1(buf::AbstractVector{UInt8}, pos=1, len=length(buf))
len == 1 || stringtoolong(InlineString1, len)
return Base.bitcast(InlineString1, buf[pos])
end

function InlineString1(ptr::Ptr{UInt8}, len=nothing)
ptr == Ptr{UInt8}(0) && nullptr(InlineString1)
if len === nothing
y, _ = addcodeunit(InlineString1(), unsafe_load(ptr))
unsafe_load(ptr, 2) === 0x00 || stringtoolong(InlineString1, 2)
return y
else
len == 1 || stringtoolong(InlineString1, len)
return Base.bitcast(InlineString1, unsafe_load(ptr))
end
end
# InlineString1(byte::UInt8) = Base.bitcast(InlineString1, byte)

function InlineString1(x::S) where {S <: InlineString}
sizeof(x) == 1 || stringtoolong(InlineString1, sizeof(x))
return Base.bitcast(InlineString1, codeunit(x, 1))
end

for T in (:InlineString3, :InlineString7, :InlineString15, :InlineString31, :InlineString63, :InlineString127, :InlineString255)
for T in (:InlineString1, :InlineString3, :InlineString7, :InlineString15, :InlineString31, :InlineString63, :InlineString127, :InlineString255)
@eval $T() = Base.zext_int($T, 0x00)

@eval function $T(x::AbstractString)
Expand Down Expand Up @@ -304,11 +262,7 @@ for T in (:InlineString3, :InlineString7, :InlineString15, :InlineString31, :Inl
return Base.add_int(y, Base.zext_int($T, UInt8(len)))
else
# promoting smaller InlineString to larger
if S === InlineString1
y = Base.shl_int(Base.zext_int($T, x), 8 * (sizeof($T) - sizeof(S)))
else
y = Base.shl_int(Base.zext_int($T, Base.lshr_int(x, 8)), 8 * (sizeof($T) - sizeof(S) + 1))
end
y = Base.shl_int(Base.zext_int($T, Base.lshr_int(x, 8)), 8 * (sizeof($T) - sizeof(S) + 1))
return Base.add_int(y, Base.zext_int($T, UInt8(sizeof(x))))
end
end
Expand All @@ -320,7 +274,7 @@ end

function InlineStringType(n::Integer)
n > 255 && stringtoolong(InlineString, n)
return n == 1 ? InlineString1 : n < 4 ? InlineString3 :
return n < 2 ? InlineString1 : n < 4 ? InlineString3 :
n < 8 ? InlineString7 : n < 16 ? InlineString15 :
n < 32 ? InlineString31 : n < 64 ? InlineString63 :
n < 128 ? InlineString127 : InlineString255
Expand Down Expand Up @@ -381,7 +335,6 @@ function Base.read(s::IO, ::Type{T}) where {T <: InlineString}
end

function Base.print(io::IO, x::T) where {T <: InlineString}
x isa InlineString1 && return print(io, Char(Base.bitcast(UInt8, x)))
ref = Ref{T}(_bswap(x))
return GC.@preserve ref begin
ptr = convert(Ptr{UInt8}, Base.unsafe_convert(Ptr{T}, ref))
Expand All @@ -391,9 +344,6 @@ function Base.print(io::IO, x::T) where {T <: InlineString}
end

function Base.isascii(x::T) where {T <: InlineString}
if T === InlineString1
return codeunit(x, 1) < 0x80
end
len = ncodeunits(x)
x = Base.lshr_int(x, 8 * (sizeof(T) - len))
for _ = 1:(len >> 2)
Expand All @@ -410,7 +360,6 @@ end
# "mutating" operations; care must be taken here to "clear out"
# unused bits to ensure our == definition continues to work
# which compares the full bit contents of inline strings
Base.chop(s::InlineString1; kw...) = chop(String3(s); kw...)
function Base.chop(s::InlineString; head::Integer = 0, tail::Integer = 1)
if isempty(s)
return s
Expand All @@ -431,7 +380,6 @@ end

Base.getindex(s::InlineString, r::AbstractUnitRange{<:Integer}) = getindex(s, Int(first(r)):Int(last(r)))

Base.getindex(s::InlineString1, r::UnitRange{Int}) = getindex(InlineString3(s), r)
Base.@propagate_inbounds function Base.getindex(s::InlineString, r::UnitRange{Int})
isempty(r) && return typeof(s)("")
i = first(r)
Expand All @@ -448,15 +396,13 @@ Base.view(s::InlineString, r::AbstractUnitRange{<:Integer}) = getindex(s, r)

if isdefined(Base, :chopprefix)

Base.chopprefix(s::InlineString1, prefix::AbstractString) = chopprefix(String3(s), prefix)
function Base.chopprefix(s::InlineString, prefix::AbstractString)
if !isempty(prefix) && startswith(s, prefix)
return _chopprefix(s, prefix)
end
return s
end

Base.chopprefix(s::InlineString1, prefix::Regex) = chopprefix(String3(s), prefix)
function Base.chopprefix(s::InlineString, prefix::Regex)
m = match(prefix, String(s), firstindex(s), Base.PCRE.ANCHORED)
m === nothing && return s
Expand All @@ -483,7 +429,6 @@ end
throw_strip_argument_error() =
throw(ArgumentError("Both arguments are strings. The second argument should be a `Char` or collection of `Char`s"))

Base.lstrip(f, s::InlineString1) = lstrip(f, InlineString3(s))
function Base.lstrip(f, s::InlineString)
nc = 0
len = 0
Expand All @@ -499,19 +444,16 @@ function Base.lstrip(f, s::InlineString)
end

Base.lstrip(::AbstractString, ::InlineString) = throw_strip_argument_error()
Base.lstrip(::AbstractString, ::InlineString1) = throw_strip_argument_error()

if isdefined(Base, :chopsuffix)

Base.chopsuffix(s::InlineString1, suffix::AbstractString) = chopsuffix(String3(s), suffix)
function Base.chopsuffix(s::InlineString, suffix::AbstractString)
if !isempty(suffix) && endswith(s, suffix)
return _chopsuffix(s, suffix)
end
return s
end

Base.chopsuffix(s::InlineString1, suffix::Regex) = chopsuffix(String3(s), suffix)
function Base.chopsuffix(s::InlineString, suffix::Regex)
m = match(suffix, String(s), firstindex(s), Base.PCRE.ENDANCHORED)
m === nothing && return s
Expand All @@ -529,7 +471,6 @@ _chopsuffix(s::InlineString, suffix::AbstractString) = _chopsuffix(s, ncodeunits
return Base.or_int(s, _oftype(typeof(s), new_n))
end

Base.rstrip(f, s::InlineString1) = rstrip(f, InlineString3(s))
function Base.rstrip(f, s::InlineString)
nc = 0
for c in Iterators.reverse(s)
Expand All @@ -543,9 +484,7 @@ function Base.rstrip(f, s::InlineString)
end

Base.rstrip(::AbstractString, ::InlineString) = throw_strip_argument_error()
Base.rstrip(::AbstractString, ::InlineString1) = throw_strip_argument_error()

Base.chomp(s::InlineString1) = chomp(String3(s))
function Base.chomp(s::InlineString)
i = lastindex(s)
len = ncodeunits(s)
Expand All @@ -558,14 +497,12 @@ function Base.chomp(s::InlineString)
end
end

Base.first(s::InlineString1, n::Integer) = first(String3(s), n)
function Base.first(s::T, n::Integer) where {T <: InlineString}
newlen = nextind(s, min(lastindex(s), nextind(s, 0, n))) - 1
i = sizeof(T) - newlen
return Base.or_int(clear_n_bytes(s, i), _oftype(typeof(s), newlen))
end

Base.last(s::InlineString1, n::Integer) = last(String3(s), n)
function Base.last(s::T, n::Integer) where {T <: InlineString}
nc = ncodeunits(s) + 1
i = max(1, prevind(s, nc, n))
Expand Down Expand Up @@ -643,13 +580,11 @@ Base.string(a::_SmallestInlineStrings, b::_SmallestInlineStrings, c::_SmallestIn
# Only benefit from keeping the small-ish strings as InlineStrings
function _string(a::Ta, b::Tb) where {Ta <: SmallInlineStrings, Tb <: SmallInlineStrings}
T = summed_type(Ta, Tb)
lb_a = Int(!isa(a, InlineString1)) # no "length byte" to remove if InlineString1
lb_b = Int(!isa(b, InlineString1))
len_a = sizeof(a)
len_b = sizeof(b)
# Remove length byte (lshr), grow to new size (zext), move chars forward (shl).
a2 = Base.shl_int(Base.zext_int(T, Base.lshr_int(a, 8*lb_a)), 8 * (sizeof(T) - sizeof(Ta) + lb_a))
b2 = Base.shl_int(Base.zext_int(T, Base.lshr_int(b, 8*lb_b)), 8 * (sizeof(T) - sizeof(Tb) + lb_b - len_a))
a2 = Base.shl_int(Base.zext_int(T, Base.lshr_int(a, 8)), 8 * (sizeof(T) - sizeof(Ta) + 1))
b2 = Base.shl_int(Base.zext_int(T, Base.lshr_int(b, 8)), 8 * (sizeof(T) - sizeof(Tb) + 1 - len_a))
lb = _oftype(T, len_a + len_b) # new length byte
return Base.or_int(Base.or_int(a2, b2), lb)
end
Expand Down Expand Up @@ -879,7 +814,7 @@ using Base.Sort, Base.Order

# And under certain thresholds, MergeSort is faster than RadixSort, even for small InlineStrings
const MergeSortThresholds = Dict(
1 => 2^5,
2 => 2^5,
4 => 2^7,
8 => 2^9,
16 => 2^23
Expand Down Expand Up @@ -925,7 +860,7 @@ function Base.sort!(vs::AbstractVector, lo::Int, hi::Int, ::InlineStringSortAlg,

# setup
ts = similar(vs)
rdx = Radix(sizeof(T) == 1 ? 8 : 11)
rdx = Radix(11)
radix_size = rdx.size
radix_mask = rdx.mask
radix_size_pow = rdx.pow
Expand Down
Loading

0 comments on commit 2a1164d

Please sign in to comment.