Skip to content

Commit

Permalink
Revert "Various fixes to byte / bytearray search (#54579)"
Browse files Browse the repository at this point in the history
This reverts commit 56451d8.
  • Loading branch information
giordano authored Sep 10, 2024
1 parent c6c449c commit 24dd706
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 170 deletions.
1 change: 0 additions & 1 deletion base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ hash(x::Char, h::UInt) =
hash_uint64(((bitcast(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))

first_utf8_byte(c::Char) = (bitcast(UInt32, c) >> 24) % UInt8
first_utf8_byte(c::AbstractChar) = first_utf8_byte(Char(c)::Char)

# fallbacks:
isless(x::AbstractChar, y::AbstractChar) = isless(Char(x), Char(y))
Expand Down
136 changes: 47 additions & 89 deletions base/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,7 @@ match strings with [`match`](@ref).
"""
abstract type AbstractPattern end

# TODO: These unions represent bytes in memory that can be accessed via a pointer.
# this property is used throughout Julia, e.g. also in IO code.
# This deserves a better solution - see #53178.
# If such a better solution comes in place, these unions should be replaced.
const DenseInt8 = Union{
DenseArray{Int8},
FastContiguousSubArray{Int8,N,<:DenseArray} where N
}

# Note: This union is different from that above in that it includes CodeUnits.
# Currently, this is redundant as CodeUnits <: DenseVector, but this subtyping
# is buggy and may be removed in the future, see #54002
const DenseUInt8 = Union{
DenseArray{UInt8},
FastContiguousSubArray{UInt8,N,<:DenseArray} where N,
CodeUnits{UInt8, <:Union{String, SubString{String}}},
FastContiguousSubArray{UInt8,N,<:CodeUnits{UInt8, <:Union{String, SubString{String}}}} where N,
}

const DenseUInt8OrInt8 = Union{DenseUInt8, DenseInt8}

last_byteindex(x::Union{String, SubString{String}}) = ncodeunits(x)
last_byteindex(x::DenseUInt8OrInt8) = lastindex(x)
nothing_sentinel(i) = i == 0 ? nothing : i

function last_utf8_byte(c::Char)
u = reinterpret(UInt32, c)
Expand All @@ -52,11 +30,11 @@ function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar}
end
@inbounds isvalid(s, i) || string_index_err(s, i)
c = pred.x
c '\x7f' && return _search(s, first_utf8_byte(c), i)
c '\x7f' && return nothing_sentinel(_search(s, c % UInt8, i))
while true
i = _search(s, first_utf8_byte(c), i)
i === nothing && return nothing
isvalid(s, i) && pred(s[i]) && return i
i == 0 && return nothing
pred(s[i]) && return i
i = nextind(s, i)
end
end
Expand All @@ -69,41 +47,31 @@ const DenseBytes = Union{
CodeUnits{UInt8, <:Union{String, SubString{String}}},
}

function findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{UInt8, Int8}}, a::Union{DenseInt8, DenseUInt8})
findnext(pred, a, firstindex(a))
end
const ByteArray = Union{DenseBytes, DenseArrayType{Int8}}

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
_search(a, pred.x, i)
end
findfirst(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
nothing_sentinel(_search(a, pred.x))

function findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
_search(a, pred.x, i)
end
findnext(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
nothing_sentinel(_search(a, pred.x, i))

# iszero is special, in that the bitpattern for zero for Int8 and UInt8 is the same,
# so we can use memchr even if we search for an Int8 in an UInt8 array or vice versa
findfirst(::typeof(iszero), a::DenseUInt8OrInt8) = _search(a, zero(UInt8))
findnext(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _search(a, zero(UInt8), i)
findfirst(::typeof(iszero), a::ByteArray) = nothing_sentinel(_search(a, zero(UInt8)))
findnext(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_search(a, zero(UInt8), i))

function _search(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = firstindex(a))
fst = firstindex(a)
lst = last_byteindex(a)
if i < fst
function _search(a::Union{String,SubString{String},<:ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
if i < 1
throw(BoundsError(a, i))
end
n_bytes = lst - i + 1
if i > lst
return i == lst+1 ? nothing : throw(BoundsError(a, i))
n = sizeof(a)
if i > n
return i == n+1 ? 0 : throw(BoundsError(a, i))
end
GC.@preserve a begin
p = pointer(a)
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-fst, b, n_bytes)
end
return q == C_NULL ? nothing : (q-p+fst) % Int
p = pointer(a)
q = GC.@preserve a ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
return q == C_NULL ? 0 : Int(q-p+1)
end

function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
function _search(a::ByteArray, b::AbstractChar, i::Integer = 1)
if isascii(b)
_search(a,UInt8(b),i)
else
Expand All @@ -112,51 +80,41 @@ function _search(a::DenseUInt8, b::AbstractChar, i::Integer = firstindex(a))
end

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:AbstractChar},
s::Union{String, SubString{String}}, i::Integer)
s::String, i::Integer)
c = pred.x
c '\x7f' && return _rsearch(s, first_utf8_byte(c), i)
c '\x7f' && return nothing_sentinel(_rsearch(s, c % UInt8, i))
b = first_utf8_byte(c)
while true
i = _rsearch(s, b, i)
i == nothing && return nothing
isvalid(s, i) && pred(s[i]) && return i
i == 0 && return nothing
pred(s[i]) && return i
i = prevind(s, i)
end
end

function findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::DenseUInt8OrInt8)
findprev(pred, a, lastindex(a))
end
findlast(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray) =
nothing_sentinel(_rsearch(a, pred.x))

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},Int8}, a::DenseInt8, i::Integer)
_rsearch(a, pred.x, i)
end
findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},<:Union{Int8,UInt8}}, a::ByteArray, i::Integer) =
nothing_sentinel(_rsearch(a, pred.x, i))

function findprev(pred::Fix2{<:Union{typeof(isequal),typeof(==)},UInt8}, a::DenseUInt8, i::Integer)
_rsearch(a, pred.x, i)
end
findlast(::typeof(iszero), a::ByteArray) = nothing_sentinel(_rsearch(a, zero(UInt8)))
findprev(::typeof(iszero), a::ByteArray, i::Integer) = nothing_sentinel(_rsearch(a, zero(UInt8), i))

# See comments above for findfirst(::typeof(iszero)) methods
findlast(::typeof(iszero), a::DenseUInt8OrInt8) = _rsearch(a, zero(UInt8))
findprev(::typeof(iszero), a::DenseUInt8OrInt8, i::Integer) = _rsearch(a, zero(UInt8), i)

function _rsearch(a::Union{String,SubString{String},DenseUInt8OrInt8}, b::Union{Int8,UInt8}, i::Integer = last_byteindex(a))
fst = firstindex(a)
lst = last_byteindex(a)
if i < fst
return i == fst - 1 ? nothing : throw(BoundsError(a, i))
end
if i > lst
return i == lst+1 ? nothing : throw(BoundsError(a, i))
function _rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = sizeof(a))
if i < 1
return i == 0 ? 0 : throw(BoundsError(a, i))
end
GC.@preserve a begin
p = pointer(a)
q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i-fst+1)
n = sizeof(a)
if i > n
return i == n+1 ? 0 : throw(BoundsError(a, i))
end
return q == C_NULL ? nothing : (q-p+fst) % Int
p = pointer(a)
q = GC.@preserve a ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
return q == C_NULL ? 0 : Int(q-p+1)
end

function _rsearch(a::DenseUInt8, b::AbstractChar, i::Integer = length(a))
function _rsearch(a::ByteArray, b::AbstractChar, i::Integer = length(a))
if isascii(b)
_rsearch(a,UInt8(b),i)
else
Expand Down Expand Up @@ -266,19 +224,18 @@ end

in(c::AbstractChar, s::AbstractString) = (findfirst(isequal(c),s)!==nothing)

function _searchindex(s::Union{AbstractString,DenseUInt8OrInt8},
function _searchindex(s::Union{AbstractString,ByteArray},
t::Union{AbstractString,AbstractChar,Int8,UInt8},
i::Integer)
sentinel = firstindex(s) - 1
x = Iterators.peel(t)
if isnothing(x)
return firstindex(s) <= i <= nextind(s,lastindex(s))::Int ? i :
return 1 <= i <= nextind(s,lastindex(s))::Int ? i :
throw(BoundsError(s, i))
end
t1, trest = x
while true
i = findnext(isequal(t1),s,i)
if i === nothing return sentinel end
if i === nothing return 0 end
ii = nextind(s, i)::Int
a = Iterators.Stateful(trest)
matched = all(splat(==), zip(SubString(s, ii), a))
Expand Down Expand Up @@ -552,8 +509,9 @@ julia> findall(UInt8[1,2], UInt8[1,2,3,1,2])
!!! compat "Julia 1.3"
This method requires at least Julia 1.3.
"""
function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{UInt8}},
s::Union{AbstractString, AbstractPattern, AbstractVector{UInt8}},

function findall(t::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
s::Union{AbstractString, AbstractPattern, AbstractVector{<:Union{Int8,UInt8}}},
; overlap::Bool=false)
found = UnitRange{Int}[]
i, e = firstindex(s), lastindex(s)
Expand Down Expand Up @@ -606,7 +564,7 @@ function _rsearchindex(s::AbstractString,
end
end

function _rsearchindex(s::Union{String, SubString{String}}, t::Union{String, SubString{String}}, i::Integer)
function _rsearchindex(s::String, t::String, i::Integer)
# Check for fast case of a single byte
if lastindex(t) == 1
return something(findprev(isequal(t[1]), s, i), 0)
Expand Down
49 changes: 0 additions & 49 deletions test/strings/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -155,16 +155,6 @@ for str in [u8str]
@test findprev(isequal('ε'), str, 4) === nothing
end

# See the comments in #54579
@testset "Search for invalid chars" begin
@test findfirst(==('\xff'), "abc\xffde") == 4
@test findprev(isequal('\xa6'), "abc\xa69", 5) == 4
@test isnothing(findfirst(==('\xff'), "abcdeæd"))

@test isnothing(findnext(==('\xa6'), "æ", 1))
@test isnothing(findprev(==('\xa6'), "æa", 2))
end

# string forward search with a single-char string
@test findfirst("x", astr) === nothing
@test findfirst("H", astr) == 1:1
Expand Down Expand Up @@ -455,45 +445,6 @@ end
@test_throws BoundsError findprev(pattern, A, -3)
end
end

@test findall([0x01, 0x02], [0x03, 0x01, 0x02, 0x01, 0x02, 0x06]) == [2:3, 4:5]
@test isempty(findall([0x04, 0x05], [0x03, 0x04, 0x06]))
end

# Issue 54578
@testset "No conflation of Int8 and UInt8" begin
# Work for mixed types if the values are the same
@test findfirst(==(Int8(1)), [0x01]) == 1
@test findnext(iszero, Int8[0, -2, 0, -3], 2) == 3
@test findfirst(Int8[1,4], UInt8[0, 2, 4, 1, 8, 1, 4, 2]) == 6:7
@test findprev(UInt8[5, 6], Int8[1, 9, 2, 5, 6, 3], 6) == 4:5

# Returns nothing for the same methods if the values are different,
# even if the bitpatterns are the same
@test isnothing(findfirst(==(Int8(-1)), [0xff]))
@test isnothing(findnext(isequal(0xff), Int8[-1, -2, -1], 2))
@test isnothing(findfirst(UInt8[0xff, 0xfe], Int8[0, -1, -2, 1, 8, 1, 4, 2]))
@test isnothing(findprev(UInt8[0xff, 0xfe], Int8[1, 9, 2, -1, -2, 3], 6))
end

@testset "DenseArray with offsets" begin
isdefined(Main, :OffsetDenseArrays) || @eval Main include("../testhelpers/OffsetDenseArrays.jl")
OffsetDenseArrays = Main.OffsetDenseArrays

A = OffsetDenseArrays.OffsetDenseArray(collect(0x61:0x69), 100)
@test findfirst(==(0x61), A) == 101
@test findlast(==(0x61), A) == 101
@test findfirst(==(0x00), A) === nothing

@test findfirst([0x62, 0x63, 0x64], A) == 102:104
@test findlast([0x63, 0x64], A) == 103:104
@test findall([0x62, 0x63], A) == [102:103]

@test findfirst(iszero, A) === nothing
A = OffsetDenseArrays.OffsetDenseArray([0x01, 0x02, 0x00, 0x03], -100)
@test findfirst(iszero, A) == -97
@test findnext(==(0x02), A, -99) == -98
@test findnext(==(0x02), A, -97) === nothing
end

# issue 32568
Expand Down
31 changes: 0 additions & 31 deletions test/testhelpers/OffsetDenseArrays.jl

This file was deleted.

0 comments on commit 24dd706

Please sign in to comment.