diff --git a/src/TextParse.jl b/src/TextParse.jl index 8ecb5a0..865c4f6 100644 --- a/src/TextParse.jl +++ b/src/TextParse.jl @@ -2,6 +2,7 @@ module TextParse using CodecZlib, WeakRefStrings, Dates, Nullables, DoubleFloats +include("VectorBackedStrings.jl") include("lib/compat.jl") include("util.jl") include("field.jl") diff --git a/src/VectorBackedStrings.jl b/src/VectorBackedStrings.jl new file mode 100644 index 0000000..9283d3d --- /dev/null +++ b/src/VectorBackedStrings.jl @@ -0,0 +1,86 @@ +struct VectorBackedUTF8String <: AbstractString + buffer::Vector{UInt8} +end + +Base.:(==)(x::VectorBackedUTF8String, y::VectorBackedUTF8String) = x.buffer == y.buffer + +function Base.show(io::IO, x::VectorBackedUTF8String) + print(io, '"') + print(io, string(x)) + print(io, '"') + return +end + +Base.pointer(s::VectorBackedUTF8String) = pointer(s.buffer) + +Base.pointer(s::VectorBackedUTF8String, i::Integer) = pointer(s.buffer) + i - 1 + +@inline Base.ncodeunits(s::VectorBackedUTF8String) = length(s.buffer) + +Base.codeunit(s::VectorBackedUTF8String) = UInt8 + +@inline function Base.codeunit(s::VectorBackedUTF8String, i::Integer) + @boundscheck checkbounds(s.buffer, i) + return @inbounds s.buffer[i] +end + +Base.thisind(s::VectorBackedUTF8String, i::Int) = Base._thisind_str(s, i) + +Base.nextind(s::VectorBackedUTF8String, i::Int) = Base._nextind_str(s, i) + +Base.isvalid(s::VectorBackedUTF8String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i + +Base.@propagate_inbounds function Base.iterate(s::VectorBackedUTF8String, i::Int=firstindex(s)) + i > ncodeunits(s) && return nothing + b = codeunit(s, i) + u = UInt32(b) << 24 + Base.between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1 + return Base.next_continued(s, i, u) +end + +function Base.next_continued(s::VectorBackedUTF8String, i::Int, u::UInt32) + u < 0xc0000000 && (i += 1; @goto ret) + n = ncodeunits(s) + # first continuation byte + (i += 1) > n && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 16 + # second continuation byte + ((i += 1) > n) | (u < 0xe0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b) << 8 + # third continuation byte + ((i += 1) > n) | (u < 0xf0000000) && @goto ret + @inbounds b = codeunit(s, i) + b & 0xc0 == 0x80 || @goto ret + u |= UInt32(b); i += 1 +@label ret + return reinterpret(Char, u), i +end + +# The following functions all had implementations in WeakRefStrings (which was taken +# as a starting point for the code in this file), but aren't needed for the +# TextParse.jl use. For now we leave stubs that throw errors around. If this type +# turns out to be useful beyond TextParse.jl, these should be implemented properly. + +Base.:(==)(x::String, y::VectorBackedUTF8String) = error("Not yet implemented.") + +Base.:(==)(y::VectorBackedUTF8String, x::String) = x == y + +Base.hash(s::VectorBackedUTF8String, h::UInt) = error("Not yet implemented.") + +Base.print(io::IO, s::VectorBackedUTF8String) = error("Not yet implemented.") + +Base.textwidth(s::VectorBackedUTF8String) = error("Not yet implemented.") + +Base.string(x::VectorBackedUTF8String) = error("Not yet implemented.") + +Base.convert(::Type{VectorBackedUTF8String}, x::String) = error("Not yet implemented.") + +Base.convert(::Type{String}, x::VectorBackedUTF8String) = error("Not yet implemented.") + +Base.String(x::VectorBackedUTF8String) = error("Not yet implemented.") + +Base.Symbol(x::VectorBackedUTF8String) = error("Not yet implemented.") diff --git a/src/csv.jl b/src/csv.jl index b5b5bbd..bb9a4d9 100644 --- a/src/csv.jl +++ b/src/csv.jl @@ -1,4 +1,5 @@ using DataStructures +using Mmap ismissingtype(T) = Missing <: T ismissingeltype(T) = missingtype(eltype(T)) @@ -74,6 +75,11 @@ Read CSV from `file`. Returns a tuple of 2 elements: """ csvread(file::String, delim=','; kwargs...) = _csvread_f(file, delim; kwargs...)[1:2] +function csvread(file::IOStream, delim=','; kwargs...) + mmap_data = Mmap.mmap(file) + _csvread(VectorBackedUTF8String(mmap_data), delim; kwargs...) +end + function csvread(buffer::IO, delim=','; kwargs...) _csvread(String(read(buffer)), delim; kwargs...) end @@ -93,8 +99,8 @@ function _csvread_f(file::AbstractString, delim=','; kwargs...) end else # Otherwise just try to read the file return open(file, "r") do io - data = read(io) - _csvread_internal(String(data), delim; filename=file, kwargs...) + data = Mmap.mmap(io) + _csvread_internal(VectorBackedUTF8String(data), delim; filename=file, kwargs...) end end end diff --git a/src/field.jl b/src/field.jl index 3537f74..23d0ee9 100644 --- a/src/field.jl +++ b/src/field.jl @@ -360,7 +360,7 @@ function tryparsenext(s::StringToken{T}, str, i, len, opts) where {T} end @inline function _substring(::Type{String}, str, i, j) - str[i:j] + String(str[i:j]) end @inline function _substring(::Type{T}, str, i, j) where {T<:SubString} diff --git a/src/utf8optimizations.jl b/src/utf8optimizations.jl index 25bc48f..f8f6ed9 100644 --- a/src/utf8optimizations.jl +++ b/src/utf8optimizations.jl @@ -1,4 +1,4 @@ -@inline function eatwhitespaces(str::String, i=1, len=lastindex(str)) +@inline function eatwhitespaces(str::Union{VectorBackedUTF8String, String}, i=1, len=lastindex(str)) while i<=len @inbounds b = codeunit(str, i) @@ -11,7 +11,7 @@ return i end -@inline function eatnewlines(str::String, i=1, len=lastindex(str)) +@inline function eatnewlines(str::Union{VectorBackedUTF8String, String}, i=1, len=lastindex(str)) count = 0 while i<=len @inbounds b = codeunit(str, i) @@ -41,7 +41,7 @@ end return i, count end -@inline function tryparsenext_base10_digit(T,str::String,i, len) +@inline function tryparsenext_base10_digit(T,str::Union{VectorBackedUTF8String, String},i, len) i > len && @goto error @inbounds b = codeunit(str,i) diff = b-0x30 @@ -54,7 +54,7 @@ end @inline _isdigit(b::UInt8) = ( (0x30 ≤ b) & (b ≤ 0x39) ) -@inline function parse_uint_and_stop(str::String, i, len, n::T) where {T <: Integer} +@inline function parse_uint_and_stop(str::Union{VectorBackedUTF8String, String}, i, len, n::T) where {T <: Integer} ten = T(10) # specialize handling of the first digit so we can return an error max_without_overflow = div(typemax(T)-9,10) # the larg @@ -83,7 +83,7 @@ end return n, true, i end -@inline function read_digits(str::String, i, len) +@inline function read_digits(str::Union{VectorBackedUTF8String, String}, i, len) # slurp up extra digits while i <= len @inbounds b = codeunit(str, i) @@ -95,22 +95,22 @@ end return i end -@inline function _is_e(str::String, i) +@inline function _is_e(str::Union{VectorBackedUTF8String, String}, i) @inbounds b = codeunit(str, i) return (b==0x65) | (b==0x45) end -@inline function _is_negative(str::String, i) +@inline function _is_negative(str::Union{VectorBackedUTF8String, String}, i) @inbounds b = codeunit(str, i) return b==0x2d end -@inline function _is_positive(str::String, i) +@inline function _is_positive(str::Union{VectorBackedUTF8String, String}, i) @inbounds b = codeunit(str, i) return b==0x2b end -@inline function tryparsenext(::Numeric{F}, str::String, i, len) where {F<:AbstractFloat} +@inline function tryparsenext(::Numeric{F}, str::Union{VectorBackedUTF8String, String}, i, len) where {F<:AbstractFloat} R = Nullable{F} i>len && @goto error diff --git a/test/runtests.jl b/test/runtests.jl index 2572bf6..0f33826 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -550,3 +550,5 @@ end rm(fngz) end end + +include("test_vectorbackedstrings.jl") diff --git a/test/test_vectorbackedstrings.jl b/test/test_vectorbackedstrings.jl new file mode 100644 index 0000000..fd30ad9 --- /dev/null +++ b/test/test_vectorbackedstrings.jl @@ -0,0 +1,43 @@ +using Test +using TextParse: VectorBackedUTF8String + +@testset "VectorBackedStrings" begin + +buffer = UInt8['T', 'e', 's', 't'] + +s = VectorBackedUTF8String(buffer) + +@test s == VectorBackedUTF8String(copy(buffer)) + +@test pointer(s) == pointer(buffer) + +@test pointer(s, 2) == pointer(buffer, 2) + +@test ncodeunits(s) == length(buffer) + +@test codeunit(s) <: UInt8 + +@test codeunit(s, 2) == UInt8('e') + +@test thisind(s, 2) == 2 + +@test isvalid(s, 2) == true + +@test iterate(s) == ('T', 2) + +@test iterate(s, 2) == ('e', 3) + +@test iterate(s, 5) == nothing + +@test_throws ErrorException s == "Test" +@test_throws ErrorException "Test" == s +@test_throws ErrorException hash(s, UInt(1)) +@test_throws ErrorException print(s) +@test_throws ErrorException textwidth(s) +@test_throws ErrorException string(s) +@test_throws ErrorException convert(VectorBackedUTF8String, "foo") +@test_throws ErrorException convert(String, s) +@test_throws ErrorException String(s) +@test_throws ErrorException Symbol(s) + +end