Skip to content

Commit

Permalink
Merge pull request #87 from davidanthoff/mmap2
Browse files Browse the repository at this point in the history
Memory mapping
  • Loading branch information
davidanthoff authored Dec 15, 2018
2 parents a46e4b3 + c5ad2e6 commit 74682a9
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 12 deletions.
1 change: 1 addition & 0 deletions src/TextParse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module TextParse

using CodecZlib, WeakRefStrings, Dates, Nullables, DoubleFloats

include("VectorBackedStrings.jl")
include("lib/compat.jl")
include("util.jl")
include("field.jl")
Expand Down
86 changes: 86 additions & 0 deletions src/VectorBackedStrings.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
struct VectorBackedUTF8String <: AbstractString
buffer::Vector{UInt8}
end

Base.:(==)(x::VectorBackedUTF8String, y::VectorBackedUTF8String) = x.buffer == y.buffer

function Base.show(io::IO, x::VectorBackedUTF8String)
print(io, '"')
print(io, string(x))
print(io, '"')
return
end

Base.pointer(s::VectorBackedUTF8String) = pointer(s.buffer)

Base.pointer(s::VectorBackedUTF8String, i::Integer) = pointer(s.buffer) + i - 1

@inline Base.ncodeunits(s::VectorBackedUTF8String) = length(s.buffer)

Base.codeunit(s::VectorBackedUTF8String) = UInt8

@inline function Base.codeunit(s::VectorBackedUTF8String, i::Integer)
@boundscheck checkbounds(s.buffer, i)
return @inbounds s.buffer[i]
end

Base.thisind(s::VectorBackedUTF8String, i::Int) = Base._thisind_str(s, i)

Base.nextind(s::VectorBackedUTF8String, i::Int) = Base._nextind_str(s, i)

Base.isvalid(s::VectorBackedUTF8String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i

Base.@propagate_inbounds function Base.iterate(s::VectorBackedUTF8String, i::Int=firstindex(s))
i > ncodeunits(s) && return nothing
b = codeunit(s, i)
u = UInt32(b) << 24
Base.between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1
return Base.next_continued(s, i, u)
end

function Base.next_continued(s::VectorBackedUTF8String, i::Int, u::UInt32)
u < 0xc0000000 && (i += 1; @goto ret)
n = ncodeunits(s)
# first continuation byte
(i += 1) > n && @goto ret
@inbounds b = codeunit(s, i)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 16
# second continuation byte
((i += 1) > n) | (u < 0xe0000000) && @goto ret
@inbounds b = codeunit(s, i)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b) << 8
# third continuation byte
((i += 1) > n) | (u < 0xf0000000) && @goto ret
@inbounds b = codeunit(s, i)
b & 0xc0 == 0x80 || @goto ret
u |= UInt32(b); i += 1
@label ret
return reinterpret(Char, u), i
end

# The following functions all had implementations in WeakRefStrings (which was taken
# as a starting point for the code in this file), but aren't needed for the
# TextParse.jl use. For now we leave stubs that throw errors around. If this type
# turns out to be useful beyond TextParse.jl, these should be implemented properly.

Base.:(==)(x::String, y::VectorBackedUTF8String) = error("Not yet implemented.")

Base.:(==)(y::VectorBackedUTF8String, x::String) = x == y

Base.hash(s::VectorBackedUTF8String, h::UInt) = error("Not yet implemented.")

Base.print(io::IO, s::VectorBackedUTF8String) = error("Not yet implemented.")

Base.textwidth(s::VectorBackedUTF8String) = error("Not yet implemented.")

Base.string(x::VectorBackedUTF8String) = error("Not yet implemented.")

Base.convert(::Type{VectorBackedUTF8String}, x::String) = error("Not yet implemented.")

Base.convert(::Type{String}, x::VectorBackedUTF8String) = error("Not yet implemented.")

Base.String(x::VectorBackedUTF8String) = error("Not yet implemented.")

Base.Symbol(x::VectorBackedUTF8String) = error("Not yet implemented.")
10 changes: 8 additions & 2 deletions src/csv.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using DataStructures
using Mmap

ismissingtype(T) = Missing <: T
ismissingeltype(T) = missingtype(eltype(T))
Expand Down Expand Up @@ -74,6 +75,11 @@ Read CSV from `file`. Returns a tuple of 2 elements:
"""
csvread(file::String, delim=','; kwargs...) = _csvread_f(file, delim; kwargs...)[1:2]

function csvread(file::IOStream, delim=','; kwargs...)
mmap_data = Mmap.mmap(file)
_csvread(VectorBackedUTF8String(mmap_data), delim; kwargs...)
end

function csvread(buffer::IO, delim=','; kwargs...)
_csvread(String(read(buffer)), delim; kwargs...)
end
Expand All @@ -93,8 +99,8 @@ function _csvread_f(file::AbstractString, delim=','; kwargs...)
end
else # Otherwise just try to read the file
return open(file, "r") do io
data = read(io)
_csvread_internal(String(data), delim; filename=file, kwargs...)
data = Mmap.mmap(io)
_csvread_internal(VectorBackedUTF8String(data), delim; filename=file, kwargs...)
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion src/field.jl
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ function tryparsenext(s::StringToken{T}, str, i, len, opts) where {T}
end

@inline function _substring(::Type{String}, str, i, j)
str[i:j]
String(str[i:j])
end

@inline function _substring(::Type{T}, str, i, j) where {T<:SubString}
Expand Down
18 changes: 9 additions & 9 deletions src/utf8optimizations.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@inline function eatwhitespaces(str::String, i=1, len=lastindex(str))
@inline function eatwhitespaces(str::Union{VectorBackedUTF8String, String}, i=1, len=lastindex(str))
while i<=len
@inbounds b = codeunit(str, i)

Expand All @@ -11,7 +11,7 @@
return i
end

@inline function eatnewlines(str::String, i=1, len=lastindex(str))
@inline function eatnewlines(str::Union{VectorBackedUTF8String, String}, i=1, len=lastindex(str))
count = 0
while i<=len
@inbounds b = codeunit(str, i)
Expand Down Expand Up @@ -41,7 +41,7 @@ end
return i, count
end

@inline function tryparsenext_base10_digit(T,str::String,i, len)
@inline function tryparsenext_base10_digit(T,str::Union{VectorBackedUTF8String, String},i, len)
i > len && @goto error
@inbounds b = codeunit(str,i)
diff = b-0x30
Expand All @@ -54,7 +54,7 @@ end

@inline _isdigit(b::UInt8) = ( (0x30 b) & (b 0x39) )

@inline function parse_uint_and_stop(str::String, i, len, n::T) where {T <: Integer}
@inline function parse_uint_and_stop(str::Union{VectorBackedUTF8String, String}, i, len, n::T) where {T <: Integer}
ten = T(10)
# specialize handling of the first digit so we can return an error
max_without_overflow = div(typemax(T)-9,10) # the larg
Expand Down Expand Up @@ -83,7 +83,7 @@ end
return n, true, i
end

@inline function read_digits(str::String, i, len)
@inline function read_digits(str::Union{VectorBackedUTF8String, String}, i, len)
# slurp up extra digits
while i <= len
@inbounds b = codeunit(str, i)
Expand All @@ -95,22 +95,22 @@ end
return i
end

@inline function _is_e(str::String, i)
@inline function _is_e(str::Union{VectorBackedUTF8String, String}, i)
@inbounds b = codeunit(str, i)
return (b==0x65) | (b==0x45)
end

@inline function _is_negative(str::String, i)
@inline function _is_negative(str::Union{VectorBackedUTF8String, String}, i)
@inbounds b = codeunit(str, i)
return b==0x2d
end

@inline function _is_positive(str::String, i)
@inline function _is_positive(str::Union{VectorBackedUTF8String, String}, i)
@inbounds b = codeunit(str, i)
return b==0x2b
end

@inline function tryparsenext(::Numeric{F}, str::String, i, len) where {F<:AbstractFloat}
@inline function tryparsenext(::Numeric{F}, str::Union{VectorBackedUTF8String, String}, i, len) where {F<:AbstractFloat}
R = Nullable{F}

i>len && @goto error
Expand Down
2 changes: 2 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -550,3 +550,5 @@ end
rm(fngz)
end
end

include("test_vectorbackedstrings.jl")
43 changes: 43 additions & 0 deletions test/test_vectorbackedstrings.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using Test
using TextParse: VectorBackedUTF8String

@testset "VectorBackedStrings" begin

buffer = UInt8['T', 'e', 's', 't']

s = VectorBackedUTF8String(buffer)

@test s == VectorBackedUTF8String(copy(buffer))

@test pointer(s) == pointer(buffer)

@test pointer(s, 2) == pointer(buffer, 2)

@test ncodeunits(s) == length(buffer)

@test codeunit(s) <: UInt8

@test codeunit(s, 2) == UInt8('e')

@test thisind(s, 2) == 2

@test isvalid(s, 2) == true

@test iterate(s) == ('T', 2)

@test iterate(s, 2) == ('e', 3)

@test iterate(s, 5) == nothing

@test_throws ErrorException s == "Test"
@test_throws ErrorException "Test" == s
@test_throws ErrorException hash(s, UInt(1))
@test_throws ErrorException print(s)
@test_throws ErrorException textwidth(s)
@test_throws ErrorException string(s)
@test_throws ErrorException convert(VectorBackedUTF8String, "foo")
@test_throws ErrorException convert(String, s)
@test_throws ErrorException String(s)
@test_throws ErrorException Symbol(s)

end

0 comments on commit 74682a9

Please sign in to comment.