From e39e0c1a340484d4a68e50f4876549f9da3bfbf2 Mon Sep 17 00:00:00 2001 From: aquatiko Date: Mon, 21 Jan 2019 23:05:04 +0530 Subject: [PATCH] updated remove_pattern and tests added --- src/preprocessing.jl | 15 +++++++++++---- test/preprocessing.jl | 7 ++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/preprocessing.jl b/src/preprocessing.jl index 6606cdfe..1070807f 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -254,11 +254,14 @@ function remove_patterns(s::AbstractString, rex::Regex) v=codeunits(s) for m in eachmatch(rex, s) len = m.match.offset-ibegin+1 + next = nextind(s, lastindex(m.match)+m.match.offset) if len > 0 Base.write_sub(iob, v, ibegin, len) - write(iob, ' ') + if next != length(s)+1 + write(iob, ' ') + end end - ibegin = nextind(s, lastindex(m.match)+m.match.offset) + ibegin = next end len = length(v) - ibegin + 1 (len > 0) && Base.write_sub(iob, v, ibegin, len) @@ -272,17 +275,21 @@ function remove_patterns(s::SubString{T}, rex::Regex) where T <: String ibegin = 1 for m in eachmatch(rex, s) len = m.match.offset-ibegin+1 + next = nextind(s, lastindex(m.match)+m.match.offset) if len > 0 Base.write_sub(iob, data, ibegin+ioffset, len) - write(iob, ' ') + if next != length(s)+1 + write(iob, ' ') + end end - ibegin = nextind(s, lastindex(m.match)+m.match.offset) + ibegin = next end len = lastindex(s) - ibegin + 1 (len > 0) && Base.write_sub(iob, data, ibegin+ioffset, len) String(take!(iob)) end + remove_patterns!(d::FileDocument, rex::Regex) = error("FileDocument cannot be modified") function remove_patterns!(d::StringDocument, rex::Regex) diff --git a/test/preprocessing.jl b/test/preprocessing.jl index 69f977cc..d9d5ddc1 100644 --- a/test/preprocessing.jl +++ b/test/preprocessing.jl @@ -93,7 +93,7 @@ #Tests strip_punctuation regex conditions str = Document("These punctuations should be removed [-.,:;,!?'\"[](){}|\`#\$%@^&*_+<>") - answer = Document("These punctuations should be removed ") + answer = Document("These punctuations should be removed ") prepare!(str, strip_punctuation) @test isequal(str.text, answer.text) @@ -101,4 +101,9 @@ answer = Document("Intel tm Core i5 3300k is a geat CPU ") #tests old implementation prepare!(str, strip_punctuation) @test isequal(str.text, answer.text) + + #Tests no whitespace at end or begining + doc = Document(" this is sample text ") + prepare!(doc, strip_whitespace) + @test isequal(doc.text, "this is sample text") end