Skip to content

Commit

Permalink
Merge pull request #121 from aquatiko/add-regex
Browse files Browse the repository at this point in the history
changed remove_patterns
  • Loading branch information
aviks authored Mar 29, 2019
2 parents 7935a3f + e39e0c1 commit 53799b2
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 5 deletions.
15 changes: 11 additions & 4 deletions src/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,14 @@ function remove_patterns(s::AbstractString, rex::Regex)
v=codeunits(s)
for m in eachmatch(rex, s)
len = m.match.offset-ibegin+1
next = nextind(s, lastindex(m.match)+m.match.offset)
if len > 0
Base.write_sub(iob, v, ibegin, len)
write(iob, ' ')
if next != length(s)+1
write(iob, ' ')
end
end
ibegin = nextind(s, lastindex(m.match)+m.match.offset)
ibegin = next
end
len = length(v) - ibegin + 1
(len > 0) && Base.write_sub(iob, v, ibegin, len)
Expand All @@ -272,17 +275,21 @@ function remove_patterns(s::SubString{T}, rex::Regex) where T <: String
ibegin = 1
for m in eachmatch(rex, s)
len = m.match.offset-ibegin+1
next = nextind(s, lastindex(m.match)+m.match.offset)
if len > 0
Base.write_sub(iob, data, ibegin+ioffset, len)
write(iob, ' ')
if next != length(s)+1
write(iob, ' ')
end
end
ibegin = nextind(s, lastindex(m.match)+m.match.offset)
ibegin = next
end
len = lastindex(s) - ibegin + 1
(len > 0) && Base.write_sub(iob, data, ibegin+ioffset, len)
String(take!(iob))
end


remove_patterns!(d::FileDocument, rex::Regex) = error("FileDocument cannot be modified")

function remove_patterns!(d::StringDocument, rex::Regex)
Expand Down
7 changes: 6 additions & 1 deletion test/preprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,17 @@

#Tests strip_punctuation regex conditions
str = Document("These punctuations should be removed [-.,:;,!?'\"[](){}|\`#\$%@^&*_+<>")
answer = Document("These punctuations should be removed ")
answer = Document("These punctuations should be removed ")
prepare!(str, strip_punctuation)
@test isequal(str.text, answer.text)

str = Document("Intel(tm) Core i5-3300k, is a geat CPU! ")
answer = Document("Intel tm Core i5 3300k is a geat CPU ") #tests old implementation
prepare!(str, strip_punctuation)
@test isequal(str.text, answer.text)

#Tests no whitespace at end or begining
doc = Document(" this is sample text ")
prepare!(doc, strip_whitespace)
@test isequal(doc.text, "this is sample text")
end

0 comments on commit 53799b2

Please sign in to comment.