Skip to content

Commit

Permalink
Fixed #69, added dlib.text
Browse files Browse the repository at this point in the history
  • Loading branch information
gecko0307 committed Sep 28, 2015
1 parent 7e2bff8 commit 1fa26ac
Show file tree
Hide file tree
Showing 7 changed files with 906 additions and 561 deletions.
1 change: 1 addition & 0 deletions dlib/package.d
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ public
import dlib.image;
import dlib.math;
import dlib.xml;
import dlib.text;
}
253 changes: 253 additions & 0 deletions dlib/text/lexer.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
/*
Copyright (c) 2015 Timur Gafarov
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/

module dlib.text.lexer;

import std.stdio;
import std.algorithm;
import std.ascii;

import dlib.core.memory;
import dlib.container.array;
import dlib.text.utf8;

dchar[] copyBuffer(dchar[] b)
{
auto res = New!(dchar[])(b.length);
foreach(i, c; b)
res[i] = c;
return res;
}

bool buffEq(dchar[] b1, dchar[] b2)
{
if (b1.length != b2.length)
return false;
foreach(i, c; b1)
if (c != b2[i])
return false;
return true;
}

/*
* General-purpose lexical analyzer.
* Breaks the input string to a stream of lexemes according to a given dictionary.
* Assumes UTF-8 input.
* Treats \r\n as a single \n.
*/
class Lexer
{
string input;
string[] delims;
size_t maxDelimLength = 0;
UTF8Decoder utf8dec;

this(string input, string[] delims)
{
this.input = input;
this.delims = delims;

if (delims.length)
{
sort!("count(a) < count(b)")(this.delims);
maxDelimLength = count(delims[$-1]);
}

this.utf8dec = UTF8Decoder(input);
}

dchar getNextChar()
{
return cast(dchar)utf8dec.decodeNext();
}

bool eos()
{
return utf8dec.eos();
}

static bool isWhitespace(dchar c)
{
foreach(w; std.ascii.whitespace)
{
if (c == w)
{
return true;
}
}
return false;
}

uint prefixCompare(dchar[] s1, string s2)
{
auto dec = UTF8Decoder(s2);
uint pos = 0;
foreach(dchar c; s1)
{
int g = dec.decodeNext();
if (g == UTF8_ERROR || g == UTF8_END)
return pos;

if (c != cast(dchar)g)
return pos;

pos++;
}

return pos;
}

DynamicArray!dchar tmp;
DynamicArray!dchar buffer;
bool fillTmp = true;
//bool returnBuffer = false;

dchar[] getLexeme()
{
bool ready = false;
dchar[] output;

while(!ready)
{
if (eos())
{
fillTmp = false;

if (!tmp.length)
{

if (buffer.length)
{
output = copyBuffer(buffer.data);
//writeln(" out (eos): ", output);
buffer.free();
ready = true;
}

break;
}
}

if (fillTmp)
{
//writeln(" filling ", maxDelimLength, " (", maxDelimLength-tmp.length, ")");
foreach(i; 0..maxDelimLength-tmp.length)
{
int c = getNextChar();

if (cast(dchar)c == '\r') // ignore carriage return
{
continue;
}

if (cast(dchar)c == '\n')
{
c = '\n';
}
else if (isWhitespace(c))
{
c = ' ';
}

if (c != UTF8_ERROR && c != UTF8_END)
tmp.append(cast(dchar)c);
else
break;
}

if (tmp.length == 0)
{
ready = true;
break;
}
}

uint pos = 0;
size_t delimLen = 0;
string delim;
foreach(d; delims)
{
uint newPos = prefixCompare(tmp.data, d);
auto co = count(d);
if (newPos == co)
{
if (newPos > pos)
{
pos = newPos;
delimLen = co;
delim = d;
}
}
/*
if (newPos > pos)
{
pos = newPos;
delimLen = co;
delim = d;
}
*/
}

//writeln(" tmp: ", tmp.data);

//writeln(" pos: ", pos, " delimLen: ", delimLen, " delim: ", delim);

if (pos && pos == delimLen)
{
if (buffer.length)
{
output = copyBuffer(buffer.data);
//writeln(" out (delimited): ", output);
buffer.free();
ready = true;
}
else
{
output = copyBuffer(tmp.data[0..pos]);
//writeln(" out (delim): ", output);
tmp.removeLeft(pos);
fillTmp = true;
ready = true;
}
}
else
{
//buffer.append(tmp.data);
//tmp.free();

buffer.append(tmp.data[0]);
//writeln(" buffer: ", buffer.data);
tmp.removeLeft(1);
fillTmp = true;
}
}

return output;
}
}

36 changes: 36 additions & 0 deletions dlib/text/package.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
Copyright (c) 2015 Timur Gafarov
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/

module dlib.text;

public
{
import dlib.text.utf8;
import dlib.text.lexer;
}

Loading

0 comments on commit 1fa26ac

Please sign in to comment.