diff --git a/dlib/package.d b/dlib/package.d index 108d862c..29cd458c 100644 --- a/dlib/package.d +++ b/dlib/package.d @@ -39,4 +39,5 @@ public import dlib.image; import dlib.math; import dlib.xml; + import dlib.text; } \ No newline at end of file diff --git a/dlib/text/lexer.d b/dlib/text/lexer.d new file mode 100644 index 00000000..1c0e50d5 --- /dev/null +++ b/dlib/text/lexer.d @@ -0,0 +1,253 @@ +/* +Copyright (c) 2015 Timur Gafarov + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +module dlib.text.lexer; + +import std.stdio; +import std.algorithm; +import std.ascii; + +import dlib.core.memory; +import dlib.container.array; +import dlib.text.utf8; + +dchar[] copyBuffer(dchar[] b) +{ + auto res = New!(dchar[])(b.length); + foreach(i, c; b) + res[i] = c; + return res; +} + +bool buffEq(dchar[] b1, dchar[] b2) +{ + if (b1.length != b2.length) + return false; + foreach(i, c; b1) + if (c != b2[i]) + return false; + return true; +} + +/* + * General-purpose lexical analyzer. + * Breaks the input string to a stream of lexemes according to a given dictionary. + * Assumes UTF-8 input. + * Treats \r\n as a single \n. + */ +class Lexer +{ + string input; + string[] delims; + size_t maxDelimLength = 0; + UTF8Decoder utf8dec; + + this(string input, string[] delims) + { + this.input = input; + this.delims = delims; + + if (delims.length) + { + sort!("count(a) < count(b)")(this.delims); + maxDelimLength = count(delims[$-1]); + } + + this.utf8dec = UTF8Decoder(input); + } + + dchar getNextChar() + { + return cast(dchar)utf8dec.decodeNext(); + } + + bool eos() + { + return utf8dec.eos(); + } + + static bool isWhitespace(dchar c) + { + foreach(w; std.ascii.whitespace) + { + if (c == w) + { + return true; + } + } + return false; + } + + uint prefixCompare(dchar[] s1, string s2) + { + auto dec = UTF8Decoder(s2); + uint pos = 0; + foreach(dchar c; s1) + { + int g = dec.decodeNext(); + if (g == UTF8_ERROR || g == UTF8_END) + return pos; + + if (c != cast(dchar)g) + return pos; + + pos++; + } + + return pos; + } + + DynamicArray!dchar tmp; + DynamicArray!dchar buffer; + bool fillTmp = true; + //bool returnBuffer = false; + + dchar[] getLexeme() + { + bool ready = false; + dchar[] output; + + while(!ready) + { + if (eos()) + { + fillTmp = false; + + if (!tmp.length) + { + + if (buffer.length) + { + output = copyBuffer(buffer.data); + //writeln(" out (eos): ", output); + buffer.free(); + ready = true; + } + + break; + } + } + + if (fillTmp) + { + //writeln(" filling ", maxDelimLength, " (", maxDelimLength-tmp.length, ")"); + foreach(i; 0..maxDelimLength-tmp.length) + { + int c = getNextChar(); + + if (cast(dchar)c == '\r') // ignore carriage return + { + continue; + } + + if (cast(dchar)c == '\n') + { + c = '\n'; + } + else if (isWhitespace(c)) + { + c = ' '; + } + + if (c != UTF8_ERROR && c != UTF8_END) + tmp.append(cast(dchar)c); + else + break; + } + + if (tmp.length == 0) + { + ready = true; + break; + } + } + + uint pos = 0; + size_t delimLen = 0; + string delim; + foreach(d; delims) + { + uint newPos = prefixCompare(tmp.data, d); + auto co = count(d); + if (newPos == co) + { + if (newPos > pos) + { + pos = newPos; + delimLen = co; + delim = d; + } + } + /* + if (newPos > pos) + { + pos = newPos; + delimLen = co; + delim = d; + } + */ + } + + //writeln(" tmp: ", tmp.data); + + //writeln(" pos: ", pos, " delimLen: ", delimLen, " delim: ", delim); + + if (pos && pos == delimLen) + { + if (buffer.length) + { + output = copyBuffer(buffer.data); + //writeln(" out (delimited): ", output); + buffer.free(); + ready = true; + } + else + { + output = copyBuffer(tmp.data[0..pos]); + //writeln(" out (delim): ", output); + tmp.removeLeft(pos); + fillTmp = true; + ready = true; + } + } + else + { + //buffer.append(tmp.data); + //tmp.free(); + + buffer.append(tmp.data[0]); + //writeln(" buffer: ", buffer.data); + tmp.removeLeft(1); + fillTmp = true; + } + } + + return output; + } +} + diff --git a/dlib/text/package.d b/dlib/text/package.d new file mode 100644 index 00000000..0748b5d9 --- /dev/null +++ b/dlib/text/package.d @@ -0,0 +1,36 @@ +/* +Copyright (c) 2015 Timur Gafarov + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +module dlib.text; + +public +{ + import dlib.text.utf8; + import dlib.text.lexer; +} + diff --git a/dlib/text/utf8.d b/dlib/text/utf8.d new file mode 100644 index 00000000..5e6b4051 --- /dev/null +++ b/dlib/text/utf8.d @@ -0,0 +1,125 @@ +/* +Copyright (c) 2015 Timur Gafarov + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + +module dlib.text.utf8; + +/* + * Simple and pretty fast UTF-8 decoder + */ + +enum UTF8_END = -1; +enum UTF8_ERROR = -2; + +struct UTF8Decoder +{ + size_t index = 0; + int character = 0; + int b = 0; + string input; + + int get() + { + if (index >= input.length) + return UTF8_END; + auto c = input[index] & 0xFF; + index++; + return c; + } + + int cont() + { + int c = get(); + return ((c & 0xC0) == 0x80) ? (c & 0x3F): UTF8_ERROR; + } + + this(string str) + { + index = 0; + character = 0; + b = 0; + input = str; + } + + int decodeNext() + { + int c; // the first byte of the character + int r; // the result + + if (index >= input.length) + return index == input.length ? UTF8_END : UTF8_ERROR; + + b = index; + character++; + c = get(); + + // Zero continuation (0 to 127) + if ((c & 0x80) == 0) + return c; + + // One continuation (128 to 2047) + if ((c & 0xE0) == 0xC0) + { + int c1 = cont(); + if (c1 >= 0) + { + r = ((c & 0x1F) << 6) | c1; + return r >= 128 ? r : UTF8_ERROR; + } + } + // Two continuation (2048 to 55295 and 57344 to 65535) + else if ((c & 0xF0) == 0xE0) + { + int c1 = cont(); + int c2 = cont(); + if ((c1 | c2) >= 0) + { + r = ((c & 0x0F) << 12) | (c1 << 6) | c2; + return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR; + } + } + // Three continuation (65536 to 1114111) + else if ((c & 0xF8) == 0xF0) + { + int c1 = cont(); + int c2 = cont(); + int c3 = cont(); + if ((c1 | c2 | c3) >= 0) + { + return (((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3) + 65536; + } + } + + return UTF8_ERROR; + } + + bool eos() + { + return (index >= input.length); + } +} + diff --git a/dlib/xml/document.d b/dlib/xml/document.d index df2b4830..d5a60ac9 100644 --- a/dlib/xml/document.d +++ b/dlib/xml/document.d @@ -1,5 +1,5 @@ /* -Copyright (c) 2013-2014 Timur Gafarov +Copyright (c) 2015 Timur Gafarov Boost Software License - Version 1.0 - August 17th, 2003 @@ -28,375 +28,480 @@ DEALINGS IN THE SOFTWARE. module dlib.xml.document; -private +import std.stdio; +import std.conv; +import dlib.core.memory; +import dlib.core.compound; +import dlib.container.array; +import dlib.container.stack; +import dlib.text.lexer; +import dlib.text.utf8; +import dlib.xml.node; + +/* + * GC-free parser for a subset of XML. + * Has the following limitations: + * - supports only ASCII and UTF-8 encodings + * - doesn't support DOCTYPE and some other special tags + */ + +string[] xmlDelims = +[ + "<", ">", "", "=", "", "\"", + "", "", + "\"", "'", " ", "\n", "\r" +]; + +enum XmlToken { - import std.stdio; - import std.file; - import std.conv; - import std.utf; - - import dlib.container.stack; - import dlib.xml.lexer; - import dlib.xml.node; + TagOpen, + TagClose, + TagName, + Assignment, + Quote, + PropValue } -final class XmlDocument +dchar[] emptyStr; + +class XmlDocument { - private XmlNode prolog = null; - XmlNode root = null; - dstring type; - - @property dstring xmlVersion() + XmlNode prolog = null; + XmlNode root; + + this() { - if (prolog is null) - return "1.0"; - else - { - if ("version"d in prolog.properties) - return prolog.properties["version"]; - else - return "1.1"; - } + root = New!XmlNode(emptyStr); + } + + ~this() + { + Delete(root); + if (prolog) + Delete(prolog); } +} - @property XmlNode rootNode() +dchar[] errMsg(string s1, dchar[] s2) +{ + DynamicArray!dchar arr; + UTF8Decoder dec = UTF8Decoder(s1); + int d; + do { - if (root.children) - return root.children[0]; - else - return null; + d = dec.decodeNext(); + if (d != UTF8_END && d != UTF8_ERROR) + arr.append(cast(dchar)d); + } while (d != UTF8_END && d != UTF8_ERROR); + + arr.append('\"'); + + foreach(c; s2) + { + arr.append(c); } - enum XmlToken + arr.append('\"'); + + dchar[] res = copyBuffer(arr.data); + arr.free(); + + return res; +} + +XmlDocument parseXML(string text) +{ + auto res = parseXMLUnmanaged(text); + if (res[0]) + { + return res[0]; + } + else { - TagOpen, - TagClose, - TagName, - Assignment, - PropValue + string errStr = to!string(res[1]); + Delete(res[1]); + throw new Exception(errStr); } +} + +Compound!(XmlDocument, dchar[]) parseXMLUnmanaged(string text) +{ + XmlDocument doc = New!XmlDocument(); + Lexer lex = New!Lexer(text, xmlDelims); + Stack!XmlNode nodeStack; + + nodeStack.push(doc.root); + + XmlToken expect = XmlToken.TagOpen; + + bool tagOpening = false; + bool xmlPrologDeclaration = false; + bool comment = false; + bool cdata = false; + bool lastCharWasWhitespace = false; - this(string text) + dchar[] tmpPropName; + DynamicArray!dchar tmpPropValue; + + bool finished = false; + + bool failed = false; + dchar[] errStr; + void error(string text, dchar[] t) { - this(text.toUTF32); + errStr = errMsg(text, t); + failed = true; } - this(dstring text) + dchar[] token; + while(!finished) { - Lexer lex = new Lexer(text); - lex.addDelimiters( - [ - "<", ">", "", "=", "", "\"", "&", - "", "" - ]); - - Stack!XmlNode nodeStack; - root = new XmlNode(""); - nodeStack.push(root); - - XmlToken expect = XmlToken.TagOpen; - bool tagOpening = false; - bool xmlPrologDeclaration = false; - bool ampersand = false; - bool doctype = false; - bool comment = false; - bool cdata = false; - dstring tmpPropName; - dstring token; - - do + token = lex.getLexeme(); + + if (!token.length) + break; + + switch(token) { - token = lex.getLexeme(); + case "": + if (comment) break; + if (cdata) + cdata = false; + else { - case "": - if (comment) break; - if (cdata) - cdata = false; - else - throw new Exception("Unexpected token \'" ~ to!string(token) ~ "\'"); - break; + case "": + if (cdata) + { + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + node.text = copyBuffer(token); + } + else if (comment) + comment = false; + else + { + error("Unexpected token ", token); + finished = true; + } + break; - case "-->": - if (cdata) - { - XmlNode node = new XmlNode("", nodeStack.top); - node.text = token; - } - else - if (comment) - comment = false; - else - throw new Exception("Unexpected token \'" ~ to!string(token) ~ "\'"); - break; + case "<": + if (comment) break; + if (cdata) + { + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + node.text = copyBuffer(token); + } + else if (expect == XmlToken.TagOpen) + { + expect = XmlToken.TagName; + tagOpening = true; + } + else + { + error("Unexpected token ", token); + finished = true; + } + break; - case "<": - if (comment) break; - if (cdata) - { - XmlNode node = new XmlNode("", nodeStack.top); - node.text = token; - } - else if (expect == XmlToken.TagOpen) - { - expect = XmlToken.TagName; - tagOpening = true; - } - else - throw new Exception("Unexpected token \'" ~ to!string(token) ~ "\'"); - break; - - case ">": - if (comment) break; - if (cdata) - { - XmlNode node = new XmlNode("", nodeStack.top); - node.text = token; - } - else if (expect == XmlToken.TagClose && !xmlPrologDeclaration) - { - expect = XmlToken.TagOpen; - if (doctype) - doctype = false; - } - else - throw new Exception("Unexpected token \'" ~ to!string(token) ~ "\'"); - break; + case ">": + if (comment) break; + if (cdata) + { + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + node.text = copyBuffer(token); + } + else if (expect == XmlToken.TagClose && !xmlPrologDeclaration) + { + expect = XmlToken.TagOpen; + } + else + { + error("Unexpected token ", token); + finished = true; + } + break; - case "": - if (comment) break; - if (cdata) - { - XmlNode node = new XmlNode("", nodeStack.top); - node.text = token; - } - else if (expect == XmlToken.TagClose && !xmlPrologDeclaration) - { - expect = XmlToken.TagOpen; - nodeStack.pop(); - } - else - throw new Exception("Unexpected token \'" ~ to!string(token) ~ "\'"); - break; - - case "": - if (comment) break; - if (cdata) - { - XmlNode node = new XmlNode("", nodeStack.top); - node.text = token; - } - else if (expect == XmlToken.TagClose && xmlPrologDeclaration) - { - expect = XmlToken.TagOpen; - xmlPrologDeclaration = false; - nodeStack.pop(); - } - break; + case "": + if (comment) break; + if (cdata) + { + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + node.text = copyBuffer(token); + } + else if (expect == XmlToken.TagClose && !xmlPrologDeclaration) + { + expect = XmlToken.TagOpen; + nodeStack.pop(); + } + else + { + error("Unexpected token ", token); + finished = true; + } + break; + + case "": + if (comment) break; + if (cdata) + { + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + node.text = copyBuffer(token); + } + else if (expect == XmlToken.TagClose && xmlPrologDeclaration) + { + expect = XmlToken.TagOpen; + xmlPrologDeclaration = false; + nodeStack.pop(); + } + break; + + case "=": + if (comment) break; + if (cdata) + { + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + node.text = copyBuffer(token); + } + else if (expect == XmlToken.Assignment) + { + expect = XmlToken.Quote; + } + else + { + error("Unexpected token ", token); + finished = true; + } + break; + + case "\"": + if (comment) break; + if (cdata) + { + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + node.text = copyBuffer(token); + } + else if (expect == XmlToken.Quote) + { + expect = XmlToken.PropValue; + } + else if (expect == XmlToken.PropValue) + { + expect = XmlToken.TagClose; + nodeStack.top.properties[copyBuffer(tmpPropName)] = copyBuffer(tmpPropValue.data); + tmpPropValue.free(); + } + else + { + error("Unexpected token ", token); + finished = true; + } + break; - if (cdata) + default: + if (comment) break; + if (cdata) + { + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + node.text = copyBuffer(token); + break; + } + + if (token != " " && token != "\n") + lastCharWasWhitespace = false; + + if (token == " " || token == "\n") + { + if (expect == XmlToken.TagOpen) + { + if (nodeStack.top.children.length) { - XmlNode node = new XmlNode("", nodeStack.top); - node.text = token; + if (nodeStack.top.children.data[$-1].text == " ") + break; } - else if (token.isWhitespace || token == "\n") + else if (!nodeStack.top.text.length) + break; + else if (nodeStack.top.text[$-1] == ' ') + break; + + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + node.text = New!(dchar[])(1); + node.text[0] = ' '; + } + else if (expect == XmlToken.PropValue) + { + if (!lastCharWasWhitespace) { - if (expect == XmlToken.TagOpen) - { - if (nodeStack.top.children.length) - { - if (nodeStack.top.children[$-1].text == " ") - break; - } - else if (!nodeStack.top.text.length) - break; - else if (nodeStack.top.text[$-1] == ' ') - break; - - XmlNode node = new XmlNode("", nodeStack.top); - node.text = " "; - } + tmpPropValue.append(' '); + lastCharWasWhitespace = true; } - else if (expect == XmlToken.TagName) + } + } + else if (expect == XmlToken.TagName) + { + expect = XmlToken.TagClose; + if (xmlPrologDeclaration) + { + if (tagOpening) { - expect = XmlToken.TagClose; - - if (xmlPrologDeclaration) + if (doc.prolog is null) { - if (tagOpening) + if (token == "xml") { - if (prolog is null) - { - if (token == "xml") - { - prolog = new XmlNode(token); - nodeStack.push(prolog); - tagOpening = false; - } - else - throw new Exception("Illegal XML prolog"); - } - else - throw new Exception("More than one XML prolog is not allowed"); + doc.prolog = New!XmlNode(copyBuffer(token)); + nodeStack.push(doc.prolog); + tagOpening = false; } else { - nodeStack.pop(); + error("Illegal XML prolog", emptyStr); + finished = true; } } - else if (token == "!DOCTYPE") - { - //writeln("!DOCTYPE"); - expect = XmlToken.TagClose; - doctype = true; - } else { - if (tagOpening) - { - XmlNode node = new XmlNode(token, nodeStack.top); - nodeStack.push(node); - tagOpening = false; - } - else - { - if (token == nodeStack.top.name) - nodeStack.pop(); - else - throw new Exception("Mismatched tag"); - } + error("More than one XML prolog is not allowed", emptyStr); + finished = true; } } - else if (expect == XmlToken.TagOpen) + else { - XmlNode node = new XmlNode("", nodeStack.top); - if (ampersand) - { - if (token[0] == '#' && token.length > 1) - { - if (token[1] == 'x') - node.text ~= cast(dchar)hexCharacterCode(token[2..$]); - else - node.text ~= cast(dchar)to!uint(token[1..$-1]); - } - ampersand = false; - } - else node.text = token; + nodeStack.pop(); } - else if (expect == XmlToken.TagClose) + } + else if (tagOpening) + { + XmlNode node = New!XmlNode(copyBuffer(token), nodeStack.top); + nodeStack.push(node); + tagOpening = false; + } + else + { + if (buffEq(token, nodeStack.top.name)) + nodeStack.pop(); + else { - if (doctype) - { - expect = XmlToken.TagClose; - type = token; - } - else - { - expect = XmlToken.Assignment; - tmpPropName = token; - } + error("Mismatched tag", emptyStr); + finished = true; } - else if (expect == XmlToken.PropValue) + } + } + else if (expect == XmlToken.TagOpen) + { + XmlNode node = New!XmlNode(emptyStr, nodeStack.top); + if (token[0] == '&') + { + if (token[1] == '#' && token.length > 2) { - expect = XmlToken.TagClose; - dstring val; - if (token[0] == '\"' && token[$-1] == '\"') - val = token[1..$-1]; + dchar c = '?'; + if (token[2] == 'x') + { + int code = hexCharacterCode(token[3..$]); + if (code == -1) + { + error("Failed to parse character reference ", token); + finished = true; + } + else + c = cast(dchar)code; + } else - val = token; - nodeStack.top.addProperty(tmpPropName, val); + c = cast(dchar)to!uint(token[2..$-1]); + + node.appendText(c); } - else - throw new Exception("Unexpected token \'" ~ - to!string(token) ~ "\', expected " ~ to!string(expect)); - break; + } + else + node.text = copyBuffer(token); + } + else if (expect == XmlToken.TagClose) + { + expect = XmlToken.Assignment; + + if (tmpPropName.length) + Delete(tmpPropName); + tmpPropName = copyBuffer(token); + } + else if (expect == XmlToken.PropValue) + { + tmpPropValue.append(token); } - } + else + { + error("Unexpected token ", token); + finished = true; + } + break; } - while (token.length); + + Delete(token); + } + + if (tmpPropName.length) + Delete(tmpPropName); + tmpPropValue.free(); + + nodeStack.free(); + Delete(lex); + + if (failed) + { + Delete(doc); + doc = null; + return compound(doc, errStr); } + else + return compound(doc, emptyStr); } -int hexCharacterCode(dstring input) +int hexCharacterCode(dchar[] input) { int res; foreach(c; input) @@ -415,18 +520,8 @@ int hexCharacterCode(dstring input) case ';': return res; default: - throw new Exception("Expected hex digit in character reference, found \'" ~ to!char(c) ~ "\'"); + return -1; } } return res; } - -/+ - // usage: - XmlDocument doc = new XmlDocument(readText("test1.xml")); - auto b = doc.rootNode.getChildByName("b"); - if (b) - { - writeln(b[0].getText); - } -+/ diff --git a/dlib/xml/lexer.d b/dlib/xml/lexer.d deleted file mode 100644 index fb8454fa..00000000 --- a/dlib/xml/lexer.d +++ /dev/null @@ -1,201 +0,0 @@ -/* -Copyright (c) 2013 Timur Gafarov - -Boost Software License - Version 1.0 - August 17th, 2003 - -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - -module dlib.xml.lexer; - -private -{ - import std.stdio; - import std.string; - import std.ascii; - import std.conv; - import std.algorithm; -} - -bool matches(T)(T input, T[] list) -{ - foreach(v; list) - if (input == to!T(v)) return true; - return false; -} - -bool isWhitespace(T)(T[] input) -{ - if (input.length==0) return false; - else if (matches!(T)(input[0], to!(T[])(std.ascii.whitespace))) return true; - return false; -} - -final class Lexer -{ - private: - dstring source; - int pos = 0; - dstring[] delimiters; - - - static dstring[] stddelimiters = - [ - "==","!=","<=",">=","+=","-=","*=","/=", - "++","--","||","&&","<<",">>","<>", - "+","-","*","/","%","=","|","^","~","<",">","!", - "(",")","{","}","[","]", - ";",":",",","@","#","$","&", - "\\","\"","\'" - ]; - - public: - dstring singleLineComment = ""; - dstring[] stringLiteralQuote = ["\""]; - - this(dstring src) - { - source = src; - } - - void addDelimiter(dstring op) - { - delimiters ~= op; - sort!("a.length > b.length")(delimiters); - } - - void addDelimiters(dstring[] op = stddelimiters) - { - delimiters ~= op; - sort!("a.length > b.length")(delimiters); - } - - dstring getLexeme() - { - dstring result; - bool commentSingleLine = false; - bool stringLiteral = false; - dstring tempStringLiteral = ""; - bool satisfied = false; - - while(!satisfied) - { - dstring lexeme = getLexemeUnfiltered(); - if (!lexeme.length) satisfied = true; - else if (lexeme == "\n") - { - if (!stringLiteral) - { - commentSingleLine = false; - result = lexeme; - satisfied = true; - } - else tempStringLiteral ~= lexeme; - } - else if (lexeme == singleLineComment) { if (!stringLiteral) commentSingleLine = true; } - else if (!commentSingleLine) - { - if (matches(lexeme, stringLiteralQuote)) - { - tempStringLiteral ~= lexeme; - if (stringLiteral) - { - if (lexeme[0] == tempStringLiteral[0]) - { - result = tempStringLiteral; - stringLiteral = false; - satisfied = true; - } - } - else stringLiteral = true; - } - else - { - if (stringLiteral) - tempStringLiteral ~= lexeme; - else //if (!lexeme.isWhitespace) - { - result = lexeme; - satisfied = true; - } - } - } - } - return result; - } - - private: - dstring getLexemeUnfiltered() - { - dstring temp; - while (pos < source.length) - { - dstring forw = matchForward(pos, delimiters); - if (source[pos] == '\n') - { - if (!temp.length) { temp ~= '\n'; pos++; } - break; - } - else if (matches(source[pos], to!(dstring)(std.ascii.whitespace))) - { - if (!temp.length) { temp ~= source[pos]; pos++; } - break; - } - else if (forw.length > 0) - { - if (!temp.length) - { - temp ~= forw; - pos += forw.length; - break; - } - else break; - } - else - { - temp ~= source[pos]; - pos++; - } - } - return temp; - } - - dstring matchForward(size_t start, dstring[] list) - { - foreach(v; list) - { - dstring forward = getForward(start, v.length); - if (forward == v) - return forward; - } - return ""; - } - - dstring getForward(size_t position, size_t num) - { - if (position + num < source.length) - return source[position..position+num]; - else - return source[position..$]; - } -} diff --git a/dlib/xml/node.d b/dlib/xml/node.d index 5fd9d0cd..3cff1a68 100644 --- a/dlib/xml/node.d +++ b/dlib/xml/node.d @@ -1,5 +1,5 @@ /* -Copyright (c) 2013 Timur Gafarov +Copyright (c) 2015 Timur Gafarov Boost Software License - Version 1.0 - August 17th, 2003 @@ -28,21 +28,22 @@ DEALINGS IN THE SOFTWARE. module dlib.xml.node; -private -{ - import std.stdio; -} +import std.stdio; +import std.conv; +import dlib.core.memory; +import dlib.container.array; +import dlib.container.dict; +import dlib.text.lexer; -final class XmlNode +class XmlNode { XmlNode parent; - XmlNode[] children; - dstring text = ""d; - - dstring name; - dstring[dstring] properties; + DynamicArray!XmlNode children; + dchar[] name; + dchar[] text; + Dict!(dchar[], dchar[]) properties; - this(dstring name, XmlNode parent = null) + this(dchar[] name, XmlNode parent = null) { this.name = name; this.parent = parent; @@ -50,41 +51,72 @@ final class XmlNode { parent.addChild(this); } + this.properties = New!(Dict!(dchar[], dchar[])); } - - XmlNode[] getChildByName(dstring name) + + ~this() { - XmlNode[] res; - foreach(n; children) + if (text.length) + Delete(text); + if (name.length) + Delete(name); + foreach(k, v; properties) + { + Delete(k); + Delete(v); + } + Delete(properties); + foreach(c; children) { - if (n.name == name) - res ~= n; + Delete(c); } - return res; + children.free(); } void addChild(XmlNode node) { - children ~= node; + children.append(node); } - - void addProperty(dstring name, dstring value) + + void appendText(dchar c) { - properties[name] = value; + dchar[] newText = New!(dchar[])(text.length+1); + foreach(i, v; text) + newText[i] = v; + newText[$-1] = c; + if (text.length) + Delete(text); + text = newText; } - - dstring getText() + + dchar[] getTextUnmanaged() { - dstring res = text; + DynamicArray!dchar res; + res.append(text); foreach(n; children) { - dstring txt = n.getText(); - res ~= txt; + dchar[] t = n.getTextUnmanaged(); + if (t.length) + { + res.append(t); + Delete(t); + } } - return res; + dchar[] output = copyBuffer(res.data); + res.free(); + return output; + } + + // Warning! Causes GC allocation! + string getText() + { + dchar[] t = getTextUnmanaged(); + string s = to!string(t); + Delete(t); + return s; } - private void printProperties(dstring indent = "") + void printProperties(dstring indent = "") { if (properties.length) { @@ -92,22 +124,26 @@ final class XmlNode writeln(indent, k, " = ", v); } } - + + // Warning! Causes GC allocation! void print(dstring indent = "") { printProperties(indent); foreach(n; children) { - dstring nm = n.name; + auto nm = n.name; if (nm.length) writeln(indent, "tag: ", nm); else writeln(indent, "tag: "); - dstring txt = n.getText; + dchar[] txt = n.getTextUnmanaged(); if (txt.length) - writeln(indent, "text: ", n.getText); + { + writeln(indent, "text: ", txt); + Delete(txt); + } n.print(indent ~ " "); }