zenXML
Straightforward C++ XML Processing
zenxml_parser.h
00001 // **************************************************************************
00002 // * This file is part of the zenXML project. It is distributed under the   *
00003 // * Boost Software License, Version 1.0. See accompanying file             *
00004 // * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt.       *
00005 // * Copyright (C) 2011 ZenJu (zhnmju123 AT gmx.de)                         *
00006 // **************************************************************************
00007 
00008 #ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432
00009 #define ZEN_XML_PARSER_HEADER_81248670213764583021432
00010 
00011 #include <cstdio>
00012 #include "string_tools.h"
00013 #include "zenxml_dom.h"
00014 #include "zenxml_error.h"
00015 
00016 namespace zen
00017 {
00023 
00024 
00030 std::string serialize(const XmlDoc& doc,
00031                       const std::string& lineBreak = "\r\n",
00032                       const std::string& indent = "    "); //throw ()
00033 
00035 struct XmlParsingError : public XmlError
00036 {
00037     XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {}
00039     size_t row;
00041     size_t col;
00042 };
00043 
00044 
00046 
00051 void parse(const std::string& stream, XmlDoc& doc); //throw XmlParsingError
00052 
00053 
00054 
00055 
00056 
00057 
00058 
00059 
00060 
00061 
00062 
00063 
00064 
00065 
00066 
00067 
00068 
00069 
00070 
00071 
00072 
00073 
00074 
00075 
00076 
00077 
00078 
00079 
00080 
00081 
00082 //---------------------------- implementation ----------------------------
00083 namespace implemenation
00084 {
00085 template <class Predicate>
00086 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a unsigned char, return true if value shall be encoded as hex
00087 {
00088     std::string output;
00089     std::for_each(str.begin(), str.end(),
00090                   [&](unsigned char c)
00091     {
00092         if (c == '&')
00093             output += "&amp;";
00094         else if (c == '\'')
00095             output += "&apos;";
00096         else if (c == '<')
00097             output += "&lt;";
00098         else if (c == '>')
00099             output += "&gt;";
00100         else if (c == '\"')
00101             output += "&quot;";
00102         else if (pred(c))
00103         {
00104             char buffer[20];
00105             int charsWritten = std::sprintf(buffer, "&#x%02X;", c);
00106             if (charsWritten > 0)
00107                 output += std::string(buffer, charsWritten);
00108         }
00109         else
00110             output += c;
00111     });
00112 
00113     return output;
00114 }
00115 
00116 inline
00117 std::string normalizeName(const std::string& str)
00118 {
00119     return normalize(str, [](unsigned char ch) { return cStringIsWhiteSpace(ch) || ch == '=' || ch == '/'; });
00120 }
00121 
00122 inline
00123 std::string normalizeValue(const std::string& str)
00124 {
00125     return normalize(str, [](unsigned char ch) { return ch < 32; });
00126 }
00127 
00128 
00129 namespace
00130 {
00131 std::string denormalize(const std::string& str)
00132 {
00133     std::string output;
00134     for (auto iter = str.begin(); iter != str.end(); ++iter)
00135     {
00136         const char c = *iter;
00137 
00138         if (c == '&')
00139         {
00140             auto checkEntity = [&](const char* placeholder, char realVal) -> bool
00141             {
00142                 size_t strLen = cStringLength(placeholder);
00143 
00144                 if (str.end() - iter >= static_cast<int>(strLen) && std::equal(iter, iter + strLen, placeholder))
00145                 {
00146                     output += realVal;
00147                     iter += strLen - 1;
00148                     return true;
00149                 }
00150                 return false;
00151             };
00152 
00153             if (checkEntity("&amp;", '&'))
00154                 continue;
00155             if (checkEntity("&apos;", '\''))
00156                 continue;
00157             if (checkEntity("&lt;", '<'))
00158                 continue;
00159             if (checkEntity("&gt;", '>'))
00160                 continue;
00161             if (checkEntity("&quot;", '\"'))
00162                 continue;
00163 
00164             if (str.end() - iter >= 6 &&
00165                 iter[1] == '#' &&
00166                 iter[2] == 'x' &&
00167                 iter[5] == ';')
00168             {
00169                 int tmp = 0;
00170                 if (::sscanf(&iter[3], "%02X", &tmp) > 0)
00171                 {
00172                     output += static_cast<unsigned char>(tmp);
00173                     iter += 5;
00174                     continue;
00175                 }
00176             }
00177 
00178             //unexpected char!
00179             output += c;
00180         }
00181         else
00182             output += c;
00183     };
00184 
00185     return output;
00186 }
00187 
00188 
00189 void serialize(const XmlElement& element, std::string& stream,
00190                const std::string& lineBreak,
00191                const std::string& indent,
00192                size_t indentLevel)
00193 {
00194     const std::string nameFmt = normalizeName(element.getNameAs<std::string>());
00195 
00196     for (size_t i = 0; i < indentLevel; ++i)
00197         stream += indent;
00198 
00199     stream += '<' + nameFmt;
00200 
00201     auto attr = element.getAttributes();
00202     for (auto iter = attr.first; iter != attr.second; ++iter)
00203         stream += ' ' + normalizeName(iter->first) + "=\"" + normalizeValue(iter->second) + "\"";
00204 
00205     //no support for mixed-mode content
00206     auto iterPair = element.getChildren();
00207     if (iterPair.first != iterPair.second) //structured element
00208     {
00209         stream += '>' + lineBreak;
00210 
00211         std::for_each(iterPair.first, iterPair.second,
00212         [&](const XmlElement& el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); });
00213 
00214         for (size_t i = 0; i < indentLevel; ++i)
00215             stream += indent;
00216         stream += "</" + nameFmt + '>' + lineBreak;
00217     }
00218     else
00219     {
00220         std::string value;
00221         element.getValue(value);
00222 
00223         if (!value.empty()) //value element
00224             stream += '>' + normalizeValue(value) + "</" + nameFmt + '>' + lineBreak;
00225         else //empty element
00226             stream += "/>" + lineBreak;
00227     }
00228 }
00229 
00230 std::string serialize(const XmlDoc& doc,
00231                       const std::string& lineBreak,
00232                       const std::string& indent)
00233 {
00234     std::string version = doc.getVersionAs<std::string>();
00235     if (!version.empty())
00236         version = " version=\"" + normalizeValue(version) + "\"";
00237 
00238     std::string encoding = doc.getEncodingAs<std::string>();
00239     if (!encoding.empty())
00240         encoding = " encoding=\"" + normalizeValue(encoding) + "\"";
00241 
00242     std::string standalone = doc.getStandaloneAs<std::string>();
00243     if (!standalone.empty())
00244         standalone = " standalone=\"" + normalizeValue(standalone) + "\"";
00245 
00246     std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak;
00247     serialize(doc.root(), output, lineBreak, indent, 0);
00248     return output;
00249 }
00250 }
00251 }
00252 
00253 inline
00254 std::string serialize(const XmlDoc& doc,
00255                       const std::string& lineBreak,
00256                       const std::string& indent) { return implemenation::serialize(doc, lineBreak, indent); }
00257 
00258 /*
00259 Grammar for XML parser
00260 -------------------------------
00261 document-expression:
00262         <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
00263         root-expression:
00264 
00265 root-expression:
00266     element-expression:
00267 
00268 element-list-expression:
00269         element-expression
00270         element-expression element-list-expression
00271 
00272 element-expression:
00273         <string attribute-expression/>
00274         <string attribute-expression> pm-expression </string>
00275 
00276 attribute-expression:
00277         string="string"
00278         string="string" attribute-expression
00279 
00280 pm-expression:
00281         string
00282     element-list-expression
00283 */
00284 
00285 class XmlParser
00286 {
00287 public:
00288     XmlParser(const std::string& stream) :
00289         scn(stream),
00290         tk(scn.nextToken()) {}
00291 
00292     void parse(XmlDoc& doc) //throw XmlParsingError
00293     {
00294         //declaration (optional)
00295         if (token().type == Token::TK_DECL_BEGIN)
00296         {
00297             nextToken();
00298 
00299             while (token().type == Token::TK_TEXT)
00300             {
00301                 std::string attribName = token().text;
00302                 nextToken();
00303 
00304                 consumeToken(Token::TK_EQUAL);
00305 
00306                 if (token().type != Token::TK_QUOTE)
00307                     throw XmlParsingError(scn.position().first, scn.position().second);
00308                 std::string attribValue = scn.extractRawText();
00309                 nextToken();
00310 
00311                 consumeToken(Token::TK_QUOTE);
00312 
00313                 if (attribName == "version")
00314                     doc.setVersion(attribValue);
00315                 else if (attribName == "encoding")
00316                     doc.setEncoding(attribValue);
00317                 else if (attribName == "standalone")
00318                     doc.setStandalone(attribValue);
00319             }
00320 
00321             consumeToken(Token::TK_DECL_END);
00322         }
00323 
00324         XmlDoc dummy;
00325         parseChildElements(dummy.root());
00326 
00327         auto iterPair = dummy.root().getChildren();
00328         if (iterPair.first != iterPair.second)
00329             doc.root().swap(*iterPair.first);
00330 
00331         consumeToken(Token::TK_END);
00332     };
00333 
00334 private:
00335     XmlParser(const XmlParser&);
00336     XmlParser& operator=(const XmlParser&);
00337 
00338     struct Token
00339     {
00340         enum Type
00341         {
00342             TK_LESS,
00343             TK_GREATER,
00344             TK_LESS_SLASH,
00345             TK_SLASH_GREATER,
00346             TK_EQUAL,
00347             TK_QUOTE,
00348             TK_DECL_BEGIN,
00349             TK_DECL_END,
00350             TK_TEXT,
00351             TK_END
00352         };
00353 
00354         Token(Type t) : type(t) {}
00355 
00356         Type type;
00357         std::string text; //if type == TK_TEXT
00358     };
00359 
00360     class Scanner
00361     {
00362     public:
00363         Scanner(const std::string& phrase) : stream(phrase), pos(stream.begin())
00364         {
00365             if (zen::startsWith(phrase, BYTE_ORDER_MARK_UTF8))
00366                 pos += 3;
00367 
00368             tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN));
00369             tokens.push_back(std::make_pair("?>",    Token::TK_DECL_END));
00370             tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH));
00371             tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER));
00372             tokens.push_back(std::make_pair("<" , Token::TK_LESS));
00373             tokens.push_back(std::make_pair(">" , Token::TK_GREATER));
00374             tokens.push_back(std::make_pair("=" , Token::TK_EQUAL));
00375             tokens.push_back(std::make_pair("\"", Token::TK_QUOTE));
00376             tokens.push_back(std::make_pair("\'", Token::TK_QUOTE));
00377         }
00378 
00379         Token nextToken()
00380         {
00381             //skip whitespace
00382             pos = std::find_if(pos, stream.end(), [](char c) { return !cStringIsWhiteSpace(c); });
00383 
00384             if (pos == stream.end())
00385                 return Token(Token::TK_END);
00386 
00387             for (TokenList::const_iterator i = tokens.begin(); i != tokens.end(); ++i)
00388                 if (startsWith(pos, i->first))
00389                 {
00390                     pos += i->first.size();
00391                     return Token(i->second);
00392                 }
00393 
00394             auto textEnd = std::find_if(pos, stream.end(), [](char c)
00395             {
00396                 return c == '<'  ||
00397                        c == '>'  ||
00398                        c == '\'' ||
00399                        c == '\"' ||
00400                        c == '\"' ||
00401                        c == '='  ||
00402                        c == '/'  ||
00403                        cStringIsWhiteSpace(c);
00404             });
00405 
00406             ptrdiff_t letterCount = textEnd - pos;
00407             if (letterCount != 0)
00408             {
00409                 Token out(Token::TK_TEXT);
00410                 out.text = implemenation::denormalize(std::string(&*pos, letterCount));
00411                 pos = textEnd;
00412                 return out;
00413             }
00414 
00415             throw XmlParsingError(position().first, position().second); //cannot arrive here
00416         }
00417 
00418         std::string extractRawText()
00419         {
00420             auto iter = std::find_if(pos, stream.end(), [](char c)
00421             {
00422                 return c == '<'  ||
00423                        c == '>'  ||
00424                        c == '\'' ||
00425                        c == '\"';
00426             });
00427             std::string output(&*pos, iter - pos);
00428             pos = iter;
00429             return implemenation::denormalize(output);
00430         }
00431 
00432         std::pair<size_t, size_t> position() const //current (row/col) beginning with 1
00433         {
00434             //seek last line break
00435             std::string::const_iterator iter = pos;
00436             while (iter != stream.begin() && *iter != '\n')
00437                 --iter;
00438 
00439             return std::make_pair(std::count(stream.begin(), pos, '\n') + 1, pos - iter);
00440         }
00441 
00442     private:
00443         Scanner(const Scanner&);
00444         Scanner& operator=(const Scanner&);
00445 
00446         bool startsWith(std::string::const_iterator iter, const std::string& prefix) const
00447         {
00448             if (stream.end() - iter < static_cast<int>(prefix.size()))
00449                 return false;
00450             return std::equal(prefix.begin(), prefix.end(), iter);
00451         }
00452 
00453         typedef std::vector<std::pair<std::string, Token::Type> > TokenList;
00454         TokenList tokens;
00455 
00456         const std::string stream;
00457         std::string::const_iterator pos;
00458     };
00459 
00460     void parseChildElements(XmlElement& parent)
00461     {
00462         while (token().type == Token::TK_LESS)
00463         {
00464             consumeToken(Token::TK_LESS);
00465 
00466             if (token().type != Token::TK_TEXT)
00467                 throw XmlParsingError(scn.position().first, scn.position().second);
00468             std::string elementName = token().text;
00469             nextToken();
00470 
00471             XmlElement& newElement = parent.addChild(elementName);
00472 
00473             parseAttributes(newElement);
00474 
00475             if (token().type == Token::TK_SLASH_GREATER) //empty element
00476             {
00477                 nextToken();
00478                 continue;
00479             }
00480 
00481             if (token().type != Token::TK_GREATER)
00482                 throw XmlParsingError(scn.position().first, scn.position().second);
00483             std::string elementValue = scn.extractRawText();
00484             nextToken();
00485 
00486             //no support for mixed-mode content
00487             if (token().type == Token::TK_LESS) //structured element
00488                 parseChildElements(newElement);
00489             else //value element
00490                 newElement.setValue(elementValue);
00491 
00492 
00493             consumeToken(Token::TK_LESS_SLASH);
00494 
00495             if (token().type != Token::TK_TEXT ||
00496                 elementName != token().text)
00497                 throw XmlParsingError(scn.position().first, scn.position().second);
00498             nextToken();
00499 
00500             consumeToken(Token::TK_GREATER);
00501         }
00502     };
00503 
00504     void parseAttributes(XmlElement& element)
00505     {
00506         while (token().type == Token::TK_TEXT)
00507         {
00508             std::string attribName = token().text;
00509             nextToken();
00510 
00511             consumeToken(Token::TK_EQUAL);
00512 
00513             if (token().type != Token::TK_QUOTE)
00514                 throw XmlParsingError(scn.position().first, scn.position().second);
00515             std::string attribValue = scn.extractRawText();
00516             nextToken();
00517 
00518             consumeToken(Token::TK_QUOTE);
00519 
00520             element.setAttribute(attribName, attribValue);
00521         }
00522     }
00523 
00524 
00525     const Token& token() const { return tk; }
00526     void nextToken() { tk = scn.nextToken(); }
00527     void consumeToken(Token::Type t)
00528     {
00529         if (token().type != t)
00530             throw XmlParsingError(scn.position().first, scn.position().second);
00531         nextToken();
00532     }
00533 
00534     Scanner scn;
00535     Token tk;
00536 };
00537 
00538 
00539 inline
00540 void parse(const std::string& stream, XmlDoc& doc) //throw XmlParsingError
00541 {
00542     XmlParser parser(stream);
00543     parser.parse(doc);  //throw XmlParsingError
00544 }
00545 }
00546 
00547 #endif //ZEN_XML_PARSER_HEADER_81248670213764583021432
 All Classes Namespaces Functions Variables