zenXML
Straightforward C++ XML Processing
|
00001 // ************************************************************************** 00002 // * This file is part of the zenXML project. It is distributed under the * 00003 // * Boost Software License, Version 1.0. See accompanying file * 00004 // * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt. * 00005 // * Copyright (C) 2011 ZenJu (zhnmju123 AT gmx.de) * 00006 // ************************************************************************** 00007 00008 #ifndef ZEN_XML_PARSER_HEADER_81248670213764583021432 00009 #define ZEN_XML_PARSER_HEADER_81248670213764583021432 00010 00011 #include <cstdio> 00012 #include "string_tools.h" 00013 #include "zenxml_dom.h" 00014 #include "zenxml_error.h" 00015 00016 namespace zen 00017 { 00023 00024 00030 std::string serialize(const XmlDoc& doc, 00031 const std::string& lineBreak = "\r\n", 00032 const std::string& indent = " "); //throw () 00033 00035 struct XmlParsingError : public XmlError 00036 { 00037 XmlParsingError(size_t rowNo, size_t colNo) : row(rowNo), col(colNo) {} 00039 size_t row; 00041 size_t col; 00042 }; 00043 00044 00046 00051 void parse(const std::string& stream, XmlDoc& doc); //throw XmlParsingError 00052 00053 00054 00055 00056 00057 00058 00059 00060 00061 00062 00063 00064 00065 00066 00067 00068 00069 00070 00071 00072 00073 00074 00075 00076 00077 00078 00079 00080 00081 00082 //---------------------------- implementation ---------------------------- 00083 namespace implemenation 00084 { 00085 template <class Predicate> 00086 std::string normalize(const std::string& str, Predicate pred) //pred: unary function taking a unsigned char, return true if value shall be encoded as hex 00087 { 00088 std::string output; 00089 std::for_each(str.begin(), str.end(), 00090 [&](unsigned char c) 00091 { 00092 if (c == '&') 00093 output += "&"; 00094 else if (c == '\'') 00095 output += "'"; 00096 else if (c == '<') 00097 output += "<"; 00098 else if (c == '>') 00099 output += ">"; 00100 else if (c == '\"') 00101 output += """; 00102 else if (pred(c)) 00103 { 00104 char buffer[20]; 00105 int charsWritten = std::sprintf(buffer, "&#x%02X;", c); 00106 if (charsWritten > 0) 00107 output += std::string(buffer, charsWritten); 00108 } 00109 else 00110 output += c; 00111 }); 00112 00113 return output; 00114 } 00115 00116 inline 00117 std::string normalizeName(const std::string& str) 00118 { 00119 return normalize(str, [](unsigned char ch) { return cStringIsWhiteSpace(ch) || ch == '=' || ch == '/'; }); 00120 } 00121 00122 inline 00123 std::string normalizeValue(const std::string& str) 00124 { 00125 return normalize(str, [](unsigned char ch) { return ch < 32; }); 00126 } 00127 00128 00129 namespace 00130 { 00131 std::string denormalize(const std::string& str) 00132 { 00133 std::string output; 00134 for (auto iter = str.begin(); iter != str.end(); ++iter) 00135 { 00136 const char c = *iter; 00137 00138 if (c == '&') 00139 { 00140 auto checkEntity = [&](const char* placeholder, char realVal) -> bool 00141 { 00142 size_t strLen = cStringLength(placeholder); 00143 00144 if (str.end() - iter >= static_cast<int>(strLen) && std::equal(iter, iter + strLen, placeholder)) 00145 { 00146 output += realVal; 00147 iter += strLen - 1; 00148 return true; 00149 } 00150 return false; 00151 }; 00152 00153 if (checkEntity("&", '&')) 00154 continue; 00155 if (checkEntity("'", '\'')) 00156 continue; 00157 if (checkEntity("<", '<')) 00158 continue; 00159 if (checkEntity(">", '>')) 00160 continue; 00161 if (checkEntity(""", '\"')) 00162 continue; 00163 00164 if (str.end() - iter >= 6 && 00165 iter[1] == '#' && 00166 iter[2] == 'x' && 00167 iter[5] == ';') 00168 { 00169 int tmp = 0; 00170 if (::sscanf(&iter[3], "%02X", &tmp) > 0) 00171 { 00172 output += static_cast<unsigned char>(tmp); 00173 iter += 5; 00174 continue; 00175 } 00176 } 00177 00178 //unexpected char! 00179 output += c; 00180 } 00181 else 00182 output += c; 00183 }; 00184 00185 return output; 00186 } 00187 00188 00189 void serialize(const XmlElement& element, std::string& stream, 00190 const std::string& lineBreak, 00191 const std::string& indent, 00192 size_t indentLevel) 00193 { 00194 const std::string nameFmt = normalizeName(element.getNameAs<std::string>()); 00195 00196 for (size_t i = 0; i < indentLevel; ++i) 00197 stream += indent; 00198 00199 stream += '<' + nameFmt; 00200 00201 auto attr = element.getAttributes(); 00202 for (auto iter = attr.first; iter != attr.second; ++iter) 00203 stream += ' ' + normalizeName(iter->first) + "=\"" + normalizeValue(iter->second) + "\""; 00204 00205 //no support for mixed-mode content 00206 auto iterPair = element.getChildren(); 00207 if (iterPair.first != iterPair.second) //structured element 00208 { 00209 stream += '>' + lineBreak; 00210 00211 std::for_each(iterPair.first, iterPair.second, 00212 [&](const XmlElement& el) { serialize(el, stream, lineBreak, indent, indentLevel + 1); }); 00213 00214 for (size_t i = 0; i < indentLevel; ++i) 00215 stream += indent; 00216 stream += "</" + nameFmt + '>' + lineBreak; 00217 } 00218 else 00219 { 00220 std::string value; 00221 element.getValue(value); 00222 00223 if (!value.empty()) //value element 00224 stream += '>' + normalizeValue(value) + "</" + nameFmt + '>' + lineBreak; 00225 else //empty element 00226 stream += "/>" + lineBreak; 00227 } 00228 } 00229 00230 std::string serialize(const XmlDoc& doc, 00231 const std::string& lineBreak, 00232 const std::string& indent) 00233 { 00234 std::string version = doc.getVersionAs<std::string>(); 00235 if (!version.empty()) 00236 version = " version=\"" + normalizeValue(version) + "\""; 00237 00238 std::string encoding = doc.getEncodingAs<std::string>(); 00239 if (!encoding.empty()) 00240 encoding = " encoding=\"" + normalizeValue(encoding) + "\""; 00241 00242 std::string standalone = doc.getStandaloneAs<std::string>(); 00243 if (!standalone.empty()) 00244 standalone = " standalone=\"" + normalizeValue(standalone) + "\""; 00245 00246 std::string output = "<?xml" + version + encoding + standalone + "?>" + lineBreak; 00247 serialize(doc.root(), output, lineBreak, indent, 0); 00248 return output; 00249 } 00250 } 00251 } 00252 00253 inline 00254 std::string serialize(const XmlDoc& doc, 00255 const std::string& lineBreak, 00256 const std::string& indent) { return implemenation::serialize(doc, lineBreak, indent); } 00257 00258 /* 00259 Grammar for XML parser 00260 ------------------------------- 00261 document-expression: 00262 <?xml version="1.0" encoding="UTF-8" standalone="yes"?> 00263 root-expression: 00264 00265 root-expression: 00266 element-expression: 00267 00268 element-list-expression: 00269 element-expression 00270 element-expression element-list-expression 00271 00272 element-expression: 00273 <string attribute-expression/> 00274 <string attribute-expression> pm-expression </string> 00275 00276 attribute-expression: 00277 string="string" 00278 string="string" attribute-expression 00279 00280 pm-expression: 00281 string 00282 element-list-expression 00283 */ 00284 00285 class XmlParser 00286 { 00287 public: 00288 XmlParser(const std::string& stream) : 00289 scn(stream), 00290 tk(scn.nextToken()) {} 00291 00292 void parse(XmlDoc& doc) //throw XmlParsingError 00293 { 00294 //declaration (optional) 00295 if (token().type == Token::TK_DECL_BEGIN) 00296 { 00297 nextToken(); 00298 00299 while (token().type == Token::TK_TEXT) 00300 { 00301 std::string attribName = token().text; 00302 nextToken(); 00303 00304 consumeToken(Token::TK_EQUAL); 00305 00306 if (token().type != Token::TK_QUOTE) 00307 throw XmlParsingError(scn.position().first, scn.position().second); 00308 std::string attribValue = scn.extractRawText(); 00309 nextToken(); 00310 00311 consumeToken(Token::TK_QUOTE); 00312 00313 if (attribName == "version") 00314 doc.setVersion(attribValue); 00315 else if (attribName == "encoding") 00316 doc.setEncoding(attribValue); 00317 else if (attribName == "standalone") 00318 doc.setStandalone(attribValue); 00319 } 00320 00321 consumeToken(Token::TK_DECL_END); 00322 } 00323 00324 XmlDoc dummy; 00325 parseChildElements(dummy.root()); 00326 00327 auto iterPair = dummy.root().getChildren(); 00328 if (iterPair.first != iterPair.second) 00329 doc.root().swap(*iterPair.first); 00330 00331 consumeToken(Token::TK_END); 00332 }; 00333 00334 private: 00335 XmlParser(const XmlParser&); 00336 XmlParser& operator=(const XmlParser&); 00337 00338 struct Token 00339 { 00340 enum Type 00341 { 00342 TK_LESS, 00343 TK_GREATER, 00344 TK_LESS_SLASH, 00345 TK_SLASH_GREATER, 00346 TK_EQUAL, 00347 TK_QUOTE, 00348 TK_DECL_BEGIN, 00349 TK_DECL_END, 00350 TK_TEXT, 00351 TK_END 00352 }; 00353 00354 Token(Type t) : type(t) {} 00355 00356 Type type; 00357 std::string text; //if type == TK_TEXT 00358 }; 00359 00360 class Scanner 00361 { 00362 public: 00363 Scanner(const std::string& phrase) : stream(phrase), pos(stream.begin()) 00364 { 00365 if (zen::startsWith(phrase, BYTE_ORDER_MARK_UTF8)) 00366 pos += 3; 00367 00368 tokens.push_back(std::make_pair("<?xml", Token::TK_DECL_BEGIN)); 00369 tokens.push_back(std::make_pair("?>", Token::TK_DECL_END)); 00370 tokens.push_back(std::make_pair("</", Token::TK_LESS_SLASH)); 00371 tokens.push_back(std::make_pair("/>", Token::TK_SLASH_GREATER)); 00372 tokens.push_back(std::make_pair("<" , Token::TK_LESS)); 00373 tokens.push_back(std::make_pair(">" , Token::TK_GREATER)); 00374 tokens.push_back(std::make_pair("=" , Token::TK_EQUAL)); 00375 tokens.push_back(std::make_pair("\"", Token::TK_QUOTE)); 00376 tokens.push_back(std::make_pair("\'", Token::TK_QUOTE)); 00377 } 00378 00379 Token nextToken() 00380 { 00381 //skip whitespace 00382 pos = std::find_if(pos, stream.end(), [](char c) { return !cStringIsWhiteSpace(c); }); 00383 00384 if (pos == stream.end()) 00385 return Token(Token::TK_END); 00386 00387 for (TokenList::const_iterator i = tokens.begin(); i != tokens.end(); ++i) 00388 if (startsWith(pos, i->first)) 00389 { 00390 pos += i->first.size(); 00391 return Token(i->second); 00392 } 00393 00394 auto textEnd = std::find_if(pos, stream.end(), [](char c) 00395 { 00396 return c == '<' || 00397 c == '>' || 00398 c == '\'' || 00399 c == '\"' || 00400 c == '\"' || 00401 c == '=' || 00402 c == '/' || 00403 cStringIsWhiteSpace(c); 00404 }); 00405 00406 ptrdiff_t letterCount = textEnd - pos; 00407 if (letterCount != 0) 00408 { 00409 Token out(Token::TK_TEXT); 00410 out.text = implemenation::denormalize(std::string(&*pos, letterCount)); 00411 pos = textEnd; 00412 return out; 00413 } 00414 00415 throw XmlParsingError(position().first, position().second); //cannot arrive here 00416 } 00417 00418 std::string extractRawText() 00419 { 00420 auto iter = std::find_if(pos, stream.end(), [](char c) 00421 { 00422 return c == '<' || 00423 c == '>' || 00424 c == '\'' || 00425 c == '\"'; 00426 }); 00427 std::string output(&*pos, iter - pos); 00428 pos = iter; 00429 return implemenation::denormalize(output); 00430 } 00431 00432 std::pair<size_t, size_t> position() const //current (row/col) beginning with 1 00433 { 00434 //seek last line break 00435 std::string::const_iterator iter = pos; 00436 while (iter != stream.begin() && *iter != '\n') 00437 --iter; 00438 00439 return std::make_pair(std::count(stream.begin(), pos, '\n') + 1, pos - iter); 00440 } 00441 00442 private: 00443 Scanner(const Scanner&); 00444 Scanner& operator=(const Scanner&); 00445 00446 bool startsWith(std::string::const_iterator iter, const std::string& prefix) const 00447 { 00448 if (stream.end() - iter < static_cast<int>(prefix.size())) 00449 return false; 00450 return std::equal(prefix.begin(), prefix.end(), iter); 00451 } 00452 00453 typedef std::vector<std::pair<std::string, Token::Type> > TokenList; 00454 TokenList tokens; 00455 00456 const std::string stream; 00457 std::string::const_iterator pos; 00458 }; 00459 00460 void parseChildElements(XmlElement& parent) 00461 { 00462 while (token().type == Token::TK_LESS) 00463 { 00464 consumeToken(Token::TK_LESS); 00465 00466 if (token().type != Token::TK_TEXT) 00467 throw XmlParsingError(scn.position().first, scn.position().second); 00468 std::string elementName = token().text; 00469 nextToken(); 00470 00471 XmlElement& newElement = parent.addChild(elementName); 00472 00473 parseAttributes(newElement); 00474 00475 if (token().type == Token::TK_SLASH_GREATER) //empty element 00476 { 00477 nextToken(); 00478 continue; 00479 } 00480 00481 if (token().type != Token::TK_GREATER) 00482 throw XmlParsingError(scn.position().first, scn.position().second); 00483 std::string elementValue = scn.extractRawText(); 00484 nextToken(); 00485 00486 //no support for mixed-mode content 00487 if (token().type == Token::TK_LESS) //structured element 00488 parseChildElements(newElement); 00489 else //value element 00490 newElement.setValue(elementValue); 00491 00492 00493 consumeToken(Token::TK_LESS_SLASH); 00494 00495 if (token().type != Token::TK_TEXT || 00496 elementName != token().text) 00497 throw XmlParsingError(scn.position().first, scn.position().second); 00498 nextToken(); 00499 00500 consumeToken(Token::TK_GREATER); 00501 } 00502 }; 00503 00504 void parseAttributes(XmlElement& element) 00505 { 00506 while (token().type == Token::TK_TEXT) 00507 { 00508 std::string attribName = token().text; 00509 nextToken(); 00510 00511 consumeToken(Token::TK_EQUAL); 00512 00513 if (token().type != Token::TK_QUOTE) 00514 throw XmlParsingError(scn.position().first, scn.position().second); 00515 std::string attribValue = scn.extractRawText(); 00516 nextToken(); 00517 00518 consumeToken(Token::TK_QUOTE); 00519 00520 element.setAttribute(attribName, attribValue); 00521 } 00522 } 00523 00524 00525 const Token& token() const { return tk; } 00526 void nextToken() { tk = scn.nextToken(); } 00527 void consumeToken(Token::Type t) 00528 { 00529 if (token().type != t) 00530 throw XmlParsingError(scn.position().first, scn.position().second); 00531 nextToken(); 00532 } 00533 00534 Scanner scn; 00535 Token tk; 00536 }; 00537 00538 00539 inline 00540 void parse(const std::string& stream, XmlDoc& doc) //throw XmlParsingError 00541 { 00542 XmlParser parser(stream); 00543 parser.parse(doc); //throw XmlParsingError 00544 } 00545 } 00546 00547 #endif //ZEN_XML_PARSER_HEADER_81248670213764583021432