zenXML
Straightforward C++ XML Processing
|
00001 // ************************************************************************** 00002 // * This file is part of the zenXML project. It is distributed under the * 00003 // * Boost Software License, Version 1.0. See accompanying file * 00004 // * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt. * 00005 // * Copyright (C) 2011 ZenJu (zhnmju123 AT gmx.de) * 00006 // ************************************************************************** 00007 00008 #ifndef STRING_UTF8_HEADER_01832479146991573473545 00009 #define STRING_UTF8_HEADER_01832479146991573473545 00010 00011 #include <iterator> 00012 #include "loki/TypeManip.h" 00013 #include "string_tools.h" 00014 #include "assert_static.h" 00015 00016 namespace zen 00017 { 00018 //Example: std::string tmp = toUtf8<std::string>(L"abc"); 00019 template <class CharString, class WideString> 00020 CharString wideToUtf8(const WideString& str); 00021 00022 //Example: std::wstring tmp = utf8To<std::wstring>("abc"); 00023 template <class WideString, class CharString> 00024 WideString utf8ToWide(const CharString& str); 00025 00026 const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF"; 00027 00028 //convert any(!) "string-like" object into a UTF8 encoded std::string 00029 template <class String> std::string toStdString(const String& str); 00030 //convert a UTF8 encoded std::string to any(!) string-class 00031 template <class String> String stdStringTo(const std::string& str); 00032 00033 00034 00035 00036 00037 00038 00039 00040 00041 00042 00043 00044 00045 00046 00047 00048 00049 00050 00051 00052 00053 00054 00055 00056 00057 00058 00059 00060 00061 00062 //----------------------- implementation ---------------------------------- 00063 namespace implementation 00064 { 00065 typedef unsigned int CodePoint; 00066 00067 const CodePoint CODE_POINT_MAX = 0x10ffff; 00068 00069 const CodePoint HIGH_SURROGATE = 0xd800; 00070 const CodePoint HIGH_SURROGATE_MAX = 0xdbff; 00071 00072 const CodePoint LOW_SURROGATE = 0xdc00; 00073 const CodePoint LOW_SURROGATE_MAX = 0xdfff; 00074 00075 00076 template <class OutputIterator> inline 00077 OutputIterator codePointToUtf16(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-16 00078 { 00079 typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type 00080 00081 assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16 00082 assert(cp <= CODE_POINT_MAX); 00083 00084 if (cp < 0x10000) 00085 *result++ = static_cast<Char16>(cp); 00086 else 00087 { 00088 cp -= 0x10000; 00089 *result++ = static_cast<Char16>((cp >> 10) + HIGH_SURROGATE); 00090 *result++ = static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE); 00091 } 00092 return result; 00093 } 00094 00095 00096 template <class CharIterator, class Function> inline 00097 Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter 00098 { 00099 assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2); 00100 typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type 00101 00102 for ( ; first != last; ++first) 00103 { 00104 CodePoint cp = static_cast<Char16>(*first); 00105 if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX) 00106 { 00107 if (++first == last) 00108 { 00109 assert(false); //low surrogate expected 00110 break; 00111 } 00112 assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected 00113 cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000; 00114 } 00115 else 00116 assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected 00117 00118 f(cp); 00119 } 00120 return f; 00121 } 00122 00123 00124 template <class OutputIterator> inline 00125 OutputIterator codePointToUtf8(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-8 00126 { 00127 typedef unsigned char Char8; 00128 00129 assert(cp <= CODE_POINT_MAX); 00130 00131 if (cp < 0x80) 00132 *result++ = static_cast<Char8>(cp); 00133 else if (cp < 0x800) 00134 { 00135 *result++ = static_cast<Char8>((cp >> 6 )| 0xc0); 00136 *result++ = static_cast<Char8>((cp & 0x3f )| 0x80); 00137 } 00138 else if (cp < 0x10000) 00139 { 00140 *result++ = static_cast<Char8>((cp >> 12 )| 0xe0); 00141 *result++ = static_cast<Char8>(((cp >> 6) & 0x3f )| 0x80); 00142 *result++ = static_cast<Char8>((cp & 0x3f )| 0x80); 00143 } 00144 else 00145 { 00146 *result++ = static_cast<Char8>((cp >> 18 )| 0xf0); 00147 *result++ = static_cast<Char8>(((cp >> 12) & 0x3f )| 0x80); 00148 *result++ = static_cast<Char8>(((cp >> 6) & 0x3f )| 0x80); 00149 *result++ = static_cast<Char8>((cp & 0x3f )| 0x80); 00150 } 00151 return result; 00152 } 00153 00154 00155 inline 00156 size_t getUtf8Len(unsigned char ch) 00157 { 00158 if (ch < 0x80) 00159 return 1; 00160 if (ch >> 5 == 0x6) 00161 return 2; 00162 if (ch >> 4 == 0xe) 00163 return 3; 00164 if (ch >> 3 == 0x1e) 00165 return 4; 00166 00167 assert(false); //no valid begin of UTF8 encoding 00168 return 1; 00169 } 00170 00171 00172 template <class CharIterator, class Function> inline 00173 Function utf8ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter 00174 { 00175 assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1); 00176 typedef unsigned char Char8; 00177 00178 for ( ; first != last; ++first) 00179 { 00180 auto getChar = [&](Char8& ch ) -> bool 00181 { 00182 if (++first == last) 00183 { 00184 assert(false); //low surrogate expected 00185 return false; 00186 } 00187 ch = static_cast<Char8>(*first); 00188 assert(ch >> 6 == 0x2); 00189 return true; 00190 }; 00191 00192 CodePoint cp = static_cast<Char8>(*first); 00193 switch (getUtf8Len(cp)) 00194 { 00195 case 1: 00196 break; 00197 case 2: 00198 { 00199 cp = (cp & 0x1f) << 6; 00200 Char8 ch; 00201 if (!getChar(ch)) continue; 00202 cp += ch & 0x3f; 00203 } 00204 break; 00205 case 3: 00206 { 00207 cp = (cp & 0xf) << 12; 00208 Char8 ch; 00209 if (!getChar(ch)) continue; 00210 cp += (ch & 0x3f) << 6; 00211 if (!getChar(ch)) continue; 00212 cp += ch & 0x3f; 00213 00214 } 00215 break; 00216 case 4: 00217 { 00218 cp = (cp & 0x7) << 18; 00219 Char8 ch; 00220 if (!getChar(ch)) continue; 00221 cp += (ch & 0x3f) << 12; 00222 if (!getChar(ch)) continue; 00223 cp += (ch & 0x3f) << 6; 00224 if (!getChar(ch)) continue; 00225 cp += ch & 0x3f; 00226 } 00227 break; 00228 default: 00229 assert(false); 00230 } 00231 f(cp); 00232 } 00233 return f; 00234 } 00235 00236 00237 template <class String> 00238 class AppendStringIterator: public std::iterator<std::output_iterator_tag, void, void, void, void> 00239 { 00240 public: 00241 explicit AppendStringIterator (String& x) : str(x) {} 00242 AppendStringIterator& operator= (typename String::value_type value) { str += value; return *this; } 00243 AppendStringIterator& operator* () { return *this; } 00244 AppendStringIterator& operator++ () { return *this; } 00245 AppendStringIterator operator++ (int) { return *this; } 00246 private: 00247 String& str; 00248 }; 00249 00250 00251 template <class WideString, class CharString> inline 00252 WideString utf8ToWide(const CharString& str, Loki::Int2Type<2>) //windows: convert utf8 to utf16 wchar_t 00253 { 00254 WideString output; 00255 utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), 00256 [&](CodePoint cp) { codePointToUtf16(cp, AppendStringIterator<WideString>(output)); }); 00257 return output; 00258 } 00259 00260 00261 template <class WideString, class CharString> inline 00262 WideString utf8ToWide(const CharString& str, Loki::Int2Type<4>) //other OS: convert utf8 to utf32 wchar_t 00263 { 00264 WideString output; 00265 utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str), 00266 [&](CodePoint cp) { output += cp; }); 00267 return output; 00268 } 00269 00270 00271 template <class CharString, class WideString> inline 00272 CharString wideToUtf8(const WideString& str, Loki::Int2Type<2>) //windows: convert utf16-wchar_t to utf8 00273 { 00274 CharString output; 00275 utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str), 00276 [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); }); 00277 return output; 00278 } 00279 00280 00281 template <class CharString, class WideString> inline 00282 CharString wideToUtf8(const WideString& str, Loki::Int2Type<4>) //other OS: convert utf32-wchar_t to utf8 00283 { 00284 CharString output; 00285 std::for_each(strBegin(str), strBegin(str) + strLength(str), 00286 [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); }); 00287 return output; 00288 } 00289 } 00290 00291 00292 template <class WideString, class CharString> inline 00293 WideString utf8ToWide(const CharString& str) 00294 { 00295 assert_static((Loki::IsSameType<typename StringTraits<CharString>::CharType, char >::value)); 00296 assert_static((Loki::IsSameType<typename StringTraits<WideString>::CharType, wchar_t>::value)); 00297 00298 return implementation::utf8ToWide<WideString>(str, Loki::Int2Type<sizeof(wchar_t)>()); 00299 } 00300 00301 00302 template <class CharString, class WideString> inline 00303 CharString wideToUtf8(const WideString& str) 00304 { 00305 assert_static((Loki::IsSameType<typename StringTraits<CharString>::CharType, char >::value)); 00306 assert_static((Loki::IsSameType<typename StringTraits<WideString>::CharType, wchar_t>::value)); 00307 00308 return implementation::wideToUtf8<CharString>(str, Loki::Int2Type<sizeof(wchar_t)>()); 00309 } 00310 00311 00312 //------------------------------------------------------------------------------------------- 00313 template <class String> inline 00314 std::string toStdString(const String& str, wchar_t) { return wideToUtf8<std::string>(str); } //convert wide character string to UTF8 00315 00316 template <class String> inline 00317 std::string toStdString(const String& str, char) { return cvrtString<std::string>(str); } //directly process string without UTF8 conversion 00318 00319 template <class String> inline 00320 std::string toStdString(const String& str) { return toStdString(str, typename StringTraits<String>::CharType()); } 00321 //------------------------------------------------------------------------------------------- 00322 00323 template <class String> inline 00324 String stdStringTo(const std::string& str, wchar_t) { return utf8ToWide<String>(str); } //convert UTF8 to wide character string 00325 00326 template <class String> inline 00327 String stdStringTo(const std::string& str, char) { return cvrtString<String>(str); } //directly process string without UTF8 conversion 00328 00329 template <class String> inline 00330 String stdStringTo(const std::string& str) { return stdStringTo<String>(str, typename StringTraits<String>::CharType()); } 00331 } 00332 00333 #endif //STRING_UTF8_HEADER_01832479146991573473545