zenXML
Straightforward C++ XML Processing
string_utf8.h
00001 // **************************************************************************
00002 // * This file is part of the zenXML project. It is distributed under the   *
00003 // * Boost Software License, Version 1.0. See accompanying file             *
00004 // * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt.       *
00005 // * Copyright (C) 2011 ZenJu (zhnmju123 AT gmx.de)                         *
00006 // **************************************************************************
00007 
00008 #ifndef STRING_UTF8_HEADER_01832479146991573473545
00009 #define STRING_UTF8_HEADER_01832479146991573473545
00010 
00011 #include <iterator>
00012 #include "loki/TypeManip.h"
00013 #include "string_tools.h"
00014 #include "assert_static.h"
00015 
00016 namespace zen
00017 {
00018 //Example: std::string tmp = toUtf8<std::string>(L"abc");
00019 template <class CharString, class WideString>
00020 CharString wideToUtf8(const WideString& str);
00021 
00022 //Example: std::wstring tmp = utf8To<std::wstring>("abc");
00023 template <class WideString, class CharString>
00024 WideString utf8ToWide(const CharString& str);
00025 
00026 const char BYTE_ORDER_MARK_UTF8[] = "\xEF\xBB\xBF";
00027 
00028 //convert any(!) "string-like" object into a UTF8 encoded std::string
00029 template <class String> std::string toStdString(const String& str);
00030 //convert a UTF8 encoded std::string to any(!) string-class
00031 template <class String> String      stdStringTo(const std::string& str);
00032 
00033 
00034 
00035 
00036 
00037 
00038 
00039 
00040 
00041 
00042 
00043 
00044 
00045 
00046 
00047 
00048 
00049 
00050 
00051 
00052 
00053 
00054 
00055 
00056 
00057 
00058 
00059 
00060 
00061 
00062 //----------------------- implementation ----------------------------------
00063 namespace implementation
00064 {
00065 typedef unsigned int CodePoint;
00066 
00067 const CodePoint CODE_POINT_MAX     = 0x10ffff;
00068 
00069 const CodePoint HIGH_SURROGATE     = 0xd800;
00070 const CodePoint HIGH_SURROGATE_MAX = 0xdbff;
00071 
00072 const CodePoint LOW_SURROGATE      = 0xdc00;
00073 const CodePoint LOW_SURROGATE_MAX  = 0xdfff;
00074 
00075 
00076 template <class OutputIterator> inline
00077 OutputIterator codePointToUtf16(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-16
00078 {
00079     typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type
00080 
00081     assert(cp < HIGH_SURROGATE || LOW_SURROGATE_MAX < cp); //code points [0xd800, 0xdfff] are not allowed for UTF-16
00082     assert(cp <= CODE_POINT_MAX);
00083 
00084     if (cp < 0x10000)
00085         *result++ = static_cast<Char16>(cp);
00086     else
00087     {
00088         cp -= 0x10000;
00089         *result++ = static_cast<Char16>((cp >> 10) + HIGH_SURROGATE);
00090         *result++ = static_cast<Char16>((cp & 0x3ff) + LOW_SURROGATE);
00091     }
00092     return result;
00093 }
00094 
00095 
00096 template <class CharIterator, class Function> inline
00097 Function utf16ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter
00098 {
00099     assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 2);
00100     typedef unsigned short Char16; //this isn't necessarily 16 bit, but all we need is an unsigned type
00101 
00102     for ( ; first != last; ++first)
00103     {
00104         CodePoint cp = static_cast<Char16>(*first);
00105         if (HIGH_SURROGATE <= cp && cp <= HIGH_SURROGATE_MAX)
00106         {
00107             if (++first == last)
00108             {
00109                 assert(false); //low surrogate expected
00110                 break;
00111             }
00112             assert(LOW_SURROGATE <= static_cast<Char16>(*first) && static_cast<Char16>(*first) <= LOW_SURROGATE_MAX); //low surrogate expected
00113             cp = ((cp - HIGH_SURROGATE) << 10) + static_cast<Char16>(*first) - LOW_SURROGATE + 0x10000;
00114         }
00115         else
00116             assert(cp < LOW_SURROGATE || LOW_SURROGATE_MAX < cp); //NO low surrogate expected
00117 
00118         f(cp);
00119     }
00120     return f;
00121 }
00122 
00123 
00124 template <class OutputIterator> inline
00125 OutputIterator codePointToUtf8(CodePoint cp, OutputIterator result) //http://en.wikipedia.org/wiki/UTF-8
00126 {
00127     typedef unsigned char Char8;
00128 
00129     assert(cp <= CODE_POINT_MAX);
00130 
00131     if (cp < 0x80)
00132         *result++ = static_cast<Char8>(cp);
00133     else if (cp < 0x800)
00134     {
00135         *result++ = static_cast<Char8>((cp >> 6   )| 0xc0);
00136         *result++ = static_cast<Char8>((cp & 0x3f )| 0x80);
00137     }
00138     else if (cp < 0x10000)
00139     {
00140         *result++ = static_cast<Char8>((cp >> 12         )| 0xe0);
00141         *result++ = static_cast<Char8>(((cp >> 6) & 0x3f )| 0x80);
00142         *result++ = static_cast<Char8>((cp & 0x3f        )| 0x80);
00143     }
00144     else
00145     {
00146         *result++ = static_cast<Char8>((cp >> 18          )| 0xf0);
00147         *result++ = static_cast<Char8>(((cp >> 12) & 0x3f )| 0x80);
00148         *result++ = static_cast<Char8>(((cp >> 6)  & 0x3f )| 0x80);
00149         *result++ = static_cast<Char8>((cp & 0x3f         )| 0x80);
00150     }
00151     return result;
00152 }
00153 
00154 
00155 inline
00156 size_t getUtf8Len(unsigned char ch)
00157 {
00158     if (ch < 0x80)
00159         return 1;
00160     if (ch >> 5 == 0x6)
00161         return 2;
00162     if (ch >> 4 == 0xe)
00163         return 3;
00164     if (ch >> 3 == 0x1e)
00165         return 4;
00166 
00167     assert(false); //no valid begin of UTF8 encoding
00168     return 1;
00169 }
00170 
00171 
00172 template <class CharIterator, class Function> inline
00173 Function utf8ToCodePoint(CharIterator first, CharIterator last, Function f) //f is a unary function taking a CodePoint as single parameter
00174 {
00175     assert_static(sizeof(typename std::iterator_traits<CharIterator>::value_type) == 1);
00176     typedef unsigned char Char8;
00177 
00178     for ( ; first != last; ++first)
00179     {
00180         auto getChar = [&](Char8& ch ) -> bool
00181         {
00182             if (++first == last)
00183             {
00184                 assert(false); //low surrogate expected
00185                 return false;
00186             }
00187             ch = static_cast<Char8>(*first);
00188             assert(ch >> 6 == 0x2);
00189             return true;
00190         };
00191 
00192         CodePoint cp = static_cast<Char8>(*first);
00193         switch (getUtf8Len(cp))
00194         {
00195             case 1:
00196                 break;
00197             case 2:
00198             {
00199                 cp = (cp & 0x1f) << 6;
00200                 Char8 ch;
00201                 if (!getChar(ch)) continue;
00202                 cp += ch & 0x3f;
00203             }
00204             break;
00205             case 3:
00206             {
00207                 cp = (cp & 0xf) << 12;
00208                 Char8 ch;
00209                 if (!getChar(ch)) continue;
00210                 cp += (ch & 0x3f) << 6;
00211                 if (!getChar(ch)) continue;
00212                 cp += ch & 0x3f;
00213 
00214             }
00215             break;
00216             case 4:
00217             {
00218                 cp = (cp & 0x7) << 18;
00219                 Char8 ch;
00220                 if (!getChar(ch)) continue;
00221                 cp += (ch & 0x3f) << 12;
00222                 if (!getChar(ch)) continue;
00223                 cp += (ch & 0x3f) << 6;
00224                 if (!getChar(ch)) continue;
00225                 cp += ch & 0x3f;
00226             }
00227             break;
00228             default:
00229                 assert(false);
00230         }
00231         f(cp);
00232     }
00233     return f;
00234 }
00235 
00236 
00237 template <class String>
00238 class AppendStringIterator: public std::iterator<std::output_iterator_tag, void, void, void, void>
00239 {
00240 public:
00241     explicit AppendStringIterator (String& x) : str(x) {}
00242     AppendStringIterator& operator= (typename String::value_type value) { str += value; return *this; }
00243     AppendStringIterator& operator*  ()    { return *this; }
00244     AppendStringIterator& operator++ ()    { return *this; }
00245     AppendStringIterator  operator++ (int) { return *this; }
00246 private:
00247     String& str;
00248 };
00249 
00250 
00251 template <class WideString, class CharString> inline
00252 WideString utf8ToWide(const CharString& str, Loki::Int2Type<2>) //windows: convert utf8 to utf16 wchar_t
00253 {
00254     WideString output;
00255     utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
00256     [&](CodePoint cp) { codePointToUtf16(cp, AppendStringIterator<WideString>(output)); });
00257     return output;
00258 }
00259 
00260 
00261 template <class WideString, class CharString> inline
00262 WideString utf8ToWide(const CharString& str, Loki::Int2Type<4>) //other OS: convert utf8 to utf32 wchar_t
00263 {
00264     WideString output;
00265     utf8ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
00266     [&](CodePoint cp) { output += cp; });
00267     return output;
00268 }
00269 
00270 
00271 template <class CharString, class WideString> inline
00272 CharString wideToUtf8(const WideString& str, Loki::Int2Type<2>) //windows: convert utf16-wchar_t to utf8
00273 {
00274     CharString output;
00275     utf16ToCodePoint(strBegin(str), strBegin(str) + strLength(str),
00276     [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); });
00277     return output;
00278 }
00279 
00280 
00281 template <class CharString, class WideString> inline
00282 CharString wideToUtf8(const WideString& str, Loki::Int2Type<4>) //other OS: convert utf32-wchar_t to utf8
00283 {
00284     CharString output;
00285     std::for_each(strBegin(str), strBegin(str) + strLength(str),
00286     [&](CodePoint cp) { codePointToUtf8(cp, AppendStringIterator<CharString>(output)); });
00287     return output;
00288 }
00289 }
00290 
00291 
00292 template <class WideString, class CharString> inline
00293 WideString utf8ToWide(const CharString& str)
00294 {
00295     assert_static((Loki::IsSameType<typename StringTraits<CharString>::CharType, char   >::value));
00296     assert_static((Loki::IsSameType<typename StringTraits<WideString>::CharType, wchar_t>::value));
00297 
00298     return implementation::utf8ToWide<WideString>(str, Loki::Int2Type<sizeof(wchar_t)>());
00299 }
00300 
00301 
00302 template <class CharString, class WideString> inline
00303 CharString wideToUtf8(const WideString& str)
00304 {
00305     assert_static((Loki::IsSameType<typename StringTraits<CharString>::CharType, char   >::value));
00306     assert_static((Loki::IsSameType<typename StringTraits<WideString>::CharType, wchar_t>::value));
00307 
00308     return implementation::wideToUtf8<CharString>(str, Loki::Int2Type<sizeof(wchar_t)>());
00309 }
00310 
00311 
00312 //-------------------------------------------------------------------------------------------
00313 template <class String> inline
00314 std::string toStdString(const String& str, wchar_t) { return wideToUtf8<std::string>(str); } //convert wide character string to UTF8
00315 
00316 template <class String> inline
00317 std::string toStdString(const String& str, char) { return cvrtString<std::string>(str); } //directly process string without UTF8 conversion
00318 
00319 template <class String> inline
00320 std::string toStdString(const String& str) { return toStdString(str, typename StringTraits<String>::CharType()); }
00321 //-------------------------------------------------------------------------------------------
00322 
00323 template <class String> inline
00324 String stdStringTo(const std::string& str, wchar_t) { return utf8ToWide<String>(str); } //convert UTF8 to wide character string
00325 
00326 template <class String> inline
00327 String stdStringTo(const std::string& str, char) { return cvrtString<String>(str); } //directly process string without UTF8 conversion
00328 
00329 template <class String> inline
00330 String stdStringTo(const std::string& str) { return stdStringTo<String>(str, typename StringTraits<String>::CharType()); }
00331 }
00332 
00333 #endif //STRING_UTF8_HEADER_01832479146991573473545
 All Classes Namespaces Files Functions Variables