zenXML
Straightforward C++ XML Processing
string_tools.h
00001 // **************************************************************************
00002 // * This file is part of the zenXML project. It is distributed under the   *
00003 // * Boost Software License, Version 1.0. See accompanying file             *
00004 // * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt.       *
00005 // * Copyright (C) 2011 ZenJu (zhnmju123 AT gmx.de)                         *
00006 // **************************************************************************
00007 
00008 #ifndef STRING_TOOLS_HEADER_213458973046
00009 #define STRING_TOOLS_HEADER_213458973046
00010 
00011 #include <cstddef> //size_t
00012 #include <cctype>  //isspace
00013 #include <cwctype> //iswspace
00014 #include <cwchar>  //swprintf
00015 #include <cstdio>  //sprintf
00016 #include <algorithm>
00017 #include <cassert>
00018 #include <sstream>
00019 #include <functional>
00020 #include <vector>
00021 #include "loki/TypeManip.h"
00022 #include "loki/EmptyType.h"
00023 #include "loki/TypeTraits.h"
00024 #include "assert_static.h"
00025 #ifdef _MSC_VER
00026 template <> struct Loki::IsCustomUnsignedInt<unsigned __int64> { enum { value = 1 }; };
00027 template <> struct Loki::IsCustomSignedInt  <signed   __int64> { enum { value = 1 }; };
00028 #endif
00029 
00030 
00031 //enhance arbitray string class with useful non-member functions:
00032 namespace zen
00033 {
00034 template <class C> size_t cStringLength(const C* str); //strlen()
00035 template <class C>   bool cStringIsWhiteSpace(C ch);
00036 template <class C>   bool cStringIsDigit(C ch);
00037 
00038 //uniform access to string-like types: classes and character arrays
00039 /*
00040 strBegin():
00041         std::wstring str(L"dummy");
00042         char array[] = "dummy";
00043         const wchar_t* iter  = strBegin(str);   //returns str.c_str()
00044         const char*    iter2 = strBegin(array); //returns array
00045 
00046 strLength():
00047         strLength(str);   //equals str.size()
00048         strLength(array); //equals cStringLength(array)
00049 
00050 StringTraits<>:
00051         StringTraits<std::wstring>::CharType  //equals wchar_t
00052         StringTraits<wchar_t[5]>  ::CharType  //equals wchar_t
00053     StringTraits<const wchar_t*>::isStringLike; //equals "true"
00054         StringTraits<const int*>    ::isStringLike; //equals "false"
00055         StringTraits<std::wstring>::isStringClass  //equals "true"
00056         StringTraits<wchar_t[5]>  ::isStringClass  //equals "false"
00057 */
00058 
00059 template <class S, class T> bool startsWith(const S& str, const T& prefix);  //both S and T can be strings or char/wchar_t arrays or simple char/wchar_t
00060 template <class S, class T> bool endsWith  (const S& str, const T& postfix); //
00061 
00062 template <class S, class T> S afterLast  (const S& str, const T& ch); //returns the whole string if ch not found
00063 template <class S, class T> S beforeLast (const S& str, const T& ch); //returns empty string if ch not found
00064 template <class S, class T> S afterFirst (const S& str, const T& ch); //returns empty string if ch not found
00065 template <class S, class T> S beforeFirst(const S& str, const T& ch); //returns the whole string if ch not found
00066 
00067 template <class S, class T> std::vector<S> split(const S& str, const T& delimiter);
00068 template <class S> void truncate(S& str, size_t newLen);
00069 template <class S, class T, class U> void replace(S& str, const T& old, const U& replacement, bool replaceAll = true);
00070 template <class S> void trim(S& str, bool fromLeft = true, bool fromRight = true);
00071 
00072 //high-performance conversion from numbers to strings
00073 template <class S, class Num> S toString(const Num& number);
00074 template <class Num, class S> Num toNumber(const S& str);
00075 
00076 //string to string conversion: converst string-like type into compatible target string class
00077 template <class T, class S> T cvrtString(const S& str);
00078 
00079 
00080 
00081 
00082 
00083 
00084 
00085 
00086 
00087 
00088 
00089 
00090 
00091 
00092 
00093 
00094 
00095 
00096 
00097 
00098 
00099 
00100 
00101 
00102 
00103 
00104 
00105 
00106 
00107 
00108 
00109 
00110 
00111 
00112 
00113 
00114 //---------------------- implementation ----------------------
00115 
00116 template <class C> inline
00117 size_t cStringLength(const C* str) //strlen()
00118 {
00119     assert_static((Loki::IsSameType<C, char>::value || Loki::IsSameType<C, wchar_t>::value));
00120     size_t len = 0;
00121     while (*str++ != 0)
00122         ++len;
00123     return len;
00124 }
00125 
00126 
00127 template <> inline
00128 bool cStringIsWhiteSpace(char ch)
00129 {
00130     //caveat 1: std::isspace() takes an int, but expects an unsigned char
00131     //caveat 2: some parts of UTF-8 chars are erroneously seen as whitespace, e.g. the a0 from "\xec\x8b\xa0" (MSVC)
00132     return static_cast<unsigned char>(ch) < 128 &&
00133            std::isspace(static_cast<unsigned char>(ch)) != 0;
00134 }
00135 
00136 template <> inline bool cStringIsWhiteSpace(unsigned char ch) { return cStringIsWhiteSpace<char>(ch); }
00137 template <> inline bool cStringIsWhiteSpace(signed char ch)   { return cStringIsWhiteSpace<char>(ch); }
00138 template <> inline bool cStringIsWhiteSpace(wchar_t ch)       { return std::iswspace(ch) != 0; }
00139 
00140 template <> inline
00141 bool cStringIsDigit(char ch)
00142 {
00143     return std::isdigit(static_cast<unsigned char>(ch)) != 0; //caveat: takes an int, but expects an unsigned char
00144 }
00145 
00146 
00147 template <>
00148 inline
00149 bool cStringIsDigit(wchar_t ch)
00150 {
00151     return std::iswdigit(ch) != 0;
00152 }
00153 
00154 namespace implementation
00155 {
00156 template <class T>
00157 struct UnArray { typedef T NonArrayType; };
00158 
00159 template <class T, int N>
00160 struct UnArray<T[N]> { typedef T NonArrayType; };
00161 
00162 template <class T>
00163 struct UnPointer { typedef T NonPtrType; };
00164 
00165 template <class T>
00166 struct UnPointer<T*> { typedef T NonPtrType; };
00167 
00168 template <class T>
00169 struct UnReference { typedef T NonRefType; };
00170 
00171 template <class T>
00172 struct UnReference<T&> { typedef T NonRefType; };
00173 
00174 
00175 template<typename T>
00176 class HasValueType
00177 {
00178     typedef char Yes[1];
00179     typedef char No [2];
00180 
00181     template <typename U> class HelperTp {};
00182 
00183     //detect presence of a member type called value_type
00184     template <class U> static Yes& hasMemberValueType(HelperTp<typename U::value_type>*);
00185     template <class U> static  No& hasMemberValueType(...);
00186 
00187 public:
00188     enum { Result = sizeof(hasMemberValueType<T>(NULL)) == sizeof(Yes)
00189          };
00190 };
00191 
00192 
00193 template<typename T, bool isClassType>
00194 class HasStringMembers
00195 {
00196 public:
00197     enum { Result = false };
00198 };
00199 
00200 template<typename T>
00201 class HasStringMembers<T, true>
00202 {
00203     typedef char Yes[1];
00204     typedef char No [2];
00205 
00206     //detect presence of member functions (without specific restriction on return type, within T or one of it's base classes)
00207     template <typename U, U t> class HelperFn {};
00208 
00209     struct Fallback
00210     {
00211         int c_str;
00212         int length;
00213     };
00214 
00215     template <class U>
00216     struct Helper2 : public U, public Fallback {}; //U must be a class-type!
00217 
00218     //we don't know the exact declaration of the member attribute (may be in base class), but we know what NOT to expect:
00219     template <class U> static  No& hasMemberCstr(HelperFn<int Fallback::*, &Helper2<U>::c_str>*);
00220     template <class U> static Yes& hasMemberCstr(...);
00221 
00222     template <class U> static  No& hasMemberLength(HelperFn<int Fallback::*, &Helper2<U>::length>*);
00223     template <class U> static Yes& hasMemberLength(...);
00224 public:
00225     enum { Result = sizeof(hasMemberCstr  <T>(NULL)) == sizeof(Yes) &&
00226                     sizeof(hasMemberLength<T>(NULL)) == sizeof(Yes)
00227          };
00228 };
00229 
00230 template <class S, bool isStringClass> struct StringTraits2 { typedef Loki::EmptyType Result; }; //"StringTraits2": fix some VS bug with namespace and partial template specialization
00231 
00232 template <class S> struct StringTraits2<S, true> { typedef typename S::value_type Result; };
00233 template <> struct StringTraits2<char,    false> { typedef char    Result; };
00234 template <> struct StringTraits2<wchar_t, false> { typedef wchar_t Result; };
00235 }
00236 
00237 template <class S>
00238 struct StringTraits
00239 {
00240 private:
00241     typedef typename implementation::UnReference<S>::NonRefType NonRefType;
00242     typedef typename Loki::TypeTraits<NonRefType>::NonConstType UndecoratedType;
00243 
00244     typedef typename implementation::UnArray<UndecoratedType>::NonArrayType NonArrayType;
00245     typedef typename implementation::UnPointer<NonArrayType>::NonPtrType    NonPtrType;
00246     typedef typename Loki::TypeTraits<NonPtrType>::NonConstType             NonConstValType; //handle "const char* const"
00247 public:
00248     enum
00249     {
00250         isStringClass = implementation::HasStringMembers<UndecoratedType, implementation::HasValueType<UndecoratedType>::Result>::Result
00251     };
00252 
00253     typedef typename implementation::StringTraits2<NonConstValType, isStringClass>::Result CharType;
00254 
00255     enum
00256     {
00257         isStringLike = Loki::IsSameType<CharType, char>::value || Loki::IsSameType<CharType, wchar_t>::value
00258     };
00259 };
00260 
00261 
00262 template <class S> inline
00263 const typename StringTraits<S>::CharType* strBegin(const S& str, typename S::value_type dummy = 0) { return str.c_str(); } //SFINAE: T must be a "string"
00264 
00265 template <class Char>
00266 inline const typename StringTraits<Char>::CharType* strBegin(const Char* str)   { return str; }
00267 inline const char*    strBegin(const char& ch)    { return &ch; }
00268 inline const wchar_t* strBegin(const wchar_t& ch) { return &ch; }
00269 
00270 
00271 template <class S> inline
00272 size_t strLength(const S& str, typename S::value_type dummy = 0) { return str.length(); } //SFINAE: T must be a "string"
00273 
00274 template <class Char>
00275 inline size_t strLength(const Char* str) { return cStringLength(str); }
00276 inline size_t strLength(char)            { return 1; }
00277 inline size_t strLength(wchar_t)         { return 1; }
00278 
00279 
00280 template <class S, class T> inline
00281 bool startsWith(const S& str, const T& prefix)
00282 {
00283     assert_static(StringTraits<S>::isStringLike);
00284     assert_static(StringTraits<T>::isStringLike);
00285 
00286     const size_t pfLength = strLength(prefix);
00287     if (strLength(str) < pfLength)
00288         return false;
00289 
00290     return std::equal(strBegin(str), strBegin(str) + pfLength,
00291                       strBegin(prefix));
00292 }
00293 
00294 
00295 template <class S, class T> inline
00296 bool endsWith(const S& str, const T& postfix)
00297 {
00298     assert_static(StringTraits<S>::isStringLike);
00299     assert_static(StringTraits<T>::isStringLike);
00300 
00301     size_t strLen = strLength(str);
00302     size_t pfLen  = strLength(postfix);
00303     if (strLen < pfLen)
00304         return false;
00305 
00306     typedef typename StringTraits<S>::CharType CharType;
00307 
00308     const CharType* cmpBegin = strBegin(str) + strLen - pfLen;
00309     return std::equal(cmpBegin, cmpBegin + pfLen,
00310                       strBegin(postfix));
00311 }
00312 
00313 
00314 // get all characters after the last occurence of ch
00315 // (returns the whole string if ch not found)
00316 template <class S, class T> inline
00317 S afterLast(const S& str, const T& ch)
00318 {
00319     assert_static(StringTraits<T>::isStringLike);
00320 
00321     const size_t pos = str.rfind(ch);
00322     if (pos != S::npos)
00323     {
00324         size_t chLen = strLength(ch);
00325         return S(str.c_str() + pos + chLen, str.length() - pos - chLen);
00326     }
00327     else
00328         return str;
00329 }
00330 
00331 
00332 // get all characters before the last occurence of ch
00333 // (returns empty string if ch not found)
00334 template <class S, class T> inline
00335 S beforeLast(const S& str, const T& ch)
00336 {
00337     assert_static(StringTraits<T>::isStringLike);
00338 
00339     const size_t pos = str.rfind(ch);
00340     if (pos != S::npos)
00341         return S(str.c_str(), pos); //data is non-empty string in this context: else ch would not have been found!
00342     else
00343         return S();
00344 }
00345 
00346 
00347 //returns empty string if ch not found
00348 template <class S, class T> inline
00349 S afterFirst(const S& str, const T& ch)
00350 {
00351     assert_static(StringTraits<T>::isStringLike);
00352 
00353     const size_t pos = str.find(ch);
00354     if (pos != S::npos)
00355     {
00356         size_t chLen = strLength(ch);
00357         return S(str.c_str() + pos + chLen, str.length() - pos - chLen);
00358     }
00359     else
00360         return S();
00361 
00362 }
00363 
00364 
00365 //returns the whole string if ch not found
00366 template <class S, class T> inline
00367 S beforeFirst(const S& str, const T& ch)
00368 {
00369     assert_static(StringTraits<T>::isStringLike);
00370 
00371     const size_t pos = str.find(ch);
00372     if (pos != S::npos)
00373         return S(str.c_str(), pos); //data is non-empty string in this context: else ch would not have been found!
00374     else
00375         return str;
00376 }
00377 
00378 
00379 template <class S, class T> inline
00380 std::vector<S> split(const S& str, const T& delimiter)
00381 {
00382     assert_static(StringTraits<T>::isStringLike);
00383 
00384     std::vector<S> output;
00385     size_t bockStart = 0;
00386     size_t delimLen = strLength(delimiter);
00387     if (delimLen != 0)
00388     {
00389         for (size_t blockEnd = str.find(delimiter, bockStart);
00390              blockEnd != S::npos;
00391              bockStart = blockEnd + delimLen, blockEnd = str.find(delimiter, bockStart))
00392         {
00393             output.push_back(S(str.c_str() + bockStart, blockEnd - bockStart));
00394         }
00395     }
00396     output.push_back(S(str.c_str() + bockStart, str.length() - bockStart));
00397     return output;
00398 }
00399 
00400 
00401 template <class S> inline
00402 void truncate(S& str, size_t newLen)
00403 {
00404     if (newLen < str.length())
00405         str.resize(newLen);
00406 }
00407 
00408 
00409 template <class S, class T, class U> inline
00410 void replace(S& str, const T& old, const U& replacement, bool replaceAll)
00411 {
00412     assert_static(StringTraits<T>::isStringLike);
00413     assert_static(StringTraits<U>::isStringLike);
00414 
00415     size_t pos = 0;
00416     size_t oldLen = strLength(old);
00417     size_t repLen = strLength(replacement);
00418     while ((pos = str.find(old, pos)) != S::npos)
00419     {
00420         str.replace(pos, oldLen, replacement);
00421         pos += repLen; //move past the string that was replaced
00422 
00423         if (!replaceAll)
00424             break;
00425     }
00426 }
00427 
00428 
00429 template <class S> inline
00430 void trim(S& str, bool fromLeft, bool fromRight)
00431 {
00432     assert(fromLeft || fromRight);
00433 
00434     typedef typename S::value_type CharType;
00435 
00436     const CharType* newBegin = str.c_str();
00437     const CharType* newEnd   = str.c_str() + str.length();
00438 
00439     if (fromRight)
00440         while (newBegin != newEnd && cStringIsWhiteSpace(newEnd[-1]))
00441             --newEnd;
00442 
00443     if (fromLeft)
00444         while (newBegin != newEnd && cStringIsWhiteSpace(*newBegin))
00445             ++newBegin;
00446 
00447     const size_t newLength = newEnd - newBegin;
00448     if (newLength != str.length())
00449     {
00450         if (newBegin != str.c_str())
00451             str = S(newBegin, newLength); //minor inefficiency: in case "str" is not shared, we could save an allocation and do a memory move only
00452         else
00453             str.resize(newLength);
00454     }
00455 }
00456 
00457 
00458 namespace implementation
00459 {
00460 template <class S, class T>
00461 struct CnvtStringToString
00462 {
00463     T convert(const S& src) const { return T(strBegin(src), strLength(src)); }
00464 };
00465 
00466 template <class S>
00467 struct CnvtStringToString<S, S> //perf: we don't need a deep copy if string types match
00468 {
00469     const S& convert(const S& src) const { return src; }
00470 };
00471 }
00472 
00473 template <class T, class S> inline
00474 T cvrtString(const S& str) { return implementation::CnvtStringToString<S, T>().convert(str); }
00475 
00476 
00477 namespace implementation
00478 {
00479 enum NumberType
00480 {
00481     NUM_TYPE_SIGNED_INT,
00482     NUM_TYPE_UNSIGNED_INT,
00483     NUM_TYPE_FLOATING_POINT,
00484     NUM_TYPE_OTHER,
00485 };
00486 
00487 
00488 template <class S, class Num, NumberType>
00489 struct CvrtNumberToString
00490 {
00491     S convert(const Num& number) const //default number to string conversion using streams: convenient, but SLOW, SLOW, SLOW!!!! (~ factor of 20)
00492     {
00493         typedef typename StringTraits<S>::CharType CharType;
00494 
00495         std::basic_ostringstream<CharType> ss;
00496         ss << number;
00497         return cvrtString<S>(ss.str());
00498     }
00499 };
00500 
00501 
00502 template <class S, class Num>
00503 struct CvrtNumberToString<S, Num, NUM_TYPE_FLOATING_POINT>
00504 {
00505     S convert(const Num& number) const { return convertFloat(number, typename StringTraits<S>::CharType()); }
00506 
00507 private:
00508     S convertFloat(const Num& number, char) const
00509     {
00510         char buffer[50];
00511         int charsWritten = std::sprintf(buffer, "%f", static_cast<double>(number));
00512         return charsWritten > 0 ? S(buffer, charsWritten) : S();
00513     }
00514     S convertFloat(const Num& number, wchar_t) const
00515     {
00516         wchar_t buffer[50];
00517 #ifdef __MINGW32__
00518         int charsWritten = ::swprintf(buffer, L"%f", static_cast<double>(number)); //MinGW does not comply to the C standard!
00519 #else
00520         int charsWritten = std::swprintf(buffer, 50, L"%f", static_cast<double>(number));
00521 #endif
00522         return charsWritten > 0 ? S(buffer, charsWritten) : S();
00523     }
00524 };
00525 
00526 /*
00527 perf: integer to string: (executed 10 mio. times)
00528         std::stringstream - 14796 ms
00529         std::sprintf      -  3086 ms
00530         hand coded        -   778 ms
00531 */
00532 
00533 template <class S, class Num> inline
00534 S formatInteger(Num n, bool hasMinus)
00535 {
00536     assert(n >= 0);
00537     S output;
00538     do
00539     {
00540         output += '0' + n % 10;
00541         n /= 10;
00542     }
00543     while (n != 0);
00544     if (hasMinus)
00545         output += '-';
00546 
00547     std::reverse(output.begin(), output.end());
00548     return output;
00549 }
00550 
00551 template <class S, class Num>
00552 struct CvrtNumberToString<S, Num, NUM_TYPE_SIGNED_INT>
00553 {
00554     S convert(const Num& number) const { return formatInteger<S>(number < 0 ? -number : number, number < 0); }
00555 };
00556 
00557 template <class S, class Num>
00558 struct CvrtNumberToString<S, Num, NUM_TYPE_UNSIGNED_INT>
00559 {
00560     S convert(const Num& number) const { return formatInteger<S>(number, false); }
00561 };
00562 
00563 //--------------------------------------------------------------------------------
00564 
00565 template <class S, class Num, NumberType>
00566 struct CvrtStringToNumber
00567 {
00568     Num convert(const S& str) const //default string to number conversion using streams: convenient, but SLOW
00569     {
00570         typedef typename StringTraits<S>::CharType CharType;
00571         Num number = 0;
00572         std::basic_istringstream<CharType>(cvrtString<std::basic_string<CharType> >(str)) >> number;
00573         return number;
00574     }
00575 };
00576 
00577 
00578 template <class S, class Num>
00579 struct CvrtStringToNumber<S, Num, NUM_TYPE_FLOATING_POINT>
00580 {
00581     Num convert(const S& str) const { return convertFloat(strBegin(str)); }
00582 
00583 private:
00584     Num convertFloat(const char*    str) const { return std::strtod(str, NULL); }
00585     Num convertFloat(const wchar_t* str) const { return std::wcstod(str, NULL); }
00586 };
00587 
00588 template <class Num, class S>
00589 Num extractInteger(const S& str, bool& hasMinusSign) //very fast conversion to integers: slightly faster than std::atoi, but more importantly: generic
00590 {
00591     typedef typename StringTraits<S>::CharType CharType;
00592 
00593     const CharType* first = strBegin(str);
00594     const CharType* last  = first + strLength(str);
00595 
00596     while (first != last && cStringIsWhiteSpace(*first)) //skip leading whitespace
00597         ++first;
00598 
00599     hasMinusSign = false; //handle minus sign
00600     if (first != last)
00601     {
00602         if (*first == '-')
00603         {
00604             hasMinusSign = true;
00605             ++first;
00606         }
00607         else if (*first == '+')
00608             ++first;
00609     }
00610 
00611     Num number = 0;
00612     for (const CharType* iter = first; iter != last; ++iter)
00613     {
00614         const CharType c = *iter;
00615         if ('0' <= c && c <= '9')
00616         {
00617             number *= 10;
00618             number += c - '0';
00619         }
00620         else
00621         {
00622             assert(std::find_if(iter, last, std::not1(std::ptr_fun(&cStringIsWhiteSpace<CharType>))) == last); //rest of string should contain whitespace only
00623             break;
00624         }
00625     }
00626     return number;
00627 }
00628 
00629 
00630 template <class S, class Num>
00631 struct CvrtStringToNumber<S, Num, NUM_TYPE_SIGNED_INT>
00632 {
00633     Num convert(const S& str) const
00634     {
00635         bool hasMinusSign = false; //handle minus sign
00636         const Num number = extractInteger<Num>(str, hasMinusSign);
00637         return hasMinusSign ? -number : number;
00638     }
00639 };
00640 
00641 
00642 template <class S, class Num>
00643 struct CvrtStringToNumber<S, Num, NUM_TYPE_UNSIGNED_INT>
00644 {
00645     Num convert(const S& str) const //very fast conversion to integers: slightly faster than std::atoi, but more importantly: generic
00646     {
00647         bool hasMinusSign = false; //handle minus sign
00648         const Num number = extractInteger<Num>(str, hasMinusSign);
00649         if (hasMinusSign)
00650         {
00651             assert(false);
00652             return 0U;
00653         }
00654         return number;
00655     }
00656 };
00657 }
00658 
00659 
00660 template <class S, class Num>
00661 inline
00662 S toString(const Num& number) //convert number to string the C++ way
00663 {
00664     using namespace implementation;
00665     return CvrtNumberToString<S, Num,
00666            Loki::TypeTraits<Num>::isSignedInt   ? NUM_TYPE_SIGNED_INT :
00667            Loki::TypeTraits<Num>::isUnsignedInt ? NUM_TYPE_UNSIGNED_INT :
00668            Loki::TypeTraits<Num>::isFloat       ? NUM_TYPE_FLOATING_POINT :
00669            NUM_TYPE_OTHER
00670            >().convert(number);
00671 }
00672 
00673 
00674 template <class Num, class S>
00675 inline
00676 Num toNumber(const S& str) //convert string to number the C++ way
00677 {
00678     using namespace implementation;
00679     return CvrtStringToNumber<S, Num,
00680            Loki::TypeTraits<Num>::isSignedInt   ? NUM_TYPE_SIGNED_INT :
00681            Loki::TypeTraits<Num>::isUnsignedInt ? NUM_TYPE_UNSIGNED_INT :
00682            Loki::TypeTraits<Num>::isFloat       ? NUM_TYPE_FLOATING_POINT :
00683            NUM_TYPE_OTHER
00684            >().convert(str);
00685 }
00686 
00687 }
00688 
00689 #endif //STRING_TOOLS_HEADER_213458973046
 All Classes Namespaces Files Functions Variables