/***************************************************************************** file : $Id: unicode.h,v 1.6 2006/10/02 15:24:40 nils Exp $ description : unicode string type, utf8, utf16 and utf32 encoding ------------------------------------------------------------------------ copyright : (C) 2006 by Nils Springob, Aachen, GERMANY email : nils.springob@nicai-systems.de project : nicai-systems library *****************************************************************************/ #ifndef _NICAI_UNICODE_H_ #define _NICAI_UNICODE_H_ #include "boost/regex/pending/unicode_iterator.hpp" #include #include namespace nicai { typedef char utf8_char; typedef boost::uint16_t utf16_char; typedef boost::uint32_t utf32_char; typedef std::basic_string utf8_raw_string; typedef std::basic_string utf16_raw_string; typedef std::basic_string utf32_raw_string; class utf8_traits { public: typedef utf8_char char_type; typedef utf8_raw_string string_type; typedef string_type::iterator string_iterator; typedef string_type::const_iterator string_const_iterator; typedef std::back_insert_iterator back_inserter; typedef boost::u8_to_u32_iterator decode_iterator; typedef boost::u8_to_u32_iterator decode_const_iterator; typedef boost::utf8_output_iterator push_encode_iterator; }; class utf16_traits { public: typedef utf16_char char_type; typedef utf16_raw_string string_type; typedef string_type::iterator string_iterator; typedef string_type::const_iterator string_const_iterator; typedef std::back_insert_iterator back_inserter; typedef boost::u16_to_u32_iterator decode_iterator; typedef boost::u16_to_u32_iterator decode_const_iterator; typedef boost::utf16_output_iterator push_encode_iterator; }; class utf32_traits { public: typedef utf32_char char_type; typedef utf32_raw_string string_type; typedef string_type::iterator string_iterator; typedef string_type::const_iterator string_const_iterator; typedef std::back_insert_iterator back_inserter; typedef string_iterator decode_iterator; typedef string_const_iterator decode_const_iterator; typedef back_inserter push_encode_iterator; }; template class unicode_string { public: typedef T traits_type; typedef utf32_char value_type; typedef typename traits_type::char_type char_type; typedef typename traits_type::string_type string_type; typedef typename string_type::size_type size_type; typedef typename string_type::difference_type difference_type; typedef typename traits_type::decode_iterator iterator; typedef typename traits_type::decode_const_iterator const_iterator; typedef typename traits_type::push_encode_iterator pushback_iterator; typedef std::reverse_iterator reverse_iterator; typedef std::reverse_iterator const_reverse_iterator; BOOST_STATIC_CONSTANT(size_type, npos = -1); private: string_type raw_string; typedef typename traits_type::back_inserter raw_back_inserter; typedef typename traits_type::string_iterator raw_string_iterator; public: /** iterator access methods */ const_iterator begin() const { return const_iterator(raw_string.begin()); } const_iterator end() const { return const_iterator(raw_string.end()); } iterator begin() { return iterator(raw_string.begin()); } iterator end() { return iterator(raw_string.end()); } const_reverse_iterator rbegin() const { return end(); } const_reverse_iterator rend() const { return begin(); } pushback_iterator pushback_end() { return pushback_iterator(back_inserter(raw_string)); } /** append methods */ unicode_string & append (const unicode_string & s2) { raw_string.append(s2.raw_string); return *this; } template unicode_string & append(typename unicode_string::const_iterator begin2, typename unicode_string::const_iterator end2) { std::copy(begin2, end2, pushback_end()); return *this; } unicode_string & append(const_iterator begin2, const_iterator end2) { raw_string.append(begin2.base(), end2.base()); return *this; } template unicode_string & append(const unicode_string & s2) { return append(s2.begin(), s2.end()); } unicode_string & append(size_type n, utf32_char & uc) { while(n--) push_back(uc); return *this; } /** constructors */ unicode_string() { } explicit unicode_string(const string_type & s2) { raw_string=s2; } unicode_string(const unicode_string & s2) { raw_string=s2.raw_string; } template unicode_string(const unicode_string & s2) { append(s2); } unicode_string(size_type n, utf32_char & uc) { while(n--) push_back(uc); } template unicode_string (typename unicode_string::const_iterator begin2, typename unicode_string::const_iterator end2) { append(begin2, end2); } /** assign */ unicode_string & assign(const unicode_string & s2) { raw_string=s2.raw_string; return *this; } template unicode_string & assign(const unicode_string & s2) { clear(); return append(s2); } unicode_string & assign(size_type n, utf32_char & uc) { clear(); while(n--) push_back(uc); return *this; } template unicode_string & assign(typename unicode_string::const_iterator begin2, typename unicode_string::const_iterator end2) { clear(); return append(begin2, end2); } /** access */ const string_type & raw() const { return raw_string; } bool empty() const { return raw_string.empty(); } size_type size() const { return std::distance(begin(), end()); } void swap(const unicode_string & s2) { raw_string.swap(s2.raw_string); } void clear() { raw_string.clear(); } void push_back (utf32_char & uc) { pushback_end()=uc; } /** insert */ void insert(size_type pos, const unicode_string & s2) { iterator it; std::advance(it, pos); raw_string.insert(it.base(), s2.raw_string); } void insert(size_type pos, utf32_char & uc) { unicode_string s2; s2.append(uc); insert(pos, s2); } template void insert(size_type pos, const unicode_string & s2) { unicode_string us(s2); insert(pos, us); } /** compare */ int compare (const unicode_string & s2) const { return raw_string.compare(s2.raw_string); } template int compare(typename unicode_string::const_iterator begin2, typename unicode_string::const_iterator end2) const { const_iterator begin1(begin()), end1(end()); while (begin1!=end1 && begin2!=end2) { if (*begin1<*begin2) return -1; if (*begin1>*begin2) return 1; begin1++, begin2++; } if (begin1!=end1) return 1; if (begin2!=end2) return -1; return 0; } template int compare(const unicode_string & s2) const { return compare(s2.begin(), s2.end()); } /** erase */ unicode_string & erase(size_type pos=0, size_type n=npos) { iterator first; std::advance(first, pos); if (n==npos) erase(first); else { iterator last(first); std::advance(last, n); erase(first, last); } return *this; } iterator erase(const iterator & first, const iterator & last) { raw_string.erase(first.base(), last.base()); return first; } iterator erase(const iterator & first) { raw_string.erase(first.base()); return first; } /** replace */ template unicode_string & replace (iterator begin1, iterator end1, typename unicode_string::cont_iterator begin2, typename unicode_string::const_iterator end2) { unicode_string u2(begin2, end2); raw_string.replace(raw_string_iterator(begin1.base()), raw_string_iterator(end1.base()), u2.begin().base(), u2.end().base()); return *this; } template unicode_string & replace (iterator begin1, size_type n, const unicode_string & s2) { iterator end1(begin1); std::advance(end1, n); unicode_string u2(s2); raw_string.replace(raw_string_iterator(begin1.base()), raw_string_iterator(end1.base()), u2.begin().base(), u2.end().base()); return *this; } template unicode_string & replace (iterator begin1, iterator end1, const unicode_string & s2) { unicode_string u2(s2); raw_string.replace(raw_string_iterator(begin1.base()), raw_string_iterator(end1.base()), u2.begin().base(), u2.end().base()); return *this; } template unicode_string & replace (size_type pos, size_type n, const unicode_string & s2) { iterator begin1(begin()); std::advance(begin1, pos); iterator end1(begin1); std::advance(end1, n); replace(begin1, end1, s2.begin(), s2.end()); } template unicode_string & replace (size_type pos, size_type n, const unicode_string & s2, typename unicode_string::size_type pos2, typename unicode_string::size_type n2) { iterator begin1(begin()); std::advance(begin1, pos); iterator end1(begin1); std::advance(end1, n); typename unicode_string::iterator begin2(begin()); std::advance(begin2, pos); typename unicode_string::iterator end2(begin2); std::advance(end2, n); replace(begin1, end1, begin2, end2); } /** substr */ unicode_string substr (size_type pos, size_type n) { iterator first; std::advance(first, pos); iterator last(first); std::advance(last, n); return unicode_string(string_type(first.base(), last.base())); } /** find will return an iterator. This is in contrast to basic_string which returns a position */ template const_iterator find(Iterator begin2, Iterator end2, const_iterator begin1=begin(), const_iterator end1=end()) const { return std::search(begin1, end1, begin2, end2); } template iterator find(Iterator begin2, Iterator end2, iterator begin1=begin(), iterator end1=end()) { return std::search(begin1, end1, begin2, end2); } template const_iterator find(const unicode_string & us) const { return find(us.begin(), us.end(), begin(), end()); } template iterator find(const unicode_string & us) { return find(us.begin(), us.end(), begin(), end()); } template const_iterator find(const unicode_string & us, const_iterator start) const { return find(us.begin(), us.end(), start, end()); } template iterator find(const unicode_string & us, iterator start) { return find(us.begin(), us.end(), start, end()); } template const_iterator find(const unicode_string & us, size_type pos) const { const_iterator start = begin(); advance(start, pos); return find(us.begin(), us.end(), start, end()); } template iterator find(const unicode_string & us, size_type pos) { iterator start = begin(); advance(start, pos); return find(us.begin(), us.end(), start, end()); } /** rfind will return an iterator. This is in contrast to basic_string which returns a position */ /*TODO*/ /** OPERATORS */ unicode_string & operator= (const unicode_string & s2) { return assign(s2); } template unicode_string & operator= (const unicode_string & s2) { return assign(s2); } unicode_string & operator= (utf32_char uc) { return assign(1, uc); } template unicode_string & operator+= (const unicode_string & s2) { return append(s2); } unicode_string & operator+= (utf32_char uc) { push_back(uc); return *this; } template bool operator==(const unicode_string & rhs) const { return compare(rhs)==0; } template bool operator!=(const unicode_string & rhs) const { return compare(rhs)!=0; } template bool operator<(const unicode_string & rhs) const { return compare(rhs)<0; } template bool operator<=(const unicode_string & rhs) const { return compare(rhs)<=0; } template bool operator>(const unicode_string & rhs) const { return compare(rhs)>0; } template bool operator>=(const unicode_string & rhs) const { return compare(rhs)>=0; } template unicode_string operator+ (const unicode_string & rhs) const { return unicode_string(*this).append(rhs); } unicode_string operator+ (utf32_char uc) const { unicode_string res(*this); res.push_back(uc); return res; } }; typedef unicode_string utf8_string; typedef unicode_string utf16_string; typedef unicode_string utf32_string; } // namespace #endif