/***************************************************************************** file : $Id: encodings.h,v 1.6 2006/10/02 15:24:40 nils Exp $ description : latin1, windows_1252 and cp437 encoding ------------------------------------------------------------------------ copyright : (C) 2006 by Nils Springob, Aachen, GERMANY based on John Maddock's unicode_iterator.hpp email : nils.springob@nicai-systems.de project : nicai-systems library *****************************************************************************/ #ifndef _NICAI_ENCODINGS_H_ #define _NICAI_ENCODINGS_H_ #include "boost/regex/pending/unicode_iterator.hpp" #include #include namespace nicai { typedef char utf8_char; typedef boost::uint16_t utf16_char; typedef boost::uint32_t utf32_char; typedef std::basic_string utf8_raw_string; typedef std::basic_string utf16_raw_string; typedef std::basic_string utf32_raw_string; template class u32_to_charencoded_iterator : public boost::iterator_facade, const U8Type, ::boost::bidirectional_traversal_tag, const U8Type> { private: typedef boost::iterator_facade, const U8Type, ::boost::bidirectional_traversal_tag, const U8Type> base_type; #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) typedef typename std::iterator_traits::value_type base_value_type; BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32); BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8); #endif typename base_type::reference dereference() const { return Encoding::encode(*m_position); } bool equal(const u32_to_charencoded_iterator& that) const { return(m_position == that.m_position); } void increment() { ++m_position; } void decrement() { --m_position; } friend class boost::iterator_core_access; public: BaseIterator base()const { return m_position; } // construct: u32_to_charencoded_iterator() : m_position() { } u32_to_charencoded_iterator(BaseIterator b) : m_position(b) { } private: mutable typename base_type::value_type m_val; BaseIterator m_position; }; template class charencoded_to_u32_iterator : public boost::iterator_facade, const U32Type, std::input_iterator_tag, const U32Type> { private: typedef boost::iterator_facade, const U32Type, std::input_iterator_tag, const U32Type> base_type; // special values for pending iterator reads: BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu); #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) typedef typename std::iterator_traits::value_type base_value_type; BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8); BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32); #endif typename base_type::reference dereference()const { return Encoding::decode(*m_position); } bool equal(const charencoded_to_u32_iterator& that)const { return m_position == that.m_position; } void increment() { ++m_position; } void decrement() { --m_position; } friend class boost::iterator_core_access; public: BaseIterator base()const { return m_position; } charencoded_to_u32_iterator() : m_position() { } charencoded_to_u32_iterator(BaseIterator b) : m_position(b) { } private: mutable typename base_type::value_type m_val; BaseIterator m_position; }; template class charencoded_output_iterator : public boost::iterator_facade, U32Type, std::output_iterator_tag, const charencoded_output_iterator &> { private: typedef boost::iterator_facade, U32Type, std::output_iterator_tag, const charencoded_output_iterator &> base_type; // special values for pending iterator reads: #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) typedef typename std::iterator_traits::value_type base_value_type; //BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8); BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32); #endif public: friend class boost::iterator_core_access; charencoded_output_iterator(const BaseIterator& b) : m_position(b){} const charencoded_output_iterator& dereference() const { return *this; } void operator=(boost::uint32_t val)const { push(val); } void increment() { } BaseIterator base()const { return m_position; } private: void push(boost::uint32_t c)const { *m_position++ = Encoding::encode(c); } mutable BaseIterator m_position; }; class latin1_encoding { public: static utf32_char decode(char c) { return (static_cast(c)); } static char encode(utf32_char c) { return (c>255)?'?':static_cast(c); } }; class windows_1252_encoding { private: static utf32_char decode_undefined() {return '?';} static char encode_undefined() {return '?';} public: static utf32_char decode(char c) { switch (c) { case 0x80: return 0x20AC; case 0x81: return decode_undefined(); case 0x82: return 0x201A; case 0x83: return 0x0192; case 0x84: return 0x201E; case 0x85: return 0x2026; case 0x86: return 0x2020; case 0x87: return 0x2021; case 0x88: return 0x02C6; case 0x89: return 0x2030; case 0x8a: return 0x0160; case 0x8b: return 0x2039; case 0x8c: return 0x0152; case 0x8d: return decode_undefined(); case 0x8e: return 0x017D; case 0x8f: return decode_undefined(); case 0x90: return decode_undefined(); case 0x91: return 0x2018; case 0x92: return 0x2019; case 0x93: return 0x201C; case 0x94: return 0x201D; case 0x95: return 0x2022; case 0x96: return 0x2013; case 0x97: return 0x2014; case 0x98: return 0x02DC; case 0x99: return 0x2122; case 0x9a: return 0x0161; case 0x9b: return 0x203A; case 0x9c: return 0x0153; case 0x9d: return decode_undefined(); case 0x9e: return 0x017E; case 0x9f: return 0x0178; default: return (static_cast(c)); } } static char encode(utf32_char c) { switch (c) { case 0x20AC: return 0x80; // undefined; case 0x201A: return 0x82; case 0x0192: return 0x83; case 0x201E: return 0x84; case 0x2026: return 0x85; case 0x2020: return 0x86; case 0x2021: return 0x87; case 0x02C6: return 0x88; case 0x2030: return 0x89; case 0x0160: return 0x8a; case 0x2039: return 0x8b; case 0x0152: return 0x8c; // undefined; case 0x017D: return 0x8e; // undefined; // undefined; case 0x2018: return 0x91; case 0x2019: return 0x92; case 0x201C: return 0x93; case 0x201D: return 0x94; case 0x2022: return 0x95; case 0x2013: return 0x96; case 0x2014: return 0x97; case 0x02DC: return 0x98; case 0x2122: return 0x99; case 0x0161: return 0x9a; case 0x203A: return 0x9b; case 0x0153: return 0x9c; // undefined; case 0x017E: return 0x9e; case 0x0178: return 0x9f; } return ((c>255)||(c>=0x80)&&(c<0xa0))?encode_undefined():static_cast(c); } }; class cp437_encoding { private: static utf32_char decode_undefined() {return '?';} static char encode_undefined() {return '?';} public: static utf32_char decode(char c) { switch (c) { case 0x80: return 0x00C7; case 0x81: return 0x00FC; case 0x82: return 0x00E9; case 0x83: return 0x00E2; case 0x84: return 0x00E4; case 0x85: return 0x00E0; case 0x86: return 0x00E5; case 0x87: return 0x00E7; case 0x88: return 0x00EA; case 0x89: return 0x00EB; case 0x8a: return 0x00E8; case 0x8b: return 0x00EF; case 0x8c: return 0x00EE; case 0x8d: return 0x00EC; case 0x8e: return 0x00C4; case 0x8f: return 0x00C5; case 0x90: return 0x00C9; case 0x91: return 0x00E6; case 0x92: return 0x00C6; case 0x93: return 0x00F4; case 0x94: return 0x00F6; case 0x95: return 0x00F2; case 0x96: return 0x00FB; case 0x97: return 0x00F9; case 0x98: return 0x00FF; case 0x99: return 0x00D6; case 0x9a: return 0x00DC; case 0x9b: return 0x00A2; case 0x9c: return 0x00A3; case 0x9d: return 0x00A5; case 0x9e: return 0x20A7; case 0x9f: return 0x0192; case 0xa0: return 0x00e1; case 0xa1: return 0x00ed; case 0xa2: return 0x00f3; case 0xa3: return 0x00fa; case 0xa4: return 0x00f1; case 0xa5: return 0x00d1; case 0xa6: return 0x00aa; case 0xa7: return 0x00ba; case 0xa8: return 0x00bf; case 0xa9: return 0x2310; case 0xaa: return 0x00ac; case 0xab: return 0x00bd; case 0xac: return 0x00bc; case 0xad: return 0x00a1; case 0xae: return 0x00ab; case 0xaf: return 0x00bb; case 0xb0: return 0x2591; case 0xb1: return 0x2592; case 0xb2: return 0x2593; case 0xb3: return 0x2502; case 0xb4: return 0x2524; case 0xb5: return 0x2561; case 0xb6: return 0x2562; case 0xb7: return 0x2556; case 0xb8: return 0x2555; case 0xb9: return 0x2563; case 0xba: return 0x2551; case 0xbb: return 0x2557; case 0xbc: return 0x255d; case 0xbd: return 0x255c; case 0xbe: return 0x255b; case 0xbf: return 0x2510; case 0xc0: return 0x2514; case 0xc1: return 0x2534; case 0xc2: return 0x252c; case 0xc3: return 0x251c; case 0xc4: return 0x2500; case 0xc5: return 0x253c; case 0xc6: return 0x255e; case 0xc7: return 0x255f; case 0xc8: return 0x255a; case 0xc9: return 0x2554; case 0xca: return 0x2569; case 0xcb: return 0x2566; case 0xcc: return 0x2560; case 0xcd: return 0x2550; case 0xce: return 0x256c; case 0xcf: return 0x2567; case 0xd0: return 0x2568; case 0xd1: return 0x2564; case 0xd2: return 0x2565; case 0xd3: return 0x2559; case 0xd4: return 0x2558; case 0xd5: return 0x2552; case 0xd6: return 0x2553; case 0xd7: return 0x256b; case 0xd8: return 0x256a; case 0xd9: return 0x2518; case 0xda: return 0x250c; case 0xdb: return 0x2588; case 0xdc: return 0x2584; case 0xdd: return 0x258c; case 0xde: return 0x2590; case 0xdf: return 0x2580; case 0xe0: return 0x03b1; case 0xe1: return 0x00df; case 0xe2: return 0x0393; case 0xe3: return 0x03c0; case 0xe4: return 0x03a3; case 0xe5: return 0x03c3; case 0xe6: return 0x00b5; case 0xe7: return 0x03c4; case 0xe8: return 0x03a6; case 0xe9: return 0x0398; case 0xea: return 0x03a9; case 0xeb: return 0x03b4; case 0xec: return 0x221e; case 0xed: return 0x03c6; case 0xee: return 0x03b5; case 0xef: return 0x2229; case 0xf0: return 0x2261; case 0xf1: return 0x00b1; case 0xf2: return 0x2265; case 0xf3: return 0x2264; case 0xf4: return 0x2320; case 0xf5: return 0x2321; case 0xf6: return 0x00f7; case 0xf7: return 0x2248; case 0xf8: return 0x00b0; case 0xf9: return 0x2219; case 0xfa: return 0x00b7; case 0xfb: return 0x221a; case 0xfc: return 0x207f; case 0xfd: return 0x00b2; case 0xfe: return 0x25a0; case 0xff: return 0x00a0; default: return (static_cast(c)); } } static char encode(utf32_char c) { switch (c) { case 0x00C7: return 0x80; case 0x00FC: return 0x81; case 0x00E9: return 0x82; case 0x00E2: return 0x83; case 0x00E4: return 0x84; case 0x00E0: return 0x85; case 0x00E5: return 0x86; case 0x00E7: return 0x87; case 0x00EA: return 0x88; case 0x00EB: return 0x89; case 0x00E8: return 0x8a; case 0x00EF: return 0x8b; case 0x00EE: return 0x8c; case 0x00EC: return 0x8d; case 0x00C4: return 0x8e; case 0x00C5: return 0x8f; case 0x00C9: return 0x90; case 0x00E6: return 0x91; case 0x00C6: return 0x92; case 0x00F4: return 0x93; case 0x00F6: return 0x94; case 0x00F2: return 0x95; case 0x00FB: return 0x96; case 0x00F9: return 0x97; case 0x00FF: return 0x98; case 0x00D6: return 0x99; case 0x00DC: return 0x9a; case 0x00A2: return 0x9b; case 0x00A3: return 0x9c; case 0x00A5: return 0x9d; case 0x20A7: return 0x9e; case 0x0192: return 0x9f; case 0x00e1: return 0xa0; case 0x00ed: return 0xa1; case 0x00f3: return 0xa2; case 0x00fa: return 0xa3; case 0x00f1: return 0xa4; case 0x00d1: return 0xa5; case 0x00aa: return 0xa6; case 0x00ba: return 0xa7; case 0x00bf: return 0xa8; case 0x2310: return 0xa9; case 0x00ac: return 0xaa; case 0x00bd: return 0xab; case 0x00bc: return 0xac; case 0x00a1: return 0xad; case 0x00ab: return 0xae; case 0x00bb: return 0xaf; case 0x2591: return 0xb0; case 0x2592: return 0xb1; case 0x2593: return 0xb2; case 0x2502: return 0xb3; case 0x2524: return 0xb4; case 0x2561: return 0xb5; case 0x2562: return 0xb6; case 0x2556: return 0xb7; case 0x2555: return 0xb8; case 0x2563: return 0xb9; case 0x2551: return 0xba; case 0x2557: return 0xbb; case 0x255d: return 0xbc; case 0x255c: return 0xbd; case 0x255b: return 0xbe; case 0x2510: return 0xbf; case 0x2514: return 0xc0; case 0x2534: return 0xc1; case 0x252c: return 0xc2; case 0x251c: return 0xc3; case 0x2500: return 0xc4; case 0x253c: return 0xc5; case 0x255e: return 0xc6; case 0x255f: return 0xc7; case 0x255a: return 0xc8; case 0x2554: return 0xc9; case 0x2569: return 0xca; case 0x2566: return 0xcb; case 0x2560: return 0xcc; case 0x2550: return 0xcd; case 0x256c: return 0xce; case 0x2567: return 0xcf; case 0x2568: return 0xd0; case 0x2564: return 0xd1; case 0x2565: return 0xd2; case 0x2559: return 0xd3; case 0x2558: return 0xd4; case 0x2552: return 0xd5; case 0x2553: return 0xd6; case 0x256b: return 0xd7; case 0x256a: return 0xd8; case 0x2518: return 0xd9; case 0x250c: return 0xda; case 0x2588: return 0xdb; case 0x2584: return 0xdc; case 0x258c: return 0xdd; case 0x2590: return 0xde; case 0x2580: return 0xdf; case 0x03b1: return 0xe0; case 0x00df: return 0xe1; case 0x0393: return 0xe2; case 0x03c0: return 0xe3; case 0x03a3: return 0xe4; case 0x03c3: return 0xe5; case 0x00b5: return 0xe6; case 0x03c4: return 0xe7; case 0x03a6: return 0xe8; case 0x0398: return 0xe9; case 0x03a9: return 0xea; case 0x03b4: return 0xeb; case 0x221e: return 0xec; case 0x03c6: return 0xed; case 0x03b5: return 0xee; case 0x2229: return 0xef; case 0x2261: return 0xf0; case 0x00b1: return 0xf1; case 0x2265: return 0xf2; case 0x2264: return 0xf3; case 0x2320: return 0xf4; case 0x2321: return 0xf5; case 0x00f7: return 0xf6; case 0x2248: return 0xf7; case 0x00b0: return 0xf8; case 0x2219: return 0xf9; case 0x00b7: return 0xfa; case 0x221a: return 0xfb; case 0x207f: return 0xfc; case 0x00b2: return 0xfd; case 0x25a0: return 0xfe; case 0x00a0: return 0xff; } return (c>0x0080)?encode_undefined():static_cast(c); } }; template class charencoded_traits { public: typedef char char_type; typedef std::string string_type; typedef string_type::iterator string_iterator; typedef string_type::const_iterator string_const_iterator; typedef std::back_insert_iterator back_inserter; typedef charencoded_to_u32_iterator decode_iterator; typedef charencoded_to_u32_iterator decode_const_iterator; typedef charencoded_output_iterator push_encode_iterator; }; typedef charencoded_traits latin1_traits; typedef charencoded_traits windows_1252_traits; typedef charencoded_traits cp437_traits; } // namespace #endif