checked.h

00001 // Copyright 2006 Nemanja Trifunovic
00002 
00003 /*
00004 Permission is hereby granted, free of charge, to any person or organization
00005 obtaining a copy of the software and accompanying documentation covered by
00006 this license (the "Software") to use, reproduce, display, distribute,
00007 execute, and transmit the Software, and to prepare derivative works of the
00008 Software, and to permit third-parties to whom the Software is furnished to
00009 do so, all subject to the following:
00010 
00011 The copyright notices in the Software and this entire statement, including
00012 the above license grant, this restriction and the following disclaimer,
00013 must be included in all copies of the Software, in whole or in part, and
00014 all derivative works of the Software, unless such copies or derivative
00015 works are solely in the form of machine-executable object code generated by
00016 a source language processor.
00017 
00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00024 DEALINGS IN THE SOFTWARE.
00025 */
00026 
00027 
00028 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00029 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00030 
00031 #include "core.h"
00032 #include <stdexcept>
00033 
00034 namespace utf8
00035 {
00036     // Exceptions that may be thrown from the library functions.
00037     class invalid_code_point : public std::exception {
00038         uint32_t cp;
00039     public:
00040         invalid_code_point(uint32_t cp) : cp(cp) {}
00041         virtual const char* what() const throw() { return "Invalid code point"; }
00042         uint32_t code_point() const {return cp;}
00043     };
00044 
00045     class invalid_utf8 : public std::exception {
00046         uint8_t u8;
00047     public:
00048         invalid_utf8 (uint8_t u) : u8(u) {}
00049         virtual const char* what() const throw() { return "Invalid UTF-8"; }
00050         uint8_t utf8_octet() const {return u8;}
00051     };
00052 
00053     class invalid_utf16 : public std::exception {
00054         uint16_t u16;
00055     public:
00056         invalid_utf16 (uint16_t u) : u16(u) {}
00057         virtual const char* what() const throw() { return "Invalid UTF-16"; }
00058         uint16_t utf16_word() const {return u16;}
00059     };
00060 
00061     class not_enough_room : public std::exception {
00062     public:
00063         virtual const char* what() const throw() { return "Not enough space"; }
00064     };
00065 
00067  
00068     template <typename octet_iterator, typename output_iterator>
00069     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
00070     {
00071         while (start != end) {
00072             octet_iterator sequence_start = start;
00073             internal::utf_error err_code = internal::validate_next(start, end);
00074             switch (err_code) {
00075                 case internal::OK :
00076                     for (octet_iterator it = sequence_start; it != start; ++it)
00077                         *out++ = *it;
00078                     break;
00079                 case internal::NOT_ENOUGH_ROOM:
00080                     throw not_enough_room();
00081                 case internal::INVALID_LEAD:
00082                     append (replacement, out);
00083                     ++start;
00084                     break;
00085                 case internal::INCOMPLETE_SEQUENCE:
00086                 case internal::OVERLONG_SEQUENCE:
00087                 case internal::INVALID_CODE_POINT:
00088                     append (replacement, out);
00089                     ++start;
00090                     // just one replacement mark for the sequence
00091                     while (internal::is_trail(*start) && start != end)
00092                         ++start;
00093                     break;
00094             }
00095         }   
00096         return out;
00097     }
00098 
00099     template <typename octet_iterator, typename output_iterator>
00100     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
00101     {
00102         static const uint32_t replacement_marker = internal::mask16(0xfffd);
00103         return replace_invalid(start, end, out, replacement_marker);
00104     }
00105 
00106     template <typename octet_iterator>
00107     octet_iterator append(uint32_t cp, octet_iterator result)
00108     {
00109         if (!internal::is_code_point_valid(cp)) 
00110             throw invalid_code_point(cp);
00111 
00112         if (cp < 0x80)                        // one octet
00113             *(result++) = static_cast<uint8_t>(cp);  
00114         else if (cp < 0x800) {                // two octets
00115             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
00116             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
00117         }
00118         else if (cp < 0x10000) {              // three octets
00119             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
00120             *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f     | 0x80);
00121             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
00122         }
00123         else if (cp <= internal::CODE_POINT_MAX) {      // four octets
00124             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
00125             *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f     | 0x80);
00126             *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f     | 0x80);
00127             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
00128         }
00129         else
00130             throw invalid_code_point(cp);
00131 
00132         return result;
00133     }
00134 
00135     template <typename octet_iterator>
00136     uint32_t next(octet_iterator& it, octet_iterator end)
00137     {
00138         uint32_t cp = 0;
00139         internal::utf_error err_code = internal::validate_next(it, end, &cp);
00140         switch (err_code) {
00141             case internal::OK :
00142                 break;
00143             case internal::NOT_ENOUGH_ROOM :
00144                 throw not_enough_room();
00145             case internal::INVALID_LEAD :
00146             case internal::INCOMPLETE_SEQUENCE :
00147             case internal::OVERLONG_SEQUENCE :
00148                 throw invalid_utf8(*it);
00149             case internal::INVALID_CODE_POINT :
00150                 throw invalid_code_point(cp);
00151         }
00152         return cp;        
00153     }
00154 
00155     template <typename octet_iterator>
00156     uint32_t prior(octet_iterator& it, octet_iterator start)
00157     {
00158         octet_iterator end = it;
00159         while (internal::is_trail(*(--it))) 
00160             if (it < start)
00161                 throw invalid_utf8(*it); // error - no lead byte in the sequence
00162         octet_iterator temp = it;
00163         return next(temp, end);
00164     }
00165 
00167     template <typename octet_iterator>
00168     uint32_t previous(octet_iterator& it, octet_iterator pass_start)
00169     {
00170         octet_iterator end = it;
00171         while (internal::is_trail(*(--it))) 
00172             if (it == pass_start)
00173                 throw invalid_utf8(*it); // error - no lead byte in the sequence
00174         octet_iterator temp = it;
00175         return next(temp, end);
00176     }
00177 
00178     template <typename octet_iterator, typename distance_type>
00179     void advance (octet_iterator& it, distance_type n, octet_iterator end)
00180     {
00181         for (distance_type i = 0; i < n; ++i)
00182             next(it, end);
00183     }
00184 
00185     template <typename octet_iterator>
00186     typename std::iterator_traits<octet_iterator>::difference_type
00187     distance (octet_iterator first, octet_iterator last)
00188     {
00189         typename std::iterator_traits<octet_iterator>::difference_type dist;
00190         for (dist = 0; first < last; ++dist) 
00191             next(first, last);
00192         return dist;
00193     }
00194 
00195     template <typename u16bit_iterator, typename octet_iterator>
00196     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
00197     {       
00198         while (start != end) {
00199             uint32_t cp = internal::mask16(*start++);
00200             // Take care of surrogate pairs first
00201             if (internal::is_surrogate(cp)) {
00202                 if (start != end) {
00203                     uint32_t trail_surrogate = internal::mask16(*start++);
00204                     if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
00205                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;                    
00206                     else 
00207                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
00208                 }
00209                 else 
00210                     throw invalid_utf16(static_cast<uint16_t>(*start));
00211             
00212             }
00213             result = append(cp, result);
00214         }
00215         return result;        
00216     }
00217 
00218     template <typename u16bit_iterator, typename octet_iterator>
00219     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
00220     {
00221         while (start != end) {
00222             uint32_t cp = next(start, end);
00223             if (cp > 0xffff) { //make a surrogate pair
00224                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
00225                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
00226             }
00227             else
00228                 *result++ = static_cast<uint16_t>(cp);
00229         }
00230         return result;
00231     }
00232 
00233     template <typename octet_iterator, typename u32bit_iterator>
00234     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
00235     {
00236         while (start != end)
00237             result = append(*(start++), result);
00238 
00239         return result;
00240     }
00241 
00242     template <typename octet_iterator, typename u32bit_iterator>
00243     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
00244     {
00245         while (start < end)
00246             (*result++) = next(start, end);
00247 
00248         return result;
00249     }
00250 
00251     // The iterator class
00252     template <typename octet_iterator>
00253     class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
00254       octet_iterator it;
00255       octet_iterator range_start;
00256       octet_iterator range_end;
00257       public:
00258       iterator () {};
00259       explicit iterator (const octet_iterator& octet_it, 
00260                          const octet_iterator& range_start,
00261                          const octet_iterator& range_end) :
00262                it(octet_it), range_start(range_start), range_end(range_end)
00263       {
00264           if (it < range_start || it > range_end)
00265             throw std::out_of_range("Invalid utf-8 iterator position");
00266       }
00267       // the default "big three" are OK
00268       octet_iterator base () const { return it; }
00269       uint32_t operator * () const
00270       {
00271           octet_iterator temp = it;
00272           return next(temp, range_end);
00273       }
00274       bool operator == (const iterator& rhs) const 
00275       { 
00276           if (range_start != rhs.range_start && range_end != rhs.range_end)
00277               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
00278           return (it == rhs.it);
00279       }
00280       bool operator != (const iterator& rhs) const
00281       {
00282           return !(operator == (rhs));
00283       }
00284       iterator& operator ++ () 
00285       {
00286           next(it, range_end);
00287           return *this;
00288       }
00289       iterator operator ++ (int)
00290       {
00291           iterator temp = *this;
00292           next(it, range_end);
00293           return temp;
00294       }  
00295       iterator& operator -- ()
00296       {
00297           prior(it, range_start);
00298           return *this;
00299       }
00300       iterator operator -- (int)
00301       {
00302           iterator temp = *this;
00303           prior(it, range_start);
00304           return temp;
00305       }
00306     }; // class iterator
00307 
00308 } // namespace utf8
00309 
00310 #endif //header guard
00311 
00312 
Generated by  doxygen 1.6.2-20100208