00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00029 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00030
00031 #include "core.h"
00032 #include <stdexcept>
00033
00034 namespace utf8
00035 {
00036
00037 class invalid_code_point : public std::exception {
00038 uint32_t cp;
00039 public:
00040 invalid_code_point(uint32_t cp) : cp(cp) {}
00041 virtual const char* what() const throw() { return "Invalid code point"; }
00042 uint32_t code_point() const {return cp;}
00043 };
00044
00045 class invalid_utf8 : public std::exception {
00046 uint8_t u8;
00047 public:
00048 invalid_utf8 (uint8_t u) : u8(u) {}
00049 virtual const char* what() const throw() { return "Invalid UTF-8"; }
00050 uint8_t utf8_octet() const {return u8;}
00051 };
00052
00053 class invalid_utf16 : public std::exception {
00054 uint16_t u16;
00055 public:
00056 invalid_utf16 (uint16_t u) : u16(u) {}
00057 virtual const char* what() const throw() { return "Invalid UTF-16"; }
00058 uint16_t utf16_word() const {return u16;}
00059 };
00060
00061 class not_enough_room : public std::exception {
00062 public:
00063 virtual const char* what() const throw() { return "Not enough space"; }
00064 };
00065
00067
00068 template <typename octet_iterator, typename output_iterator>
00069 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
00070 {
00071 while (start != end) {
00072 octet_iterator sequence_start = start;
00073 internal::utf_error err_code = internal::validate_next(start, end);
00074 switch (err_code) {
00075 case internal::OK :
00076 for (octet_iterator it = sequence_start; it != start; ++it)
00077 *out++ = *it;
00078 break;
00079 case internal::NOT_ENOUGH_ROOM:
00080 throw not_enough_room();
00081 case internal::INVALID_LEAD:
00082 append (replacement, out);
00083 ++start;
00084 break;
00085 case internal::INCOMPLETE_SEQUENCE:
00086 case internal::OVERLONG_SEQUENCE:
00087 case internal::INVALID_CODE_POINT:
00088 append (replacement, out);
00089 ++start;
00090
00091 while (internal::is_trail(*start) && start != end)
00092 ++start;
00093 break;
00094 }
00095 }
00096 return out;
00097 }
00098
00099 template <typename octet_iterator, typename output_iterator>
00100 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
00101 {
00102 static const uint32_t replacement_marker = internal::mask16(0xfffd);
00103 return replace_invalid(start, end, out, replacement_marker);
00104 }
00105
00106 template <typename octet_iterator>
00107 octet_iterator append(uint32_t cp, octet_iterator result)
00108 {
00109 if (!internal::is_code_point_valid(cp))
00110 throw invalid_code_point(cp);
00111
00112 if (cp < 0x80)
00113 *(result++) = static_cast<uint8_t>(cp);
00114 else if (cp < 0x800) {
00115 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
00116 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00117 }
00118 else if (cp < 0x10000) {
00119 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
00120 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
00121 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00122 }
00123 else if (cp <= internal::CODE_POINT_MAX) {
00124 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
00125 *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
00126 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
00127 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
00128 }
00129 else
00130 throw invalid_code_point(cp);
00131
00132 return result;
00133 }
00134
00135 template <typename octet_iterator>
00136 uint32_t next(octet_iterator& it, octet_iterator end)
00137 {
00138 uint32_t cp = 0;
00139 internal::utf_error err_code = internal::validate_next(it, end, &cp);
00140 switch (err_code) {
00141 case internal::OK :
00142 break;
00143 case internal::NOT_ENOUGH_ROOM :
00144 throw not_enough_room();
00145 case internal::INVALID_LEAD :
00146 case internal::INCOMPLETE_SEQUENCE :
00147 case internal::OVERLONG_SEQUENCE :
00148 throw invalid_utf8(*it);
00149 case internal::INVALID_CODE_POINT :
00150 throw invalid_code_point(cp);
00151 }
00152 return cp;
00153 }
00154
00155 template <typename octet_iterator>
00156 uint32_t prior(octet_iterator& it, octet_iterator start)
00157 {
00158 octet_iterator end = it;
00159 while (internal::is_trail(*(--it)))
00160 if (it < start)
00161 throw invalid_utf8(*it);
00162 octet_iterator temp = it;
00163 return next(temp, end);
00164 }
00165
00167 template <typename octet_iterator>
00168 uint32_t previous(octet_iterator& it, octet_iterator pass_start)
00169 {
00170 octet_iterator end = it;
00171 while (internal::is_trail(*(--it)))
00172 if (it == pass_start)
00173 throw invalid_utf8(*it);
00174 octet_iterator temp = it;
00175 return next(temp, end);
00176 }
00177
00178 template <typename octet_iterator, typename distance_type>
00179 void advance (octet_iterator& it, distance_type n, octet_iterator end)
00180 {
00181 for (distance_type i = 0; i < n; ++i)
00182 next(it, end);
00183 }
00184
00185 template <typename octet_iterator>
00186 typename std::iterator_traits<octet_iterator>::difference_type
00187 distance (octet_iterator first, octet_iterator last)
00188 {
00189 typename std::iterator_traits<octet_iterator>::difference_type dist;
00190 for (dist = 0; first < last; ++dist)
00191 next(first, last);
00192 return dist;
00193 }
00194
00195 template <typename u16bit_iterator, typename octet_iterator>
00196 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
00197 {
00198 while (start != end) {
00199 uint32_t cp = internal::mask16(*start++);
00200
00201 if (internal::is_surrogate(cp)) {
00202 if (start != end) {
00203 uint32_t trail_surrogate = internal::mask16(*start++);
00204 if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
00205 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
00206 else
00207 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
00208 }
00209 else
00210 throw invalid_utf16(static_cast<uint16_t>(*start));
00211
00212 }
00213 result = append(cp, result);
00214 }
00215 return result;
00216 }
00217
00218 template <typename u16bit_iterator, typename octet_iterator>
00219 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
00220 {
00221 while (start != end) {
00222 uint32_t cp = next(start, end);
00223 if (cp > 0xffff) {
00224 *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
00225 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
00226 }
00227 else
00228 *result++ = static_cast<uint16_t>(cp);
00229 }
00230 return result;
00231 }
00232
00233 template <typename octet_iterator, typename u32bit_iterator>
00234 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
00235 {
00236 while (start != end)
00237 result = append(*(start++), result);
00238
00239 return result;
00240 }
00241
00242 template <typename octet_iterator, typename u32bit_iterator>
00243 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
00244 {
00245 while (start < end)
00246 (*result++) = next(start, end);
00247
00248 return result;
00249 }
00250
00251
00252 template <typename octet_iterator>
00253 class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
00254 octet_iterator it;
00255 octet_iterator range_start;
00256 octet_iterator range_end;
00257 public:
00258 iterator () {};
00259 explicit iterator (const octet_iterator& octet_it,
00260 const octet_iterator& range_start,
00261 const octet_iterator& range_end) :
00262 it(octet_it), range_start(range_start), range_end(range_end)
00263 {
00264 if (it < range_start || it > range_end)
00265 throw std::out_of_range("Invalid utf-8 iterator position");
00266 }
00267
00268 octet_iterator base () const { return it; }
00269 uint32_t operator * () const
00270 {
00271 octet_iterator temp = it;
00272 return next(temp, range_end);
00273 }
00274 bool operator == (const iterator& rhs) const
00275 {
00276 if (range_start != rhs.range_start && range_end != rhs.range_end)
00277 throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
00278 return (it == rhs.it);
00279 }
00280 bool operator != (const iterator& rhs) const
00281 {
00282 return !(operator == (rhs));
00283 }
00284 iterator& operator ++ ()
00285 {
00286 next(it, range_end);
00287 return *this;
00288 }
00289 iterator operator ++ (int)
00290 {
00291 iterator temp = *this;
00292 next(it, range_end);
00293 return temp;
00294 }
00295 iterator& operator -- ()
00296 {
00297 prior(it, range_start);
00298 return *this;
00299 }
00300 iterator operator -- (int)
00301 {
00302 iterator temp = *this;
00303 prior(it, range_start);
00304 return temp;
00305 }
00306 };
00307
00308 }
00309
00310 #endif //header guard
00311
00312