28 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
37 class invalid_code_point :
public std::exception {
40 invalid_code_point(uint32_t cp) : cp(cp) {}
41 virtual const char* what()
const throw() {
return "Invalid code point"; }
42 uint32_t code_point()
const {
return cp;}
45 class invalid_utf8 :
public std::exception {
48 invalid_utf8 (uint8_t u) : u8(u) {}
49 virtual const char* what()
const throw() {
return "Invalid UTF-8"; }
50 uint8_t utf8_octet()
const {
return u8;}
53 class invalid_utf16 :
public std::exception {
56 invalid_utf16 (uint16_t u) : u16(u) {}
57 virtual const char* what()
const throw() {
return "Invalid UTF-16"; }
58 uint16_t utf16_word()
const {
return u16;}
61 class not_enough_room :
public std::exception {
63 virtual const char* what()
const throw() {
return "Not enough space"; }
68 template <
typename octet_iterator,
typename output_iterator>
69 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
71 while (start != end) {
72 octet_iterator sequence_start = start;
73 internal::utf_error err_code = internal::validate_next(start, end);
76 for (octet_iterator it = sequence_start; it != start; ++it)
79 case internal::NOT_ENOUGH_ROOM:
80 throw not_enough_room();
81 case internal::INVALID_LEAD:
82 append (replacement, out);
85 case internal::INCOMPLETE_SEQUENCE:
86 case internal::OVERLONG_SEQUENCE:
87 case internal::INVALID_CODE_POINT:
88 append (replacement, out);
91 while (internal::is_trail(*start) && start != end)
99 template <
typename octet_iterator,
typename output_iterator>
100 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
102 static const uint32_t replacement_marker = internal::mask16(0xfffd);
103 return replace_invalid(start, end, out, replacement_marker);
106 template <
typename octet_iterator>
107 octet_iterator append(uint32_t cp, octet_iterator result)
109 if (!internal::is_code_point_valid(cp))
110 throw invalid_code_point(cp);
113 *(result++) = static_cast<uint8_t>(cp);
114 else if (cp < 0x800) {
115 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
116 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
118 else if (cp < 0x10000) {
119 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
120 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
121 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
123 else if (cp <= internal::CODE_POINT_MAX) {
124 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
125 *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80);
126 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80);
127 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
130 throw invalid_code_point(cp);
135 template <
typename octet_iterator>
136 uint32_t next(octet_iterator& it, octet_iterator end)
139 internal::utf_error err_code = internal::validate_next(it, end, &cp);
143 case internal::NOT_ENOUGH_ROOM :
144 throw not_enough_room();
145 case internal::INVALID_LEAD :
146 case internal::INCOMPLETE_SEQUENCE :
147 case internal::OVERLONG_SEQUENCE :
148 throw invalid_utf8(*it);
149 case internal::INVALID_CODE_POINT :
150 throw invalid_code_point(cp);
155 template <
typename octet_iterator>
156 uint32_t prior(octet_iterator& it, octet_iterator start)
158 octet_iterator end = it;
159 while (internal::is_trail(*(--it)))
161 throw invalid_utf8(*it);
162 octet_iterator temp = it;
163 return next(temp, end);
167 template <
typename octet_iterator>
168 uint32_t previous(octet_iterator& it, octet_iterator pass_start)
170 octet_iterator end = it;
171 while (internal::is_trail(*(--it)))
172 if (it == pass_start)
173 throw invalid_utf8(*it);
174 octet_iterator temp = it;
175 return next(temp, end);
178 template <
typename octet_iterator,
typename distance_type>
179 void advance (octet_iterator& it, distance_type n, octet_iterator end)
181 for (distance_type i = 0; i < n; ++i)
185 template <
typename octet_iterator>
186 typename std::iterator_traits<octet_iterator>::difference_type
187 distance (octet_iterator first, octet_iterator last)
189 typename std::iterator_traits<octet_iterator>::difference_type dist;
190 for (dist = 0; first < last; ++dist)
195 template <
typename u16bit_iterator,
typename octet_iterator>
196 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
198 while (start != end) {
199 uint32_t cp = internal::mask16(*start++);
201 if (internal::is_surrogate(cp)) {
203 uint32_t trail_surrogate = internal::mask16(*start++);
204 if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX)
205 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
207 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
210 throw invalid_utf16(static_cast<uint16_t>(*start));
213 result = append(cp, result);
218 template <
typename u16bit_iterator,
typename octet_iterator>
219 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
221 while (start != end) {
222 uint32_t cp = next(start, end);
224 *result++ =
static_cast<uint16_t
>((cp >> 10) + internal::LEAD_OFFSET);
225 *result++ =
static_cast<uint16_t
>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
228 *result++ =
static_cast<uint16_t
>(cp);
233 template <
typename octet_iterator,
typename u32bit_iterator>
234 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
237 result = append(*(start++), result);
242 template <
typename octet_iterator,
typename u32bit_iterator>
243 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
246 (*result++) = next(start, end);
252 template <
typename octet_iterator>
253 class iterator :
public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
255 octet_iterator range_start;
256 octet_iterator range_end;
259 explicit iterator (
const octet_iterator& octet_it,
260 const octet_iterator& range_start,
261 const octet_iterator& range_end) :
262 it(octet_it), range_start(range_start), range_end(range_end)
264 if (it < range_start || it > range_end)
265 throw std::out_of_range(
"Invalid utf-8 iterator position");
268 octet_iterator base ()
const {
return it; }
269 uint32_t operator * ()
const
271 octet_iterator temp = it;
272 return next(temp, range_end);
274 bool operator == (
const iterator& rhs)
const
276 if (range_start != rhs.range_start && range_end != rhs.range_end)
277 throw std::logic_error(
"Comparing utf-8 iterators defined with different ranges");
278 return (it == rhs.it);
280 bool operator != (
const iterator& rhs)
const
282 return !(operator == (rhs));
284 iterator& operator ++ ()
289 iterator operator ++ (
int)
291 iterator temp = *
this;
295 iterator& operator -- ()
297 prior(it, range_start);
300 iterator operator -- (
int)
302 iterator temp = *
this;
303 prior(it, range_start);
310 #endif //header guard