Libparserutils
codec_ext8.c
Go to the documentation of this file.
1 /*
2  * This file is part of LibParserUtils.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
8 #include <assert.h>
9 #include <stdlib.h>
10 #include <string.h>
11 
13 
15 #include "utils/endian.h"
16 #include "utils/utils.h"
17 
19 
20 static struct {
21  uint16_t mib;
22  const char *name;
23  size_t len;
24  uint32_t *table;
25 } known_charsets[] = {
26  { 0, "Windows-1250", SLEN("Windows-1250"), w1250 },
27  { 0, "Windows-1251", SLEN("Windows-1251"), w1251 },
28  { 0, "Windows-1252", SLEN("Windows-1252"), w1252 },
29  { 0, "Windows-1253", SLEN("Windows-1253"), w1253 },
30  { 0, "Windows-1254", SLEN("Windows-1254"), w1254 },
31  { 0, "Windows-1255", SLEN("Windows-1255"), w1255 },
32  { 0, "Windows-1256", SLEN("Windows-1256"), w1256 },
33  { 0, "Windows-1257", SLEN("Windows-1257"), w1257 },
34  { 0, "Windows-1258", SLEN("Windows-1258"), w1258 },
35 };
36 
40 typedef struct charset_ext8_codec {
43  uint32_t *table;
45 #define READ_BUFSIZE (8)
46  uint32_t read_buf[READ_BUFSIZE];
49  size_t read_len;
51 #define WRITE_BUFSIZE (8)
55  size_t write_len;
58 
59 static bool charset_ext8_codec_handles_charset(const char *charset);
60 static parserutils_error charset_ext8_codec_create(const char *charset,
66  const uint8_t **source, size_t *sourcelen,
67  uint8_t **dest, size_t *destlen);
70  const uint8_t **source, size_t *sourcelen,
71  uint8_t **dest, size_t *destlen);
76  const uint8_t **source, size_t *sourcelen,
77  uint8_t **dest, size_t *destlen);
80  uint32_t ucs4, uint8_t **dest, size_t *destlen);
82  uint32_t ucs4, uint8_t **s, size_t *len);
84  const uint8_t *s, size_t len, uint32_t *ucs4);
85 
92 bool charset_ext8_codec_handles_charset(const char *charset)
93 {
94  uint32_t i;
95  uint16_t match = parserutils_charset_mibenum_from_name(charset,
96  strlen(charset));
97 
98  if (known_charsets[0].mib == 0) {
99  for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
100  known_charsets[i].mib =
102  known_charsets[i].name,
103  known_charsets[i].len);
104  }
105  }
106 
107  for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
108  if (known_charsets[i].mib == match)
109  return true;
110  }
111 
112  return false;
113 }
114 
126 {
127  uint32_t i;
129  uint16_t match = parserutils_charset_mibenum_from_name(
130  charset, strlen(charset));
131  uint32_t *table = NULL;
132 
133  for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
134  if (known_charsets[i].mib == match) {
135  table = known_charsets[i].table;
136  break;
137  }
138  }
139 
140  assert(table != NULL);
141 
142  c = malloc(sizeof(charset_ext8_codec));
143  if (c == NULL)
144  return PARSERUTILS_NOMEM;
145 
146  c->table = table;
147 
148  c->read_buf[0] = 0;
149  c->read_len = 0;
150 
151  c->write_buf[0] = 0;
152  c->write_len = 0;
153 
154  /* Finally, populate vtable */
159 
160  *codec = (parserutils_charset_codec *) c;
161 
162  return PARSERUTILS_OK;
163 }
164 
172 {
173  UNUSED(codec);
174 
175  return PARSERUTILS_OK;
176 }
177 
206  const uint8_t **source, size_t *sourcelen,
207  uint8_t **dest, size_t *destlen)
208 {
209  charset_ext8_codec *c = (charset_ext8_codec *) codec;
210  uint32_t ucs4;
211  uint32_t *towrite;
212  size_t towritelen;
213  parserutils_error error;
214 
215  /* Process any outstanding characters from the previous call */
216  if (c->write_len > 0) {
217  uint32_t *pwrite = c->write_buf;
218 
219  while (c->write_len > 0) {
220  error = charset_ext8_from_ucs4(c, pwrite[0],
221  dest, destlen);
222  if (error != PARSERUTILS_OK) {
223  uint32_t len;
224  assert(error == PARSERUTILS_NOMEM);
225 
226  for (len = 0; len < c->write_len; len++) {
227  c->write_buf[len] = pwrite[len];
228  }
229 
230  return error;
231  }
232 
233  pwrite++;
234  c->write_len--;
235  }
236  }
237 
238  /* Now process the characters for this call */
239  while (*sourcelen > 0) {
240  ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
241  towrite = &ucs4;
242  towritelen = 1;
243 
244  /* Output current characters */
245  while (towritelen > 0) {
246  error = charset_ext8_from_ucs4(c, towrite[0], dest,
247  destlen);
248  if (error != PARSERUTILS_OK) {
249  uint32_t len;
250  if (error != PARSERUTILS_NOMEM) {
251  return error;
252  }
253 
254  /* Insufficient output space */
255  assert(towritelen < WRITE_BUFSIZE);
256 
257  c->write_len = towritelen;
258 
259  /* Copy pending chars to save area, for
260  * processing next call. */
261  for (len = 0; len < towritelen; len++)
262  c->write_buf[len] = towrite[len];
263 
264  /* Claim character we've just buffered,
265  * so it's not reprocessed */
266  *source += 4;
267  *sourcelen -= 4;
268 
269  return PARSERUTILS_NOMEM;
270  }
271 
272  towrite++;
273  towritelen--;
274  }
275 
276  *source += 4;
277  *sourcelen -= 4;
278  }
279 
280  return PARSERUTILS_OK;
281 }
282 
325  const uint8_t **source, size_t *sourcelen,
326  uint8_t **dest, size_t *destlen)
327 {
328  charset_ext8_codec *c = (charset_ext8_codec *) codec;
329  parserutils_error error;
330 
331  if (c->read_len > 0) {
332  /* Output left over from last decode */
333  uint32_t *pread = c->read_buf;
334 
335  while (c->read_len > 0 && *destlen >= c->read_len * 4) {
336  *((uint32_t *) (void *) *dest) =
337  endian_host_to_big(pread[0]);
338 
339  *dest += 4;
340  *destlen -= 4;
341 
342  pread++;
343  c->read_len--;
344  }
345 
346  if (*destlen < c->read_len * 4) {
347  /* Ran out of output buffer */
348  size_t i;
349 
350  /* Shuffle remaining output down */
351  for (i = 0; i < c->read_len; i++)
352  c->read_buf[i] = pread[i];
353 
354  return PARSERUTILS_NOMEM;
355  }
356  }
357 
358  /* Finally, the "normal" case; process all outstanding characters */
359  while (*sourcelen > 0) {
361  source, sourcelen, dest, destlen);
362  if (error != PARSERUTILS_OK) {
363  return error;
364  }
365  }
366 
367  return PARSERUTILS_OK;
368 }
369 
377 {
378  charset_ext8_codec *c = (charset_ext8_codec *) codec;
379 
380  c->read_buf[0] = 0;
381  c->read_len = 0;
382 
383  c->write_buf[0] = 0;
384  c->write_len = 0;
385 
386  return PARSERUTILS_OK;
387 }
388 
389 
419  const uint8_t **source, size_t *sourcelen,
420  uint8_t **dest, size_t *destlen)
421 {
422  uint32_t ucs4;
423  parserutils_error error;
424 
425  /* Convert a single character */
426  error = charset_ext8_to_ucs4(c, *source, *sourcelen, &ucs4);
427  if (error == PARSERUTILS_OK) {
428  /* Read a character */
430  ucs4, dest, destlen);
431  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
432  /* output succeeded; update source pointers */
433  *source += 1;
434  *sourcelen -= 1;
435  }
436 
437  return error;
438  } else if (error == PARSERUTILS_NEEDDATA) {
439  /* Can only happen if sourcelen == 0 */
440  return error;
441  } else if (error == PARSERUTILS_INVALID) {
442  /* Illegal input sequence */
443 
444  /* Strict errormode; simply flag invalid character */
445  if (c->base.errormode ==
447  return PARSERUTILS_INVALID;
448  }
449 
450  /* output U+FFFD and continue processing. */
452  0xFFFD, dest, destlen);
453  if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
454  /* output succeeded; update source pointers */
455  *source += 1;
456  *sourcelen -= 1;
457  }
458 
459  return error;
460  }
461 
462  return PARSERUTILS_OK;
463 }
464 
476  uint32_t ucs4, uint8_t **dest, size_t *destlen)
477 {
478  if (*destlen < 4) {
479  /* Run out of output buffer */
480  c->read_len = 1;
481  c->read_buf[0] = ucs4;
482 
483  return PARSERUTILS_NOMEM;
484  }
485 
486  *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
487  *dest += 4;
488  *destlen -= 4;
489 
490  return PARSERUTILS_OK;
491 }
492 
510  uint32_t ucs4, uint8_t **s, size_t *len)
511 {
512  uint8_t out = 0;
513 
514  if (*len < 1)
515  return PARSERUTILS_NOMEM;
516 
517  if (ucs4 < 0x80) {
518  /* ASCII */
519  out = ucs4;
520  } else {
521  uint32_t i;
522 
523  for (i = 0; i < 128; i++) {
524  if (ucs4 == c->table[i])
525  break;
526  }
527 
528  if (i == 128) {
529  if (c->base.errormode ==
531  return PARSERUTILS_INVALID;
532  else
533  out = '?';
534  } else {
535  out = 0x80 + i;
536  }
537  }
538 
539  *(*s) = out;
540  (*s)++;
541  (*len)--;
542 
543  return PARSERUTILS_OK;
544 }
545 
558  const uint8_t *s, size_t len, uint32_t *ucs4)
559 {
560  uint32_t out;
561 
562  if (len < 1)
563  return PARSERUTILS_NEEDDATA;
564 
565  if (*s < 0x80) {
566  out = *s;
567  } else {
568  if (c->table[*s - 0x80] == 0xFFFF)
569  return PARSERUTILS_INVALID;
570 
571  out = c->table[*s - 0x80];
572  }
573 
574  *ucs4 = out;
575 
576  return PARSERUTILS_OK;
577 }
578 
582 };
583 
charset_ext8_codec_handles_charset
static bool charset_ext8_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition: codec_ext8.c:92
w1254
static uint32_t w1254[128]
Definition: ext8_tables.h:92
charset_ext8_codec_decode
static parserutils_error charset_ext8_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of extended 8bit data into UCS-4 (big endian)
Definition: codec_ext8.c:324
parserutils_charset_codec::encode
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:26
w1257
static uint32_t w1257[128]
Definition: ext8_tables.h:149
known_charsets
static struct @2 known_charsets[]
name
const char * name
Definition: codec_ext8.c:22
charset_ext8_codec::table
uint32_t * table
Mapping table for 0x80-0xFF.
Definition: codec_ext8.c:43
parserutils_charset_codec::handler
struct parserutils_charset_codec::@3 handler
Vtable for handler code.
charset_ext8_codec::write_buf
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition: codec_ext8.c:52
parserutils_charset_codec::destroy
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition: codec_impl.h:25
PARSERUTILS_OK
@ PARSERUTILS_OK
Definition: errors.h:19
charset_ext8_from_ucs4
static parserutils_error charset_ext8_from_ucs4(charset_ext8_codec *c, uint32_t ucs4, uint8_t **s, size_t *len)
Convert a UCS4 (host endian) character to extended 8bit.
Definition: codec_ext8.c:509
w1252
static uint32_t w1252[128]
Definition: ext8_tables.h:54
charset_ext8_codec
Windows charset codec.
Definition: codec_ext8.c:40
PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
@ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
Abort processing if unrepresentable character encountered.
Definition: codec.h:64
READ_BUFSIZE
#define READ_BUFSIZE
Definition: codec_ext8.c:45
parserutils_charset_codec
Core charset codec definition; implementations extend this.
Definition: codec_impl.h:19
utils.h
charset_ext8_codec_output_decoded_char
static parserutils_error charset_ext8_codec_output_decoded_char(charset_ext8_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
Definition: codec_ext8.c:475
w1256
static uint32_t w1256[128]
Definition: ext8_tables.h:130
charset_ext8_codec::read_len
size_t read_len
Character length of read_buf.
Definition: codec_ext8.c:49
w1258
static uint32_t w1258[128]
Definition: ext8_tables.h:168
charset_ext8_codec::read_buf
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition: codec_ext8.c:46
w1255
static uint32_t w1255[128]
Definition: ext8_tables.h:111
charset_ext8_codec_create
static parserutils_error charset_ext8_codec_create(const char *charset, parserutils_charset_codec **codec)
Create an extended 8bit codec.
Definition: codec_ext8.c:124
w1250
static uint32_t w1250[128]
Definition: ext8_tables.h:16
charset_ext8_codec_reset
static parserutils_error charset_ext8_codec_reset(parserutils_charset_codec *codec)
Clear an extended 8bit codec's encoding state.
Definition: codec_ext8.c:376
charset_ext8_codec_read_char
static parserutils_error charset_ext8_codec_read_char(charset_ext8_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from the extended 8bit to UCS-4 (big endian)
Definition: codec_ext8.c:418
charset_ext8_to_ucs4
static parserutils_error charset_ext8_to_ucs4(charset_ext8_codec *c, const uint8_t *s, size_t len, uint32_t *ucs4)
Convert an extended 8bit character to UCS4 (host endian)
Definition: codec_ext8.c:557
parserutils_charset_codec::reset
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition: codec_impl.h:32
table
uint32_t * table
Definition: codec_ext8.c:24
len
size_t len
Definition: codec_ext8.c:23
PARSERUTILS_INVALID
@ PARSERUTILS_INVALID
Definition: errors.h:23
parserutils_charset_codec::errormode
parserutils_charset_codec_errormode errormode
error mode
Definition: codec_impl.h:22
charset_ext8_codec_encode
static parserutils_error charset_ext8_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into extended 8bit.
Definition: codec_ext8.c:205
codec_impl.h
N_ELEMENTS
#define N_ELEMENTS(s)
Definition: utils.h:29
endian_big_to_host
static uint32_t endian_big_to_host(uint32_t big)
Definition: endian.h:32
parserutils_error
parserutils_error
Definition: errors.h:18
PARSERUTILS_NEEDDATA
@ PARSERUTILS_NEEDDATA
Definition: errors.h:25
SLEN
#define SLEN(s)
Definition: utils.h:21
parserutils_charset_handler
Codec factory component definition.
Definition: codec_impl.h:39
charset_ext8_codec::base
parserutils_charset_codec base
Base class.
Definition: codec_ext8.c:41
PARSERUTILS_NOMEM
@ PARSERUTILS_NOMEM
Definition: errors.h:21
mib
uint16_t mib
Definition: codec_ext8.c:21
ext8_tables.h
charset_ext8_codec::write_len
size_t write_len
Character length of write_buf.
Definition: codec_ext8.c:55
parserutils_charset_codec::decode
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition: codec_impl.h:29
charset_ext8_codec
struct charset_ext8_codec charset_ext8_codec
Windows charset codec.
mibenum.h
parserutils_charset_mibenum_from_name
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition: aliases.c:107
w1253
static uint32_t w1253[128]
Definition: ext8_tables.h:73
WRITE_BUFSIZE
#define WRITE_BUFSIZE
Definition: codec_ext8.c:51
charset_ext8_codec_handler
const parserutils_charset_handler charset_ext8_codec_handler
Definition: codec_ext8.c:579
endian_host_to_big
static uint32_t endian_host_to_big(uint32_t host)
Definition: endian.h:24
charset_ext8_codec_destroy
static parserutils_error charset_ext8_codec_destroy(parserutils_charset_codec *codec)
Destroy an extended 8bit codec.
Definition: codec_ext8.c:171
w1251
static uint32_t w1251[128]
Definition: ext8_tables.h:35
UNUSED
#define UNUSED(x)
Definition: utils.h:25
endian.h