SphinxBase 0.6
|
00001 /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> */ 00002 /* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */ 00003 00004 /* Slightly modified to use Sphinx types and remove explicit inline. */ 00005 00006 #include "sphinxbase/prim_type.h" 00007 00008 #define UTF8_ACCEPT 0 00009 #define UTF8_REJECT 1 00010 00011 static const uint8 utf8d[] = { 00012 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 00013 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 00014 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f 00015 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f 00016 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f 00017 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf 00018 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df 00019 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef 00020 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff 00021 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 00022 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 00023 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 00024 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 00025 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 00026 }; 00027 00028 uint32 00029 utf8_decode(uint32 *state, uint32 *codep, uint32 byte) { 00030 uint32 type = utf8d[byte]; 00031 00032 *codep = (*state != UTF8_ACCEPT) ? 00033 (byte & 0x3fu) | (*codep << 6) : 00034 (0xff >> type) & (byte); 00035 00036 *state = utf8d[256 + *state*16 + type]; 00037 return *state; 00038 } 00039 00040 /* CMU code starts here. */ 00041 /* ==================================================================== 00042 * Copyright (c) 2009 Carnegie Mellon University. All rights 00043 * reserved. 00044 * 00045 * Redistribution and use in source and binary forms, with or without 00046 * modification, are permitted provided that the following conditions 00047 * are met: 00048 * 00049 * 1. Redistributions of source code must retain the above copyright 00050 * notice, this list of conditions and the following disclaimer. 00051 * 00052 * 2. Redistributions in binary form must reproduce the above copyright 00053 * notice, this list of conditions and the following disclaimer in 00054 * the documentation and/or other materials provided with the 00055 * distribution. 00056 * 00057 * This work was supported in part by funding from the Defense Advanced 00058 * Research Projects Agency and the National Science Foundation of the 00059 * United States of America, and the CMU Sphinx Speech Consortium. 00060 * 00061 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00062 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00063 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00064 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00065 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00066 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00067 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00068 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00069 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00070 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00071 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00072 * 00073 * ==================================================================== 00074 * 00075 */