Audacious  $Id:Doxyfile42802007-03-2104:39:00Znenolod$
chardet.c
Go to the documentation of this file.
1 /* Audacious
2  * Copyright (C) 2005-2007 Audacious development team.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; under version 3 of the License.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program. If not, see <http://www.gnu.org/licenses>.
15  *
16  * The Audacious team does not consider modular code linking to
17  * Audacious or using our public API to be a derived work.
18  */
19 
20 #include <glib.h>
21 #include <string.h>
22 #include <libaudcore/audstrings.h>
23 
24 #include "config.h"
25 #include "debug.h"
26 #include "i18n.h"
27 #include "main.h"
28 #include "misc.h"
29 
30 #ifdef USE_CHARDET
31 # include <libguess.h>
32 #endif
33 
34 static char * cd_chardet_to_utf8 (const char * str, int len,
35  int * arg_bytes_read, int * arg_bytes_written);
36 
37 static char * str_to_utf8_fallback (const char * str)
38 {
39  char * out = g_strconcat (str, _(" (invalid UTF-8)"), NULL);
40 
41  for (char * c = out; * c; c ++)
42  {
43  if (* c & 0x80)
44  * c = '?';
45  }
46 
47  return out;
48 }
49 
50 static char * cd_str_to_utf8 (const char * str)
51 {
52  char *out_str;
53 
54  if (str == NULL)
55  return NULL;
56 
57  /* Note: Currently, playlist calls this function repeatedly, even
58  * if the string is already converted into utf-8.
59  * chardet_to_utf8() would convert a valid utf-8 string into a
60  * different utf-8 string, if fallback encodings were supplied and
61  * the given string could be treated as a string in one of
62  * fallback encodings. To avoid this, g_utf8_validate() had been
63  * used at the top of evaluation.
64  */
65 
66  /* Note 2: g_utf8_validate() has so called encapsulated utf-8
67  * problem, thus chardet_to_utf8() took the place of that.
68  */
69 
70  /* Note 3: As introducing madplug, the problem of conversion from
71  * ISO-8859-1 to UTF-8 arose. This may be coped with g_convert()
72  * located near the end of chardet_to_utf8(), but it requires utf8
73  * validation guard where g_utf8_validate() was. New
74  * dfa_validate_utf8() employs libguess' DFA engine to validate
75  * utf-8 and can properly distinguish examples of encapsulated
76  * utf-8. It is considered to be safe to use as a guard.
77  */
78 
79  /* Already UTF-8? */
80 #ifdef USE_CHARDET
81  if (libguess_validate_utf8(str, strlen(str)))
82  return g_strdup(str);
83 #else
84  if (g_utf8_validate(str, strlen(str), NULL))
85  return g_strdup(str);
86 #endif
87 
88  /* chardet encoding detector */
89  if ((out_str = cd_chardet_to_utf8 (str, strlen (str), NULL, NULL)))
90  return out_str;
91 
92  /* all else fails, we mask off character codes >= 128, replace with '?' */
93  return str_to_utf8_fallback(str);
94 }
95 
96 static char * cd_chardet_to_utf8 (const char * str, int len,
97  int * arg_bytes_read, int * arg_bytes_write)
98 {
99  char *ret = NULL;
100  int * bytes_read, * bytes_write;
101  int my_bytes_read, my_bytes_write;
102 
103  bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
104  bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;
105 
106  g_return_val_if_fail(str != NULL, NULL);
107 
108 #ifdef USE_CHARDET
109  if (libguess_validate_utf8(str, len))
110 #else
111  if (g_utf8_validate(str, len, NULL))
112 #endif
113  {
114  if (len < 0)
115  len = strlen (str);
116 
117  ret = g_malloc (len + 1);
118  memcpy (ret, str, len);
119  ret[len] = 0;
120 
121  if (arg_bytes_read != NULL)
122  * arg_bytes_read = len;
123  if (arg_bytes_write != NULL)
124  * arg_bytes_write = len;
125 
126  return ret;
127  }
128 
129 #ifdef USE_CHARDET
130  char * det = get_string (NULL, "chardet_detector");
131 
132  if (det[0])
133  {
134  AUDDBG("guess encoding (%s) %s\n", det, str);
135  const char * encoding = libguess_determine_encoding (str, len, det);
136  AUDDBG("encoding = %s\n", encoding);
137  if (encoding)
138  {
139  gsize read_gsize = 0, written_gsize = 0;
140  ret = g_convert (str, len, "UTF-8", encoding, & read_gsize, & written_gsize, NULL);
141  * bytes_read = read_gsize;
142  * bytes_write = written_gsize;
143  }
144  }
145 
146  g_free (det);
147 #endif
148 
149  /* If detection failed or was not enabled, try fallbacks (if there are any) */
150  if (! ret)
151  {
152  char * fallbacks = get_string (NULL, "chardet_fallback");
153  char * * split = g_strsplit_set (fallbacks, " ,:;|/", -1);
154 
155  for (char * * enc = split; * enc; enc ++)
156  {
157  gsize read_gsize = 0, written_gsize = 0;
158  ret = g_convert (str, len, "UTF-8", * enc, & read_gsize, & written_gsize, NULL);
159  * bytes_read = read_gsize;
160  * bytes_write = written_gsize;
161 
162  if (len == *bytes_read)
163  break;
164  else {
165  g_free(ret);
166  ret = NULL;
167  }
168  }
169 
170  g_strfreev (split);
171  g_free (fallbacks);
172  }
173 
174  /* First fallback: locale (duh!) */
175  if (ret == NULL)
176  {
177  gsize read_gsize = 0, written_gsize = 0;
178  ret = g_locale_to_utf8 (str, len, & read_gsize, & written_gsize, NULL);
179  * bytes_read = read_gsize;
180  * bytes_write = written_gsize;
181  }
182 
183  /* The final fallback is ISO-8859-1, if no other is specified or conversions fail */
184  if (ret == NULL)
185  {
186  gsize read_gsize = 0, written_gsize = 0;
187  ret = g_convert (str, len, "UTF-8", "ISO-8859-1", & read_gsize, & written_gsize, NULL);
188  * bytes_read = read_gsize;
189  * bytes_write = written_gsize;
190  }
191 
192  if (ret != NULL)
193  {
194  if (g_utf8_validate(ret, -1, NULL))
195  return ret;
196  else
197  {
198  g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
199  g_free(ret);
200  return NULL;
201  }
202  }
203 
204  return NULL; /* If we have no idea, return NULL. */
205 }
206 
207 void chardet_init (void)
208 {
209 #ifdef USE_CHARDET
210  libguess_determine_encoding(NULL, -1, "");
211 #endif
213 }