liblcf
reader_util.cpp
Go to the documentation of this file.
1 /*
2  * This file is part of liblcf. Copyright (c) liblcf authors.
3  * https://github.com/EasyRPG/liblcf - https://easyrpg.org
4  *
5  * liblcf is Free/Libre Open Source Software, released under the MIT License.
6  * For the full copyright and license information, please view the COPYING
7  * file that was distributed with this source code.
8  */
9 
10 #include "lcf/config.h"
11 #include "lcf/scope_guard.h"
12 
13 #if LCF_SUPPORT_ICU
14 # include <unicode/ucsdet.h>
15 # include <unicode/ucnv.h>
16 # include <unicode/normalizer2.h>
17 # include <unicode/unistr.h>
18 # include <unicode/locid.h>
19 #else
20 # ifdef _MSC_VER
21 # error MSVC builds require ICU
22 # endif
23 #endif
24 
25 #ifdef _WIN32
26 # include <windows.h>
27 #else
28 # if !LCF_SUPPORT_ICU
29 # include <iconv.h>
30 # endif
31 # include <locale>
32 #endif
33 
34 #include <algorithm>
35 #include <cstdio>
36 #include <cstdlib>
37 #include <sstream>
38 #include <vector>
39 
40 #include "lcf/inireader.h"
41 #include "lcf/ldb/reader.h"
42 #include "lcf/reader_util.h"
43 
44 namespace lcf {
45 
46 namespace ReaderUtil {
47 }
48 
49 std::string ReaderUtil::CodepageToEncoding(int codepage) {
50  if (codepage == 0)
51  return std::string();
52 
53  if (codepage == 932) {
54 #if LCF_SUPPORT_ICU
55  return "ibm-943_P15A-2003";
56 #else
57  return "SHIFT_JIS";
58 #endif
59  }
60  if (codepage == 949) {
61 #if LCF_SUPPORT_ICU
62  return "windows-949-2000";
63 #else
64  return "cp949";
65 #endif
66  }
67  std::ostringstream out;
68 #if LCF_SUPPORT_ICU
69  out << "windows-" << codepage;
70 #else
71  out << "CP" << codepage;
72 #endif
73 
74  // Looks like a valid codepage
75  std::string outs = out.str();
76  return outs;
77 }
78 
79 std::string ReaderUtil::DetectEncoding(lcf::rpg::Database& db) {
80  std::vector<std::string> encodings = DetectEncodings(db);
81 
82  if (encodings.empty()) {
83  return "";
84  }
85 
86  return encodings.front();
87 }
88 
89 std::vector<std::string> ReaderUtil::DetectEncodings(lcf::rpg::Database& db) {
90 #if LCF_SUPPORT_ICU
91  std::ostringstream text;
92 
93  auto append = [](const auto& s) {
94  return ToString(s) + " ";
95  };
96 
97  lcf::rpg::ForEachString(db.system, [&](const auto& val, const auto& ctx) {
98  text << append(val);
99  });
100 
101  // Cannot use ForEachString here for Terms:
102  // Too much untranslated garbage data in there, even in default database
103  for (const auto& s: {
104  db.terms.menu_save,
105  db.terms.menu_quit,
106  db.terms.new_game,
107  db.terms.load_game,
108  db.terms.exit_game,
109  db.terms.status,
110  db.terms.row,
111  db.terms.order,
112  db.terms.wait_on,
113  db.terms.wait_off,
114  db.terms.level,
115  db.terms.health_points,
116  db.terms.spirit_points,
117  db.terms.normal_status,
118  db.terms.sp_cost,
119  db.terms.attack,
120  db.terms.defense,
121  db.terms.spirit,
122  db.terms.agility,
123  db.terms.weapon,
124  db.terms.shield,
125  db.terms.armor,
126  db.terms.helmet,
127  db.terms.accessory,
128  db.terms.save_game_message,
129  db.terms.load_game_message,
130  db.terms.exit_game_message,
131  db.terms.file,
132  db.terms.yes,
133  db.terms.no
134  }) {
135  text << append(s);
136  }
137 
138  return ReaderUtil::DetectEncodings(text.str());
139 #else
140  return std::vector<std::string>();
141 #endif
142 }
143 
144 std::string ReaderUtil::DetectEncoding(StringView string) {
145  std::vector<std::string> encodings = DetectEncodings(string);
146 
147  if (encodings.empty()) {
148  return "";
149  }
150 
151  return encodings.front();
152 }
153 
154 std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
155 std::vector<std::string> encodings;
156 #if LCF_SUPPORT_ICU
157  if (!string.empty()) {
158  UErrorCode status = U_ZERO_ERROR;
159  UCharsetDetector* detector = ucsdet_open(&status);
160 
161  auto s = std::string(string);
162  ucsdet_setText(detector, s.c_str(), s.length(), &status);
163 
164  int32_t matches_count;
165  const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
166 
167  if (matches != nullptr) {
168  // Collect all candidates, most confident comes first
169  for (int i = 0; i < matches_count; ++i) {
170  std::string encoding = ucsdet_getName(matches[i], &status);
171 
172  // Fixes to ensure proper Windows encodings
173  if (encoding == "Shift_JIS") {
174  encodings.emplace_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
175  } else if (encoding == "EUC-KR") {
176  encodings.emplace_back("windows-949-2000"); // Korean with \ as backlash
177  } else if (encoding == "GB18030") {
178  encodings.emplace_back("windows-936-2000"); // Simplified Chinese
179  } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
180  encodings.emplace_back("ibm-5348_P100-1997"); // Occidental with Euro
181  } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
182  encodings.emplace_back("ibm-5346_P100-1998"); // Central Europe with Euro
183  } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
184  encodings.emplace_back("ibm-5347_P100-1998"); // Cyrillic with Euro
185  } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
186  encodings.emplace_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
187  } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
188  encodings.emplace_back("ibm-5349_P100-1998"); // Greek with Euro
189  } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
190  encodings.emplace_back("ibm-9447_P100-2002"); // Hebrew with Euro
191  } else {
192  encodings.push_back(encoding);
193  }
194  }
195  }
196  ucsdet_close(detector);
197  }
198 #endif
199 
200  return encodings;
201 }
202 
203 std::string ReaderUtil::GetEncoding(StringView ini_file) {
204  INIReader ini(ToString(ini_file));
205  if (ini.ParseError() != -1) {
206  std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
207  if (!encoding.empty()) {
208  return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
209  }
210  }
211  return std::string();
212 }
213 
214 std::string ReaderUtil::GetEncoding(std::istream& filestream) {
215  INIReader ini(filestream);
216  if (ini.ParseError() != -1) {
217  std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
218  if (!encoding.empty()) {
219  return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
220  }
221  }
222  return std::string();
223 }
224 
225 std::string ReaderUtil::GetLocaleEncoding() {
226 #ifdef _WIN32
227  int codepage = GetACP();
228 #elif __ANDROID__
229  // No std::locale support in NDK
230  // Doesn't really matter because the Android version auto-detects via ICU
231  int codepage = 1252;
232 #else
233  int codepage = 1252;
234 
235  std::locale loc = std::locale("");
236  // Gets the language and culture part only
237  std::string loc_full = loc.name().substr(0, loc.name().find_first_of("@."));
238  // Gets the language part only
239  std::string loc_lang = loc.name().substr(0, loc.name().find_first_of("_"));
240 
241  if (loc_lang == "th") codepage = 874;
242  else if (loc_lang == "ja") codepage = 932;
243  else if (loc_full == "zh_CN" ||
244  loc_full == "zh_SG") codepage = 936;
245  else if (loc_lang == "ko") codepage = 949;
246  else if (loc_full == "zh_TW" ||
247  loc_full == "zh_HK") codepage = 950;
248  else if (loc_lang == "cs" ||
249  loc_lang == "hu" ||
250  loc_lang == "pl" ||
251  loc_lang == "ro" ||
252  loc_lang == "hr" ||
253  loc_lang == "sk" ||
254  loc_lang == "sl") codepage = 1250;
255  else if (loc_lang == "ru") codepage = 1251;
256  else if (loc_lang == "ca" ||
257  loc_lang == "da" ||
258  loc_lang == "de" ||
259  loc_lang == "en" ||
260  loc_lang == "es" ||
261  loc_lang == "fi" ||
262  loc_lang == "fr" ||
263  loc_lang == "it" ||
264  loc_lang == "nl" ||
265  loc_lang == "nb" ||
266  loc_lang == "pt" ||
267  loc_lang == "sv" ||
268  loc_lang == "eu") codepage = 1252;
269  else if (loc_lang == "el") codepage = 1253;
270  else if (loc_lang == "tr") codepage = 1254;
271  else if (loc_lang == "he") codepage = 1255;
272  else if (loc_lang == "ar") codepage = 1256;
273  else if (loc_lang == "et" ||
274  loc_lang == "lt" ||
275  loc_lang == "lv") codepage = 1257;
276  else if (loc_lang == "vi") codepage = 1258;
277 #endif
278 
279  return CodepageToEncoding(codepage);
280 }
281 
282 std::string ReaderUtil::Recode(StringView str_to_encode, StringView source_encoding) {
283  return ReaderUtil::Recode(str_to_encode, source_encoding, "UTF-8");
284 }
285 
286 std::string ReaderUtil::Recode(StringView str_to_encode,
287  StringView src_enc,
288  StringView dst_enc) {
289 
290  if (src_enc.empty() || dst_enc.empty() || str_to_encode.empty()) {
291  return ToString(str_to_encode);
292  }
293 
294  auto src_cp = SvAtoi(src_enc);
295  const auto& src_enc_str = src_cp > 0
296  ? ReaderUtil::CodepageToEncoding(src_cp)
297  : ToString(src_enc);
298 
299  auto dst_cp = SvAtoi(dst_enc);
300  const auto& dst_enc_str = dst_cp > 0
301  ? ReaderUtil::CodepageToEncoding(dst_cp)
302  : ToString(dst_enc);
303 
304 #if LCF_SUPPORT_ICU
305  auto status = U_ZERO_ERROR;
306  auto conv_from = ucnv_open(src_enc_str.c_str(), &status);
307 
308  if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
309  fprintf(stderr, "liblcf: ucnv_open() error for source encoding \"%s\": %s\n", src_enc_str.c_str(), u_errorName(status));
310  return std::string();
311  }
312  status = U_ZERO_ERROR;
313  auto conv_from_sg = makeScopeGuard([&]() { ucnv_close(conv_from); });
314 
315  auto conv_to = ucnv_open(dst_enc_str.c_str(), &status);
316 
317  if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
318  fprintf(stderr, "liblcf: ucnv_open() error for dest encoding \"%s\": %s\n", dst_enc_str.c_str(), u_errorName(status));
319  return std::string();
320  }
321  auto conv_to_sg = makeScopeGuard([&]() { ucnv_close(conv_to); });
322  status = U_ZERO_ERROR;
323 
324  std::string result(str_to_encode.size() * 4, '\0');
325  auto* src = str_to_encode.data();
326  auto* dst = &result.front();
327 
328  ucnv_convertEx(conv_to, conv_from,
329  &dst, dst + result.size(),
330  &src, src + str_to_encode.size(),
331  nullptr, nullptr, nullptr, nullptr,
332  true, true,
333  &status);
334 
335  if (U_FAILURE(status)) {
336  fprintf(stderr, "liblcf: ucnv_convertEx() error when encoding \"%.*s\": %s\n", (int)str_to_encode.length(), str_to_encode.data(), u_errorName(status));
337  return std::string();
338  }
339 
340  result.resize(dst - result.c_str());
341  result.shrink_to_fit();
342 
343  return result;
344 #else
345  iconv_t cd = iconv_open(dst_enc_str.c_str(), src_enc_str.c_str());
346  if (cd == (iconv_t)-1)
347  return ToString(str_to_encode);
348  char *src = const_cast<char *>(str_to_encode.data());
349  size_t src_left = str_to_encode.size();
350  size_t dst_size = str_to_encode.size() * 5 + 10;
351  char *dst = new char[dst_size];
352  size_t dst_left = dst_size;
353 # ifdef ICONV_CONST
354  char ICONV_CONST *p = src;
355 # else
356  char *p = src;
357 # endif
358  char *q = dst;
359  size_t status = iconv(cd, &p, &src_left, &q, &dst_left);
360  iconv_close(cd);
361  if (status == (size_t) -1 || src_left > 0) {
362  delete[] dst;
363  return std::string();
364  }
365  *q++ = '\0';
366  std::string result(dst);
367  delete[] dst;
368  return result;
369 #endif
370 }
371 
372 std::string ReaderUtil::Normalize(StringView str) {
373 #if LCF_SUPPORT_ICU
374  icu::UnicodeString uni = icu::UnicodeString(str.data(), str.length(), "utf-8").toLower(icu::Locale::getRoot());
375  UErrorCode err = U_ZERO_ERROR;
376  std::string res;
377  const icu::Normalizer2* norm = icu::Normalizer2::getNFKCInstance(err);
378  if (U_FAILURE(err)) {
379  static bool err_reported = false;
380  if (!err_reported) {
381  fprintf(stderr, "Normalizer2::getNFKCInstance failed (%s). \"nrm\" is probably missing in the ICU data file. Unicode normalization will not work!\n", u_errorName(err));
382  err_reported = true;
383  }
384  uni.toUTF8String(res);
385  return res;
386  }
387  icu::UnicodeString f = norm->normalize(uni, err);
388  if (U_FAILURE(err)) {
389  uni.toUTF8String(res);
390  } else {
391  f.toUTF8String(res);
392  }
393  return res;
394 #else
395  auto result = std::string(str);
396  std::transform(result.begin(), result.end(), result.begin(), tolower);
397  return result;
398 #endif
399 }
400 
401 } //namespace lcf
Definition: dbarray.cpp:13