Package translate :: Package lang :: Module data
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.data

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2007-2011 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """This module stores information and functionality that relates to plurals.""" 
 23   
 24  import unicodedata 
 25   
 26  from translate.storage.placeables import StringElem 
 27   
 28   
 29  languages = { 
 30  'af': (u'Afrikaans', 2, '(n != 1)'), 
 31  'ak': (u'Akan', 2, 'n > 1'), 
 32  'am': (u'Amharic', 2, 'n > 1'), 
 33  'an': (u'Aragonese', 2, '(n != 1)'), 
 34  'ar': (u'Arabic', 6, 'n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 && n%100<=10 ? 3 : n%100>=11 ? 4 : 5'), 
 35  'arn': (u'Mapudungun; Mapuche', 2, 'n > 1'), 
 36  'ast': (u'Asturian; Bable; Leonese; Asturleonese', 2, '(n != 1)'), 
 37  'az': (u'Azerbaijani', 2, '(n != 1)'), 
 38  'be': (u'Belarusian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 39  'bg': (u'Bulgarian', 2, '(n != 1)'), 
 40  'bn': (u'Bengali', 2, '(n != 1)'), 
 41  'bn_IN': (u'Bengali (India)', 2, '(n != 1)'), 
 42  'bo': (u'Tibetan', 1, '0'), 
 43  'br': (u'Breton', 2, 'n > 1'), 
 44  'bs': (u'Bosnian', 3, 'n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 45  'ca': (u'Catalan; Valencian', 2, '(n != 1)'), 
 46  'ca@valencia': (u'Catalan; Valencian (Valencia)', 2, '(n != 1)'), 
 47  'cs': (u'Czech', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'), 
 48  'csb': (u'Kashubian', 3, 'n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2'), 
 49  'cy': (u'Welsh', 2, '(n==2) ? 1 : 0'), 
 50  'da': (u'Danish', 2, '(n != 1)'), 
 51  'de': (u'German', 2, '(n != 1)'), 
 52  'dz': (u'Dzongkha', 1, '0'), 
 53  'el': (u'Greek, Modern (1453-)', 2, '(n != 1)'), 
 54  'en': (u'English', 2, '(n != 1)'), 
 55  'en_GB': (u'English (United Kingdom)', 2, '(n != 1)'), 
 56  'en_ZA': (u'English (South Africa)', 2, '(n != 1)'), 
 57  'eo': (u'Esperanto', 2, '(n != 1)'), 
 58  'es': (u'Spanish; Castilian', 2, '(n != 1)'), 
 59  'et': (u'Estonian', 2, '(n != 1)'), 
 60  'eu': (u'Basque', 2, '(n != 1)'), 
 61  'fa': (u'Persian', 1, '0'), 
 62  'fi': (u'Finnish', 2, '(n != 1)'), 
 63  'fil': (u'Filipino; Pilipino', 2, '(n > 1)'), 
 64  'fo': (u'Faroese', 2, '(n != 1)'), 
 65  'fr': (u'French', 2, '(n > 1)'), 
 66  'fur': (u'Friulian', 2, '(n != 1)'), 
 67  'fy': (u'Frisian', 2, '(n != 1)'), 
 68  'ga': (u'Irish', 3, 'n==1 ? 0 : n==2 ? 1 : 2'), 
 69  'gd': (u'Gaelic; Scottish Gaelic', 2, 'nplurals=4; plural=(n==1 || n==11) ? 0 : (n==2 || n==12) ? 1 : (n > 2 && n < 20) ? 2 : 3'), 
 70  'gl': (u'Galician', 2, '(n != 1)'), 
 71  'gu': (u'Gujarati', 2, '(n != 1)'), 
 72  'gun': (u'Gun', 2, '(n > 1)'), 
 73  'ha': (u'Hausa', 2, '(n != 1)'), 
 74  'he': (u'Hebrew', 2, '(n != 1)'), 
 75  'hi': (u'Hindi', 2, '(n != 1)'), 
 76  'hy': (u'Armenian', 1, '0'), 
 77  'hr': (u'Croatian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
 78  'hu': (u'Hungarian', 2, '(n != 1)'), 
 79  'ia': (u"Interlingua (International Auxiliary Language Association)", 2, '(n != 1)'), 
 80  'id': (u'Indonesian', 1, '0'), 
 81  'is': (u'Icelandic', 2, '(n != 1)'), 
 82  'it': (u'Italian', 2, '(n != 1)'), 
 83  'ja': (u'Japanese', 1, '0'), 
 84  'jv': (u'Javanese', 2, '(n != 1)'), 
 85  'ka': (u'Georgian', 1, '0'), 
 86  'kk': (u'Kazakh', 1, '0'), 
 87  'km': (u'Central Khmer', 1, '0'), 
 88  'kn': (u'Kannada', 2, '(n != 1)'), 
 89  'ko': (u'Korean', 1, '0'), 
 90  'ku': (u'Kurdish', 2, '(n != 1)'), 
 91  'kw': (u'Cornish', 4, '(n==1) ? 0 : (n==2) ? 1 : (n == 3) ? 2 : 3'), 
 92  'ky': (u'Kirghiz; Kyrgyz', 1, '0'), 
 93  'lb': (u'Luxembourgish; Letzeburgesch', 2, '(n != 1)'), 
 94  'ln': (u'Lingala', 2, '(n > 1)'), 
 95  'lo': (u'Lao', 1, '0'), 
 96  'lt': (u'Lithuanian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
 97  'lv': (u'Latvian', 3, '(n%10==1 && n%100!=11 ? 0 : n != 0 ? 1 : 2)'), 
 98  'mai': (u'Maithili', 2, '(n != 1)'), 
 99  'mfe': (u'Morisyen', 2, '(n > 1)'), 
100  'mg': (u'Malagasy', 2, '(n > 1)'), 
101  'mi': (u'Maori', 2, '(n > 1)'), 
102  'mk': (u'Macedonian', 2, 'n==1 || n%10==1 ? 0 : 1'), 
103  'ml': (u'Malayalam', 2, '(n != 1)'), 
104  'mn': (u'Mongolian', 2, '(n != 1)'), 
105  'mr': (u'Marathi', 2, '(n != 1)'), 
106  'ms': (u'Malay', 1, '0'), 
107  'mt': (u'Maltese', 4, '(n==1 ? 0 : n==0 || ( n%100>1 && n%100<11) ? 1 : (n%100>10 && n%100<20 ) ? 2 : 3)'), 
108  'nah': (u'Nahuatl languages', 2, '(n != 1)'), 
109  'nap': (u'Neapolitan', 2, '(n != 1)'), 
110  'nb': (u'Bokmål, Norwegian; Norwegian Bokmål', 2, '(n != 1)'), 
111  'ne': (u'Nepali', 2, '(n != 1)'), 
112  'nl': (u'Dutch; Flemish', 2, '(n != 1)'), 
113  'nn': (u'Norwegian Nynorsk; Nynorsk, Norwegian', 2, '(n != 1)'), 
114  'nso': (u'Pedi; Sepedi; Northern Sotho', 2, '(n != 1)'), 
115  'oc': (u'Occitan (post 1500)', 2, '(n > 1)'), 
116  'or': (u'Oriya', 2, '(n != 1)'), 
117  'pa': (u'Panjabi; Punjabi', 2, '(n != 1)'), 
118  'pap': (u'Papiamento', 2, '(n != 1)'), 
119  'pl': (u'Polish', 3, '(n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
120  'pms': (u'Piemontese', 2, '(n != 1)'), 
121  'ps': (u'Pushto; Pashto', 2, '(n != 1)'), 
122  'pt': (u'Portuguese', 2, '(n != 1)'), 
123  'pt_BR': (u'Portuguese (Brazil)', 2, '(n > 1)'), 
124  'rm': (u'Romansh', 2, '(n != 1)'), 
125  'ro': (u'Romanian', 3, '(n==1 ? 0 : (n==0 || (n%100 > 0 && n%100 < 20)) ? 1 : 2);'), 
126  'ru': (u'Russian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
127  'sco': (u'Scots', 2, '(n != 1)'), 
128  'si': (u'Sinhala; Sinhalese', 2, '(n != 1)'), 
129  'sk': (u'Slovak', 3, '(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2'), 
130  'sl': (u'Slovenian', 4, '(n%100==1 ? 0 : n%100==2 ? 1 : n%100==3 || n%100==4 ? 2 : 3)'), 
131  'so': (u'Somali', 2, '(n != 1)'), 
132  'son': (u'Songhai languages', 2, '(n != 1)'), 
133  'sq': (u'Albanian', 2, '(n != 1)'), 
134  'sr': (u'Serbian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
135  'st': (u'Sotho, Southern', 2, '(n != 1)'), 
136  'su': (u'Sundanese', 1, '0'), 
137  'sv': (u'Swedish', 2, '(n != 1)'), 
138  'sw': (u'Swahili', 2, '(n != 1)'), 
139  'ta': (u'Tamil', 2, '(n != 1)'), 
140  'te': (u'Telugu', 2, '(n != 1)'), 
141  'tg': (u'Tajik', 2, '(n != 1)'), 
142  'ti': (u'Tigrinya', 2, '(n > 1)'), 
143  'th': (u'Thai', 1, '0'), 
144  'tk': (u'Turkmen', 2, '(n != 1)'), 
145  'tr': (u'Turkish', 1, '0'), 
146  'tt': (u'Tatar', 1, '0'), 
147  'ug': (u'Uighur; Uyghur', 1, '0'), 
148  'uk': (u'Ukrainian', 3, '(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2)'), 
149  'vi': (u'Vietnamese', 1, '0'), 
150  'wa': (u'Walloon', 2, '(n > 1)'), 
151  'yo': (u'Yoruba', 2, '(n != 1)'), 
152  # Chinese is difficult because the main divide is on script, not really 
153  # country. Simplified Chinese is used mostly in China, Singapore and Malaysia. 
154  # Traditional Chinese is used mostly in Hong Kong, Taiwan and Macau. 
155  'zh_CN': (u'Chinese (China)', 1, '0'), 
156  'zh_HK': (u'Chinese (Hong Kong)', 1, '0'), 
157  'zh_TW': (u'Chinese (Taiwan)', 1, '0'), 
158  'zu': (u'Zulu', 2, '(n != 1)'), 
159  } 
160  """Dictionary of language data. 
161  The language code is the dictionary key (which may contain country codes and modifiers). 
162  The value is a tuple: (Full name in English from iso-codes, nplurals, plural equation). 
163   
164  Note that the English names should not be used in user facing places - it 
165  should always be passed through the function returned from tr_lang(), or at 
166  least passed through _fix_language_name().""" 
167   
168  _fixed_names = { 
169          u"Asturian; Bable; Leonese; Asturleonese": u"Asturian", 
170          u"Bokmål, Norwegian; Norwegian Bokmål": u"Norwegian Bokmål", 
171          u"Catalan; Valencian": u"Catalan", 
172          u"Central Khmer": u"Khmer", 
173          u"Chichewa; Chewa; Nyanja": u"Chewa; Nyanja", 
174          u"Divehi; Dhivehi; Maldivian": u"Divehi", 
175          u"Dutch; Flemish": u"Dutch", 
176          u"Filipino; Pilipino": u"Filipino", 
177          u"Gaelic; Scottish Gaelic": u"Scottish Gaelic", 
178          u"Greek, Modern (1453-)": u"Greek", 
179          u"Interlingua (International Auxiliary Language Association)": u"Interlingua", 
180          u"Kirghiz; Kyrgyz": u"Kirghiz", 
181          u"Klingon; tlhIngan-Hol": u"Klingon", 
182          u"Limburgan; Limburger; Limburgish": u"Limburgish", 
183          u"Low German; Low Saxon; German, Low; Saxon, Low": u"Low German", 
184          u"Luxembourgish; Letzeburgesch": u"Luxembourgish", 
185          u"Ndebele, South; South Ndebele": u"Southern Ndebele", 
186          u"Norwegian Nynorsk; Nynorsk, Norwegian": u"Norwegian Nynorsk", 
187          u"Occitan (post 1500)": u"Occitan", 
188          u"Panjabi; Punjabi": u"Punjabi", 
189          u"Pedi; Sepedi; Northern Sotho": u"Northern Sotho", 
190          u"Pushto; Pashto": u"Pashto", 
191          u"Sinhala; Sinhalese": u"Sinhala", 
192          u"Sotho, Southern": u"Sotho", 
193          u"Spanish; Castilian": u"Spanish", 
194          u"Uighur; Uyghur": u"Uighur", 
195  } 
196   
197   
198 -def simplercode(code):
199 """This attempts to simplify the given language code by ignoring country 200 codes, for example. 201 202 @see: 203 - U{http://www.rfc-editor.org/rfc/bcp/bcp47.txt} 204 - U{http://www.rfc-editor.org/rfc/rfc4646.txt} 205 - U{http://www.rfc-editor.org/rfc/rfc4647.txt} 206 - U{http://www.w3.org/International/articles/language-tags/} 207 """ 208 if not code: 209 return code 210 211 normalized = normalize_code(code) 212 separator = normalized.rfind('-') 213 if separator >= 0: 214 return code[:separator] 215 else: 216 return ""
217 218 219 expansion_factors = { 220 'af': 0.1, 221 'ar': -0.09, 222 'es': 0.21, 223 'fr': 0.28, 224 'it': 0.2, 225 } 226 """Source to target string length expansion factors.""" 227 228 import gettext 229 import locale 230 import re 231 import os 232 233 iso639 = {} 234 """ISO 639 language codes""" 235 iso3166 = {} 236 """ISO 3166 country codes""" 237 238 langcode_re = re.compile("^[a-z]{2,3}([_-][A-Z]{2,3}|)(@[a-zA-Z0-9]+|)$") 239 langcode_ire = re.compile("^[a-z]{2,3}([_-][a-z]{2,3})?(@[a-z0-9]+)?$", re.IGNORECASE) 240 variant_re = re.compile("^[_-][A-Z]{2,3}(@[a-zA-Z0-9]+|)$") 241 242
243 -def languagematch(languagecode, otherlanguagecode):
244 """matches a languagecode to another, ignoring regions in the second""" 245 if languagecode is None: 246 return langcode_re.match(otherlanguagecode) 247 return languagecode == otherlanguagecode or \ 248 (otherlanguagecode.startswith(languagecode) and variant_re.match(otherlanguagecode[len(languagecode):]))
249 250 dialect_name_re = re.compile(r"(.+)\s\(([^)\d]{,25})\)$") 251 # The limit of 25 characters on the country name is so that "Interlingua (...)" 252 # (see above) is correctly interpreted. 253 254
255 -def tr_lang(langcode=None):
256 """Gives a function that can translate a language name, even in the form C{"language (country)"}, 257 into the language with iso code langcode, or the system language if no language is specified.""" 258 langfunc = gettext_lang(langcode) 259 countryfunc = gettext_country(langcode) 260 261 def handlelanguage(name): 262 match = dialect_name_re.match(name) 263 if match: 264 language, country = match.groups() 265 return u"%s (%s)" % (_fix_language_name(langfunc(language)), countryfunc(country)) 266 else: 267 return _fix_language_name(langfunc(name))
268 269 return handlelanguage 270 271
272 -def _fix_language_name(name):
273 """Identify and replace some unsightly names present in iso-codes. 274 275 If the name is present in _fixed_names we assume it is untranslated and 276 we replace it with a more usable rendering. If the remaining part is long 277 and includes a semi-colon, we only take the text up to the semi-colon to 278 keep things neat.""" 279 if name in _fixed_names: 280 return _fixed_names[name] 281 elif len(name) > 11: 282 # These constants are somewhat arbitrary, but testing with the Japanese 283 # translation of ISO codes suggests these as the upper bounds. 284 split_point = name[5:].find(u';') 285 if split_point >= 0: 286 return name[:5+split_point] 287 return name
288 289
290 -def gettext_lang(langcode=None):
291 """Returns a gettext function to translate language names into the given 292 language, or the system language if no language is specified.""" 293 if not langcode in iso639: 294 if not langcode: 295 langcode = "" 296 if os.name == "nt": 297 # On Windows the default locale is not used for some reason 298 t = gettext.translation('iso_639', languages=[locale.getdefaultlocale()[0]], fallback=True) 299 else: 300 t = gettext.translation('iso_639', fallback=True) 301 else: 302 t = gettext.translation('iso_639', languages=[langcode], fallback=True) 303 iso639[langcode] = t.ugettext 304 return iso639[langcode]
305 306
307 -def gettext_country(langcode=None):
308 """Returns a gettext function to translate country names into the given 309 language, or the system language if no language is specified.""" 310 if not langcode in iso3166: 311 if not langcode: 312 langcode = "" 313 if os.name == "nt": 314 # On Windows the default locale is not used for some reason 315 t = gettext.translation('iso_3166', languages=[locale.getdefaultlocale()[0]], fallback=True) 316 else: 317 t = gettext.translation('iso_3166', fallback=True) 318 else: 319 t = gettext.translation('iso_3166', languages=[langcode], fallback=True) 320 iso3166[langcode] = t.ugettext 321 return iso3166[langcode]
322 323
324 -def normalize(string, normal_form="NFC"):
325 """Return a unicode string in its normalized form 326 327 @param string: The string to be normalized 328 @param normal_form: NFC (default), NFD, NFKC, NFKD 329 @return: Normalized string 330 """ 331 if string is None: 332 return None 333 else: 334 return unicodedata.normalize(normal_form, string)
335 336
337 -def forceunicode(string):
338 """Ensures that the string is in unicode. 339 340 @param string: A text string 341 @type string: Unicode, String 342 @return: String converted to Unicode and normalized as needed. 343 @rtype: Unicode 344 """ 345 if string is None: 346 return None 347 if isinstance(string, str): 348 encoding = getattr(string, "encoding", "utf-8") 349 string = string.decode(encoding) 350 elif isinstance(string, StringElem): 351 string = unicode(string) 352 return string
353 354
355 -def normalized_unicode(string):
356 """Forces the string to unicode and does normalization.""" 357 return normalize(forceunicode(string))
358 359
360 -def normalize_code(code):
361 if not code: 362 return code 363 return code.replace("_", "-").replace("@", "-").lower()
364 365
366 -def simplify_to_common(language_code, languages=languages):
367 """Simplify language code to the most commonly used form for the 368 language, stripping country information for languages that tend 369 not to be localized differently for different countries""" 370 simpler = simplercode(language_code) 371 if normalize_code(language_code) in [normalize_code(key) for key in languages.keys()] or simpler == "": 372 return language_code 373 else: 374 return simplify_to_common(simpler)
375