blob: cb01821c3c2cf9f5ff787f0670874df9020ac53b [file] [log] [blame]
Marc-André Lemburg5431bc32000-06-07 09:11:40 +00001""" Locale support.
Guido van Rossum4b8c6ea2000-02-04 15:39:30 +00002
Marc-André Lemburg5431bc32000-06-07 09:11:40 +00003 The module provides low-level access to the C lib's locale APIs
4 and adds high level number formatting APIs as well as a locale
5 aliasing engine to complement these.
6
7 The aliasing engine includes support for many commonly used locale
8 names and maps them to values suitable for passing to the C lib's
9 setlocale() function. It also includes default encodings for all
10 supported locale names.
11
12"""
13
14import string
15
16### C lib locale APIs
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000017
18from _locale import *
Marc-André Lemburg5431bc32000-06-07 09:11:40 +000019
20### Number formatting APIs
21
22# Author: Martin von Loewis
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000023
24#perform the grouping from right to left
25def _group(s):
26 conv=localeconv()
27 grouping=conv['grouping']
28 if not grouping:return s
29 result=""
30 while s and grouping:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000031 # if grouping is -1, we are done
32 if grouping[0]==CHAR_MAX:
33 break
34 # 0: re-use last group ad infinitum
35 elif grouping[0]!=0:
36 #process last group
37 group=grouping[0]
38 grouping=grouping[1:]
39 if result:
40 result=s[-group:]+conv['thousands_sep']+result
41 else:
42 result=s[-group:]
43 s=s[:-group]
Marc-André Lemburg5431bc32000-06-07 09:11:40 +000044 if not result:
45 return s
46 if s:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000047 result=s+conv['thousands_sep']+result
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000048 return result
49
50def format(f,val,grouping=0):
51 """Formats a value in the same way that the % formatting would use,
52 but takes the current locale into account.
53 Grouping is applied if the third parameter is true."""
54 result = f % val
Marc-André Lemburg5431bc32000-06-07 09:11:40 +000055 fields = string.split(result, ".")
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000056 if grouping:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000057 fields[0]=_group(fields[0])
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000058 if len(fields)==2:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000059 return fields[0]+localeconv()['decimal_point']+fields[1]
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000060 elif len(fields)==1:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000061 return fields[0]
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000062 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000063 raise Error,"Too many decimal points in result string"
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000064
65def str(val):
66 """Convert float to integer, taking the locale into account."""
67 return format("%.12g",val)
68
69def atof(str,func=string.atof):
70 "Parses a string as a float according to the locale settings."
71 #First, get rid of the grouping
Marc-André Lemburg5431bc32000-06-07 09:11:40 +000072 ts = localeconv()['thousands_sep']
73 if ts:
74 s=string.split(str,ts)
75 str=string.join(s, "")
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000076 #next, replace the decimal point with a dot
Marc-André Lemburg5431bc32000-06-07 09:11:40 +000077 dd = localeconv()['decimal_point']
78 if dd:
79 s=string.split(str,dd)
80 str=string.join(s,'.')
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000081 #finally, parse the string
82 return func(str)
83
84def atoi(str):
85 "Converts a string to an integer according to the locale settings."
86 return atof(str,string.atoi)
87
Marc-André Lemburg5431bc32000-06-07 09:11:40 +000088def _test():
Guido van Rossumeef1d4e1997-11-19 19:01:43 +000089 setlocale(LC_ALL,"")
90 #do grouping
91 s1=format("%d",123456789,1)
92 print s1,"is",atoi(s1)
93 #standard formatting
94 s1=str(3.14)
95 print s1,"is",atof(s1)
Marc-André Lemburg5431bc32000-06-07 09:11:40 +000096
97### Locale name aliasing engine
98
99# Author: Marc-Andre Lemburg, mal@lemburg.com
100
101def normalize(localename):
102
103 """ Returns a normalized locale code for the given locale
104 name.
105
106 The returned locale code is formatted for use with
107 setlocale().
108
109 If normalization fails, the original name is returned
110 unchanged.
111
112 If the given encoding is not known, the function defaults to
113 the default encoding for the locale code just like setlocale()
114 does.
115
116 """
117 # Normalize the locale name and extract the encoding
118 fullname = string.lower(localename)
119 if ':' in fullname:
120 # ':' is sometimes used as encoding delimiter.
121 fullname = string.replace(fullname, ':', '.')
122 if '.' in fullname:
123 langname, encoding = string.split(fullname, '.')[:2]
124 fullname = langname + '.' + encoding
125 else:
126 langname = fullname
127 encoding = ''
128
129 # First lookup: fullname (possibly with encoding)
130 code = locale_alias.get(fullname, None)
131 if code is not None:
132 return code
133
134 # Second try: langname (without encoding)
135 code = locale_alias.get(langname, None)
136 if code is not None:
137 if '.' in code:
138 langname, defenc = string.split(code, '.')
139 else:
140 langname = code
141 defenc = ''
142 if encoding:
143 encoding = encoding_alias.get(encoding, encoding)
144 else:
145 encoding = defenc
146 if encoding:
147 return langname + '.' + encoding
148 else:
149 return langname
150
151 else:
152 return localename
153
154def _parse_localename(localename):
155
156 """ Parses the locale code for localename and returns the
157 result as tuple (language code, encoding).
158
159 The localename is normalized and passed through the locale
160 alias engine. A ValueError is raised in case the locale name
161 cannot be parsed.
162
163 The language code corresponds to RFC 1766. code and encoding
164 can be None in case the values cannot be determined or are
165 unkown to this implementation.
166
167 """
168 code = normalize(localename)
169 if '.' in code:
170 return string.split(code, '.')[:2]
171 elif code == 'C':
172 return None, None
173 else:
174 raise ValueError,'unkown locale: %s' % localename
175 return l
176
177def _build_localename(localetuple):
178
179 """ Builds a locale code from the given tuple (language code,
180 encoding).
181
182 No aliasing or normalizing takes place.
183
184 """
185 language, encoding = localetuple
186 if language is None:
187 language = 'C'
188 if encoding is None:
189 return language
190 else:
191 return language + '.' + encoding
Guido van Rossumeef1d4e1997-11-19 19:01:43 +0000192
Marc-André Lemburg5431bc32000-06-07 09:11:40 +0000193def get_default(envvars=('LANGUAGE', 'LC_ALL', 'LC_CTYPE', 'LANG')):
194
195 """ Tries to determine the default locale settings and returns
196 them as tuple (language code, encoding).
197
198 According to POSIX, a program which has not called
199 setlocale(LC_ALL,"") runs using the portable 'C' locale.
200 Calling setlocale(LC_ALL,"") lets it use the default locale as
201 defined by the LANG variable. Since we don't want to interfere
202 with the current locale setting we thus emulate the behaviour
203 in the way described above.
204
205 To maintain compatibility with other platforms, not only the
206 LANG variable is tested, but a list of variables given as
207 envvars parameter. The first found to be defined will be
208 used. envvars defaults to the search path used in GNU gettext;
209 it must always contain the variable name 'LANG'.
210
211 Except for the code 'C', the language code corresponds to RFC
212 1766. code and encoding can be None in case the values cannot
213 be determined.
214
215 """
216 import os
217 lookup = os.environ.get
218 for variable in envvars:
219 localename = lookup(variable,None)
220 if localename is not None:
221 break
222 else:
223 localename = 'C'
224 return _parse_localename(localename)
225
226def get_locale(category=LC_CTYPE):
227
228 """ Returns the current setting for the given locale category as
229 tuple (language code, encoding).
230
231 category may be one of the LC_* value except LC_ALL. It
232 defaults to LC_CTYPE.
233
234 Except for the code 'C', the language code corresponds to RFC
235 1766. code and encoding can be None in case the values cannot
236 be determined.
237
238 """
239 localename = setlocale(category)
240 if category == LC_ALL and ';' in localename:
241 raise TypeError,'category LC_ALL is not supported'
242 return _parse_localename(localename)
243
244def set_locale(localetuple, category=LC_ALL):
245
246 """ Set the locale according to the localetuple (language code,
247 encoding) as returned by get_locale() and get_default().
248
249 The given codes are passed through the locale aliasing engine
250 before being given to setlocale() for processing.
251
252 category may be given as one of the LC_* values. It defaults
253 to LC_ALL.
254
255 """
256 setlocale(category, normalize(_build_localename(localetuple)))
257
258def set_to_default(category=LC_ALL):
259
260 """ Sets the locale for category to the default setting.
261
262 The default setting is determined by calling
263 get_default(). category defaults to LC_ALL.
264
265 """
266 setlocale(category, _build_localename(get_default()))
267
268### Database
269#
270# The following data was extracted from the locale.alias file which
271# comes with X11 and then hand edited removing the explicit encoding
272# definitions and adding some more aliases. The file is usually
273# available as /usr/lib/X11/locale/locale.alias.
274#
275
276#
277# The encoding_alias table maps lowercase encoding alias names to C
278# locale encoding names (case-sensitive).
279#
280encoding_alias = {
281 '437': 'C',
282 'c': 'C',
283 'iso8859': 'ISO8859-1',
284 '8859': 'ISO8859-1',
285 '88591': 'ISO8859-1',
286 'ascii': 'ISO8859-1',
287 'en': 'ISO8859-1',
288 'iso88591': 'ISO8859-1',
289 'iso_8859-1': 'ISO8859-1',
290 '885915': 'ISO8859-15',
291 'iso885915': 'ISO8859-15',
292 'iso_8859-15': 'ISO8859-15',
293 'iso8859-2': 'ISO8859-2',
294 'iso88592': 'ISO8859-2',
295 'iso_8859-2': 'ISO8859-2',
296 'iso88595': 'ISO8859-5',
297 'iso88596': 'ISO8859-6',
298 'iso88597': 'ISO8859-7',
299 'iso88598': 'ISO8859-8',
300 'iso88599': 'ISO8859-9',
301 'iso-2022-jp': 'JIS7',
302 'jis': 'JIS7',
303 'jis7': 'JIS7',
304 'sjis': 'SJIS',
305 'tis620': 'TACTIS',
306 'ajec': 'eucJP',
307 'eucjp': 'eucJP',
308 'ujis': 'eucJP',
309 'utf-8': 'utf',
310 'utf8': 'utf',
311 'utf8@ucs4': 'utf',
312}
313
314#
315# The locale_alias table maps lowercase alias names to C locale names
316# (case-sensitive). Encodings are always separated from the locale
317# name using a dot ('.'); they should only be given in case the
318# language name is needed to interpret the given encoding alias
319# correctly (CJK codes often have this need).
320#
321locale_alias = {
322 'american': 'en_US.ISO8859-1',
323 'ar': 'ar_AA.ISO8859-6',
324 'ar_aa': 'ar_AA.ISO8859-6',
325 'ar_sa': 'ar_SA.ISO8859-6',
326 'arabic': 'ar_AA.ISO8859-6',
327 'bg': 'bg_BG.ISO8859-5',
328 'bg_bg': 'bg_BG.ISO8859-5',
329 'bulgarian': 'bg_BG.ISO8859-5',
330 'c-french': 'fr_CA.ISO8859-1',
331 'c': 'C',
332 'c_c': 'C',
333 'cextend': 'en_US.ISO8859-1',
334 'chinese-s': 'zh_CN.eucCN',
335 'chinese-t': 'zh_TW.eucTW',
336 'croatian': 'hr_HR.ISO8859-2',
337 'cs': 'cs_CZ.ISO8859-2',
338 'cs_cs': 'cs_CZ.ISO8859-2',
339 'cs_cz': 'cs_CZ.ISO8859-2',
340 'cz': 'cz_CZ.ISO8859-2',
341 'cz_cz': 'cz_CZ.ISO8859-2',
342 'czech': 'cs_CS.ISO8859-2',
343 'da': 'da_DK.ISO8859-1',
344 'da_dk': 'da_DK.ISO8859-1',
345 'danish': 'da_DK.ISO8859-1',
346 'de': 'de_DE.ISO8859-1',
347 'de_at': 'de_AT.ISO8859-1',
348 'de_ch': 'de_CH.ISO8859-1',
349 'de_de': 'de_DE.ISO8859-1',
350 'dutch': 'nl_BE.ISO8859-1',
351 'ee': 'ee_EE.ISO8859-4',
352 'el': 'el_GR.ISO8859-7',
353 'el_gr': 'el_GR.ISO8859-7',
354 'en': 'en_US.ISO8859-1',
355 'en_au': 'en_AU.ISO8859-1',
356 'en_ca': 'en_CA.ISO8859-1',
357 'en_gb': 'en_GB.ISO8859-1',
358 'en_ie': 'en_IE.ISO8859-1',
359 'en_nz': 'en_NZ.ISO8859-1',
360 'en_uk': 'en_GB.ISO8859-1',
361 'en_us': 'en_US.ISO8859-1',
362 'eng_gb': 'en_GB.ISO8859-1',
363 'english': 'en_EN.ISO8859-1',
364 'english_uk': 'en_GB.ISO8859-1',
365 'english_united-states': 'en_US.ISO8859-1',
366 'english_us': 'en_US.ISO8859-1',
367 'es': 'es_ES.ISO8859-1',
368 'es_ar': 'es_AR.ISO8859-1',
369 'es_bo': 'es_BO.ISO8859-1',
370 'es_cl': 'es_CL.ISO8859-1',
371 'es_co': 'es_CO.ISO8859-1',
372 'es_cr': 'es_CR.ISO8859-1',
373 'es_ec': 'es_EC.ISO8859-1',
374 'es_es': 'es_ES.ISO8859-1',
375 'es_gt': 'es_GT.ISO8859-1',
376 'es_mx': 'es_MX.ISO8859-1',
377 'es_ni': 'es_NI.ISO8859-1',
378 'es_pa': 'es_PA.ISO8859-1',
379 'es_pe': 'es_PE.ISO8859-1',
380 'es_py': 'es_PY.ISO8859-1',
381 'es_sv': 'es_SV.ISO8859-1',
382 'es_uy': 'es_UY.ISO8859-1',
383 'es_ve': 'es_VE.ISO8859-1',
384 'et': 'et_EE.ISO8859-4',
385 'et_ee': 'et_EE.ISO8859-4',
386 'fi': 'fi_FI.ISO8859-1',
387 'fi_fi': 'fi_FI.ISO8859-1',
388 'finnish': 'fi_FI.ISO8859-1',
389 'fr': 'fr_FR.ISO8859-1',
390 'fr_be': 'fr_BE.ISO8859-1',
391 'fr_ca': 'fr_CA.ISO8859-1',
392 'fr_ch': 'fr_CH.ISO8859-1',
393 'fr_fr': 'fr_FR.ISO8859-1',
394 'fre_fr': 'fr_FR.ISO8859-1',
395 'french': 'fr_FR.ISO8859-1',
396 'french_france': 'fr_FR.ISO8859-1',
397 'ger_de': 'de_DE.ISO8859-1',
398 'german': 'de_DE.ISO8859-1',
399 'german_germany': 'de_DE.ISO8859-1',
400 'greek': 'el_GR.ISO8859-7',
401 'hebrew': 'iw_IL.ISO8859-8',
402 'hr': 'hr_HR.ISO8859-2',
403 'hr_hr': 'hr_HR.ISO8859-2',
404 'hu': 'hu_HU.ISO8859-2',
405 'hu_hu': 'hu_HU.ISO8859-2',
406 'hungarian': 'hu_HU.ISO8859-2',
407 'icelandic': 'is_IS.ISO8859-1',
408 'id': 'id_ID.ISO8859-1',
409 'id_id': 'id_ID.ISO8859-1',
410 'is': 'is_IS.ISO8859-1',
411 'is_is': 'is_IS.ISO8859-1',
412 'iso-8859-1': 'en_US.ISO8859-1',
413 'iso-8859-15': 'en_US.ISO8859-15',
414 'iso8859-1': 'en_US.ISO8859-1',
415 'iso8859-15': 'en_US.ISO8859-15',
416 'iso_8859_1': 'en_US.ISO8859-1',
417 'iso_8859_15': 'en_US.ISO8859-15',
418 'it': 'it_IT.ISO8859-1',
419 'it_ch': 'it_CH.ISO8859-1',
420 'it_it': 'it_IT.ISO8859-1',
421 'italian': 'it_IT.ISO8859-1',
422 'iw': 'iw_IL.ISO8859-8',
423 'iw_il': 'iw_IL.ISO8859-8',
424 'ja': 'ja_JP.eucJP',
425 'ja.jis': 'ja_JP.JIS7',
426 'ja.sjis': 'ja_JP.SJIS',
427 'ja_jp': 'ja_JP.eucJP',
428 'ja_jp.ajec': 'ja_JP.eucJP',
429 'ja_jp.euc': 'ja_JP.eucJP',
430 'ja_jp.eucjp': 'ja_JP.eucJP',
431 'ja_jp.iso-2022-jp': 'ja_JP.JIS7',
432 'ja_jp.jis': 'ja_JP.JIS7',
433 'ja_jp.jis7': 'ja_JP.JIS7',
434 'ja_jp.mscode': 'ja_JP.SJIS',
435 'ja_jp.sjis': 'ja_JP.SJIS',
436 'ja_jp.ujis': 'ja_JP.eucJP',
437 'japan': 'ja_JP.eucJP',
438 'japanese': 'ja_JP.SJIS',
439 'japanese-euc': 'ja_JP.eucJP',
440 'japanese.euc': 'ja_JP.eucJP',
441 'jp_jp': 'ja_JP.eucJP',
442 'ko': 'ko_KR.eucKR',
443 'ko_kr': 'ko_KR.eucKR',
444 'ko_kr.euc': 'ko_KR.eucKR',
445 'korean': 'ko_KR.eucKR',
446 'lt': 'lt_LT.ISO8859-4',
447 'lv': 'lv_LV.ISO8859-4',
448 'mk': 'mk_MK.ISO8859-5',
449 'mk_mk': 'mk_MK.ISO8859-5',
450 'nl': 'nl_NL.ISO8859-1',
451 'nl_be': 'nl_BE.ISO8859-1',
452 'nl_nl': 'nl_NL.ISO8859-1',
453 'no': 'no_NO.ISO8859-1',
454 'no_no': 'no_NO.ISO8859-1',
455 'norwegian': 'no_NO.ISO8859-1',
456 'pl': 'pl_PL.ISO8859-2',
457 'pl_pl': 'pl_PL.ISO8859-2',
458 'polish': 'pl_PL.ISO8859-2',
459 'portuguese': 'pt_PT.ISO8859-1',
460 'portuguese_brazil': 'pt_BR.ISO8859-1',
461 'posix': 'C',
462 'posix-utf2': 'C',
463 'pt': 'pt_PT.ISO8859-1',
464 'pt_br': 'pt_BR.ISO8859-1',
465 'pt_pt': 'pt_PT.ISO8859-1',
466 'ro': 'ro_RO.ISO8859-2',
467 'ro_ro': 'ro_RO.ISO8859-2',
468 'ru': 'ru_RU.ISO8859-5',
469 'ru_ru': 'ru_RU.ISO8859-5',
470 'rumanian': 'ro_RO.ISO8859-2',
471 'russian': 'ru_RU.ISO8859-5',
472 'serbocroatian': 'sh_YU.ISO8859-2',
473 'sh': 'sh_YU.ISO8859-2',
474 'sh_hr': 'sh_HR.ISO8859-2',
475 'sh_sp': 'sh_YU.ISO8859-2',
476 'sh_yu': 'sh_YU.ISO8859-2',
477 'sk': 'sk_SK.ISO8859-2',
478 'sk_sk': 'sk_SK.ISO8859-2',
479 'sl': 'sl_CS.ISO8859-2',
480 'sl_cs': 'sl_CS.ISO8859-2',
481 'sl_si': 'sl_SI.ISO8859-2',
482 'slovak': 'sk_SK.ISO8859-2',
483 'slovene': 'sl_CS.ISO8859-2',
484 'sp': 'sp_YU.ISO8859-5',
485 'sp_yu': 'sp_YU.ISO8859-5',
486 'spanish': 'es_ES.ISO8859-1',
487 'spanish_spain': 'es_ES.ISO8859-1',
488 'sr_sp': 'sr_SP.ISO8859-2',
489 'sv': 'sv_SE.ISO8859-1',
490 'sv_se': 'sv_SE.ISO8859-1',
491 'swedish': 'sv_SE.ISO8859-1',
492 'th_th': 'th_TH.TACTIS',
493 'tr': 'tr_TR.ISO8859-9',
494 'tr_tr': 'tr_TR.ISO8859-9',
495 'turkish': 'tr_TR.ISO8859-9',
496 'univ': 'en_US.utf',
497 'universal': 'en_US.utf',
498 'zh': 'zh_CN.eucCN',
499 'zh_cn': 'zh_CN.eucCN',
500 'zh_cn.big5': 'zh_TW.eucTW',
501 'zh_cn.euc': 'zh_CN.eucCN',
502 'zh_tw': 'zh_TW.eucTW',
503 'zh_tw.euc': 'zh_TW.eucTW',
504}
505
506def _print_locale():
507
508 """ Test function.
509 """
510 categories = {}
511 def _init_categories(categories=categories):
512 for k,v in globals().items():
513 if k[:3] == 'LC_':
514 categories[k] = v
515 _init_categories()
516 del categories['LC_ALL']
517
518 print 'Locale defaults as determined by get_default():'
519 print '-'*72
520 lang, enc = get_default()
521 print 'Language: ', lang or '(undefined)'
522 print 'Encoding: ', enc or '(undefined)'
523 print
524
525 print 'Locale settings on startup:'
526 print '-'*72
527 for name,category in categories.items():
528 print name,'...'
529 lang, enc = get_locale(category)
530 print ' Language: ', lang or '(undefined)'
531 print ' Encoding: ', enc or '(undefined)'
532 print
533
534 set_to_default()
535 print
536 print 'Locale settings after calling set_to_default():'
537 print '-'*72
538 for name,category in categories.items():
539 print name,'...'
540 lang, enc = get_locale(category)
541 print ' Language: ', lang or '(undefined)'
542 print ' Encoding: ', enc or '(undefined)'
543 print
544
545 try:
546 setlocale(LC_ALL,"")
547 except:
548 print 'NOTE:'
549 print 'setlocale(LC_ALL,"") does not support the default locale'
550 print 'given in the OS environment variables.'
551 else:
552 print
553 print 'Locale settings after calling setlocale(LC_ALL,""):'
554 print '-'*72
555 for name,category in categories.items():
556 print name,'...'
557 lang, enc = get_locale(category)
558 print ' Language: ', lang or '(undefined)'
559 print ' Encoding: ', enc or '(undefined)'
560 print
561
562###
Guido van Rossumeef1d4e1997-11-19 19:01:43 +0000563
564if __name__=='__main__':
Marc-André Lemburg5431bc32000-06-07 09:11:40 +0000565 print 'Locale aliasing:'
566 print
567 _print_locale()
568 print
569 print 'Number formatting:'
570 print
571 _test()