blob: 546874c5990755f60ec17f8ba08632b624f127ce [file] [log] [blame]
Barry Warsawaf572511999-08-11 21:40:38 +00001#! /usr/bin/env python
Barry Warsawa507c321999-11-03 16:46:05 +00002# Originally written by Barry Warsaw <bwarsaw@python.org>
Barry Warsawc8f08922000-02-26 20:56:47 +00003#
4# minimally patched to make it even more xgettext compatible
5# by Peter Funk <pf@artcom-gmbh.de>
Barry Warsawe27db5a1999-08-13 20:59:48 +00006
Barry Warsawa17e0f12000-03-08 15:18:35 +00007# for selftesting
8try:
9 import fintl
10 _ = fintl.gettext
11except ImportError:
12 def _(s): return s
13
14
15__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
Barry Warsawe27db5a1999-08-13 20:59:48 +000016
17Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
18internationalization of C programs. Most of these tools are independent of
19the programming language and can be used from within Python programs. Martin
20von Loewis' work[1] helps considerably in this regard.
21
Barry Warsaw5dbf5261999-11-03 18:47:52 +000022There's one problem though; xgettext is the program that scans source code
Barry Warsawe27db5a1999-08-13 20:59:48 +000023looking for message strings, but it groks only C (or C++). Python introduces
24a few wrinkles, such as dual quoting characters, triple quoted strings, and
25raw strings. xgettext understands none of this.
26
27Enter pygettext, which uses Python's standard tokenize module to scan Python
28source code, generating .pot files identical to what GNU xgettext[2] generates
Barry Warsaw5dbf5261999-11-03 18:47:52 +000029for C and C++ code. From there, the standard GNU tools can be used.
Barry Warsawe27db5a1999-08-13 20:59:48 +000030
31A word about marking Python strings as candidates for translation. GNU
32xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
33gettext_noop. But those can be a lot of text to include all over your code.
Barry Warsaw5dbf5261999-11-03 18:47:52 +000034C and C++ have a trick: they use the C preprocessor. Most internationalized C
Barry Warsawe27db5a1999-08-13 20:59:48 +000035source includes a #define for gettext() to _() so that what has to be written
36in the source is much less. Thus these are both translatable strings:
37
38 gettext("Translatable String")
39 _("Translatable String")
40
41Python of course has no preprocessor so this doesn't work so well. Thus,
42pygettext searches only for _() by default, but see the -k/--keyword flag
43below for how to augment this.
44
45 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
46 [2] http://www.gnu.org/software/gettext/gettext.html
47
Barry Warsawe27db5a1999-08-13 20:59:48 +000048NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
Barry Warsawc8f08922000-02-26 20:56:47 +000049where ever possible. However some options are still missing or are not fully
Barry Warsawa17e0f12000-03-08 15:18:35 +000050implemented. Also, xgettext's use of command line switches with option
51arguments is broken, and in these cases, pygettext just defines additional
52switches.
Barry Warsawe27db5a1999-08-13 20:59:48 +000053
Barry Warsawa17e0f12000-03-08 15:18:35 +000054Usage: pygettext [options] inputfile ...
Barry Warsawe27db5a1999-08-13 20:59:48 +000055
56Options:
57
58 -a
59 --extract-all
60 Extract all strings
61
Barry Warsawc8f08922000-02-26 20:56:47 +000062 -d name
63 --default-domain=name
64 Rename the default output file from messages.pot to name.pot
65
66 -E
67 --escape
68 replace non-ASCII characters with octal escape sequences.
69
70 -h
71 --help
72 print this help message and exit
Barry Warsawe27db5a1999-08-13 20:59:48 +000073
Barry Warsawa17e0f12000-03-08 15:18:35 +000074 -k word
75 --keyword=word
76 Keywords to look for in addition to the default set, which are:
77 %(DEFAULTKEYWORDS)s
Barry Warsawe27db5a1999-08-13 20:59:48 +000078
Barry Warsawa17e0f12000-03-08 15:18:35 +000079 You can have multiple -k flags on the command line.
80
81 -K
82 --no-default-keywords
83 Disable the default set of keywords (see above). Any keywords
84 explicitly added with the -k/--keyword option are still recognized.
Barry Warsawe27db5a1999-08-13 20:59:48 +000085
86 --no-location
Barry Warsawa17e0f12000-03-08 15:18:35 +000087 Do not write filename/lineno location comments.
Barry Warsawe27db5a1999-08-13 20:59:48 +000088
Barry Warsawa17e0f12000-03-08 15:18:35 +000089 -n
90 --add-location
Barry Warsawe27db5a1999-08-13 20:59:48 +000091 Write filename/lineno location comments indicating where each
92 extracted string is found in the source. These lines appear before
Barry Warsawa17e0f12000-03-08 15:18:35 +000093 each msgid. The style of comments is controlled by the -S/--style
94 option. This is the default.
95
96 -S stylename
97 --style stylename
98 Specify which style to use for location comments. Two styles are
99 supported:
Barry Warsawe27db5a1999-08-13 20:59:48 +0000100
101 Solaris # File: filename, line: line-number
Barry Warsawa17e0f12000-03-08 15:18:35 +0000102 GNU #: filename:line
Barry Warsawe27db5a1999-08-13 20:59:48 +0000103
Barry Warsawa17e0f12000-03-08 15:18:35 +0000104 The style name is case insensitive. GNU style is the default.
Barry Warsawe27db5a1999-08-13 20:59:48 +0000105
Barry Warsawc8f08922000-02-26 20:56:47 +0000106 -o filename
107 --output=filename
Barry Warsawa17e0f12000-03-08 15:18:35 +0000108 Rename the default output file from messages.pot to filename. If
109 filename is `-' then the output is sent to standard out.
Barry Warsawc8f08922000-02-26 20:56:47 +0000110
111 -p dir
112 --output-dir=dir
113 Output files will be placed in directory dir.
114
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000115 -v
116 --verbose
117 Print the names of the files being processed.
118
Barry Warsawc8f08922000-02-26 20:56:47 +0000119 -V
120 --version
121 Print the version of pygettext and exit.
122
123 -w columns
124 --width=columns
125 Set width of output to columns.
126
127 -x filename
128 --exclude-file=filename
129 Specify a file that contains a list of strings that are not be
130 extracted from the input files. Each string to be excluded must
131 appear on a line by itself in the file.
Barry Warsawe27db5a1999-08-13 20:59:48 +0000132
Barry Warsawa17e0f12000-03-08 15:18:35 +0000133If `inputfile' is -, standard input is read.
134
135""")
Barry Warsawe27db5a1999-08-13 20:59:48 +0000136
137import os
138import sys
Barry Warsawe27db5a1999-08-13 20:59:48 +0000139import time
140import getopt
141import tokenize
142
Barry Warsawa17e0f12000-03-08 15:18:35 +0000143__version__ = '1.1'
144
145default_keywords = ['_']
146DEFAULTKEYWORDS = ', '.join(default_keywords)
147
148EMPTYSTRING = ''
Barry Warsawe27db5a1999-08-13 20:59:48 +0000149
150
151
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000152# The normal pot-file header. msgmerge and EMACS' po-mode work better if
153# it's there.
154pot_header = _('''\
155# SOME DESCRIPTIVE TITLE.
156# Copyright (C) YEAR ORGANIZATION
157# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
158#
159msgid ""
160msgstr ""
161"Project-Id-Version: PACKAGE VERSION\\n"
162"PO-Revision-Date: %(time)s\\n"
163"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
164"Language-Team: LANGUAGE <LL@li.org>\\n"
165"MIME-Version: 1.0\\n"
166"Content-Type: text/plain; charset=CHARSET\\n"
167"Content-Transfer-Encoding: ENCODING\\n"
168"Generated-By: pygettext.py %(version)s\\n"
169
170''')
171
172
Barry Warsawe27db5a1999-08-13 20:59:48 +0000173def usage(code, msg=''):
174 print __doc__ % globals()
175 if msg:
176 print msg
177 sys.exit(code)
178
Barry Warsawc8f08922000-02-26 20:56:47 +0000179
Barry Warsawe27db5a1999-08-13 20:59:48 +0000180
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000181escapes = []
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000182
Barry Warsawc8f08922000-02-26 20:56:47 +0000183def make_escapes(pass_iso8859):
184 global escapes
Barry Warsaw7733e122000-02-27 14:30:48 +0000185 if pass_iso8859:
186 # Allow iso-8859 characters to pass through so that e.g. 'msgid
187 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
188 # escape any character outside the 32..126 range.
189 mod = 128
190 else:
191 mod = 256
Barry Warsawc8f08922000-02-26 20:56:47 +0000192 for i in range(256):
Barry Warsaw7733e122000-02-27 14:30:48 +0000193 if 32 <= (i % mod) <= 126:
Barry Warsawc8f08922000-02-26 20:56:47 +0000194 escapes.append(chr(i))
195 else:
196 escapes.append("\\%03o" % i)
197 escapes[ord('\\')] = '\\\\'
198 escapes[ord('\t')] = '\\t'
199 escapes[ord('\r')] = '\\r'
200 escapes[ord('\n')] = '\\n'
201 escapes[ord('\"')] = '\\"'
202
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000203
204def escape(s):
Barry Warsawc8f08922000-02-26 20:56:47 +0000205 global escapes
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000206 s = list(s)
207 for i in range(len(s)):
208 s[i] = escapes[ord(s[i])]
Barry Warsawa17e0f12000-03-08 15:18:35 +0000209 return EMPTYSTRING.join(s)
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000210
211
212def safe_eval(s):
213 # unwrap quotes, safely
214 return eval(s, {'__builtins__':{}}, {})
215
216
Barry Warsawe27db5a1999-08-13 20:59:48 +0000217def normalize(s):
218 # This converts the various Python string types into a format that is
219 # appropriate for .po files, namely much closer to C style.
Barry Warsawa17e0f12000-03-08 15:18:35 +0000220 lines = s.split('\n')
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000221 if len(lines) == 1:
222 s = '"' + escape(s) + '"'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000223 else:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000224 if not lines[-1]:
225 del lines[-1]
226 lines[-1] = lines[-1] + '\n'
227 for i in range(len(lines)):
228 lines[i] = escape(lines[i])
Barry Warsawa17e0f12000-03-08 15:18:35 +0000229 lineterm = '\\n"\n"'
230 s = '""\n"' + lineterm.join(lines) + '"'
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000231 return s
Barry Warsawe27db5a1999-08-13 20:59:48 +0000232
233
234
235class TokenEater:
236 def __init__(self, options):
237 self.__options = options
238 self.__messages = {}
239 self.__state = self.__waiting
240 self.__data = []
241 self.__lineno = -1
242
243 def __call__(self, ttype, tstring, stup, etup, line):
244 # dispatch
245 self.__state(ttype, tstring, stup[0])
246
247 def __waiting(self, ttype, tstring, lineno):
248 if ttype == tokenize.NAME and tstring in self.__options.keywords:
249 self.__state = self.__keywordseen
250
251 def __keywordseen(self, ttype, tstring, lineno):
252 if ttype == tokenize.OP and tstring == '(':
253 self.__data = []
254 self.__lineno = lineno
255 self.__state = self.__openseen
256 else:
257 self.__state = self.__waiting
258
259 def __openseen(self, ttype, tstring, lineno):
260 if ttype == tokenize.OP and tstring == ')':
261 # We've seen the last of the translatable strings. Record the
262 # line number of the first line of the strings and update the list
263 # of messages seen. Reset state for the next batch. If there
264 # were no strings inside _(), then just ignore this entry.
265 if self.__data:
Barry Warsawa17e0f12000-03-08 15:18:35 +0000266 msg = EMPTYSTRING.join(self.__data)
Barry Warsawc8f08922000-02-26 20:56:47 +0000267 if not msg in self.__options.toexclude:
268 entry = (self.__curfile, self.__lineno)
269 linenos = self.__messages.get(msg)
270 if linenos is None:
271 self.__messages[msg] = [entry]
272 else:
273 linenos.append(entry)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000274 self.__state = self.__waiting
275 elif ttype == tokenize.STRING:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000276 self.__data.append(safe_eval(tstring))
Barry Warsawe27db5a1999-08-13 20:59:48 +0000277 # TBD: should we warn if we seen anything else?
278
279 def set_filename(self, filename):
280 self.__curfile = filename
281
282 def write(self, fp):
283 options = self.__options
284 timestamp = time.ctime(time.time())
285 # common header
286 try:
287 sys.stdout = fp
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000288 # The time stamp in the header doesn't have the same format
289 # as that generated by xgettext...
Barry Warsawc8f08922000-02-26 20:56:47 +0000290 print pot_header % {'time': timestamp, 'version': __version__}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000291 for k, v in self.__messages.items():
Barry Warsawa17e0f12000-03-08 15:18:35 +0000292 if not options.writelocations:
293 pass
Barry Warsawc8f08922000-02-26 20:56:47 +0000294 # location comments are different b/w Solaris and GNU:
Barry Warsawa17e0f12000-03-08 15:18:35 +0000295 elif options.locationstyle == options.SOLARIS:
Barry Warsawc8f08922000-02-26 20:56:47 +0000296 for filename, lineno in v:
297 d = {'filename': filename, 'lineno': lineno}
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000298 print _('# File: %(filename)s, line: %(lineno)d') % d
Barry Warsawa17e0f12000-03-08 15:18:35 +0000299 elif options.locationstyle == options.GNU:
Barry Warsawc8f08922000-02-26 20:56:47 +0000300 # fit as many locations on one line, as long as the
301 # resulting line length doesn't exceeds 'options.width'
302 locline = '#:'
303 for filename, lineno in v:
304 d = {'filename': filename, 'lineno': lineno}
305 s = _(' %(filename)s:%(lineno)d') % d
306 if len(locline) + len(s) <= options.width:
307 locline = locline + s
308 else:
309 print locline
310 locline = "#:" + s
311 if len(locline) > 2:
312 print locline
Barry Warsawe27db5a1999-08-13 20:59:48 +0000313 # TBD: sorting, normalizing
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000314 print 'msgid', normalize(k)
Barry Warsawc8f08922000-02-26 20:56:47 +0000315 print 'msgstr ""\n'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000316 finally:
317 sys.stdout = sys.__stdout__
318
319
320def main():
Barry Warsawa17e0f12000-03-08 15:18:35 +0000321 global default_keywords
Barry Warsawe27db5a1999-08-13 20:59:48 +0000322 try:
323 opts, args = getopt.getopt(
324 sys.argv[1:],
Barry Warsawa17e0f12000-03-08 15:18:35 +0000325 'ad:Ehk:Kno:p:S:Vvw:x:',
326 ['extract-all', 'default-domain', 'escape', 'help',
327 'keyword=', 'no-default-keywords',
Barry Warsawc8f08922000-02-26 20:56:47 +0000328 'add-location', 'no-location', 'output=', 'output-dir=',
Barry Warsawa17e0f12000-03-08 15:18:35 +0000329 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
Barry Warsawc8f08922000-02-26 20:56:47 +0000330 ])
Barry Warsawe27db5a1999-08-13 20:59:48 +0000331 except getopt.error, msg:
332 usage(1, msg)
333
334 # for holding option values
335 class Options:
336 # constants
337 GNU = 1
338 SOLARIS = 2
339 # defaults
Barry Warsawc8f08922000-02-26 20:56:47 +0000340 extractall = 0 # FIXME: currently this option has no effect at all.
341 escape = 0
Barry Warsawe27db5a1999-08-13 20:59:48 +0000342 keywords = []
Barry Warsawc8f08922000-02-26 20:56:47 +0000343 outpath = ''
Barry Warsawe27db5a1999-08-13 20:59:48 +0000344 outfile = 'messages.pot'
Barry Warsawa17e0f12000-03-08 15:18:35 +0000345 writelocations = 1
346 locationstyle = GNU
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000347 verbose = 0
Barry Warsawc8f08922000-02-26 20:56:47 +0000348 width = 78
349 excludefilename = ''
Barry Warsawe27db5a1999-08-13 20:59:48 +0000350
351 options = Options()
352 locations = {'gnu' : options.GNU,
353 'solaris' : options.SOLARIS,
354 }
355
356 # parse options
357 for opt, arg in opts:
358 if opt in ('-h', '--help'):
359 usage(0)
Barry Warsawc8f08922000-02-26 20:56:47 +0000360 elif opt in ('-a', '--extract-all'):
361 options.extractall = 1
362 elif opt in ('-d', '--default-domain'):
363 options.outfile = arg + '.pot'
364 elif opt in ('-E', '--escape'):
365 options.escape = 1
Barry Warsawe27db5a1999-08-13 20:59:48 +0000366 elif opt in ('-k', '--keyword'):
Barry Warsawe27db5a1999-08-13 20:59:48 +0000367 options.keywords.append(arg)
Barry Warsawa17e0f12000-03-08 15:18:35 +0000368 elif opt in ('-K', '--no-default-keywords'):
369 default_keywords = []
Barry Warsawe27db5a1999-08-13 20:59:48 +0000370 elif opt in ('-n', '--add-location'):
Barry Warsawa17e0f12000-03-08 15:18:35 +0000371 options.writelocations = 1
Barry Warsawe27db5a1999-08-13 20:59:48 +0000372 elif opt in ('--no-location',):
Barry Warsawa17e0f12000-03-08 15:18:35 +0000373 options.writelocations = 0
374 elif opt in ('-S', '--style'):
375 options.locationstyle = locations.get(arg.lower())
376 if options.locationstyle is None:
377 usage(1, _('Invalid value for --style: %s') % arg)
Barry Warsawc8f08922000-02-26 20:56:47 +0000378 elif opt in ('-o', '--output'):
379 options.outfile = arg
380 elif opt in ('-p', '--output-dir'):
381 options.outpath = arg
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000382 elif opt in ('-v', '--verbose'):
383 options.verbose = 1
Barry Warsawc8f08922000-02-26 20:56:47 +0000384 elif opt in ('-V', '--version'):
385 print _('pygettext.py (xgettext for Python) %s') % __version__
386 sys.exit(0)
387 elif opt in ('-w', '--width'):
388 try:
389 options.width = int(arg)
390 except ValueError:
Barry Warsawa17e0f12000-03-08 15:18:35 +0000391 usage(1, _('--width argument must be an integer: %s') % arg)
Barry Warsawc8f08922000-02-26 20:56:47 +0000392 elif opt in ('-x', '--exclude-file'):
393 options.excludefilename = arg
394
395 # calculate escapes
Barry Warsaw7733e122000-02-27 14:30:48 +0000396 make_escapes(options.escape)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000397
398 # calculate all keywords
399 options.keywords.extend(default_keywords)
400
Barry Warsawc8f08922000-02-26 20:56:47 +0000401 # initialize list of strings to exclude
402 if options.excludefilename:
403 try:
404 fp = open(options.excludefilename)
405 options.toexclude = fp.readlines()
406 fp.close()
407 except IOError:
408 sys.stderr.write(_("Can't read --exclude-file: %s") %
409 options.excludefilename)
410 sys.exit(1)
411 else:
412 options.toexclude = []
413
Barry Warsawe27db5a1999-08-13 20:59:48 +0000414 # slurp through all the files
415 eater = TokenEater(options)
416 for filename in args:
Barry Warsawa17e0f12000-03-08 15:18:35 +0000417 if filename == '-':
418 if options.verbose:
419 print _('Reading standard input')
420 fp = sys.stdin
421 closep = 0
422 else:
423 if options.verbose:
424 print _('Working on %s') % filename
425 fp = open(filename)
426 closep = 1
427 try:
428 eater.set_filename(filename)
429 tokenize.tokenize(fp.readline, eater)
430 finally:
431 if closep:
432 fp.close()
Barry Warsawe27db5a1999-08-13 20:59:48 +0000433
Barry Warsawa17e0f12000-03-08 15:18:35 +0000434 # write the output
435 if options.outfile == '-':
436 fp = sys.stdout
437 closep = 0
438 else:
439 if options.outpath:
440 options.outfile = os.path.join(options.outpath, options.outfile)
441 fp = open(options.outfile, 'w')
442 closep = 1
443 try:
444 eater.write(fp)
445 finally:
446 if closep:
447 fp.close()
Barry Warsawe27db5a1999-08-13 20:59:48 +0000448
449
450if __name__ == '__main__':
451 main()
Barry Warsaw75a6e672000-05-02 19:28:30 +0000452 # some more test strings
453 _(u'a unicode string')