blob: c39ce6e3595383d2dbd6963799763b8880367ca1 [file] [log] [blame]
Barry Warsawaf572511999-08-11 21:40:38 +00001#! /usr/bin/env python
Barry Warsawa507c321999-11-03 16:46:05 +00002# Originally written by Barry Warsaw <bwarsaw@python.org>
Barry Warsawc8f08922000-02-26 20:56:47 +00003#
4# minimally patched to make it even more xgettext compatible
5# by Peter Funk <pf@artcom-gmbh.de>
Barry Warsawe27db5a1999-08-13 20:59:48 +00006
7"""pygettext -- Python equivalent of xgettext(1)
8
9Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
10internationalization of C programs. Most of these tools are independent of
11the programming language and can be used from within Python programs. Martin
12von Loewis' work[1] helps considerably in this regard.
13
Barry Warsaw5dbf5261999-11-03 18:47:52 +000014There's one problem though; xgettext is the program that scans source code
Barry Warsawe27db5a1999-08-13 20:59:48 +000015looking for message strings, but it groks only C (or C++). Python introduces
16a few wrinkles, such as dual quoting characters, triple quoted strings, and
17raw strings. xgettext understands none of this.
18
19Enter pygettext, which uses Python's standard tokenize module to scan Python
20source code, generating .pot files identical to what GNU xgettext[2] generates
Barry Warsaw5dbf5261999-11-03 18:47:52 +000021for C and C++ code. From there, the standard GNU tools can be used.
Barry Warsawe27db5a1999-08-13 20:59:48 +000022
23A word about marking Python strings as candidates for translation. GNU
24xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
25gettext_noop. But those can be a lot of text to include all over your code.
Barry Warsaw5dbf5261999-11-03 18:47:52 +000026C and C++ have a trick: they use the C preprocessor. Most internationalized C
Barry Warsawe27db5a1999-08-13 20:59:48 +000027source includes a #define for gettext() to _() so that what has to be written
28in the source is much less. Thus these are both translatable strings:
29
30 gettext("Translatable String")
31 _("Translatable String")
32
33Python of course has no preprocessor so this doesn't work so well. Thus,
34pygettext searches only for _() by default, but see the -k/--keyword flag
35below for how to augment this.
36
37 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
38 [2] http://www.gnu.org/software/gettext/gettext.html
39
Barry Warsawe27db5a1999-08-13 20:59:48 +000040NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
Barry Warsawc8f08922000-02-26 20:56:47 +000041where ever possible. However some options are still missing or are not fully
42implemented.
Barry Warsawe27db5a1999-08-13 20:59:48 +000043
44Usage: pygettext [options] filename ...
45
46Options:
47
48 -a
49 --extract-all
50 Extract all strings
51
Barry Warsawc8f08922000-02-26 20:56:47 +000052 -d name
53 --default-domain=name
54 Rename the default output file from messages.pot to name.pot
55
56 -E
57 --escape
58 replace non-ASCII characters with octal escape sequences.
59
60 -h
61 --help
62 print this help message and exit
Barry Warsawe27db5a1999-08-13 20:59:48 +000063
64 -k [word]
65 --keyword[=word]
66 Additional keywords to look for. Without `word' means not to use the
67 default keywords. The default keywords, which are always looked for
68 if not explicitly disabled: _
69
70 The default keyword list is different than GNU xgettext. You can have
71 multiple -k flags on the command line.
72
73 --no-location
74 Do not write filename/lineno location comments
75
76 -n [style]
77 --add-location[=style]
78 Write filename/lineno location comments indicating where each
79 extracted string is found in the source. These lines appear before
80 each msgid. Two styles are supported:
81
82 Solaris # File: filename, line: line-number
83 Gnu #: filename:line
84
85 If style is omitted, Gnu is used. The style name is case
86 insensitive. By default, locations are included.
87
Barry Warsawc8f08922000-02-26 20:56:47 +000088 -o filename
89 --output=filename
90 Rename the default output file from messages.pot to filename.
91
92 -p dir
93 --output-dir=dir
94 Output files will be placed in directory dir.
95
Barry Warsaw5dbf5261999-11-03 18:47:52 +000096 -v
97 --verbose
98 Print the names of the files being processed.
99
Barry Warsawc8f08922000-02-26 20:56:47 +0000100 -V
101 --version
102 Print the version of pygettext and exit.
103
104 -w columns
105 --width=columns
106 Set width of output to columns.
107
108 -x filename
109 --exclude-file=filename
110 Specify a file that contains a list of strings that are not be
111 extracted from the input files. Each string to be excluded must
112 appear on a line by itself in the file.
Barry Warsawe27db5a1999-08-13 20:59:48 +0000113
114"""
115
116import os
117import sys
118import string
119import time
120import getopt
121import tokenize
122
Barry Warsawc8f08922000-02-26 20:56:47 +0000123__version__ = '1.0'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000124
125
126
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000127# for selftesting
Barry Warsawc8f08922000-02-26 20:56:47 +0000128try:
129 import fintl
130 _ = fintl.gettext
131except ImportError:
132 def _(s): return s
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000133
134
135# The normal pot-file header. msgmerge and EMACS' po-mode work better if
136# it's there.
137pot_header = _('''\
138# SOME DESCRIPTIVE TITLE.
139# Copyright (C) YEAR ORGANIZATION
140# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
141#
142msgid ""
143msgstr ""
144"Project-Id-Version: PACKAGE VERSION\\n"
145"PO-Revision-Date: %(time)s\\n"
146"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
147"Language-Team: LANGUAGE <LL@li.org>\\n"
148"MIME-Version: 1.0\\n"
149"Content-Type: text/plain; charset=CHARSET\\n"
150"Content-Transfer-Encoding: ENCODING\\n"
151"Generated-By: pygettext.py %(version)s\\n"
152
153''')
154
155
Barry Warsawe27db5a1999-08-13 20:59:48 +0000156def usage(code, msg=''):
157 print __doc__ % globals()
158 if msg:
159 print msg
160 sys.exit(code)
161
Barry Warsawc8f08922000-02-26 20:56:47 +0000162
Barry Warsawe27db5a1999-08-13 20:59:48 +0000163
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000164escapes = []
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000165
Barry Warsawc8f08922000-02-26 20:56:47 +0000166def make_escapes(pass_iso8859):
167 global escapes
Barry Warsaw7733e122000-02-27 14:30:48 +0000168 if pass_iso8859:
169 # Allow iso-8859 characters to pass through so that e.g. 'msgid
170 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
171 # escape any character outside the 32..126 range.
172 mod = 128
173 else:
174 mod = 256
Barry Warsawc8f08922000-02-26 20:56:47 +0000175 for i in range(256):
Barry Warsaw7733e122000-02-27 14:30:48 +0000176 if 32 <= (i % mod) <= 126:
Barry Warsawc8f08922000-02-26 20:56:47 +0000177 escapes.append(chr(i))
178 else:
179 escapes.append("\\%03o" % i)
180 escapes[ord('\\')] = '\\\\'
181 escapes[ord('\t')] = '\\t'
182 escapes[ord('\r')] = '\\r'
183 escapes[ord('\n')] = '\\n'
184 escapes[ord('\"')] = '\\"'
185
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000186
187def escape(s):
Barry Warsawc8f08922000-02-26 20:56:47 +0000188 global escapes
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000189 s = list(s)
190 for i in range(len(s)):
191 s[i] = escapes[ord(s[i])]
192 return string.join(s, '')
193
194
195def safe_eval(s):
196 # unwrap quotes, safely
197 return eval(s, {'__builtins__':{}}, {})
198
199
Barry Warsawe27db5a1999-08-13 20:59:48 +0000200def normalize(s):
201 # This converts the various Python string types into a format that is
202 # appropriate for .po files, namely much closer to C style.
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000203 lines = string.split(s, '\n')
204 if len(lines) == 1:
205 s = '"' + escape(s) + '"'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000206 else:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000207 if not lines[-1]:
208 del lines[-1]
209 lines[-1] = lines[-1] + '\n'
210 for i in range(len(lines)):
211 lines[i] = escape(lines[i])
212 s = '""\n"' + string.join(lines, '\\n"\n"') + '"'
213 return s
Barry Warsawe27db5a1999-08-13 20:59:48 +0000214
215
216
217class TokenEater:
218 def __init__(self, options):
219 self.__options = options
220 self.__messages = {}
221 self.__state = self.__waiting
222 self.__data = []
223 self.__lineno = -1
224
225 def __call__(self, ttype, tstring, stup, etup, line):
226 # dispatch
227 self.__state(ttype, tstring, stup[0])
228
229 def __waiting(self, ttype, tstring, lineno):
230 if ttype == tokenize.NAME and tstring in self.__options.keywords:
231 self.__state = self.__keywordseen
232
233 def __keywordseen(self, ttype, tstring, lineno):
234 if ttype == tokenize.OP and tstring == '(':
235 self.__data = []
236 self.__lineno = lineno
237 self.__state = self.__openseen
238 else:
239 self.__state = self.__waiting
240
241 def __openseen(self, ttype, tstring, lineno):
242 if ttype == tokenize.OP and tstring == ')':
243 # We've seen the last of the translatable strings. Record the
244 # line number of the first line of the strings and update the list
245 # of messages seen. Reset state for the next batch. If there
246 # were no strings inside _(), then just ignore this entry.
247 if self.__data:
248 msg = string.join(self.__data, '')
Barry Warsawc8f08922000-02-26 20:56:47 +0000249 if not msg in self.__options.toexclude:
250 entry = (self.__curfile, self.__lineno)
251 linenos = self.__messages.get(msg)
252 if linenos is None:
253 self.__messages[msg] = [entry]
254 else:
255 linenos.append(entry)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000256 self.__state = self.__waiting
257 elif ttype == tokenize.STRING:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000258 self.__data.append(safe_eval(tstring))
Barry Warsawe27db5a1999-08-13 20:59:48 +0000259 # TBD: should we warn if we seen anything else?
260
261 def set_filename(self, filename):
262 self.__curfile = filename
263
264 def write(self, fp):
265 options = self.__options
266 timestamp = time.ctime(time.time())
267 # common header
268 try:
269 sys.stdout = fp
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000270 # The time stamp in the header doesn't have the same format
271 # as that generated by xgettext...
Barry Warsawc8f08922000-02-26 20:56:47 +0000272 print pot_header % {'time': timestamp, 'version': __version__}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000273 for k, v in self.__messages.items():
Barry Warsawc8f08922000-02-26 20:56:47 +0000274 # location comments are different b/w Solaris and GNU:
275 if options.location == options.SOLARIS:
276 for filename, lineno in v:
277 d = {'filename': filename, 'lineno': lineno}
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000278 print _('# File: %(filename)s, line: %(lineno)d') % d
Barry Warsawc8f08922000-02-26 20:56:47 +0000279 elif options.location == options.GNU:
280 # fit as many locations on one line, as long as the
281 # resulting line length doesn't exceeds 'options.width'
282 locline = '#:'
283 for filename, lineno in v:
284 d = {'filename': filename, 'lineno': lineno}
285 s = _(' %(filename)s:%(lineno)d') % d
286 if len(locline) + len(s) <= options.width:
287 locline = locline + s
288 else:
289 print locline
290 locline = "#:" + s
291 if len(locline) > 2:
292 print locline
Barry Warsawe27db5a1999-08-13 20:59:48 +0000293 # TBD: sorting, normalizing
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000294 print 'msgid', normalize(k)
Barry Warsawc8f08922000-02-26 20:56:47 +0000295 print 'msgstr ""\n'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000296 finally:
297 sys.stdout = sys.__stdout__
298
299
300def main():
301 default_keywords = ['_']
302 try:
303 opts, args = getopt.getopt(
304 sys.argv[1:],
Barry Warsawc8f08922000-02-26 20:56:47 +0000305 'ad:Ehk:n:o:p:Vvw:x:',
306 ['extract-all', 'default-domain', 'escape', 'help', 'keyword',
307 'add-location', 'no-location', 'output=', 'output-dir=',
308 'verbose', 'version', 'width=', 'exclude-file=',
309 ])
Barry Warsawe27db5a1999-08-13 20:59:48 +0000310 except getopt.error, msg:
311 usage(1, msg)
312
313 # for holding option values
314 class Options:
315 # constants
316 GNU = 1
317 SOLARIS = 2
318 # defaults
Barry Warsawc8f08922000-02-26 20:56:47 +0000319 extractall = 0 # FIXME: currently this option has no effect at all.
320 escape = 0
Barry Warsawe27db5a1999-08-13 20:59:48 +0000321 keywords = []
Barry Warsawc8f08922000-02-26 20:56:47 +0000322 outpath = ''
Barry Warsawe27db5a1999-08-13 20:59:48 +0000323 outfile = 'messages.pot'
324 location = GNU
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000325 verbose = 0
Barry Warsawc8f08922000-02-26 20:56:47 +0000326 width = 78
327 excludefilename = ''
Barry Warsawe27db5a1999-08-13 20:59:48 +0000328
329 options = Options()
330 locations = {'gnu' : options.GNU,
331 'solaris' : options.SOLARIS,
332 }
333
334 # parse options
335 for opt, arg in opts:
336 if opt in ('-h', '--help'):
337 usage(0)
Barry Warsawc8f08922000-02-26 20:56:47 +0000338 elif opt in ('-a', '--extract-all'):
339 options.extractall = 1
340 elif opt in ('-d', '--default-domain'):
341 options.outfile = arg + '.pot'
342 elif opt in ('-E', '--escape'):
343 options.escape = 1
Barry Warsawe27db5a1999-08-13 20:59:48 +0000344 elif opt in ('-k', '--keyword'):
345 if arg is None:
346 default_keywords = []
347 options.keywords.append(arg)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000348 elif opt in ('-n', '--add-location'):
349 if arg is None:
350 arg = 'gnu'
351 try:
352 options.location = locations[string.lower(arg)]
353 except KeyError:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000354 d = {'arg':arg}
355 usage(1, _('Invalid value for --add-location: %(arg)s') % d)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000356 elif opt in ('--no-location',):
357 options.location = 0
Barry Warsawc8f08922000-02-26 20:56:47 +0000358 elif opt in ('-o', '--output'):
359 options.outfile = arg
360 elif opt in ('-p', '--output-dir'):
361 options.outpath = arg
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000362 elif opt in ('-v', '--verbose'):
363 options.verbose = 1
Barry Warsawc8f08922000-02-26 20:56:47 +0000364 elif opt in ('-V', '--version'):
365 print _('pygettext.py (xgettext for Python) %s') % __version__
366 sys.exit(0)
367 elif opt in ('-w', '--width'):
368 try:
369 options.width = int(arg)
370 except ValueError:
371 d = {'arg':arg}
372 usage(1, _('Invalid value for --width: %(arg)s, must be int')
373 % d)
374 elif opt in ('-x', '--exclude-file'):
375 options.excludefilename = arg
376
377 # calculate escapes
Barry Warsaw7733e122000-02-27 14:30:48 +0000378 make_escapes(options.escape)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000379
380 # calculate all keywords
381 options.keywords.extend(default_keywords)
382
Barry Warsawc8f08922000-02-26 20:56:47 +0000383 # initialize list of strings to exclude
384 if options.excludefilename:
385 try:
386 fp = open(options.excludefilename)
387 options.toexclude = fp.readlines()
388 fp.close()
389 except IOError:
390 sys.stderr.write(_("Can't read --exclude-file: %s") %
391 options.excludefilename)
392 sys.exit(1)
393 else:
394 options.toexclude = []
395
Barry Warsawe27db5a1999-08-13 20:59:48 +0000396 # slurp through all the files
397 eater = TokenEater(options)
398 for filename in args:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000399 if options.verbose:
400 print _('Working on %(filename)s') % {'filename':filename}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000401 fp = open(filename)
402 eater.set_filename(filename)
403 tokenize.tokenize(fp.readline, eater)
404 fp.close()
405
Barry Warsawc8f08922000-02-26 20:56:47 +0000406 if options.outpath:
407 options.outfile = os.path.join(options.outpath, options.outfile)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000408 fp = open(options.outfile, 'w')
409 eater.write(fp)
410 fp.close()
411
412
413
414if __name__ == '__main__':
415 main()