blob: 4ff4962d62b0024558bb8a7ef4146c81c2d4b9c0 [file] [log] [blame]
Barry Warsawaf572511999-08-11 21:40:38 +00001#! /usr/bin/env python
Barry Warsawa507c321999-11-03 16:46:05 +00002# Originally written by Barry Warsaw <bwarsaw@python.org>
Barry Warsawc8f08922000-02-26 20:56:47 +00003#
4# minimally patched to make it even more xgettext compatible
5# by Peter Funk <pf@artcom-gmbh.de>
Barry Warsawe27db5a1999-08-13 20:59:48 +00006
7"""pygettext -- Python equivalent of xgettext(1)
8
9Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
10internationalization of C programs. Most of these tools are independent of
11the programming language and can be used from within Python programs. Martin
12von Loewis' work[1] helps considerably in this regard.
13
Barry Warsaw5dbf5261999-11-03 18:47:52 +000014There's one problem though; xgettext is the program that scans source code
Barry Warsawe27db5a1999-08-13 20:59:48 +000015looking for message strings, but it groks only C (or C++). Python introduces
16a few wrinkles, such as dual quoting characters, triple quoted strings, and
17raw strings. xgettext understands none of this.
18
19Enter pygettext, which uses Python's standard tokenize module to scan Python
20source code, generating .pot files identical to what GNU xgettext[2] generates
Barry Warsaw5dbf5261999-11-03 18:47:52 +000021for C and C++ code. From there, the standard GNU tools can be used.
Barry Warsawe27db5a1999-08-13 20:59:48 +000022
23A word about marking Python strings as candidates for translation. GNU
24xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
25gettext_noop. But those can be a lot of text to include all over your code.
Barry Warsaw5dbf5261999-11-03 18:47:52 +000026C and C++ have a trick: they use the C preprocessor. Most internationalized C
Barry Warsawe27db5a1999-08-13 20:59:48 +000027source includes a #define for gettext() to _() so that what has to be written
28in the source is much less. Thus these are both translatable strings:
29
30 gettext("Translatable String")
31 _("Translatable String")
32
33Python of course has no preprocessor so this doesn't work so well. Thus,
34pygettext searches only for _() by default, but see the -k/--keyword flag
35below for how to augment this.
36
37 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
38 [2] http://www.gnu.org/software/gettext/gettext.html
39
Barry Warsawe27db5a1999-08-13 20:59:48 +000040NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
Barry Warsawc8f08922000-02-26 20:56:47 +000041where ever possible. However some options are still missing or are not fully
42implemented.
Barry Warsawe27db5a1999-08-13 20:59:48 +000043
44Usage: pygettext [options] filename ...
45
46Options:
47
48 -a
49 --extract-all
50 Extract all strings
51
Barry Warsawc8f08922000-02-26 20:56:47 +000052 -d name
53 --default-domain=name
54 Rename the default output file from messages.pot to name.pot
55
56 -E
57 --escape
58 replace non-ASCII characters with octal escape sequences.
59
60 -h
61 --help
62 print this help message and exit
Barry Warsawe27db5a1999-08-13 20:59:48 +000063
64 -k [word]
65 --keyword[=word]
66 Additional keywords to look for. Without `word' means not to use the
67 default keywords. The default keywords, which are always looked for
68 if not explicitly disabled: _
69
70 The default keyword list is different than GNU xgettext. You can have
71 multiple -k flags on the command line.
72
73 --no-location
74 Do not write filename/lineno location comments
75
76 -n [style]
77 --add-location[=style]
78 Write filename/lineno location comments indicating where each
79 extracted string is found in the source. These lines appear before
80 each msgid. Two styles are supported:
81
82 Solaris # File: filename, line: line-number
83 Gnu #: filename:line
84
85 If style is omitted, Gnu is used. The style name is case
86 insensitive. By default, locations are included.
87
Barry Warsawc8f08922000-02-26 20:56:47 +000088 -o filename
89 --output=filename
90 Rename the default output file from messages.pot to filename.
91
92 -p dir
93 --output-dir=dir
94 Output files will be placed in directory dir.
95
Barry Warsaw5dbf5261999-11-03 18:47:52 +000096 -v
97 --verbose
98 Print the names of the files being processed.
99
Barry Warsawc8f08922000-02-26 20:56:47 +0000100 -V
101 --version
102 Print the version of pygettext and exit.
103
104 -w columns
105 --width=columns
106 Set width of output to columns.
107
108 -x filename
109 --exclude-file=filename
110 Specify a file that contains a list of strings that are not be
111 extracted from the input files. Each string to be excluded must
112 appear on a line by itself in the file.
Barry Warsawe27db5a1999-08-13 20:59:48 +0000113
114"""
115
116import os
117import sys
118import string
119import time
120import getopt
121import tokenize
122
Barry Warsawc8f08922000-02-26 20:56:47 +0000123__version__ = '1.0'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000124
125
126
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000127# for selftesting
Barry Warsawc8f08922000-02-26 20:56:47 +0000128try:
129 import fintl
130 _ = fintl.gettext
131except ImportError:
132 def _(s): return s
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000133
134
135# The normal pot-file header. msgmerge and EMACS' po-mode work better if
136# it's there.
137pot_header = _('''\
138# SOME DESCRIPTIVE TITLE.
139# Copyright (C) YEAR ORGANIZATION
140# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
141#
142msgid ""
143msgstr ""
144"Project-Id-Version: PACKAGE VERSION\\n"
145"PO-Revision-Date: %(time)s\\n"
146"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
147"Language-Team: LANGUAGE <LL@li.org>\\n"
148"MIME-Version: 1.0\\n"
149"Content-Type: text/plain; charset=CHARSET\\n"
150"Content-Transfer-Encoding: ENCODING\\n"
151"Generated-By: pygettext.py %(version)s\\n"
152
153''')
154
155
Barry Warsawe27db5a1999-08-13 20:59:48 +0000156def usage(code, msg=''):
157 print __doc__ % globals()
158 if msg:
159 print msg
160 sys.exit(code)
161
Barry Warsawc8f08922000-02-26 20:56:47 +0000162
Barry Warsawe27db5a1999-08-13 20:59:48 +0000163
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000164escapes = []
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000165
Barry Warsawc8f08922000-02-26 20:56:47 +0000166def make_escapes(pass_iso8859):
167 global escapes
168 for i in range(256):
169 if pass_iso8859:
170 # Allow iso-8859 characters to pass through so that e.g. 'msgid
171 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise
172 # we escape any character outside the 32..126 range.
173 i = i % 128
174 if 32 <= i <= 126:
175 escapes.append(chr(i))
176 else:
177 escapes.append("\\%03o" % i)
178 escapes[ord('\\')] = '\\\\'
179 escapes[ord('\t')] = '\\t'
180 escapes[ord('\r')] = '\\r'
181 escapes[ord('\n')] = '\\n'
182 escapes[ord('\"')] = '\\"'
183
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000184
185def escape(s):
Barry Warsawc8f08922000-02-26 20:56:47 +0000186 global escapes
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000187 s = list(s)
188 for i in range(len(s)):
189 s[i] = escapes[ord(s[i])]
190 return string.join(s, '')
191
192
193def safe_eval(s):
194 # unwrap quotes, safely
195 return eval(s, {'__builtins__':{}}, {})
196
197
Barry Warsawe27db5a1999-08-13 20:59:48 +0000198def normalize(s):
199 # This converts the various Python string types into a format that is
200 # appropriate for .po files, namely much closer to C style.
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000201 lines = string.split(s, '\n')
202 if len(lines) == 1:
203 s = '"' + escape(s) + '"'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000204 else:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000205 if not lines[-1]:
206 del lines[-1]
207 lines[-1] = lines[-1] + '\n'
208 for i in range(len(lines)):
209 lines[i] = escape(lines[i])
210 s = '""\n"' + string.join(lines, '\\n"\n"') + '"'
211 return s
Barry Warsawe27db5a1999-08-13 20:59:48 +0000212
213
214
215class TokenEater:
216 def __init__(self, options):
217 self.__options = options
218 self.__messages = {}
219 self.__state = self.__waiting
220 self.__data = []
221 self.__lineno = -1
222
223 def __call__(self, ttype, tstring, stup, etup, line):
224 # dispatch
225 self.__state(ttype, tstring, stup[0])
226
227 def __waiting(self, ttype, tstring, lineno):
228 if ttype == tokenize.NAME and tstring in self.__options.keywords:
229 self.__state = self.__keywordseen
230
231 def __keywordseen(self, ttype, tstring, lineno):
232 if ttype == tokenize.OP and tstring == '(':
233 self.__data = []
234 self.__lineno = lineno
235 self.__state = self.__openseen
236 else:
237 self.__state = self.__waiting
238
239 def __openseen(self, ttype, tstring, lineno):
240 if ttype == tokenize.OP and tstring == ')':
241 # We've seen the last of the translatable strings. Record the
242 # line number of the first line of the strings and update the list
243 # of messages seen. Reset state for the next batch. If there
244 # were no strings inside _(), then just ignore this entry.
245 if self.__data:
246 msg = string.join(self.__data, '')
Barry Warsawc8f08922000-02-26 20:56:47 +0000247 if not msg in self.__options.toexclude:
248 entry = (self.__curfile, self.__lineno)
249 linenos = self.__messages.get(msg)
250 if linenos is None:
251 self.__messages[msg] = [entry]
252 else:
253 linenos.append(entry)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000254 self.__state = self.__waiting
255 elif ttype == tokenize.STRING:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000256 self.__data.append(safe_eval(tstring))
Barry Warsawe27db5a1999-08-13 20:59:48 +0000257 # TBD: should we warn if we seen anything else?
258
259 def set_filename(self, filename):
260 self.__curfile = filename
261
262 def write(self, fp):
263 options = self.__options
264 timestamp = time.ctime(time.time())
265 # common header
266 try:
267 sys.stdout = fp
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000268 # The time stamp in the header doesn't have the same format
269 # as that generated by xgettext...
Barry Warsawc8f08922000-02-26 20:56:47 +0000270 print pot_header % {'time': timestamp, 'version': __version__}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000271 for k, v in self.__messages.items():
Barry Warsawc8f08922000-02-26 20:56:47 +0000272 # location comments are different b/w Solaris and GNU:
273 if options.location == options.SOLARIS:
274 for filename, lineno in v:
275 d = {'filename': filename, 'lineno': lineno}
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000276 print _('# File: %(filename)s, line: %(lineno)d') % d
Barry Warsawc8f08922000-02-26 20:56:47 +0000277 elif options.location == options.GNU:
278 # fit as many locations on one line, as long as the
279 # resulting line length doesn't exceeds 'options.width'
280 locline = '#:'
281 for filename, lineno in v:
282 d = {'filename': filename, 'lineno': lineno}
283 s = _(' %(filename)s:%(lineno)d') % d
284 if len(locline) + len(s) <= options.width:
285 locline = locline + s
286 else:
287 print locline
288 locline = "#:" + s
289 if len(locline) > 2:
290 print locline
Barry Warsawe27db5a1999-08-13 20:59:48 +0000291 # TBD: sorting, normalizing
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000292 print 'msgid', normalize(k)
Barry Warsawc8f08922000-02-26 20:56:47 +0000293 print 'msgstr ""\n'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000294 finally:
295 sys.stdout = sys.__stdout__
296
297
298def main():
299 default_keywords = ['_']
300 try:
301 opts, args = getopt.getopt(
302 sys.argv[1:],
Barry Warsawc8f08922000-02-26 20:56:47 +0000303 'ad:Ehk:n:o:p:Vvw:x:',
304 ['extract-all', 'default-domain', 'escape', 'help', 'keyword',
305 'add-location', 'no-location', 'output=', 'output-dir=',
306 'verbose', 'version', 'width=', 'exclude-file=',
307 ])
Barry Warsawe27db5a1999-08-13 20:59:48 +0000308 except getopt.error, msg:
309 usage(1, msg)
310
311 # for holding option values
312 class Options:
313 # constants
314 GNU = 1
315 SOLARIS = 2
316 # defaults
Barry Warsawc8f08922000-02-26 20:56:47 +0000317 extractall = 0 # FIXME: currently this option has no effect at all.
318 escape = 0
Barry Warsawe27db5a1999-08-13 20:59:48 +0000319 keywords = []
Barry Warsawc8f08922000-02-26 20:56:47 +0000320 outpath = ''
Barry Warsawe27db5a1999-08-13 20:59:48 +0000321 outfile = 'messages.pot'
322 location = GNU
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000323 verbose = 0
Barry Warsawc8f08922000-02-26 20:56:47 +0000324 width = 78
325 excludefilename = ''
Barry Warsawe27db5a1999-08-13 20:59:48 +0000326
327 options = Options()
328 locations = {'gnu' : options.GNU,
329 'solaris' : options.SOLARIS,
330 }
331
332 # parse options
333 for opt, arg in opts:
334 if opt in ('-h', '--help'):
335 usage(0)
Barry Warsawc8f08922000-02-26 20:56:47 +0000336 elif opt in ('-a', '--extract-all'):
337 options.extractall = 1
338 elif opt in ('-d', '--default-domain'):
339 options.outfile = arg + '.pot'
340 elif opt in ('-E', '--escape'):
341 options.escape = 1
Barry Warsawe27db5a1999-08-13 20:59:48 +0000342 elif opt in ('-k', '--keyword'):
343 if arg is None:
344 default_keywords = []
345 options.keywords.append(arg)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000346 elif opt in ('-n', '--add-location'):
347 if arg is None:
348 arg = 'gnu'
349 try:
350 options.location = locations[string.lower(arg)]
351 except KeyError:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000352 d = {'arg':arg}
353 usage(1, _('Invalid value for --add-location: %(arg)s') % d)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000354 elif opt in ('--no-location',):
355 options.location = 0
Barry Warsawc8f08922000-02-26 20:56:47 +0000356 elif opt in ('-o', '--output'):
357 options.outfile = arg
358 elif opt in ('-p', '--output-dir'):
359 options.outpath = arg
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000360 elif opt in ('-v', '--verbose'):
361 options.verbose = 1
Barry Warsawc8f08922000-02-26 20:56:47 +0000362 elif opt in ('-V', '--version'):
363 print _('pygettext.py (xgettext for Python) %s') % __version__
364 sys.exit(0)
365 elif opt in ('-w', '--width'):
366 try:
367 options.width = int(arg)
368 except ValueError:
369 d = {'arg':arg}
370 usage(1, _('Invalid value for --width: %(arg)s, must be int')
371 % d)
372 elif opt in ('-x', '--exclude-file'):
373 options.excludefilename = arg
374
375 # calculate escapes
376 make_escapes(options.escapes)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000377
378 # calculate all keywords
379 options.keywords.extend(default_keywords)
380
Barry Warsawc8f08922000-02-26 20:56:47 +0000381 # initialize list of strings to exclude
382 if options.excludefilename:
383 try:
384 fp = open(options.excludefilename)
385 options.toexclude = fp.readlines()
386 fp.close()
387 except IOError:
388 sys.stderr.write(_("Can't read --exclude-file: %s") %
389 options.excludefilename)
390 sys.exit(1)
391 else:
392 options.toexclude = []
393
Barry Warsawe27db5a1999-08-13 20:59:48 +0000394 # slurp through all the files
395 eater = TokenEater(options)
396 for filename in args:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000397 if options.verbose:
398 print _('Working on %(filename)s') % {'filename':filename}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000399 fp = open(filename)
400 eater.set_filename(filename)
401 tokenize.tokenize(fp.readline, eater)
402 fp.close()
403
Barry Warsawc8f08922000-02-26 20:56:47 +0000404 if options.outpath:
405 options.outfile = os.path.join(options.outpath, options.outfile)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000406 fp = open(options.outfile, 'w')
407 eater.write(fp)
408 fp.close()
409
410
411
412if __name__ == '__main__':
413 main()