blob: f8567e9c94c8d9cb98cefb38b9200d4a857e992d [file] [log] [blame]
Barry Warsawaf572511999-08-11 21:40:38 +00001#! /usr/bin/env python
Barry Warsawa507c321999-11-03 16:46:05 +00002# Originally written by Barry Warsaw <bwarsaw@python.org>
Barry Warsawe27db5a1999-08-13 20:59:48 +00003
4"""pygettext -- Python equivalent of xgettext(1)
5
6Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
7internationalization of C programs. Most of these tools are independent of
8the programming language and can be used from within Python programs. Martin
9von Loewis' work[1] helps considerably in this regard.
10
Barry Warsaw5dbf5261999-11-03 18:47:52 +000011There's one problem though; xgettext is the program that scans source code
Barry Warsawe27db5a1999-08-13 20:59:48 +000012looking for message strings, but it groks only C (or C++). Python introduces
13a few wrinkles, such as dual quoting characters, triple quoted strings, and
14raw strings. xgettext understands none of this.
15
16Enter pygettext, which uses Python's standard tokenize module to scan Python
17source code, generating .pot files identical to what GNU xgettext[2] generates
Barry Warsaw5dbf5261999-11-03 18:47:52 +000018for C and C++ code. From there, the standard GNU tools can be used.
Barry Warsawe27db5a1999-08-13 20:59:48 +000019
20A word about marking Python strings as candidates for translation. GNU
21xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
22gettext_noop. But those can be a lot of text to include all over your code.
Barry Warsaw5dbf5261999-11-03 18:47:52 +000023C and C++ have a trick: they use the C preprocessor. Most internationalized C
Barry Warsawe27db5a1999-08-13 20:59:48 +000024source includes a #define for gettext() to _() so that what has to be written
25in the source is much less. Thus these are both translatable strings:
26
27 gettext("Translatable String")
28 _("Translatable String")
29
30Python of course has no preprocessor so this doesn't work so well. Thus,
31pygettext searches only for _() by default, but see the -k/--keyword flag
32below for how to augment this.
33
34 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
35 [2] http://www.gnu.org/software/gettext/gettext.html
36
Barry Warsawe27db5a1999-08-13 20:59:48 +000037NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
38where ever possible.
39
40Usage: pygettext [options] filename ...
41
42Options:
43
44 -a
45 --extract-all
46 Extract all strings
47
48 -d default-domain
49 --default-domain=default-domain
50 Rename the default output file from messages.pot to default-domain.pot
51
52 -k [word]
53 --keyword[=word]
54 Additional keywords to look for. Without `word' means not to use the
55 default keywords. The default keywords, which are always looked for
56 if not explicitly disabled: _
57
58 The default keyword list is different than GNU xgettext. You can have
59 multiple -k flags on the command line.
60
61 --no-location
62 Do not write filename/lineno location comments
63
64 -n [style]
65 --add-location[=style]
66 Write filename/lineno location comments indicating where each
67 extracted string is found in the source. These lines appear before
68 each msgid. Two styles are supported:
69
70 Solaris # File: filename, line: line-number
71 Gnu #: filename:line
72
73 If style is omitted, Gnu is used. The style name is case
74 insensitive. By default, locations are included.
75
Barry Warsaw5dbf5261999-11-03 18:47:52 +000076 -v
77 --verbose
78 Print the names of the files being processed.
79
Barry Warsawe27db5a1999-08-13 20:59:48 +000080 --help
81 -h
82 print this help message and exit
83
84"""
85
86import os
87import sys
88import string
89import time
90import getopt
91import tokenize
92
Barry Warsaw5dbf5261999-11-03 18:47:52 +000093__version__ = '0.2'
Barry Warsawe27db5a1999-08-13 20:59:48 +000094
95
96
Barry Warsaw5dbf5261999-11-03 18:47:52 +000097# for selftesting
98def _(s): return s
99
100
101# The normal pot-file header. msgmerge and EMACS' po-mode work better if
102# it's there.
103pot_header = _('''\
104# SOME DESCRIPTIVE TITLE.
105# Copyright (C) YEAR ORGANIZATION
106# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
107#
108msgid ""
109msgstr ""
110"Project-Id-Version: PACKAGE VERSION\\n"
111"PO-Revision-Date: %(time)s\\n"
112"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
113"Language-Team: LANGUAGE <LL@li.org>\\n"
114"MIME-Version: 1.0\\n"
115"Content-Type: text/plain; charset=CHARSET\\n"
116"Content-Transfer-Encoding: ENCODING\\n"
117"Generated-By: pygettext.py %(version)s\\n"
118
119''')
120
121
Barry Warsawe27db5a1999-08-13 20:59:48 +0000122def usage(code, msg=''):
123 print __doc__ % globals()
124 if msg:
125 print msg
126 sys.exit(code)
127
Barry Warsawe27db5a1999-08-13 20:59:48 +0000128
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000129escapes = []
130for i in range(256):
131 if i < 32 or i > 127:
132 escapes.append("\\%03o" % i)
133 else:
134 escapes.append(chr(i))
135
136escapes[ord('\\')] = '\\\\'
137escapes[ord('\t')] = '\\t'
138escapes[ord('\r')] = '\\r'
139escapes[ord('\n')] = '\\n'
140
141def escape(s):
142 s = list(s)
143 for i in range(len(s)):
144 s[i] = escapes[ord(s[i])]
145 return string.join(s, '')
146
147
148def safe_eval(s):
149 # unwrap quotes, safely
150 return eval(s, {'__builtins__':{}}, {})
151
152
Barry Warsawe27db5a1999-08-13 20:59:48 +0000153def normalize(s):
154 # This converts the various Python string types into a format that is
155 # appropriate for .po files, namely much closer to C style.
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000156 lines = string.split(s, '\n')
157 if len(lines) == 1:
158 s = '"' + escape(s) + '"'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000159 else:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000160 if not lines[-1]:
161 del lines[-1]
162 lines[-1] = lines[-1] + '\n'
163 for i in range(len(lines)):
164 lines[i] = escape(lines[i])
165 s = '""\n"' + string.join(lines, '\\n"\n"') + '"'
166 return s
Barry Warsawe27db5a1999-08-13 20:59:48 +0000167
168
169
170class TokenEater:
171 def __init__(self, options):
172 self.__options = options
173 self.__messages = {}
174 self.__state = self.__waiting
175 self.__data = []
176 self.__lineno = -1
177
178 def __call__(self, ttype, tstring, stup, etup, line):
179 # dispatch
180 self.__state(ttype, tstring, stup[0])
181
182 def __waiting(self, ttype, tstring, lineno):
183 if ttype == tokenize.NAME and tstring in self.__options.keywords:
184 self.__state = self.__keywordseen
185
186 def __keywordseen(self, ttype, tstring, lineno):
187 if ttype == tokenize.OP and tstring == '(':
188 self.__data = []
189 self.__lineno = lineno
190 self.__state = self.__openseen
191 else:
192 self.__state = self.__waiting
193
194 def __openseen(self, ttype, tstring, lineno):
195 if ttype == tokenize.OP and tstring == ')':
196 # We've seen the last of the translatable strings. Record the
197 # line number of the first line of the strings and update the list
198 # of messages seen. Reset state for the next batch. If there
199 # were no strings inside _(), then just ignore this entry.
200 if self.__data:
201 msg = string.join(self.__data, '')
202 entry = (self.__curfile, self.__lineno)
203 linenos = self.__messages.get(msg)
204 if linenos is None:
205 self.__messages[msg] = [entry]
206 else:
207 linenos.append(entry)
208 self.__state = self.__waiting
209 elif ttype == tokenize.STRING:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000210 self.__data.append(safe_eval(tstring))
Barry Warsawe27db5a1999-08-13 20:59:48 +0000211 # TBD: should we warn if we seen anything else?
212
213 def set_filename(self, filename):
214 self.__curfile = filename
215
216 def write(self, fp):
217 options = self.__options
218 timestamp = time.ctime(time.time())
219 # common header
220 try:
221 sys.stdout = fp
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000222 # The time stamp in the header doesn't have the same format
223 # as that generated by xgettext...
224 print pot_header % {'time': timestamp, 'version':__version__}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000225 for k, v in self.__messages.items():
226 for filename, lineno in v:
227 # location comments are different b/w Solaris and GNU
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000228 d = {'filename': filename,
229 'lineno': lineno}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000230 if options.location == options.SOLARIS:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000231 print _('# File: %(filename)s, line: %(lineno)d') % d
Barry Warsawe27db5a1999-08-13 20:59:48 +0000232 elif options.location == options.GNU:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000233 print _('#: %(filename)s:%(lineno)d') % d
Barry Warsawe27db5a1999-08-13 20:59:48 +0000234 # TBD: sorting, normalizing
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000235 print 'msgid', normalize(k)
236 print 'msgstr ""'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000237 print
238 finally:
239 sys.stdout = sys.__stdout__
240
241
242def main():
243 default_keywords = ['_']
244 try:
245 opts, args = getopt.getopt(
246 sys.argv[1:],
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000247 'k:d:n:hv',
Barry Warsawe27db5a1999-08-13 20:59:48 +0000248 ['keyword', 'default-domain', 'help',
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000249 'add-location=', 'no-location', 'verbose'])
Barry Warsawe27db5a1999-08-13 20:59:48 +0000250 except getopt.error, msg:
251 usage(1, msg)
252
253 # for holding option values
254 class Options:
255 # constants
256 GNU = 1
257 SOLARIS = 2
258 # defaults
259 keywords = []
260 outfile = 'messages.pot'
261 location = GNU
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000262 verbose = 0
Barry Warsawe27db5a1999-08-13 20:59:48 +0000263
264 options = Options()
265 locations = {'gnu' : options.GNU,
266 'solaris' : options.SOLARIS,
267 }
268
269 # parse options
270 for opt, arg in opts:
271 if opt in ('-h', '--help'):
272 usage(0)
273 elif opt in ('-k', '--keyword'):
274 if arg is None:
275 default_keywords = []
276 options.keywords.append(arg)
277 elif opt in ('-d', '--default-domain'):
278 options.outfile = arg + '.pot'
279 elif opt in ('-n', '--add-location'):
280 if arg is None:
281 arg = 'gnu'
282 try:
283 options.location = locations[string.lower(arg)]
284 except KeyError:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000285 d = {'arg':arg}
286 usage(1, _('Invalid value for --add-location: %(arg)s') % d)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000287 elif opt in ('--no-location',):
288 options.location = 0
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000289 elif opt in ('-v', '--verbose'):
290 options.verbose = 1
Barry Warsawe27db5a1999-08-13 20:59:48 +0000291
292 # calculate all keywords
293 options.keywords.extend(default_keywords)
294
295 # slurp through all the files
296 eater = TokenEater(options)
297 for filename in args:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000298 if options.verbose:
299 print _('Working on %(filename)s') % {'filename':filename}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000300 fp = open(filename)
301 eater.set_filename(filename)
302 tokenize.tokenize(fp.readline, eater)
303 fp.close()
304
305 fp = open(options.outfile, 'w')
306 eater.write(fp)
307 fp.close()
308
309
310
311if __name__ == '__main__':
312 main()