blob: fcd6b9512d398f432a537208d97b1f09454222e7 [file] [log] [blame]
Barry Warsawaf572511999-08-11 21:40:38 +00001#! /usr/bin/env python
Barry Warsawa507c321999-11-03 16:46:05 +00002# Originally written by Barry Warsaw <bwarsaw@python.org>
Barry Warsawe27db5a1999-08-13 20:59:48 +00003
4"""pygettext -- Python equivalent of xgettext(1)
5
6Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
7internationalization of C programs. Most of these tools are independent of
8the programming language and can be used from within Python programs. Martin
9von Loewis' work[1] helps considerably in this regard.
10
Barry Warsaw5dbf5261999-11-03 18:47:52 +000011There's one problem though; xgettext is the program that scans source code
Barry Warsawe27db5a1999-08-13 20:59:48 +000012looking for message strings, but it groks only C (or C++). Python introduces
13a few wrinkles, such as dual quoting characters, triple quoted strings, and
14raw strings. xgettext understands none of this.
15
16Enter pygettext, which uses Python's standard tokenize module to scan Python
17source code, generating .pot files identical to what GNU xgettext[2] generates
Barry Warsaw5dbf5261999-11-03 18:47:52 +000018for C and C++ code. From there, the standard GNU tools can be used.
Barry Warsawe27db5a1999-08-13 20:59:48 +000019
20A word about marking Python strings as candidates for translation. GNU
21xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
22gettext_noop. But those can be a lot of text to include all over your code.
Barry Warsaw5dbf5261999-11-03 18:47:52 +000023C and C++ have a trick: they use the C preprocessor. Most internationalized C
Barry Warsawe27db5a1999-08-13 20:59:48 +000024source includes a #define for gettext() to _() so that what has to be written
25in the source is much less. Thus these are both translatable strings:
26
27 gettext("Translatable String")
28 _("Translatable String")
29
30Python of course has no preprocessor so this doesn't work so well. Thus,
31pygettext searches only for _() by default, but see the -k/--keyword flag
32below for how to augment this.
33
34 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
35 [2] http://www.gnu.org/software/gettext/gettext.html
36
Barry Warsawe27db5a1999-08-13 20:59:48 +000037NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
38where ever possible.
39
40Usage: pygettext [options] filename ...
41
42Options:
43
44 -a
45 --extract-all
46 Extract all strings
47
48 -d default-domain
49 --default-domain=default-domain
50 Rename the default output file from messages.pot to default-domain.pot
51
52 -k [word]
53 --keyword[=word]
54 Additional keywords to look for. Without `word' means not to use the
55 default keywords. The default keywords, which are always looked for
56 if not explicitly disabled: _
57
58 The default keyword list is different than GNU xgettext. You can have
59 multiple -k flags on the command line.
60
61 --no-location
62 Do not write filename/lineno location comments
63
64 -n [style]
65 --add-location[=style]
66 Write filename/lineno location comments indicating where each
67 extracted string is found in the source. These lines appear before
68 each msgid. Two styles are supported:
69
70 Solaris # File: filename, line: line-number
71 Gnu #: filename:line
72
73 If style is omitted, Gnu is used. The style name is case
74 insensitive. By default, locations are included.
75
Barry Warsaw5dbf5261999-11-03 18:47:52 +000076 -v
77 --verbose
78 Print the names of the files being processed.
79
Barry Warsawe27db5a1999-08-13 20:59:48 +000080 --help
81 -h
82 print this help message and exit
83
84"""
85
86import os
87import sys
88import string
89import time
90import getopt
91import tokenize
92
Barry Warsaw5dbf5261999-11-03 18:47:52 +000093__version__ = '0.2'
Barry Warsawe27db5a1999-08-13 20:59:48 +000094
95
96
Barry Warsaw5dbf5261999-11-03 18:47:52 +000097# for selftesting
98def _(s): return s
99
100
101# The normal pot-file header. msgmerge and EMACS' po-mode work better if
102# it's there.
103pot_header = _('''\
104# SOME DESCRIPTIVE TITLE.
105# Copyright (C) YEAR ORGANIZATION
106# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
107#
108msgid ""
109msgstr ""
110"Project-Id-Version: PACKAGE VERSION\\n"
111"PO-Revision-Date: %(time)s\\n"
112"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
113"Language-Team: LANGUAGE <LL@li.org>\\n"
114"MIME-Version: 1.0\\n"
115"Content-Type: text/plain; charset=CHARSET\\n"
116"Content-Transfer-Encoding: ENCODING\\n"
117"Generated-By: pygettext.py %(version)s\\n"
118
119''')
120
121
Barry Warsawe27db5a1999-08-13 20:59:48 +0000122def usage(code, msg=''):
123 print __doc__ % globals()
124 if msg:
125 print msg
126 sys.exit(code)
127
Barry Warsawe27db5a1999-08-13 20:59:48 +0000128
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000129escapes = []
130for i in range(256):
131 if i < 32 or i > 127:
132 escapes.append("\\%03o" % i)
133 else:
134 escapes.append(chr(i))
135
136escapes[ord('\\')] = '\\\\'
137escapes[ord('\t')] = '\\t'
138escapes[ord('\r')] = '\\r'
139escapes[ord('\n')] = '\\n'
Barry Warsaw8f356811999-12-06 02:46:49 +0000140escapes[ord('\"')] = '\\"'
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000141
142def escape(s):
143 s = list(s)
144 for i in range(len(s)):
145 s[i] = escapes[ord(s[i])]
146 return string.join(s, '')
147
148
149def safe_eval(s):
150 # unwrap quotes, safely
151 return eval(s, {'__builtins__':{}}, {})
152
153
Barry Warsawe27db5a1999-08-13 20:59:48 +0000154def normalize(s):
155 # This converts the various Python string types into a format that is
156 # appropriate for .po files, namely much closer to C style.
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000157 lines = string.split(s, '\n')
158 if len(lines) == 1:
159 s = '"' + escape(s) + '"'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000160 else:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000161 if not lines[-1]:
162 del lines[-1]
163 lines[-1] = lines[-1] + '\n'
164 for i in range(len(lines)):
165 lines[i] = escape(lines[i])
166 s = '""\n"' + string.join(lines, '\\n"\n"') + '"'
167 return s
Barry Warsawe27db5a1999-08-13 20:59:48 +0000168
169
170
171class TokenEater:
172 def __init__(self, options):
173 self.__options = options
174 self.__messages = {}
175 self.__state = self.__waiting
176 self.__data = []
177 self.__lineno = -1
178
179 def __call__(self, ttype, tstring, stup, etup, line):
180 # dispatch
181 self.__state(ttype, tstring, stup[0])
182
183 def __waiting(self, ttype, tstring, lineno):
184 if ttype == tokenize.NAME and tstring in self.__options.keywords:
185 self.__state = self.__keywordseen
186
187 def __keywordseen(self, ttype, tstring, lineno):
188 if ttype == tokenize.OP and tstring == '(':
189 self.__data = []
190 self.__lineno = lineno
191 self.__state = self.__openseen
192 else:
193 self.__state = self.__waiting
194
195 def __openseen(self, ttype, tstring, lineno):
196 if ttype == tokenize.OP and tstring == ')':
197 # We've seen the last of the translatable strings. Record the
198 # line number of the first line of the strings and update the list
199 # of messages seen. Reset state for the next batch. If there
200 # were no strings inside _(), then just ignore this entry.
201 if self.__data:
202 msg = string.join(self.__data, '')
203 entry = (self.__curfile, self.__lineno)
204 linenos = self.__messages.get(msg)
205 if linenos is None:
206 self.__messages[msg] = [entry]
207 else:
208 linenos.append(entry)
209 self.__state = self.__waiting
210 elif ttype == tokenize.STRING:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000211 self.__data.append(safe_eval(tstring))
Barry Warsawe27db5a1999-08-13 20:59:48 +0000212 # TBD: should we warn if we seen anything else?
213
214 def set_filename(self, filename):
215 self.__curfile = filename
216
217 def write(self, fp):
218 options = self.__options
219 timestamp = time.ctime(time.time())
220 # common header
221 try:
222 sys.stdout = fp
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000223 # The time stamp in the header doesn't have the same format
224 # as that generated by xgettext...
225 print pot_header % {'time': timestamp, 'version':__version__}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000226 for k, v in self.__messages.items():
227 for filename, lineno in v:
228 # location comments are different b/w Solaris and GNU
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000229 d = {'filename': filename,
230 'lineno': lineno}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000231 if options.location == options.SOLARIS:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000232 print _('# File: %(filename)s, line: %(lineno)d') % d
Barry Warsawe27db5a1999-08-13 20:59:48 +0000233 elif options.location == options.GNU:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000234 print _('#: %(filename)s:%(lineno)d') % d
Barry Warsawe27db5a1999-08-13 20:59:48 +0000235 # TBD: sorting, normalizing
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000236 print 'msgid', normalize(k)
237 print 'msgstr ""'
Barry Warsawe27db5a1999-08-13 20:59:48 +0000238 print
239 finally:
240 sys.stdout = sys.__stdout__
241
242
243def main():
244 default_keywords = ['_']
245 try:
246 opts, args = getopt.getopt(
247 sys.argv[1:],
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000248 'k:d:n:hv',
Barry Warsawe27db5a1999-08-13 20:59:48 +0000249 ['keyword', 'default-domain', 'help',
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000250 'add-location=', 'no-location', 'verbose'])
Barry Warsawe27db5a1999-08-13 20:59:48 +0000251 except getopt.error, msg:
252 usage(1, msg)
253
254 # for holding option values
255 class Options:
256 # constants
257 GNU = 1
258 SOLARIS = 2
259 # defaults
260 keywords = []
261 outfile = 'messages.pot'
262 location = GNU
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000263 verbose = 0
Barry Warsawe27db5a1999-08-13 20:59:48 +0000264
265 options = Options()
266 locations = {'gnu' : options.GNU,
267 'solaris' : options.SOLARIS,
268 }
269
270 # parse options
271 for opt, arg in opts:
272 if opt in ('-h', '--help'):
273 usage(0)
274 elif opt in ('-k', '--keyword'):
275 if arg is None:
276 default_keywords = []
277 options.keywords.append(arg)
278 elif opt in ('-d', '--default-domain'):
279 options.outfile = arg + '.pot'
280 elif opt in ('-n', '--add-location'):
281 if arg is None:
282 arg = 'gnu'
283 try:
284 options.location = locations[string.lower(arg)]
285 except KeyError:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000286 d = {'arg':arg}
287 usage(1, _('Invalid value for --add-location: %(arg)s') % d)
Barry Warsawe27db5a1999-08-13 20:59:48 +0000288 elif opt in ('--no-location',):
289 options.location = 0
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000290 elif opt in ('-v', '--verbose'):
291 options.verbose = 1
Barry Warsawe27db5a1999-08-13 20:59:48 +0000292
293 # calculate all keywords
294 options.keywords.extend(default_keywords)
295
296 # slurp through all the files
297 eater = TokenEater(options)
298 for filename in args:
Barry Warsaw5dbf5261999-11-03 18:47:52 +0000299 if options.verbose:
300 print _('Working on %(filename)s') % {'filename':filename}
Barry Warsawe27db5a1999-08-13 20:59:48 +0000301 fp = open(filename)
302 eater.set_filename(filename)
303 tokenize.tokenize(fp.readline, eater)
304 fp.close()
305
306 fp = open(options.outfile, 'w')
307 eater.write(fp)
308 fp.close()
309
310
311
312if __name__ == '__main__':
313 main()