blob: 3542f3f23ec2cd0f8f861935bb8a589c02146448 [file] [log] [blame]
Barry Warsawaf572511999-08-11 21:40:38 +00001#! /usr/bin/env python
Barry Warsawe27db5a1999-08-13 20:59:48 +00002
3"""pygettext -- Python equivalent of xgettext(1)
4
5Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
6internationalization of C programs. Most of these tools are independent of
7the programming language and can be used from within Python programs. Martin
8von Loewis' work[1] helps considerably in this regard.
9
10There's one hole though; xgettext is the program that scans source code
11looking for message strings, but it groks only C (or C++). Python introduces
12a few wrinkles, such as dual quoting characters, triple quoted strings, and
13raw strings. xgettext understands none of this.
14
15Enter pygettext, which uses Python's standard tokenize module to scan Python
16source code, generating .pot files identical to what GNU xgettext[2] generates
17for C and C++ code. From there, the standard GNU tools can be used.
18
19A word about marking Python strings as candidates for translation. GNU
20xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
21gettext_noop. But those can be a lot of text to include all over your code.
22C and C++ have a trick: they use the C preprocessor. Most internationalized C
23source includes a #define for gettext() to _() so that what has to be written
24in the source is much less. Thus these are both translatable strings:
25
26 gettext("Translatable String")
27 _("Translatable String")
28
29Python of course has no preprocessor so this doesn't work so well. Thus,
30pygettext searches only for _() by default, but see the -k/--keyword flag
31below for how to augment this.
32
33 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
34 [2] http://www.gnu.org/software/gettext/gettext.html
35
36
37NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
38where ever possible.
39
40Usage: pygettext [options] filename ...
41
42Options:
43
44 -a
45 --extract-all
46 Extract all strings
47
48 -d default-domain
49 --default-domain=default-domain
50 Rename the default output file from messages.pot to default-domain.pot
51
52 -k [word]
53 --keyword[=word]
54 Additional keywords to look for. Without `word' means not to use the
55 default keywords. The default keywords, which are always looked for
56 if not explicitly disabled: _
57
58 The default keyword list is different than GNU xgettext. You can have
59 multiple -k flags on the command line.
60
61 --no-location
62 Do not write filename/lineno location comments
63
64 -n [style]
65 --add-location[=style]
66 Write filename/lineno location comments indicating where each
67 extracted string is found in the source. These lines appear before
68 each msgid. Two styles are supported:
69
70 Solaris # File: filename, line: line-number
71 Gnu #: filename:line
72
73 If style is omitted, Gnu is used. The style name is case
74 insensitive. By default, locations are included.
75
76 --help
77 -h
78 print this help message and exit
79
80"""
81
82import os
83import sys
84import string
85import time
86import getopt
87import tokenize
88
89__version__ = '0.1'
90
91
92
93def usage(code, msg=''):
94 print __doc__ % globals()
95 if msg:
96 print msg
97 sys.exit(code)
98
99
100
101def normalize(s):
102 # This converts the various Python string types into a format that is
103 # appropriate for .po files, namely much closer to C style.
104 #
105 # unwrap quotes, safely
106 s = eval(s, {'__builtins__':{}}, {})
107 # now escape any embedded double quotes
108 parts = []
109 last = 0
110 i = string.find(s, '"')
111 while i >= 0:
112 # find the number of preceding backslashes
113 j = i
114 n = 0
115 while j >= 0 and s[i] == '\\':
116 j = j - 1
117 n = n + 1
118 if (n % 2) == 0:
119 parts.append(s[last:j])
120 parts.append('\\')
121 parts.append(s[j:i])
122 else:
123 parts.append(s[last:i])
124 last = i
125 i = string.find(s, '"', i+1)
126 else:
127 parts.append(s[last:])
128 if parts:
129 return '"' + string.join(parts, '') + '"'
130 else:
131 return '"' + s + '"'
132
133
134
135class TokenEater:
136 def __init__(self, options):
137 self.__options = options
138 self.__messages = {}
139 self.__state = self.__waiting
140 self.__data = []
141 self.__lineno = -1
142
143 def __call__(self, ttype, tstring, stup, etup, line):
144 # dispatch
145 self.__state(ttype, tstring, stup[0])
146
147 def __waiting(self, ttype, tstring, lineno):
148 if ttype == tokenize.NAME and tstring in self.__options.keywords:
149 self.__state = self.__keywordseen
150
151 def __keywordseen(self, ttype, tstring, lineno):
152 if ttype == tokenize.OP and tstring == '(':
153 self.__data = []
154 self.__lineno = lineno
155 self.__state = self.__openseen
156 else:
157 self.__state = self.__waiting
158
159 def __openseen(self, ttype, tstring, lineno):
160 if ttype == tokenize.OP and tstring == ')':
161 # We've seen the last of the translatable strings. Record the
162 # line number of the first line of the strings and update the list
163 # of messages seen. Reset state for the next batch. If there
164 # were no strings inside _(), then just ignore this entry.
165 if self.__data:
166 msg = string.join(self.__data, '')
167 entry = (self.__curfile, self.__lineno)
168 linenos = self.__messages.get(msg)
169 if linenos is None:
170 self.__messages[msg] = [entry]
171 else:
172 linenos.append(entry)
173 self.__state = self.__waiting
174 elif ttype == tokenize.STRING:
175 self.__data.append(normalize(tstring))
176 # TBD: should we warn if we seen anything else?
177
178 def set_filename(self, filename):
179 self.__curfile = filename
180
181 def write(self, fp):
182 options = self.__options
183 timestamp = time.ctime(time.time())
184 # common header
185 try:
186 sys.stdout = fp
187 print '# POT file generated by pygettext.py', __version__
188 print '#', timestamp
189 print '#'
190 for k, v in self.__messages.items():
191 for filename, lineno in v:
192 # location comments are different b/w Solaris and GNU
193 if options.location == options.SOLARIS:
194 print '# File: %s,' % filename, 'line: %d' % lineno
195 elif options.location == options.GNU:
196 print '#: %s:%d' % (filename, lineno)
197 # TBD: sorting, normalizing
198 print 'msgid', k
199 print 'msgstr '
200 print
201 finally:
202 sys.stdout = sys.__stdout__
203
204
205def main():
206 default_keywords = ['_']
207 try:
208 opts, args = getopt.getopt(
209 sys.argv[1:],
210 'k:d:n:h',
211 ['keyword', 'default-domain', 'help',
212 'add-location=', 'no-location'])
213 except getopt.error, msg:
214 usage(1, msg)
215
216 # for holding option values
217 class Options:
218 # constants
219 GNU = 1
220 SOLARIS = 2
221 # defaults
222 keywords = []
223 outfile = 'messages.pot'
224 location = GNU
225
226 options = Options()
227 locations = {'gnu' : options.GNU,
228 'solaris' : options.SOLARIS,
229 }
230
231 # parse options
232 for opt, arg in opts:
233 if opt in ('-h', '--help'):
234 usage(0)
235 elif opt in ('-k', '--keyword'):
236 if arg is None:
237 default_keywords = []
238 options.keywords.append(arg)
239 elif opt in ('-d', '--default-domain'):
240 options.outfile = arg + '.pot'
241 elif opt in ('-n', '--add-location'):
242 if arg is None:
243 arg = 'gnu'
244 try:
245 options.location = locations[string.lower(arg)]
246 except KeyError:
247 usage(1, 'Invalid value for --add-location: ' + arg)
248 elif opt in ('--no-location',):
249 options.location = 0
250
251 # calculate all keywords
252 options.keywords.extend(default_keywords)
253
254 # slurp through all the files
255 eater = TokenEater(options)
256 for filename in args:
257 fp = open(filename)
258 eater.set_filename(filename)
259 tokenize.tokenize(fp.readline, eater)
260 fp.close()
261
262 fp = open(options.outfile, 'w')
263 eater.write(fp)
264 fp.close()
265
266
267
268if __name__ == '__main__':
269 main()