blob: 2a3c97b902f7a9800bf0c45106fa2f2cf5728658 [file] [log] [blame]
Barry Warsawaf572511999-08-11 21:40:38 +00001#! /usr/bin/env python
Barry Warsawa507c321999-11-03 16:46:05 +00002# Originally written by Barry Warsaw <bwarsaw@python.org>
Barry Warsawe27db5a1999-08-13 20:59:48 +00003
4"""pygettext -- Python equivalent of xgettext(1)
5
6Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
7internationalization of C programs. Most of these tools are independent of
8the programming language and can be used from within Python programs. Martin
9von Loewis' work[1] helps considerably in this regard.
10
11There's one hole though; xgettext is the program that scans source code
12looking for message strings, but it groks only C (or C++). Python introduces
13a few wrinkles, such as dual quoting characters, triple quoted strings, and
14raw strings. xgettext understands none of this.
15
16Enter pygettext, which uses Python's standard tokenize module to scan Python
17source code, generating .pot files identical to what GNU xgettext[2] generates
18for C and C++ code. From there, the standard GNU tools can be used.
19
20A word about marking Python strings as candidates for translation. GNU
21xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
22gettext_noop. But those can be a lot of text to include all over your code.
23C and C++ have a trick: they use the C preprocessor. Most internationalized C
24source includes a #define for gettext() to _() so that what has to be written
25in the source is much less. Thus these are both translatable strings:
26
27 gettext("Translatable String")
28 _("Translatable String")
29
30Python of course has no preprocessor so this doesn't work so well. Thus,
31pygettext searches only for _() by default, but see the -k/--keyword flag
32below for how to augment this.
33
34 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
35 [2] http://www.gnu.org/software/gettext/gettext.html
36
37
38NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
39where ever possible.
40
41Usage: pygettext [options] filename ...
42
43Options:
44
45 -a
46 --extract-all
47 Extract all strings
48
49 -d default-domain
50 --default-domain=default-domain
51 Rename the default output file from messages.pot to default-domain.pot
52
53 -k [word]
54 --keyword[=word]
55 Additional keywords to look for. Without `word' means not to use the
56 default keywords. The default keywords, which are always looked for
57 if not explicitly disabled: _
58
59 The default keyword list is different than GNU xgettext. You can have
60 multiple -k flags on the command line.
61
62 --no-location
63 Do not write filename/lineno location comments
64
65 -n [style]
66 --add-location[=style]
67 Write filename/lineno location comments indicating where each
68 extracted string is found in the source. These lines appear before
69 each msgid. Two styles are supported:
70
71 Solaris # File: filename, line: line-number
72 Gnu #: filename:line
73
74 If style is omitted, Gnu is used. The style name is case
75 insensitive. By default, locations are included.
76
77 --help
78 -h
79 print this help message and exit
80
81"""
82
83import os
84import sys
85import string
86import time
87import getopt
88import tokenize
89
90__version__ = '0.1'
91
92
93
94def usage(code, msg=''):
95 print __doc__ % globals()
96 if msg:
97 print msg
98 sys.exit(code)
99
100
101
102def normalize(s):
103 # This converts the various Python string types into a format that is
104 # appropriate for .po files, namely much closer to C style.
105 #
106 # unwrap quotes, safely
107 s = eval(s, {'__builtins__':{}}, {})
108 # now escape any embedded double quotes
109 parts = []
110 last = 0
111 i = string.find(s, '"')
112 while i >= 0:
113 # find the number of preceding backslashes
114 j = i
115 n = 0
116 while j >= 0 and s[i] == '\\':
117 j = j - 1
118 n = n + 1
119 if (n % 2) == 0:
120 parts.append(s[last:j])
121 parts.append('\\')
122 parts.append(s[j:i])
123 else:
124 parts.append(s[last:i])
125 last = i
126 i = string.find(s, '"', i+1)
127 else:
128 parts.append(s[last:])
129 if parts:
130 return '"' + string.join(parts, '') + '"'
131 else:
132 return '"' + s + '"'
133
134
135
136class TokenEater:
137 def __init__(self, options):
138 self.__options = options
139 self.__messages = {}
140 self.__state = self.__waiting
141 self.__data = []
142 self.__lineno = -1
143
144 def __call__(self, ttype, tstring, stup, etup, line):
145 # dispatch
146 self.__state(ttype, tstring, stup[0])
147
148 def __waiting(self, ttype, tstring, lineno):
149 if ttype == tokenize.NAME and tstring in self.__options.keywords:
150 self.__state = self.__keywordseen
151
152 def __keywordseen(self, ttype, tstring, lineno):
153 if ttype == tokenize.OP and tstring == '(':
154 self.__data = []
155 self.__lineno = lineno
156 self.__state = self.__openseen
157 else:
158 self.__state = self.__waiting
159
160 def __openseen(self, ttype, tstring, lineno):
161 if ttype == tokenize.OP and tstring == ')':
162 # We've seen the last of the translatable strings. Record the
163 # line number of the first line of the strings and update the list
164 # of messages seen. Reset state for the next batch. If there
165 # were no strings inside _(), then just ignore this entry.
166 if self.__data:
167 msg = string.join(self.__data, '')
168 entry = (self.__curfile, self.__lineno)
169 linenos = self.__messages.get(msg)
170 if linenos is None:
171 self.__messages[msg] = [entry]
172 else:
173 linenos.append(entry)
174 self.__state = self.__waiting
175 elif ttype == tokenize.STRING:
176 self.__data.append(normalize(tstring))
177 # TBD: should we warn if we seen anything else?
178
179 def set_filename(self, filename):
180 self.__curfile = filename
181
182 def write(self, fp):
183 options = self.__options
184 timestamp = time.ctime(time.time())
185 # common header
186 try:
187 sys.stdout = fp
188 print '# POT file generated by pygettext.py', __version__
189 print '#', timestamp
190 print '#'
191 for k, v in self.__messages.items():
192 for filename, lineno in v:
193 # location comments are different b/w Solaris and GNU
194 if options.location == options.SOLARIS:
195 print '# File: %s,' % filename, 'line: %d' % lineno
196 elif options.location == options.GNU:
197 print '#: %s:%d' % (filename, lineno)
198 # TBD: sorting, normalizing
199 print 'msgid', k
200 print 'msgstr '
201 print
202 finally:
203 sys.stdout = sys.__stdout__
204
205
206def main():
207 default_keywords = ['_']
208 try:
209 opts, args = getopt.getopt(
210 sys.argv[1:],
211 'k:d:n:h',
212 ['keyword', 'default-domain', 'help',
213 'add-location=', 'no-location'])
214 except getopt.error, msg:
215 usage(1, msg)
216
217 # for holding option values
218 class Options:
219 # constants
220 GNU = 1
221 SOLARIS = 2
222 # defaults
223 keywords = []
224 outfile = 'messages.pot'
225 location = GNU
226
227 options = Options()
228 locations = {'gnu' : options.GNU,
229 'solaris' : options.SOLARIS,
230 }
231
232 # parse options
233 for opt, arg in opts:
234 if opt in ('-h', '--help'):
235 usage(0)
236 elif opt in ('-k', '--keyword'):
237 if arg is None:
238 default_keywords = []
239 options.keywords.append(arg)
240 elif opt in ('-d', '--default-domain'):
241 options.outfile = arg + '.pot'
242 elif opt in ('-n', '--add-location'):
243 if arg is None:
244 arg = 'gnu'
245 try:
246 options.location = locations[string.lower(arg)]
247 except KeyError:
248 usage(1, 'Invalid value for --add-location: ' + arg)
249 elif opt in ('--no-location',):
250 options.location = 0
251
252 # calculate all keywords
253 options.keywords.extend(default_keywords)
254
255 # slurp through all the files
256 eater = TokenEater(options)
257 for filename in args:
258 fp = open(filename)
259 eater.set_filename(filename)
260 tokenize.tokenize(fp.readline, eater)
261 fp.close()
262
263 fp = open(options.outfile, 'w')
264 eater.write(fp)
265 fp.close()
266
267
268
269if __name__ == '__main__':
270 main()