Initial implementation

commit: e27db5a3938b64a1e72c853285aa02c8c60c45d2 [log] [tgz]
author: Barry Warsaw <barry@python.org> Fri Aug 13 20:59:48 1999 +0000
committer: Barry Warsaw <barry@python.org> Fri Aug 13 20:59:48 1999 +0000
tree: ba6c377440c21ecbd730873743c15b7268438eee
parent: 31e5e37196260c58a57f780a688b27b25ccae1bb [diff] [blame]
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py
index ee3ecd2..3542f3f 100755
--- a/Tools/i18n/pygettext.py
+++ b/Tools/i18n/pygettext.py

@@ -1 +1,269 @@
 #! /usr/bin/env python
+
+"""pygettext -- Python equivalent of xgettext(1)
+
+Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
+internationalization of C programs.  Most of these tools are independent of
+the programming language and can be used from within Python programs.  Martin
+von Loewis' work[1] helps considerably in this regard.
+
+There's one hole though; xgettext is the program that scans source code
+looking for message strings, but it groks only C (or C++).  Python introduces
+a few wrinkles, such as dual quoting characters, triple quoted strings, and
+raw strings.  xgettext understands none of this.
+
+Enter pygettext, which uses Python's standard tokenize module to scan Python
+source code, generating .pot files identical to what GNU xgettext[2] generates
+for C and C++ code.  From there, the standard GNU tools can be used.  
+
+A word about marking Python strings as candidates for translation.  GNU
+xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
+gettext_noop.  But those can be a lot of text to include all over your code.
+C and C++ have a trick: they use the C preprocessor.  Most internationalized C 
+source includes a #define for gettext() to _() so that what has to be written
+in the source is much less.  Thus these are both translatable strings:
+
+    gettext("Translatable String")
+    _("Translatable String")
+
+Python of course has no preprocessor so this doesn't work so well.  Thus,
+pygettext searches only for _() by default, but see the -k/--keyword flag
+below for how to augment this.
+
+ [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
+ [2] http://www.gnu.org/software/gettext/gettext.html
+
+
+NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
+where ever possible.
+
+Usage: pygettext [options] filename ...
+
+Options:
+
+    -a
+    --extract-all
+        Extract all strings
+
+    -d default-domain
+    --default-domain=default-domain
+        Rename the default output file from messages.pot to default-domain.pot 
+
+    -k [word]
+    --keyword[=word]
+        Additional keywords to look for.  Without `word' means not to use the
+        default keywords.  The default keywords, which are always looked for
+        if not explicitly disabled: _
+
+        The default keyword list is different than GNU xgettext. You can have
+        multiple -k flags on the command line.
+
+    --no-location
+        Do not write filename/lineno location comments
+
+    -n [style]
+    --add-location[=style]
+        Write filename/lineno location comments indicating where each
+        extracted string is found in the source.  These lines appear before
+        each msgid.  Two styles are supported:
+
+        Solaris  # File: filename, line: line-number
+        Gnu      #: filename:line
+
+        If style is omitted, Gnu is used.  The style name is case
+        insensitive.  By default, locations are included.
+
+    --help
+    -h
+        print this help message and exit
+
+"""
+
+import os
+import sys
+import string
+import time
+import getopt
+import tokenize
+
+__version__ = '0.1'
+
+
+
+def usage(code, msg=''):
+    print __doc__ % globals()
+    if msg:
+        print msg
+    sys.exit(code)
+
+
+
+def normalize(s):
+    # This converts the various Python string types into a format that is
+    # appropriate for .po files, namely much closer to C style.
+    #
+    # unwrap quotes, safely
+    s = eval(s, {'__builtins__':{}}, {})
+    # now escape any embedded double quotes
+    parts = []
+    last = 0
+    i = string.find(s, '"')
+    while i >= 0:
+        # find the number of preceding backslashes
+        j = i
+        n = 0
+        while j >= 0 and s[i] == '\\':
+            j = j - 1
+            n = n + 1
+        if (n % 2) == 0:
+            parts.append(s[last:j])
+            parts.append('\\')
+            parts.append(s[j:i])
+        else:
+            parts.append(s[last:i])
+        last = i
+        i = string.find(s, '"', i+1)
+    else:
+        parts.append(s[last:])
+    if parts:
+        return '"' + string.join(parts, '') + '"'
+    else:
+        return '"' + s + '"'
+
+
+
+class TokenEater:
+    def __init__(self, options):
+        self.__options = options
+        self.__messages = {}
+        self.__state = self.__waiting
+        self.__data = []
+        self.__lineno = -1
+
+    def __call__(self, ttype, tstring, stup, etup, line):
+        # dispatch
+        self.__state(ttype, tstring, stup[0])
+
+    def __waiting(self, ttype, tstring, lineno):
+        if ttype == tokenize.NAME and tstring in self.__options.keywords:
+            self.__state = self.__keywordseen
+
+    def __keywordseen(self, ttype, tstring, lineno):
+        if ttype == tokenize.OP and tstring == '(':
+            self.__data = []
+            self.__lineno = lineno
+            self.__state = self.__openseen
+        else:
+            self.__state = self.__waiting
+
+    def __openseen(self, ttype, tstring, lineno):
+        if ttype == tokenize.OP and tstring == ')':
+            # We've seen the last of the translatable strings.  Record the
+            # line number of the first line of the strings and update the list 
+            # of messages seen.  Reset state for the next batch.  If there
+            # were no strings inside _(), then just ignore this entry.
+            if self.__data:
+                msg = string.join(self.__data, '')
+                entry = (self.__curfile, self.__lineno)
+                linenos = self.__messages.get(msg)
+                if linenos is None:
+                    self.__messages[msg] = [entry]
+                else:
+                    linenos.append(entry)
+            self.__state = self.__waiting
+        elif ttype == tokenize.STRING:
+            self.__data.append(normalize(tstring))
+        # TBD: should we warn if we seen anything else?
+
+    def set_filename(self, filename):
+        self.__curfile = filename
+
+    def write(self, fp):
+        options = self.__options
+        timestamp = time.ctime(time.time())
+        # common header
+        try:
+            sys.stdout = fp
+            print '# POT file generated by pygettext.py', __version__
+            print '#', timestamp
+            print '#'
+            for k, v in self.__messages.items():
+                for filename, lineno in v:
+                    # location comments are different b/w Solaris and GNU
+                    if options.location == options.SOLARIS:
+                        print '# File: %s,' % filename, 'line: %d' % lineno
+                    elif options.location == options.GNU:
+                        print '#: %s:%d' % (filename, lineno)
+                # TBD: sorting, normalizing
+                print 'msgid', k
+                print 'msgstr '
+                print
+        finally:
+            sys.stdout = sys.__stdout__
+
+
+def main():
+    default_keywords = ['_']
+    try:
+        opts, args = getopt.getopt(
+            sys.argv[1:],
+            'k:d:n:h',
+            ['keyword', 'default-domain', 'help',
+             'add-location=', 'no-location'])
+    except getopt.error, msg:
+        usage(1, msg)
+
+    # for holding option values
+    class Options:
+        # constants
+        GNU = 1
+        SOLARIS = 2
+        # defaults
+        keywords = []
+        outfile = 'messages.pot'
+        location = GNU
+
+    options = Options()
+    locations = {'gnu' : options.GNU,
+                 'solaris' : options.SOLARIS,
+                 }
+
+    # parse options
+    for opt, arg in opts:
+        if opt in ('-h', '--help'):
+            usage(0)
+        elif opt in ('-k', '--keyword'):
+            if arg is None:
+                default_keywords = []
+            options.keywords.append(arg)
+        elif opt in ('-d', '--default-domain'):
+            options.outfile = arg + '.pot'
+        elif opt in ('-n', '--add-location'):
+            if arg is None:
+                arg = 'gnu'
+            try:
+                options.location = locations[string.lower(arg)]
+            except KeyError:
+                usage(1, 'Invalid value for --add-location: ' + arg)
+        elif opt in ('--no-location',):
+            options.location = 0
+
+    # calculate all keywords
+    options.keywords.extend(default_keywords)
+
+    # slurp through all the files
+    eater = TokenEater(options)
+    for filename in args:
+        fp = open(filename)
+        eater.set_filename(filename)
+        tokenize.tokenize(fp.readline, eater)
+        fp.close()
+
+    fp = open(options.outfile, 'w')
+    eater.write(fp)
+    fp.close()
+
+
+
+if __name__ == '__main__':
+    main()
commit	e27db5a3938b64a1e72c853285aa02c8c60c45d2	[log] [tgz]
author	Barry Warsaw <barry@python.org>	Fri Aug 13 20:59:48 1999 +0000
committer	Barry Warsaw <barry@python.org>	Fri Aug 13 20:59:48 1999 +0000
tree	ba6c377440c21ecbd730873743c15b7268438eee
parent	31e5e37196260c58a57f780a688b27b25ccae1bb [diff] [blame]