diff options
Diffstat (limited to 'babel/messages/extract.py')
-rw-r--r-- | babel/messages/extract.py | 554 |
1 files changed, 0 insertions, 554 deletions
diff --git a/babel/messages/extract.py b/babel/messages/extract.py deleted file mode 100644 index 1f3a662..0000000 --- a/babel/messages/extract.py +++ /dev/null @@ -1,554 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2007 Edgewall Software -# All rights reserved. -# -# This software is licensed as described in the file COPYING, which -# you should have received as part of this distribution. The terms -# are also available at http://babel.edgewall.org/wiki/License. -# -# This software consists of voluntary contributions made by many -# individuals. For the exact contribution history, see the revision -# history and logs, available at http://babel.edgewall.org/log/. - -"""Basic infrastructure for extracting localizable messages from source files. - -This module defines an extensible system for collecting localizable message -strings from a variety of sources. A native extractor for Python source files -is builtin, extractors for other sources can be added using very simple plugins. - -The main entry points into the extraction functionality are the functions -`extract_from_dir` and `extract_from_file`. -""" - -import os -try: - set -except NameError: - from sets import Set as set -import sys -from tokenize import generate_tokens, COMMENT, NAME, OP, STRING - -from babel.util import parse_encoding, pathmatch, relpath -from textwrap import dedent - -__all__ = ['extract', 'extract_from_dir', 'extract_from_file'] -__docformat__ = 'restructuredtext en' - -GROUP_NAME = 'babel.extractors' - -DEFAULT_KEYWORDS = { - '_': None, - 'gettext': None, - 'ngettext': (1, 2), - 'ugettext': None, - 'ungettext': (1, 2), - 'dgettext': (2,), - 'dngettext': (2, 3), - 'N_': None -} - -DEFAULT_MAPPING = [('**.py', 'python')] - -empty_msgid_warning = ( -'%s: warning: Empty msgid. It is reserved by GNU gettext: gettext("") ' -'returns the header entry with meta information, not the empty string.') - - -def _strip_comment_tags(comments, tags): - """Helper function for `extract` that strips comment tags from strings - in a list of comment lines. This functions operates in-place. - """ - def _strip(line): - for tag in tags: - if line.startswith(tag): - return line[len(tag):].strip() - return line - comments[:] = map(_strip, comments) - - -def extract_from_dir(dirname=os.getcwd(), method_map=DEFAULT_MAPPING, - options_map=None, keywords=DEFAULT_KEYWORDS, - comment_tags=(), callback=None, strip_comment_tags=False): - """Extract messages from any source files found in the given directory. - - This function generates tuples of the form: - - ``(filename, lineno, message, comments)`` - - Which extraction method is used per file is determined by the `method_map` - parameter, which maps extended glob patterns to extraction method names. - For example, the following is the default mapping: - - >>> method_map = [ - ... ('**.py', 'python') - ... ] - - This basically says that files with the filename extension ".py" at any - level inside the directory should be processed by the "python" extraction - method. Files that don't match any of the mapping patterns are ignored. See - the documentation of the `pathmatch` function for details on the pattern - syntax. - - The following extended mapping would also use the "genshi" extraction - method on any file in "templates" subdirectory: - - >>> method_map = [ - ... ('**/templates/**.*', 'genshi'), - ... ('**.py', 'python') - ... ] - - The dictionary provided by the optional `options_map` parameter augments - these mappings. It uses extended glob patterns as keys, and the values are - dictionaries mapping options names to option values (both strings). - - The glob patterns of the `options_map` do not necessarily need to be the - same as those used in the method mapping. For example, while all files in - the ``templates`` folders in an application may be Genshi applications, the - options for those files may differ based on extension: - - >>> options_map = { - ... '**/templates/**.txt': { - ... 'template_class': 'genshi.template:TextTemplate', - ... 'encoding': 'latin-1' - ... }, - ... '**/templates/**.html': { - ... 'include_attrs': '' - ... } - ... } - - :param dirname: the path to the directory to extract messages from - :param method_map: a list of ``(pattern, method)`` tuples that maps of - extraction method names to extended glob patterns - :param options_map: a dictionary of additional options (optional) - :param keywords: a dictionary mapping keywords (i.e. names of functions - that should be recognized as translation functions) to - tuples that specify which of their arguments contain - localizable strings - :param comment_tags: a list of tags of translator comments to search for - and include in the results - :param callback: a function that is called for every file that message are - extracted from, just before the extraction itself is - performed; the function is passed the filename, the name - of the extraction method and and the options dictionary as - positional arguments, in that order - :param strip_comment_tags: a flag that if set to `True` causes all comment - tags to be removed from the collected comments. - :return: an iterator over ``(filename, lineno, funcname, message)`` tuples - :rtype: ``iterator`` - :see: `pathmatch` - """ - if options_map is None: - options_map = {} - - absname = os.path.abspath(dirname) - for root, dirnames, filenames in os.walk(absname): - for subdir in dirnames: - if subdir.startswith('.') or subdir.startswith('_'): - dirnames.remove(subdir) - dirnames.sort() - filenames.sort() - for filename in filenames: - filename = relpath( - os.path.join(root, filename).replace(os.sep, '/'), - dirname - ) - for pattern, method in method_map: - if pathmatch(pattern, filename): - filepath = os.path.join(absname, filename) - options = {} - for opattern, odict in options_map.items(): - if pathmatch(opattern, filename): - options = odict - if callback: - callback(filename, method, options) - for lineno, message, comments in \ - extract_from_file(method, filepath, - keywords=keywords, - comment_tags=comment_tags, - options=options, - strip_comment_tags= - strip_comment_tags): - yield filename, lineno, message, comments - break - - -def extract_from_file(method, filename, keywords=DEFAULT_KEYWORDS, - comment_tags=(), options=None, strip_comment_tags=False): - """Extract messages from a specific file. - - This function returns a list of tuples of the form: - - ``(lineno, funcname, message)`` - - :param filename: the path to the file to extract messages from - :param method: a string specifying the extraction method (.e.g. "python") - :param keywords: a dictionary mapping keywords (i.e. names of functions - that should be recognized as translation functions) to - tuples that specify which of their arguments contain - localizable strings - :param comment_tags: a list of translator tags to search for and include - in the results - :param strip_comment_tags: a flag that if set to `True` causes all comment - tags to be removed from the collected comments. - :param options: a dictionary of additional options (optional) - :return: the list of extracted messages - :rtype: `list` - """ - fileobj = open(filename, 'U') - try: - return list(extract(method, fileobj, keywords, comment_tags, options, - strip_comment_tags)) - finally: - fileobj.close() - - -def extract(method, fileobj, keywords=DEFAULT_KEYWORDS, comment_tags=(), - options=None, strip_comment_tags=False): - """Extract messages from the given file-like object using the specified - extraction method. - - This function returns a list of tuples of the form: - - ``(lineno, message, comments)`` - - The implementation dispatches the actual extraction to plugins, based on the - value of the ``method`` parameter. - - >>> source = '''# foo module - ... def run(argv): - ... print _('Hello, world!') - ... ''' - - >>> from StringIO import StringIO - >>> for message in extract('python', StringIO(source)): - ... print message - (3, u'Hello, world!', []) - - :param method: a string specifying the extraction method (.e.g. "python"); - if this is a simple name, the extraction function will be - looked up by entry point; if it is an explicit reference - to a function (of the form ``package.module:funcname`` or - ``package.module.funcname``), the corresponding function - will be imported and used - :param fileobj: the file-like object the messages should be extracted from - :param keywords: a dictionary mapping keywords (i.e. names of functions - that should be recognized as translation functions) to - tuples that specify which of their arguments contain - localizable strings - :param comment_tags: a list of translator tags to search for and include - in the results - :param options: a dictionary of additional options (optional) - :param strip_comment_tags: a flag that if set to `True` causes all comment - tags to be removed from the collected comments. - :return: the list of extracted messages - :rtype: `list` - :raise ValueError: if the extraction method is not registered - """ - func = None - if ':' in method or '.' in method: - if ':' not in method: - lastdot = method.rfind('.') - module, attrname = method[:lastdot], method[lastdot + 1:] - else: - module, attrname = method.split(':', 1) - func = getattr(__import__(module, {}, {}, [attrname]), attrname) - else: - try: - from pkg_resources import working_set - except ImportError: - # pkg_resources is not available, so we resort to looking up the - # builtin extractors directly - builtin = {'ignore': extract_nothing, 'python': extract_python} - func = builtin.get(method) - else: - for entry_point in working_set.iter_entry_points(GROUP_NAME, - method): - func = entry_point.load(require=True) - break - if func is None: - raise ValueError('Unknown extraction method %r' % method) - - results = func(fileobj, keywords.keys(), comment_tags, - options=options or {}) - - for lineno, funcname, messages, comments in results: - if funcname: - spec = keywords[funcname] or (1,) - else: - spec = (1,) - if not isinstance(messages, (list, tuple)): - messages = [messages] - if not messages: - continue - - # Validate the messages against the keyword's specification - msgs = [] - invalid = False - # last_index is 1 based like the keyword spec - last_index = len(messages) - for index in spec: - if last_index < index: - # Not enough arguments - invalid = True - break - message = messages[index - 1] - if message is None: - invalid = True - break - msgs.append(message) - if invalid: - continue - - first_msg_index = spec[0] - 1 - if not messages[first_msg_index]: - # An empty string msgid isn't valid, emit a warning - where = '%s:%i' % (hasattr(fileobj, 'name') and \ - fileobj.name or '(unknown)', lineno) - print >> sys.stderr, empty_msgid_warning % where - continue - - messages = tuple(msgs) - if len(messages) == 1: - messages = messages[0] - - if strip_comment_tags: - _strip_comment_tags(comments, comment_tags) - yield lineno, messages, comments - - -def extract_nothing(fileobj, keywords, comment_tags, options): - """Pseudo extractor that does not actually extract anything, but simply - returns an empty list. - """ - return [] - - -def extract_python(fileobj, keywords, comment_tags, options): - """Extract messages from Python source code. - - :param fileobj: the seekable, file-like object the messages should be - extracted from - :param keywords: a list of keywords (i.e. function names) that should be - recognized as translation functions - :param comment_tags: a list of translator tags to search for and include - in the results - :param options: a dictionary of additional options (optional) - :return: an iterator over ``(lineno, funcname, message, comments)`` tuples - :rtype: ``iterator`` - """ - funcname = lineno = message_lineno = None - call_stack = -1 - buf = [] - messages = [] - translator_comments = [] - in_def = in_translator_comments = False - comment_tag = None - - encoding = parse_encoding(fileobj) or options.get('encoding', 'iso-8859-1') - - tokens = generate_tokens(fileobj.readline) - for tok, value, (lineno, _), _, _ in tokens: - if call_stack == -1 and tok == NAME and value in ('def', 'class'): - in_def = True - elif tok == OP and value == '(': - if in_def: - # Avoid false positives for declarations such as: - # def gettext(arg='message'): - in_def = False - continue - if funcname: - message_lineno = lineno - call_stack += 1 - elif in_def and tok == OP and value == ':': - # End of a class definition without parens - in_def = False - continue - elif call_stack == -1 and tok == COMMENT: - # Strip the comment token from the line - value = value.decode(encoding)[1:].strip() - if in_translator_comments and \ - translator_comments[-1][0] == lineno - 1: - # We're already inside a translator comment, continue appending - translator_comments.append((lineno, value)) - continue - # If execution reaches this point, let's see if comment line - # starts with one of the comment tags - for comment_tag in comment_tags: - if value.startswith(comment_tag): - in_translator_comments = True - translator_comments.append((lineno, value)) - break - elif funcname and call_stack == 0: - if tok == OP and value == ')': - if buf: - messages.append(''.join(buf)) - del buf[:] - else: - messages.append(None) - - if len(messages) > 1: - messages = tuple(messages) - else: - messages = messages[0] - # Comments don't apply unless they immediately preceed the - # message - if translator_comments and \ - translator_comments[-1][0] < message_lineno - 1: - translator_comments = [] - - yield (message_lineno, funcname, messages, - [comment[1] for comment in translator_comments]) - - funcname = lineno = message_lineno = None - call_stack = -1 - messages = [] - translator_comments = [] - in_translator_comments = False - elif tok == STRING: - # Unwrap quotes in a safe manner, maintaining the string's - # encoding - # https://sourceforge.net/tracker/?func=detail&atid=355470& - # aid=617979&group_id=5470 - value = eval('# coding=%s\n%s' % (encoding, value), - {'__builtins__':{}}, {}) - if isinstance(value, str): - value = value.decode(encoding) - buf.append(value) - elif tok == OP and value == ',': - if buf: - messages.append(''.join(buf)) - del buf[:] - else: - messages.append(None) - if translator_comments: - # We have translator comments, and since we're on a - # comma(,) user is allowed to break into a new line - # Let's increase the last comment's lineno in order - # for the comment to still be a valid one - old_lineno, old_comment = translator_comments.pop() - translator_comments.append((old_lineno+1, old_comment)) - elif call_stack > 0 and tok == OP and value == ')': - call_stack -= 1 - elif funcname and call_stack == -1: - funcname = None - elif tok == NAME and value in keywords: - funcname = value - - -def extract_javascript(fileobj, keywords, comment_tags, options): - """Extract messages from JavaScript source code. - - :param fileobj: the seekable, file-like object the messages should be - extracted from - :param keywords: a list of keywords (i.e. function names) that should be - recognized as translation functions - :param comment_tags: a list of translator tags to search for and include - in the results - :param options: a dictionary of additional options (optional) - :return: an iterator over ``(lineno, funcname, message, comments)`` tuples - :rtype: ``iterator`` - """ - from babel.messages.jslexer import tokenize, unquote_string - funcname = message_lineno = None - messages = [] - last_argument = None - translator_comments = [] - concatenate_next = False - encoding = options.get('encoding', 'utf-8') - last_token = None - call_stack = -1 - - for token in tokenize(fileobj.read().decode(encoding)): - if token.type == 'operator' and token.value == '(': - if funcname: - message_lineno = token.lineno - call_stack += 1 - - elif call_stack == -1 and token.type == 'linecomment': - value = token.value[2:].strip() - if translator_comments and \ - translator_comments[-1][0] == token.lineno - 1: - translator_comments.append((token.lineno, value)) - continue - - for comment_tag in comment_tags: - if value.startswith(comment_tag): - translator_comments.append((token.lineno, value.strip())) - break - - elif token.type == 'multilinecomment': - # only one multi-line comment may preceed a translation - translator_comments = [] - value = token.value[2:-2].strip() - for comment_tag in comment_tags: - if value.startswith(comment_tag): - lines = value.splitlines() - if lines: - lines[0] = lines[0].strip() - lines[1:] = dedent('\n'.join(lines[1:])).splitlines() - for offset, line in enumerate(lines): - translator_comments.append((token.lineno + offset, - line)) - break - - elif funcname and call_stack == 0: - if token.type == 'operator' and token.value == ')': - if last_argument is not None: - messages.append(last_argument) - if len(messages) > 1: - messages = tuple(messages) - elif messages: - messages = messages[0] - else: - messages = None - - # Comments don't apply unless they immediately preceed the - # message - if translator_comments and \ - translator_comments[-1][0] < message_lineno - 1: - translator_comments = [] - - if messages is not None: - yield (message_lineno, funcname, messages, - [comment[1] for comment in translator_comments]) - - funcname = message_lineno = last_argument = None - concatenate_next = False - translator_comments = [] - messages = [] - call_stack = -1 - - elif token.type == 'string': - new_value = unquote_string(token.value) - if concatenate_next: - last_argument = (last_argument or '') + new_value - concatenate_next = False - else: - last_argument = new_value - - elif token.type == 'operator': - if token.value == ',': - if last_argument is not None: - messages.append(last_argument) - last_argument = None - else: - messages.append(None) - concatenate_next = False - elif token.value == '+': - concatenate_next = True - - elif call_stack > 0 and token.type == 'operator' \ - and token.value == ')': - call_stack -= 1 - - elif funcname and call_stack == -1: - funcname = None - - elif call_stack == -1 and token.type == 'name' and \ - token.value in keywords and \ - (last_token is None or last_token.type != 'name' or - last_token.value != 'function'): - funcname = token.value - - last_token = token |