diff options
Diffstat (limited to 'babel/messages/jslexer.py')
-rw-r--r-- | babel/messages/jslexer.py | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/babel/messages/jslexer.py b/babel/messages/jslexer.py new file mode 100644 index 0000000..d063ef0 --- /dev/null +++ b/babel/messages/jslexer.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2008 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://babel.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://babel.edgewall.org/log/. + +"""A simple JavaScript 1.5 lexer which is used for the JavaScript +extractor. +""" + +import re +from operator import itemgetter + + +operators = [ + '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=', + '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=', + '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')', + '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':' +] +operators.sort(lambda a, b: cmp(-len(a), -len(b))) + +escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'} + +rules = [ + (None, re.compile(r'\s+(?u)')), + (None, re.compile(r'<!--.*')), + ('linecomment', re.compile(r'//.*')), + ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')), + ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')), + ('number', re.compile(r'''(?x)( + (?:0|[1-9]\d*) + (\.\d+)? + ([eE][-+]?\d+)? | + (0x[a-fA-F0-9]+) + )''')), + ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))), + ('string', re.compile(r'''(?xs)( + '(?:[^'\\]*(?:\\.[^'\\]*)*)' | + "(?:[^"\\]*(?:\\.[^"\\]*)*)" + )''')) +] + +division_re = re.compile(r'/=?') +regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)') +line_re = re.compile(r'(\r\n|\n|\r)') +line_join_re = re.compile(r'\\' + line_re.pattern) +uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}') + + +class Token(tuple): + """Represents a token as returned by `tokenize`.""" + __slots__ = () + + def __new__(cls, type, value, lineno): + return tuple.__new__(cls, (type, value, lineno)) + + type = property(itemgetter(0)) + value = property(itemgetter(1)) + lineno = property(itemgetter(2)) + + +def indicates_division(token): + """A helper function that helps the tokenizer to decide if the current + token may be followed by a division operator. + """ + if token.type == 'operator': + return token.value in (')', ']', '}', '++', '--') + return token.type in ('name', 'number', 'string', 'regexp') + + +def unquote_string(string): + """Unquote a string with JavaScript rules. The string has to start with + string delimiters (``'`` or ``"``.) + + :return: a string + """ + assert string and string[0] == string[-1] and string[0] in '"\'', \ + 'string provided is not properly delimited' + string = line_join_re.sub('\\1', string[1:-1]) + result = [] + add = result.append + pos = 0 + + while 1: + # scan for the next escape + escape_pos = string.find('\\', pos) + if escape_pos < 0: + break + add(string[pos:escape_pos]) + + # check which character is escaped + next_char = string[escape_pos + 1] + if next_char in escapes: + add(escapes[next_char]) + + # unicode escapes. trie to consume up to four characters of + # hexadecimal characters and try to interpret them as unicode + # character point. If there is no such character point, put + # all the consumed characters into the string. + elif next_char in 'uU': + escaped = uni_escape_re.match(string, escape_pos + 2) + if escaped is not None: + escaped_value = escaped.group() + if len(escaped_value) == 4: + try: + add(unichr(int(escaped_value, 16))) + except ValueError: + pass + else: + pos = escape_pos + 6 + continue + add(next_char + escaped_value) + pos = escaped.end() + continue + else: + add(next_char) + + # bogus escape. Just remove the backslash. + else: + add(next_char) + pos = escape_pos + 2 + + if pos < len(string): + add(string[pos:]) + + return u''.join(result) + + +def tokenize(source): + """Tokenize a JavaScript source. + + :return: generator of `Token`\s + """ + may_divide = False + pos = 0 + lineno = 1 + end = len(source) + + while pos < end: + # handle regular rules first + for token_type, rule in rules: + match = rule.match(source, pos) + if match is not None: + break + # if we don't have a match we don't give up yet, but check for + # division operators or regular expression literals, based on + # the status of `may_divide` which is determined by the last + # processed non-whitespace token using `indicates_division`. + else: + if may_divide: + match = division_re.match(source, pos) + token_type = 'operator' + else: + match = regex_re.match(source, pos) + token_type = 'regexp' + if match is None: + # woops. invalid syntax. jump one char ahead and try again. + pos += 1 + continue + + token_value = match.group() + if token_type is not None: + token = Token(token_type, token_value, lineno) + may_divide = indicates_division(token) + yield token + lineno += len(line_re.findall(token_value)) + pos = match.end() |