diff options
Diffstat (limited to 'translate-toolkit-1.5.1/translate/misc/sparse.py')
-rw-r--r-- | translate-toolkit-1.5.1/translate/misc/sparse.py | 188 |
1 files changed, 188 insertions, 0 deletions
diff --git a/translate-toolkit-1.5.1/translate/misc/sparse.py b/translate-toolkit-1.5.1/translate/misc/sparse.py new file mode 100644 index 0000000..bf2ba04 --- /dev/null +++ b/translate-toolkit-1.5.1/translate/misc/sparse.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""simple parser / string tokenizer +rather than returning a list of token types etc, we simple return a list of tokens... +each tokenizing function takes a string as input and returns a list of tokens +""" + +# Copyright 2002, 2003 St James Software +# +# This file is part of translate. +# +# translate is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# translate is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with translate; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +def stringeval(text): + """takes away repeated quotes (escapes) and returns the string represented by the text""" + stringchar = text[0] + if text[-1] != stringchar or stringchar not in ("'",'"'): + # scratch your head + raise ValueError, "error parsing escaped string: %r" % text + return text[1:-1].replace(stringchar+stringchar,stringchar) + +def stringquote(text): + """escapes quotes as neccessary and returns a string representing the text""" + if "'" in text: + if '"' in text: + return '"' + text.replace('"', '""') + '"' + else: + return '"' + text + '"' + else: + return "'" + text + "'" + +class ParserError(ValueError): + """Intelligent parser error""" + def __init__(self, parser, message, tokennum): + """takes a message and the number of the token that caused the error""" + tokenpos = parser.findtokenpos(tokennum) + line, charpos = parser.getlinepos(tokenpos) + ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \ + (message, line, charpos, parser.tokens[tokennum])) + self.parser = parser + self.tokennum = tokennum + +class SimpleParser: + """this is a simple parser""" + def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0): + if defaulttokenlist is None: + self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>'] + self.defaulttokenlist.extend('(),[]:=+-') + else: + self.defaulttokenlist = defaulttokenlist + self.whitespacechars = whitespacechars + self.includewhitespacetokens = includewhitespacetokens + self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens] + self.quotechars = ('"', "'") + self.endquotechars = {'"':'"',"'":"'"} + self.stringescaping = 1 + + def stringtokenize(self, text): + """makes strings in text into tokens...""" + tokens = [] + laststart = 0 + instring = 0 + endstringchar, escapechar = '', '\\' + gotclose, gotescape = 0, 0 + for pos in range(len(text)): + char = text[pos] + if instring: + if self.stringescaping and (gotescape or char == escapechar) and not gotclose: + gotescape = not gotescape + elif char == endstringchar: + gotclose = not gotclose + elif gotclose: + tokens.append(text[laststart:pos]) + instring, laststart, endstringchar = 0, pos, '' + if not instring: + if char in self.quotechars: + if pos > laststart: tokens.append(text[laststart:pos]) + instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0 + if laststart < len(text): tokens.append(text[laststart:]) + return tokens + + def keeptogether(self, text): + """checks whether a token should be kept together""" + return self.isstringtoken(text) + + def isstringtoken(self, text): + """checks whether a token is a string token""" + return text[:1] in self.quotechars + + def separatetokens(self, text, tokenlist = None): + """this separates out tokens in tokenlist from whitespace etc""" + if self.keeptogether(text): return [text] + if tokenlist is None: + tokenlist = self.defaulttokenlist + # loop through and put tokens into a list + tokens = [] + pos = 0 + laststart = 0 + lentext = len(text) + while pos < lentext: + foundtoken = 0 + for token in tokenlist: + lentoken = len(token) + if text[pos:pos+lentoken] == token: + if laststart < pos: tokens.append(text[laststart:pos]) + tokens.append(token) + pos += lentoken + foundtoken, laststart = 1, pos + break + if not foundtoken: pos += 1 + if laststart < lentext: tokens.append(text[laststart:]) + return tokens + + def removewhitespace(self, text): + """this removes whitespace but lets it separate things out into separate tokens""" + if self.keeptogether(text): return [text] + # loop through and put tokens into a list + tokens = [] + pos = 0 + inwhitespace = 0 + laststart = 0 + for pos in range(len(text)): + char = text[pos] + if inwhitespace: + if char not in self.whitespacechars: + if laststart < pos and self.includewhitespacetokens: tokens.append(text[laststart:pos]) + inwhitespace, laststart = 0, pos + else: + if char in self.whitespacechars: + if laststart < pos: tokens.append(text[laststart:pos]) + inwhitespace, laststart = 1, pos + if laststart < len(text) and (not inwhitespace or self.includewhitespacetokens): + tokens.append(text[laststart:]) + return tokens + + def applytokenizer(self, inputlist, tokenizer): + """apply a tokenizer to a set of text, flattening the result""" + tokenizedlists = [tokenizer(text) for text in inputlist] + joined = [] + map(joined.extend, tokenizedlists) + return joined + + def applytokenizers(self, inputlist, tokenizers): + """apply a set of tokenizers to a set of text, flattening each time""" + for tokenizer in tokenizers: + inputlist = self.applytokenizer(inputlist, tokenizer) + return inputlist + + def tokenize(self, source, tokenizers=None): + """tokenize the text string with the standard tokenizers""" + self.source = source + if tokenizers is None: + tokenizers = self.standardtokenizers + self.tokens = self.applytokenizers([self.source], tokenizers) + return self.tokens + + def findtokenpos(self, tokennum): + """finds the position of the given token in the text""" + currenttokenpos = 0 + for currenttokennum in range(tokennum+1): + currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos) + return currenttokenpos + + def getlinepos(self, tokenpos): + """finds the line and character position of the given character""" + sourcecut = self.source[:tokenpos] + line = sourcecut.count("\n")+1 + charpos = tokenpos - sourcecut.rfind("\n") + return line, charpos + + def raiseerror(self, message, tokennum): + """raises a ParserError""" + raise ParserError(self, message, tokennum) + + |