diff options
Diffstat (limited to 'translate-toolkit-1.5.1/translate/storage/pypo.py')
-rw-r--r-- | translate-toolkit-1.5.1/translate/storage/pypo.py | 845 |
1 files changed, 845 insertions, 0 deletions
diff --git a/translate-toolkit-1.5.1/translate/storage/pypo.py b/translate-toolkit-1.5.1/translate/storage/pypo.py new file mode 100644 index 0000000..885b1a2 --- /dev/null +++ b/translate-toolkit-1.5.1/translate/storage/pypo.py @@ -0,0 +1,845 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2002-2009 Zuza Software Foundation +# +# This file is part of the Translate Toolkit. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see <http://www.gnu.org/licenses/>. + +"""classes that hold units of .po files (pounit) or entire files (pofile) +gettext-style .po (or .pot) files are used in translations for KDE et al (see kbabel)""" + +from __future__ import generators +from translate.misc.multistring import multistring +from translate.misc import quote +from translate.misc import textwrap +from translate.lang import data +from translate.storage import pocommon, base +import re +import copy +import cStringIO +import poparser + +lsep = "\n#: " +"""Seperator for #: entries""" + +# general functions for quoting / unquoting po strings + +po_unescape_map = {"\\r": "\r", "\\t": "\t", '\\"': '"', '\\n': '\n', '\\\\': '\\'} +po_escape_map = dict([(value, key) for (key, value) in po_unescape_map.items()]) + +def escapeforpo(line): + """Escapes a line for po format. assumes no \n occurs in the line. + + @param line: unescaped text + """ + special_locations = [] + for special_key in po_escape_map: + special_locations.extend(quote.find_all(line, special_key)) + special_locations = dict.fromkeys(special_locations).keys() + special_locations.sort() + escaped_line = "" + last_location = 0 + for location in special_locations: + escaped_line += line[last_location:location] + escaped_line += po_escape_map[line[location:location+1]] + last_location = location+1 + escaped_line += line[last_location:] + return escaped_line + +def unescapehandler(escape): + + return po_unescape_map.get(escape, escape) + +def wrapline(line): + """Wrap text for po files.""" + wrappedlines = textwrap.wrap(line, 76, replace_whitespace=False, expand_tabs=False, drop_whitespace=False) + + # Lines should not start with a space... + if len(wrappedlines) > 1: + for index, line in enumerate(wrappedlines[1:]): + if line.startswith(' '): + # Remove the space at the beginning of the line: + wrappedlines[index+1] = line[1:] + + # Append a space to the previous line: + wrappedlines[index] += ' ' + return wrappedlines + +def quoteforpo(text): + """quotes the given text for a PO file, returning quoted and escaped lines""" + polines = [] + if text is None: + return polines + lines = text.split("\n") + if len(lines) > 1 or (len(lines) == 1 and len(lines[0]) > 71): + if len(lines) != 2 or lines[1]: + polines.extend(['""']) + for line in lines[:-1]: + #TODO: We should only wrap after escaping + lns = wrapline(line) + if len(lns) > 0: + for ln in lns[:-1]: + polines.extend(['"' + escapeforpo(ln) + '"']) + if lns[-1]: + polines.extend(['"' + escapeforpo(lns[-1]) + '\\n"']) + else: + polines.extend(['"\\n"']) + if lines[-1]: + polines.extend(['"' + escapeforpo(line) + '"' for line in wrapline(lines[-1])]) + return polines + +def extractpoline(line): + """Remove quote and unescape line from po file. + + @param line: a quoted line from a po file (msgid or msgstr) + """ + extracted = quote.extractwithoutquotes(line, '"', '"', '\\', includeescapes=unescapehandler)[0] + return extracted + +def unquotefrompo(postr): + return u"".join([extractpoline(line) for line in postr]) + +def encodingToUse(encoding): + """Tests whether the given encoding is known in the python runtime, or returns utf-8. + This function is used to ensure that a valid encoding is always used.""" + if encoding == "CHARSET" or encoding == None: + return 'utf-8' + return encoding +# if encoding is None: return False +# return True +# try: +# tuple = codecs.lookup(encoding) +# except LookupError: +# return False +# return True + +def is_null(lst): + return lst == [] or len(lst) == 1 and lst[0] == '""' + +def extractstr(string): + left = string.find('"') + right = string.rfind('"') + if right > -1: + return string[left:right+1] + else: + return string[left:] + '"' + +class pounit(pocommon.pounit): + # othercomments = [] # # this is another comment + # automaticcomments = [] # #. comment extracted from the source code + # sourcecomments = [] # #: sourcefile.xxx:35 + # prev_msgctxt = [] # #| The previous values that msgctxt and msgid held + # prev_msgid = [] # + # prev_msgid_plural = [] # + # typecomments = [] # #, fuzzy + # msgidcomments = [] # _: within msgid + # msgctxt + # msgid = [] + # msgstr = [] + + # Our homegrown way to indicate what must be copied in a shallow + # fashion + __shallow__ = ['_store'] + + def __init__(self, source=None, encoding="UTF-8"): + self._encoding = encodingToUse(encoding) + self.obsolete = False + self._initallcomments(blankall=True) + self.prev_msgctxt = [] + self.prev_msgid = [] + self.prev_msgid_plural = [] + self.msgctxt = [] + self.msgid = [] + self.msgid_pluralcomments = [] + self.msgid_plural = [] + self.msgstr = [] + self.obsoletemsgctxt = [] + self.obsoletemsgid = [] + self.obsoletemsgid_pluralcomments = [] + self.obsoletemsgid_plural = [] + self.obsoletemsgstr = [] + pocommon.pounit.__init__(self, source) + + def _initallcomments(self, blankall=False): + """Initialises allcomments""" + if blankall: + self.othercomments = [] + self.automaticcomments = [] + self.sourcecomments = [] + self.typecomments = [] + self.msgidcomments = [] + self.obsoletemsgidcomments = [] + + def _get_all_comments(self): + return [self.othercomments, + self.automaticcomments, + self.sourcecomments, + self.typecomments, + self.msgidcomments, + self.obsoletemsgidcomments] + + allcomments = property(_get_all_comments) + + def _get_source_vars(self, msgid, msgid_plural): + multi = multistring(unquotefrompo(msgid), self._encoding) + if self.hasplural(): + pluralform = unquotefrompo(msgid_plural) + if isinstance(pluralform, str): + pluralform = pluralform.decode(self._encoding) + multi.strings.append(pluralform) + return multi + + def _set_source_vars(self, source): + msgid = None + msgid_plural = None + if isinstance(source, str): + source = source.decode(self._encoding) + if isinstance(source, multistring): + source = source.strings + if isinstance(source, list): + msgid = quoteforpo(source[0]) + if len(source) > 1: + msgid_plural = quoteforpo(source[1]) + else: + msgid_plural = [] + else: + msgid = quoteforpo(source) + msgid_plural = [] + return msgid, msgid_plural + + def getsource(self): + """Returns the unescaped msgid""" + return self._get_source_vars(self.msgid, self.msgid_plural) + + def setsource(self, source): + """Sets the msgid to the given (unescaped) value. + + @param source: an unescaped source string. + """ + self.msgid, self.msgid_plural = self._set_source_vars(source) + source = property(getsource, setsource) + + def _get_prev_source(self): + """Returns the unescaped msgid""" + return self._get_source_vars(self.prev_msgid, self.prev_msgid_plural) + + def _set_prev_source(self, source): + """Sets the msgid to the given (unescaped) value. + + @param source: an unescaped source string. + """ + self.prev_msgid, self.prev_msgid_plural = self._set_source_vars(source) + prev_source = property(_get_prev_source, _set_prev_source) + + def gettarget(self): + """Returns the unescaped msgstr""" + if isinstance(self.msgstr, dict): + multi = multistring(map(unquotefrompo, self.msgstr.values()), self._encoding) + else: + multi = multistring(unquotefrompo(self.msgstr), self._encoding) + return multi + + def settarget(self, target): + """Sets the msgstr to the given (unescaped) value""" + self._rich_target = None + if isinstance(target, str): + target = target.decode(self._encoding) + if self.hasplural(): + if isinstance(target, multistring): + target = target.strings + elif isinstance(target, basestring): + target = [target] + elif isinstance(target, (dict, list)): + if len(target) == 1: + target = target[0] + else: + raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target)) + templates = self.msgstr + if isinstance(templates, list): + templates = {0: templates} + if isinstance(target, list): + self.msgstr = dict([(i, quoteforpo(target[i])) for i in range(len(target))]) + elif isinstance(target, dict): + self.msgstr = dict([(i, quoteforpo(targetstring)) for i, targetstring in target.iteritems()]) + else: + self.msgstr = quoteforpo(target) + target = property(gettarget, settarget) + + def getnotes(self, origin=None): + """Return comments based on origin value (programmer, developer, source code and translator)""" + if origin == None: + comments = u"".join([comment[2:] for comment in self.othercomments]) + comments += u"".join([comment[3:] for comment in self.automaticcomments]) + elif origin == "translator": + comments = u"".join ([comment[2:] for comment in self.othercomments]) + elif origin in ["programmer", "developer", "source code"]: + comments = u"".join([comment[3:] for comment in self.automaticcomments]) + else: + raise ValueError("Comment type not valid") + # Let's drop the last newline + return comments[:-1] + + def addnote(self, text, origin=None, position="append"): + """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote""" + # ignore empty strings and strings without non-space characters + if not (text and text.strip()): + return + text = data.forceunicode(text) + commentlist = self.othercomments + linestart = "# " + if origin in ["programmer", "developer", "source code"]: + autocomments = True + commentlist = self.automaticcomments + linestart = "#. " + text = text.split("\n") + if position == "append": + commentlist += [linestart + line + "\n" for line in text] + else: + newcomments = [linestart + line + "\n" for line in text] + newcomments += [line for line in commentlist] + if autocomments: + self.automaticcomments = newcomments + else: + self.othercomments = newcomments + + def removenotes(self): + """Remove all the translator's notes (other comments)""" + self.othercomments = [] + + def __deepcopy__(self, memo={}): + # Make an instance to serve as the copy + new_unit = self.__class__() + # We'll be testing membership frequently, so make a set from + # self.__shallow__ + shallow = set(self.__shallow__) + # Make deep copies of all members which are not in shallow + for key, value in self.__dict__.iteritems(): + if key not in shallow: + setattr(new_unit, key, copy.deepcopy(value)) + # Make shallow copies of all members which are in shallow + for key in set(shallow): + setattr(new_unit, key, getattr(self, key)) + # Mark memo with ourself, so that we won't get deep copied + # again + memo[id(self)] = self + # Return our copied unit + return new_unit + + def copy(self): + return copy.deepcopy(self) + + def _msgidlen(self): + if self.hasplural(): + return len(unquotefrompo(self.msgid).strip()) + len(unquotefrompo(self.msgid_plural).strip()) + else: + return len(unquotefrompo(self.msgid).strip()) + + def _msgstrlen(self): + if isinstance(self.msgstr, dict): + combinedstr = "\n".join([unquotefrompo(msgstr).strip() for msgstr in self.msgstr.itervalues()]) + return len(combinedstr.strip()) + else: + return len(unquotefrompo(self.msgstr).strip()) + + def merge(self, otherpo, overwrite=False, comments=True, authoritative=False): + """Merges the otherpo (with the same msgid) into this one. + + Overwrite non-blank self.msgstr only if overwrite is True + merge comments only if comments is True + """ + + def mergelists(list1, list2, split=False): + #decode where necessary + if unicode in [type(item) for item in list2] + [type(item) for item in list1]: + for position, item in enumerate(list1): + if isinstance(item, str): + list1[position] = item.decode("utf-8") + for position, item in enumerate(list2): + if isinstance(item, str): + list2[position] = item.decode("utf-8") + + #Determine the newline style of list1 + lineend = "" + if list1 and list1[0]: + for candidate in ["\n", "\r", "\n\r"]: + if list1[0].endswith(candidate): + lineend = candidate + if not lineend: + lineend = "" + else: + lineend = "\n" + + #Split if directed to do so: + if split: + splitlist1 = [] + splitlist2 = [] + prefix = "#" + for item in list1: + splitlist1.extend(item.split()[1:]) + prefix = item.split()[0] + for item in list2: + splitlist2.extend(item.split()[1:]) + prefix = item.split()[0] + list1.extend(["%s %s%s" % (prefix, item, lineend) for item in splitlist2 if not item in splitlist1]) + else: + #Normal merge, but conform to list1 newline style + if list1 != list2: + for item in list2: + if lineend: + item = item.rstrip() + lineend + # avoid duplicate comment lines (this might cause some problems) + if item not in list1 or len(item) < 5: + list1.append(item) + if not isinstance(otherpo, pounit): + super(pounit, self).merge(otherpo, overwrite, comments) + return + if comments: + mergelists(self.othercomments, otherpo.othercomments) + mergelists(self.typecomments, otherpo.typecomments) + if not authoritative: + # We don't bring across otherpo.automaticcomments as we consider ourself + # to be the the authority. Same applies to otherpo.msgidcomments + mergelists(self.automaticcomments, otherpo.automaticcomments) + mergelists(self.msgidcomments, otherpo.msgidcomments) + mergelists(self.sourcecomments, otherpo.sourcecomments, split=True) + if not self.istranslated() or overwrite: + # Remove kde-style comments from the translation (if any). + if self._extract_msgidcomments(otherpo.target): + otherpo.target = otherpo.target.replace('_: ' + otherpo._extract_msgidcomments()+ '\n', '') + self.target = otherpo.target + if self.source != otherpo.source or self.getcontext() != otherpo.getcontext(): + self.markfuzzy() + else: + self.markfuzzy(otherpo.isfuzzy()) + elif not otherpo.istranslated(): + if self.source != otherpo.source: + self.markfuzzy() + else: + if self.target != otherpo.target: + self.markfuzzy() + + def isheader(self): + #return (self._msgidlen() == 0) and (self._msgstrlen() > 0) and (len(self.msgidcomments) == 0) + #rewritten here for performance: + return (is_null(self.msgid) + and not is_null(self.msgstr) + and self.msgidcomments == [] + and is_null(self.msgctxt) + ) + + def isblank(self): + if self.isheader() or len(self.msgidcomments): + return False + if (self._msgidlen() == 0) and (self._msgstrlen() == 0) and (is_null(self.msgctxt)): + return True + return False + # TODO: remove: + # Before, the equivalent of the following was the final return statement: + # return len(self.source.strip()) == 0 + + def hastypecomment(self, typecomment): + """Check whether the given type comment is present""" + # check for word boundaries properly by using a regular expression... + return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0 + + def hasmarkedcomment(self, commentmarker): + """Check whether the given comment marker is present as # (commentmarker) ...""" + commentmarker = "(%s)" % commentmarker + for comment in self.othercomments: + if comment.replace("#", "", 1).strip().startswith(commentmarker): + return True + return False + + def settypecomment(self, typecomment, present=True): + """Alters whether a given typecomment is present""" + if self.hastypecomment(typecomment) != present: + if present: + self.typecomments.append("#, %s\n" % typecomment) + else: + # this should handle word boundaries properly ... + typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments) + self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments) + + def isfuzzy(self): + return self.hastypecomment("fuzzy") + + def markfuzzy(self, present=True): + self.settypecomment("fuzzy", present) + + def isobsolete(self): + return self.obsolete + + def makeobsolete(self): + """Makes this unit obsolete""" + self.obsolete = True + if self.msgctxt: + self.obsoletemsgctxt = self.msgctxt + if self.msgid: + self.obsoletemsgid = self.msgid + self.msgid = [] + if self.msgidcomments: + self.obsoletemsgidcomments = self.msgidcomments + self.msgidcomments = [] + if self.msgid_plural: + self.obsoletemsgid_plural = self.msgid_plural + self.msgid_plural = [] + if self.msgstr: + self.obsoletemsgstr = self.msgstr + self.msgstr = [] + self.sourcecomments = [] + self.automaticcomments = [] + + def resurrect(self): + """Makes an obsolete unit normal""" + self.obsolete = False + if self.obsoletemsgctxt: + self.msgid = self.obsoletemsgctxt + self.obsoletemsgctxt = [] + if self.obsoletemsgid: + self.msgid = self.obsoletemsgid + self.obsoletemsgid = [] + if self.obsoletemsgidcomments: + self.msgidcomments = self.obsoletemsgidcomments + self.obsoletemsgidcomments = [] + if self.obsoletemsgid_plural: + self.msgid_plural = self.obsoletemsgid_plural + self.obsoletemsgid_plural = [] + if self.obsoletemsgstr: + self.msgstr = self.obsoletemsgstr + self.obsoletemgstr = [] + + def hasplural(self): + """returns whether this pounit contains plural strings...""" + return len(self.msgid_plural) > 0 + + def parse(self, src): + return poparser.parse_unit(poparser.ParseState(cStringIO.StringIO(src), pounit), self) + + def _getmsgpartstr(self, partname, partlines, partcomments=""): + if isinstance(partlines, dict): + partkeys = partlines.keys() + partkeys.sort() + return "".join([self._getmsgpartstr("%s[%d]" % (partname, partkey), partlines[partkey], partcomments) for partkey in partkeys]) + partstr = partname + " " + partstartline = 0 + if len(partlines) > 0 and len(partcomments) == 0: + partstr += partlines[0] + partstartline = 1 + elif len(partcomments) > 0: + if len(partlines) > 0 and len(unquotefrompo(partlines[:1])) == 0: + # if there is a blank leader line, it must come before the comment + partstr += partlines[0] + '\n' + # but if the whole string is blank, leave it in + if len(partlines) > 1: + partstartline += 1 + else: + # All partcomments should start on a newline + partstr += '""\n' + # combine comments into one if more than one + if len(partcomments) > 1: + combinedcomment = [] + for comment in partcomments: + comment = unquotefrompo([comment]) + if comment.startswith("_:"): + comment = comment[len("_:"):] + if comment.endswith("\\n"): + comment = comment[:-len("\\n")] + #Before we used to strip. Necessary in some cases? + combinedcomment.append(comment) + partcomments = quoteforpo("_:%s" % "".join(combinedcomment)) + # comments first, no blank leader line needed + partstr += "\n".join(partcomments) + partstr = quote.rstripeol(partstr) + else: + partstr += '""' + partstr += '\n' + # add the rest + for partline in partlines[partstartline:]: + partstr += partline + '\n' + return partstr + + def _encodeifneccessary(self, output): + """encodes unicode strings and returns other strings unchanged""" + if isinstance(output, unicode): + encoding = encodingToUse(getattr(self, "encoding", "UTF-8")) + return output.encode(encoding) + return output + + def __str__(self): + """convert to a string. double check that unicode is handled somehow here""" + output = self._getoutput() + return self._encodeifneccessary(output) + + def _getoutput(self): + """return this po element as a string""" + def add_prev_msgid_lines(lines, header, var): + if len(var) > 0: + lines.append("#| %s %s\n" % (header, var[0])) + lines.extend("#| %s\n" % line for line in var[1:]) + + def add_prev_msgid_info(lines): + add_prev_msgid_lines(lines, 'msgctxt', self.prev_msgctxt) + add_prev_msgid_lines(lines, 'msgid', self.prev_msgid) + add_prev_msgid_lines(lines, 'msgid_plural', self.prev_msgid_plural) + + lines = [] + lines.extend(self.othercomments) + if self.isobsolete(): + lines.extend(self.typecomments) + obsoletelines = [] + if self.obsoletemsgctxt: + obsoletelines.append(self._getmsgpartstr("#~ msgctxt", self.obsoletemsgctxt)) + obsoletelines.append(self._getmsgpartstr("#~ msgid", self.obsoletemsgid, self.obsoletemsgidcomments)) + if self.obsoletemsgid_plural or self.obsoletemsgid_pluralcomments: + obsoletelines.append(self._getmsgpartstr("#~ msgid_plural", self.obsoletemsgid_plural, self.obsoletemsgid_pluralcomments)) + obsoletelines.append(self._getmsgpartstr("#~ msgstr", self.obsoletemsgstr)) + for index, obsoleteline in enumerate(obsoletelines): + # We need to account for a multiline msgid or msgstr here + obsoletelines[index] = obsoleteline.replace('\n"', '\n#~ "') + lines.extend(obsoletelines) + lines = [self._encodeifneccessary(line) for line in lines] + return "".join(lines) + # if there's no msgid don't do msgid and string, unless we're the header + # this will also discard any comments other than plain othercomments... + if is_null(self.msgid): + if not (self.isheader() or self.getcontext() or self.sourcecomments): + return "".join(lines) + lines.extend(self.automaticcomments) + lines.extend(self.sourcecomments) + lines.extend(self.typecomments) + add_prev_msgid_info(lines) + if self.msgctxt: + lines.append(self._getmsgpartstr("msgctxt", self.msgctxt)) + lines.append(self._getmsgpartstr("msgid", self.msgid, self.msgidcomments)) + if self.msgid_plural or self.msgid_pluralcomments: + lines.append(self._getmsgpartstr("msgid_plural", self.msgid_plural, self.msgid_pluralcomments)) + lines.append(self._getmsgpartstr("msgstr", self.msgstr)) + lines = [self._encodeifneccessary(line) for line in lines] + postr = "".join(lines) + return postr + + def getlocations(self): + """Get a list of locations from sourcecomments in the PO unit + + rtype: List + return: A list of the locations with '#: ' stripped + + """ + locations = [] + for sourcecomment in self.sourcecomments: + locations += quote.rstripeol(sourcecomment)[3:].split() + return locations + + def addlocation(self, location): + """Add a location to sourcecomments in the PO unit + + @param location: Text location e.g. 'file.c:23' does not include #: + @type location: String + + """ + self.sourcecomments.append("#: %s\n" % location) + + def _extract_msgidcomments(self, text=None): + """Extract KDE style msgid comments from the unit. + + @rtype: String + @return: Returns the extracted msgidcomments found in this unit's msgid. + """ + + if not text: + text = unquotefrompo(self.msgidcomments) + return text.split('\n')[0].replace('_: ', '', 1) + + def setmsgidcomment(self, msgidcomment): + if msgidcomment: + self.msgidcomments = ['"_: %s\\n"' % msgidcomment] + else: + self.msgidcomments = [] + + msgidcomment = property(_extract_msgidcomments, setmsgidcomment) + + def getcontext(self): + """Get the message context.""" + return unquotefrompo(self.msgctxt) + self._extract_msgidcomments() + + def getid(self): + """Returns a unique identifier for this unit.""" + context = self.getcontext() + # Gettext does not consider the plural to determine duplicates, only + # the msgid. For generation of .mo files, we might want to use this + # code to generate the entry for the hash table, but for now, it is + # commented out for conformance to gettext. +# id = '\0'.join(self.source.strings) + id = self.source + if self.msgidcomments: + id = u"_: %s\n%s" % (context, id) + elif context: + id = u"%s\04%s" % (context, id) + return id + +class pofile(pocommon.pofile): + """A .po file containing various units""" + UnitClass = pounit + + def __init__(self, inputfile=None, encoding=None, unitclass=pounit): + """Construct a pofile, optionally reading in from inputfile. + encoding can be specified but otherwise will be read from the PO header""" + self.UnitClass = unitclass + pocommon.pofile.__init__(self, unitclass=unitclass) + self.units = [] + self.filename = '' + self._encoding = encodingToUse(encoding) + if inputfile is not None: + self.parse(inputfile) + + def changeencoding(self, newencoding): + """Deprecated: changes the encoding on the file.""" + # This should not be here but in poheader. It also shouldn't mangle the + # header itself, but use poheader methods. All users are removed, so + # we can deprecate after one release. + raise DeprecationWarning + + self._encoding = encodingToUse(newencoding) + if not self.units: + return + header = self.header() + if not header or header.isblank(): + return + charsetline = None + headerstr = unquotefrompo(header.msgstr) + for line in headerstr.split("\n"): + if not ":" in line: + continue + key, value = line.strip().split(":", 1) + if key.strip() != "Content-Type": + continue + charsetline = line + if charsetline is None: + headerstr += "Content-Type: text/plain; charset=%s" % self._encoding + else: + charset = re.search("charset=([^ ]*)", charsetline) + if charset is None: + newcharsetline = charsetline + if not newcharsetline.strip().endswith(";"): + newcharsetline += ";" + newcharsetline += " charset=%s" % self._encoding + else: + charset = charset.group(1) + newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self._encoding, 1) + headerstr = headerstr.replace(charsetline, newcharsetline, 1) + header.msgstr = quoteforpo(headerstr) + + def parse(self, input): + """Parses the given file or file source string.""" + try: + if hasattr(input, 'name'): + self.filename = input.name + elif not getattr(self, 'filename', ''): + self.filename = '' + if isinstance(input, str): + input = cStringIO.StringIO(input) + poparser.parse_units(poparser.ParseState(input, pounit), self) + except Exception, e: + raise base.ParseError(e) + + def removeduplicates(self, duplicatestyle="merge"): + """Make sure each msgid is unique ; merge comments etc from duplicates into original""" + # TODO: can we handle consecutive calls to removeduplicates()? What + # about files already containing msgctxt? - test + id_dict = {} + uniqueunits = [] + # TODO: this is using a list as the pos aren't hashable, but this is slow. + # probably not used frequently enough to worry about it, though. + markedpos = [] + def addcomment(thepo): + thepo.msgidcomments.append('"_: %s\\n"' % " ".join(thepo.getlocations())) + markedpos.append(thepo) + for thepo in self.units: + id = thepo.getid() + if thepo.isheader() and not thepo.getlocations(): + # header msgids shouldn't be merged... + uniqueunits.append(thepo) + elif id in id_dict: + if duplicatestyle == "merge": + if id: + id_dict[id].merge(thepo) + else: + addcomment(thepo) + uniqueunits.append(thepo) + elif duplicatestyle == "msgctxt": + origpo = id_dict[id] + if origpo not in markedpos: + origpo.msgctxt.append('"%s"' % escapeforpo(" ".join(origpo.getlocations()))) + markedpos.append(thepo) + thepo.msgctxt.append('"%s"' % escapeforpo(" ".join(thepo.getlocations()))) + uniqueunits.append(thepo) + else: + if not id: + if duplicatestyle == "merge": + addcomment(thepo) + else: + thepo.msgctxt.append('"%s"' % escapeforpo(" ".join(thepo.getlocations()))) + id_dict[id] = thepo + uniqueunits.append(thepo) + self.units = uniqueunits + + def __str__(self): + """Convert to a string. double check that unicode is handled somehow here""" + output = self._getoutput() + if isinstance(output, unicode): + return output.encode(getattr(self, "encoding", "UTF-8")) + return output + + def _getoutput(self): + """convert the units back to lines""" + lines = [] + for unit in self.units: + unitsrc = str(unit) + "\n" + lines.append(unitsrc) + lines = "".join(self.encode(lines)).rstrip() + #After the last pounit we will have \n\n and we only want to end in \n: + if lines: + lines += "\n" + return lines + + def encode(self, lines): + """encode any unicode strings in lines in self._encoding""" + newlines = [] + encoding = self._encoding + if encoding is None or encoding.lower() == "charset": + encoding = 'UTF-8' + for line in lines: + if isinstance(line, unicode): + line = line.encode(encoding) + newlines.append(line) + return newlines + + def decode(self, lines): + """decode any non-unicode strings in lines with self._encoding""" + newlines = [] + for line in lines: + if isinstance(line, str) and self._encoding is not None and self._encoding.lower() != "charset": + try: + line = line.decode(self._encoding) + except UnicodeError, e: + raise UnicodeError("Error decoding line with encoding %r: %s. Line is %r" % (self._encoding, e, line)) + newlines.append(line) + return newlines + + def unit_iter(self): + for unit in self.units: + if not (unit.isheader() or unit.isobsolete()): + yield unit |