1 files changed, 845 insertions, 0 deletions
diff --git a/translate-toolkit-1.5.1/translate/storage/pypo.py b/translate-toolkit-1.5.1/translate/storage/pypo.py
new file mode 100644
index 0000000..885b1a2
--- /dev/null
+++ b/translate-toolkit-1.5.1/translate/storage/pypo.py
@@ -0,0 +1,845 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright 2002-2009 Zuza Software Foundation
+#
+# This file is part of the Translate Toolkit.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+"""classes that hold units of .po files (pounit) or entire files (pofile)
+gettext-style .po (or .pot) files are used in translations for KDE et al (see kbabel)"""
+
+from __future__ import generators
+from translate.misc.multistring import multistring
+from translate.misc import quote
+from translate.misc import textwrap
+from translate.lang import data
+from translate.storage import pocommon, base
+import re
+import copy
+import cStringIO
+import poparser
+
+lsep = "\n#: "
+"""Seperator for #: entries"""
+
+# general functions for quoting / unquoting po strings
+
+po_unescape_map = {"\\r": "\r", "\\t": "\t", '\\"': '"', '\\n': '\n', '\\\\': '\\'}
+po_escape_map = dict([(value, key) for (key, value) in po_unescape_map.items()])
+
+def escapeforpo(line):
+    """Escapes a line for po format. assumes no \n occurs in the line.
+
+    @param line: unescaped text
+    """
+    special_locations = []
+    for special_key in po_escape_map:
+        special_locations.extend(quote.find_all(line, special_key))
+    special_locations = dict.fromkeys(special_locations).keys()
+    special_locations.sort()
+    escaped_line = ""
+    last_location = 0
+    for location in special_locations:
+        escaped_line += line[last_location:location]
+        escaped_line += po_escape_map[line[location:location+1]]
+        last_location = location+1
+    escaped_line += line[last_location:]
+    return escaped_line
+
+def unescapehandler(escape):
+
+    return po_unescape_map.get(escape, escape)
+
+def wrapline(line):
+    """Wrap text for po files."""
+    wrappedlines = textwrap.wrap(line, 76, replace_whitespace=False, expand_tabs=False, drop_whitespace=False)
+
+    # Lines should not start with a space...
+    if len(wrappedlines) > 1:
+        for index, line in enumerate(wrappedlines[1:]):
+            if line.startswith(' '):
+                # Remove the space at the beginning of the line:
+                wrappedlines[index+1] = line[1:]
+
+                # Append a space to the previous line:
+                wrappedlines[index] += ' '
+    return wrappedlines
+
+def quoteforpo(text):
+    """quotes the given text for a PO file, returning quoted and escaped lines"""
+    polines = []
+    if text is None:
+        return polines
+    lines = text.split("\n")
+    if len(lines) > 1 or (len(lines) == 1 and len(lines[0]) > 71):
+        if len(lines) != 2 or lines[1]:
+            polines.extend(['""'])
+        for line in lines[:-1]:
+            #TODO: We should only wrap after escaping
+            lns = wrapline(line)
+            if len(lns) > 0:
+                for ln in lns[:-1]:
+                    polines.extend(['"' + escapeforpo(ln) + '"'])
+                if lns[-1]:
+                    polines.extend(['"' + escapeforpo(lns[-1]) + '\\n"'])
+            else:
+                polines.extend(['"\\n"'])
+    if lines[-1]:
+        polines.extend(['"' + escapeforpo(line) + '"' for line in wrapline(lines[-1])])
+    return polines
+
+def extractpoline(line):
+    """Remove quote and unescape line from po file.
+
+    @param line: a quoted line from a po file (msgid or msgstr)
+    """
+    extracted = quote.extractwithoutquotes(line, '"', '"', '\\', includeescapes=unescapehandler)[0]
+    return extracted
+
+def unquotefrompo(postr):
+    return u"".join([extractpoline(line) for line in postr])
+
+def encodingToUse(encoding):
+    """Tests whether the given encoding is known in the python runtime, or returns utf-8.
+    This function is used to ensure that a valid encoding is always used."""
+    if encoding == "CHARSET" or encoding == None:
+        return 'utf-8'
+    return encoding
+#    if encoding is None: return False
+#    return True
+#    try:
+#        tuple = codecs.lookup(encoding)
+#    except LookupError:
+#        return False
+#    return True
+
+def is_null(lst):
+    return lst == [] or len(lst) == 1 and lst[0] == '""'
+
+def extractstr(string):
+    left = string.find('"')
+    right = string.rfind('"')
+    if right > -1:
+        return string[left:right+1]
+    else:
+        return string[left:] + '"'
+
+class pounit(pocommon.pounit):
+    # othercomments = []      #   # this is another comment
+    # automaticcomments = []  #   #. comment extracted from the source code
+    # sourcecomments = []     #   #: sourcefile.xxx:35
+    # prev_msgctxt = []       #   #| The previous values that msgctxt and msgid held
+    # prev_msgid = []         # 
+    # prev_msgid_plural = []  # 
+    # typecomments = []       #   #, fuzzy
+    # msgidcomments = []      #   _: within msgid
+    # msgctxt
+    # msgid = []
+    # msgstr = []
+
+    # Our homegrown way to indicate what must be copied in a shallow
+    # fashion
+    __shallow__ = ['_store']
+
+    def __init__(self, source=None, encoding="UTF-8"):
+        self._encoding = encodingToUse(encoding)
+        self.obsolete = False
+        self._initallcomments(blankall=True)
+        self.prev_msgctxt = []
+        self.prev_msgid = []
+        self.prev_msgid_plural = []
+        self.msgctxt = []
+        self.msgid = []
+        self.msgid_pluralcomments = []
+        self.msgid_plural = []
+        self.msgstr = []
+        self.obsoletemsgctxt = []
+        self.obsoletemsgid = []
+        self.obsoletemsgid_pluralcomments = []
+        self.obsoletemsgid_plural = []
+        self.obsoletemsgstr = []
+        pocommon.pounit.__init__(self, source)
+
+    def _initallcomments(self, blankall=False):
+        """Initialises allcomments"""
+        if blankall:
+            self.othercomments = []
+            self.automaticcomments = []
+            self.sourcecomments = []
+            self.typecomments = []
+            self.msgidcomments = []
+            self.obsoletemsgidcomments = []
+
+    def _get_all_comments(self):
+        return [self.othercomments,
+                self.automaticcomments,
+                self.sourcecomments,
+                self.typecomments,
+                self.msgidcomments,
+                self.obsoletemsgidcomments]
+
+    allcomments = property(_get_all_comments)
+
+    def _get_source_vars(self, msgid, msgid_plural):
+        multi = multistring(unquotefrompo(msgid), self._encoding)
+        if self.hasplural():
+            pluralform = unquotefrompo(msgid_plural)
+            if isinstance(pluralform, str):
+                pluralform = pluralform.decode(self._encoding)
+            multi.strings.append(pluralform)
+        return multi
+
+    def _set_source_vars(self, source):
+        msgid = None
+        msgid_plural = None
+        if isinstance(source, str):
+            source = source.decode(self._encoding)
+        if isinstance(source, multistring):
+            source = source.strings
+        if isinstance(source, list):
+            msgid = quoteforpo(source[0])
+            if len(source) > 1:
+                msgid_plural = quoteforpo(source[1])
+            else:
+                msgid_plural = []
+        else:
+            msgid = quoteforpo(source)
+            msgid_plural = []
+        return msgid, msgid_plural
+
+    def getsource(self):
+        """Returns the unescaped msgid"""
+        return self._get_source_vars(self.msgid, self.msgid_plural)
+
+    def setsource(self, source):
+        """Sets the msgid to the given (unescaped) value.
+
+        @param source: an unescaped source string.
+        """
+        self.msgid, self.msgid_plural = self._set_source_vars(source)
+    source = property(getsource, setsource)
+
+    def _get_prev_source(self):
+        """Returns the unescaped msgid"""
+        return self._get_source_vars(self.prev_msgid, self.prev_msgid_plural)
+
+    def _set_prev_source(self, source):
+        """Sets the msgid to the given (unescaped) value.
+
+        @param source: an unescaped source string.
+        """
+        self.prev_msgid, self.prev_msgid_plural = self._set_source_vars(source)
+    prev_source = property(_get_prev_source, _set_prev_source)
+
+    def gettarget(self):
+        """Returns the unescaped msgstr"""
+        if isinstance(self.msgstr, dict):
+            multi = multistring(map(unquotefrompo, self.msgstr.values()), self._encoding)
+        else:
+            multi = multistring(unquotefrompo(self.msgstr), self._encoding)
+        return multi
+
+    def settarget(self, target):
+        """Sets the msgstr to the given (unescaped) value"""
+        self._rich_target = None
+        if isinstance(target, str):
+            target = target.decode(self._encoding)
+        if self.hasplural():
+            if isinstance(target, multistring):
+                target = target.strings
+            elif isinstance(target, basestring):
+                target = [target]
+        elif isinstance(target, (dict, list)):
+            if len(target) == 1:
+                target = target[0]
+            else:
+                raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target))
+        templates = self.msgstr
+        if isinstance(templates, list):
+            templates = {0: templates}
+        if isinstance(target, list):
+            self.msgstr = dict([(i, quoteforpo(target[i])) for i in range(len(target))])
+        elif isinstance(target, dict):
+            self.msgstr = dict([(i, quoteforpo(targetstring)) for i, targetstring in target.iteritems()])
+        else:
+            self.msgstr = quoteforpo(target)
+    target = property(gettarget, settarget)
+
+    def getnotes(self, origin=None):
+        """Return comments based on origin value (programmer, developer, source code and translator)"""
+        if origin == None:
+            comments = u"".join([comment[2:] for comment in self.othercomments])
+            comments += u"".join([comment[3:] for comment in self.automaticcomments])
+        elif origin == "translator":
+            comments = u"".join ([comment[2:] for comment in self.othercomments])
+        elif origin in ["programmer", "developer", "source code"]:
+            comments = u"".join([comment[3:] for comment in self.automaticcomments])
+        else:
+            raise ValueError("Comment type not valid")
+        # Let's drop the last newline
+        return comments[:-1]
+
+    def addnote(self, text, origin=None, position="append"):
+        """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote"""
+        # ignore empty strings and strings without non-space characters
+        if not (text and text.strip()):
+            return
+        text = data.forceunicode(text)
+        commentlist = self.othercomments
+        linestart = "# "
+        if origin in ["programmer", "developer", "source code"]:
+            autocomments = True
+            commentlist = self.automaticcomments
+            linestart = "#. "
+        text = text.split("\n")
+        if position == "append":
+            commentlist += [linestart + line + "\n" for line in text]
+        else:
+            newcomments = [linestart + line + "\n" for line in text]
+            newcomments += [line for line in commentlist]
+            if autocomments:
+                self.automaticcomments = newcomments
+            else:
+                self.othercomments = newcomments
+
+    def removenotes(self):
+        """Remove all the translator's notes (other comments)"""
+        self.othercomments = []
+
+    def __deepcopy__(self, memo={}):
+        # Make an instance to serve as the copy
+        new_unit = self.__class__()
+        # We'll be testing membership frequently, so make a set from
+        # self.__shallow__
+        shallow = set(self.__shallow__)
+        # Make deep copies of all members which are not in shallow
+        for key, value in self.__dict__.iteritems():
+            if key not in shallow:
+                setattr(new_unit, key, copy.deepcopy(value))
+        # Make shallow copies of all members which are in shallow
+        for key in set(shallow):
+            setattr(new_unit, key, getattr(self, key))
+        # Mark memo with ourself, so that we won't get deep copied
+        # again
+        memo[id(self)] = self
+        # Return our copied unit
+        return new_unit
+
+    def copy(self):
+        return copy.deepcopy(self)
+
+    def _msgidlen(self):
+        if self.hasplural():
+            return len(unquotefrompo(self.msgid).strip()) + len(unquotefrompo(self.msgid_plural).strip())
+        else:
+            return len(unquotefrompo(self.msgid).strip())
+
+    def _msgstrlen(self):
+        if isinstance(self.msgstr, dict):
+            combinedstr = "\n".join([unquotefrompo(msgstr).strip() for msgstr in self.msgstr.itervalues()])
+            return len(combinedstr.strip())
+        else:
+            return len(unquotefrompo(self.msgstr).strip())
+
+    def merge(self, otherpo, overwrite=False, comments=True, authoritative=False):
+        """Merges the otherpo (with the same msgid) into this one.
+
+        Overwrite non-blank self.msgstr only if overwrite is True
+        merge comments only if comments is True
+        """
+
+        def mergelists(list1, list2, split=False):
+            #decode where necessary
+            if unicode in [type(item) for item in list2] + [type(item) for item in list1]:
+                for position, item in enumerate(list1):
+                    if isinstance(item, str):
+                        list1[position] = item.decode("utf-8")
+                for position, item in enumerate(list2):
+                    if isinstance(item, str):
+                        list2[position] = item.decode("utf-8")
+
+            #Determine the newline style of list1
+            lineend = ""
+            if list1 and list1[0]:
+                for candidate in ["\n", "\r", "\n\r"]:
+                    if list1[0].endswith(candidate):
+                        lineend = candidate
+                if not lineend:
+                    lineend = ""
+            else:
+                lineend = "\n"
+
+            #Split if directed to do so:
+            if split:
+                splitlist1 = []
+                splitlist2 = []
+                prefix = "#"
+                for item in list1:
+                    splitlist1.extend(item.split()[1:])
+                    prefix = item.split()[0]
+                for item in list2:
+                    splitlist2.extend(item.split()[1:])
+                    prefix = item.split()[0]
+                list1.extend(["%s %s%s" % (prefix, item, lineend) for item in splitlist2 if not item in splitlist1])
+            else:
+                #Normal merge, but conform to list1 newline style
+                if list1 != list2:
+                    for item in list2:
+                        if lineend:
+                            item = item.rstrip() + lineend
+                        # avoid duplicate comment lines (this might cause some problems)
+                        if item not in list1 or len(item) < 5:
+                            list1.append(item)
+        if not isinstance(otherpo, pounit):
+            super(pounit, self).merge(otherpo, overwrite, comments)
+            return
+        if comments:
+            mergelists(self.othercomments, otherpo.othercomments)
+            mergelists(self.typecomments, otherpo.typecomments)
+            if not authoritative:
+                # We don't bring across otherpo.automaticcomments as we consider ourself
+                # to be the the authority.  Same applies to otherpo.msgidcomments
+                mergelists(self.automaticcomments, otherpo.automaticcomments)
+                mergelists(self.msgidcomments, otherpo.msgidcomments)
+                mergelists(self.sourcecomments, otherpo.sourcecomments, split=True)
+        if not self.istranslated() or overwrite:
+            # Remove kde-style comments from the translation (if any).
+            if self._extract_msgidcomments(otherpo.target):
+                otherpo.target = otherpo.target.replace('_: ' + otherpo._extract_msgidcomments()+ '\n', '')
+            self.target = otherpo.target
+            if self.source != otherpo.source or self.getcontext() != otherpo.getcontext():
+                self.markfuzzy()
+            else:
+                self.markfuzzy(otherpo.isfuzzy())
+        elif not otherpo.istranslated():
+            if self.source != otherpo.source:
+                self.markfuzzy()
+        else:
+            if self.target != otherpo.target:
+                self.markfuzzy()
+
+    def isheader(self):
+        #return (self._msgidlen() == 0) and (self._msgstrlen() > 0) and (len(self.msgidcomments) == 0)
+        #rewritten here for performance:
+        return (is_null(self.msgid)
+                        and not is_null(self.msgstr)
+                        and self.msgidcomments == []
+                        and is_null(self.msgctxt)
+        )
+
+    def isblank(self):
+        if self.isheader() or len(self.msgidcomments):
+            return False
+        if (self._msgidlen() == 0) and (self._msgstrlen() == 0) and (is_null(self.msgctxt)):
+            return True
+        return False
+        # TODO: remove:
+        # Before, the equivalent of the following was the final return statement:
+        # return len(self.source.strip()) == 0
+
+    def hastypecomment(self, typecomment):
+        """Check whether the given type comment is present"""
+        # check for word boundaries properly by using a regular expression...
+        return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0
+
+    def hasmarkedcomment(self, commentmarker):
+        """Check whether the given comment marker is present as # (commentmarker) ..."""
+        commentmarker = "(%s)" % commentmarker
+        for comment in self.othercomments:
+            if comment.replace("#", "", 1).strip().startswith(commentmarker):
+                return True
+        return False
+
+    def settypecomment(self, typecomment, present=True):
+        """Alters whether a given typecomment is present"""
+        if self.hastypecomment(typecomment) != present:
+            if present:
+                self.typecomments.append("#, %s\n" % typecomment)
+            else:
+                # this should handle word boundaries properly ...
+                typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments)
+                self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments)
+
+    def isfuzzy(self):
+        return self.hastypecomment("fuzzy")
+
+    def markfuzzy(self, present=True):
+        self.settypecomment("fuzzy", present)
+
+    def isobsolete(self):
+        return self.obsolete
+
+    def makeobsolete(self):
+        """Makes this unit obsolete"""
+        self.obsolete = True
+        if self.msgctxt:
+            self.obsoletemsgctxt = self.msgctxt
+        if self.msgid:
+            self.obsoletemsgid = self.msgid
+            self.msgid = []
+        if self.msgidcomments:
+            self.obsoletemsgidcomments = self.msgidcomments
+            self.msgidcomments = []
+        if self.msgid_plural:
+            self.obsoletemsgid_plural = self.msgid_plural
+            self.msgid_plural = []
+        if self.msgstr:
+            self.obsoletemsgstr = self.msgstr
+            self.msgstr = []
+        self.sourcecomments = []
+        self.automaticcomments = []
+
+    def resurrect(self):
+        """Makes an obsolete unit normal"""
+        self.obsolete = False
+        if self.obsoletemsgctxt:
+            self.msgid = self.obsoletemsgctxt
+            self.obsoletemsgctxt = []
+        if self.obsoletemsgid:
+            self.msgid = self.obsoletemsgid
+            self.obsoletemsgid = []
+        if self.obsoletemsgidcomments:
+            self.msgidcomments = self.obsoletemsgidcomments
+            self.obsoletemsgidcomments = []
+        if self.obsoletemsgid_plural:
+            self.msgid_plural = self.obsoletemsgid_plural
+            self.obsoletemsgid_plural = []
+        if self.obsoletemsgstr:
+            self.msgstr = self.obsoletemsgstr
+            self.obsoletemgstr = []
+
+    def hasplural(self):
+        """returns whether this pounit contains plural strings..."""
+        return len(self.msgid_plural) > 0
+
+    def parse(self, src):
+        return poparser.parse_unit(poparser.ParseState(cStringIO.StringIO(src), pounit), self)
+
+    def _getmsgpartstr(self, partname, partlines, partcomments=""):
+        if isinstance(partlines, dict):
+            partkeys = partlines.keys()
+            partkeys.sort()
+            return "".join([self._getmsgpartstr("%s[%d]" % (partname, partkey), partlines[partkey], partcomments) for partkey in partkeys])
+        partstr = partname + " "
+        partstartline = 0
+        if len(partlines) > 0 and len(partcomments) == 0:
+            partstr += partlines[0]
+            partstartline = 1
+        elif len(partcomments) > 0:
+            if len(partlines) > 0 and len(unquotefrompo(partlines[:1])) == 0:
+                # if there is a blank leader line, it must come before the comment
+                partstr += partlines[0] + '\n'
+                # but if the whole string is blank, leave it in
+                if len(partlines) > 1:
+                    partstartline += 1
+            else:
+                # All partcomments should start on a newline
+                partstr += '""\n'
+            # combine comments into one if more than one
+            if len(partcomments) > 1:
+                combinedcomment = []
+                for comment in partcomments:
+                    comment = unquotefrompo([comment])
+                    if comment.startswith("_:"):
+                        comment = comment[len("_:"):]
+                    if comment.endswith("\\n"):
+                        comment = comment[:-len("\\n")]
+                    #Before we used to strip. Necessary in some cases?
+                    combinedcomment.append(comment)
+                partcomments = quoteforpo("_:%s" % "".join(combinedcomment))
+            # comments first, no blank leader line needed
+            partstr += "\n".join(partcomments)
+            partstr = quote.rstripeol(partstr)
+        else:
+            partstr += '""'
+        partstr += '\n'
+        # add the rest
+        for partline in partlines[partstartline:]:
+            partstr += partline + '\n'
+        return partstr
+
+    def _encodeifneccessary(self, output):
+        """encodes unicode strings and returns other strings unchanged"""
+        if isinstance(output, unicode):
+            encoding = encodingToUse(getattr(self, "encoding", "UTF-8"))
+            return output.encode(encoding)
+        return output
+
+    def __str__(self):
+        """convert to a string. double check that unicode is handled somehow here"""
+        output = self._getoutput()
+        return self._encodeifneccessary(output)
+
+    def _getoutput(self):
+        """return this po element as a string"""
+        def add_prev_msgid_lines(lines, header, var):
+            if len(var) > 0:
+                lines.append("#| %s %s\n" % (header, var[0]))
+                lines.extend("#| %s\n" % line for line in var[1:])
+
+        def add_prev_msgid_info(lines):
+            add_prev_msgid_lines(lines, 'msgctxt', self.prev_msgctxt)
+            add_prev_msgid_lines(lines, 'msgid', self.prev_msgid)
+            add_prev_msgid_lines(lines, 'msgid_plural', self.prev_msgid_plural)
+
+        lines = []
+        lines.extend(self.othercomments)
+        if self.isobsolete():
+            lines.extend(self.typecomments)
+            obsoletelines = []
+            if self.obsoletemsgctxt:
+                obsoletelines.append(self._getmsgpartstr("#~ msgctxt", self.obsoletemsgctxt))
+            obsoletelines.append(self._getmsgpartstr("#~ msgid", self.obsoletemsgid, self.obsoletemsgidcomments))
+            if self.obsoletemsgid_plural or self.obsoletemsgid_pluralcomments:
+                obsoletelines.append(self._getmsgpartstr("#~ msgid_plural", self.obsoletemsgid_plural, self.obsoletemsgid_pluralcomments))
+            obsoletelines.append(self._getmsgpartstr("#~ msgstr", self.obsoletemsgstr))
+            for index, obsoleteline in enumerate(obsoletelines):
+                # We need to account for a multiline msgid or msgstr here
+                obsoletelines[index] = obsoleteline.replace('\n"', '\n#~ "')
+            lines.extend(obsoletelines)
+            lines = [self._encodeifneccessary(line) for line in lines]
+            return "".join(lines)
+        # if there's no msgid don't do msgid and string, unless we're the header
+        # this will also discard any comments other than plain othercomments...
+        if is_null(self.msgid):
+            if not (self.isheader() or self.getcontext() or self.sourcecomments):
+                return "".join(lines)
+        lines.extend(self.automaticcomments)
+        lines.extend(self.sourcecomments)
+        lines.extend(self.typecomments)
+        add_prev_msgid_info(lines)
+        if self.msgctxt:
+            lines.append(self._getmsgpartstr("msgctxt", self.msgctxt))
+        lines.append(self._getmsgpartstr("msgid", self.msgid, self.msgidcomments))
+        if self.msgid_plural or self.msgid_pluralcomments:
+            lines.append(self._getmsgpartstr("msgid_plural", self.msgid_plural, self.msgid_pluralcomments))
+        lines.append(self._getmsgpartstr("msgstr", self.msgstr))
+        lines = [self._encodeifneccessary(line) for line in lines]
+        postr = "".join(lines)
+        return postr
+
+    def getlocations(self):
+        """Get a list of locations from sourcecomments in the PO unit
+
+        rtype: List
+        return: A list of the locations with '#: ' stripped
+
+        """
+        locations = []
+        for sourcecomment in self.sourcecomments:
+            locations += quote.rstripeol(sourcecomment)[3:].split()
+        return locations
+
+    def addlocation(self, location):
+        """Add a location to sourcecomments in the PO unit
+
+        @param location: Text location e.g. 'file.c:23' does not include #:
+        @type location: String
+
+        """
+        self.sourcecomments.append("#: %s\n" % location)
+
+    def _extract_msgidcomments(self, text=None):
+        """Extract KDE style msgid comments from the unit.
+
+        @rtype: String
+        @return: Returns the extracted msgidcomments found in this unit's msgid.
+        """
+
+        if not text:
+            text = unquotefrompo(self.msgidcomments)
+        return text.split('\n')[0].replace('_: ', '', 1)
+
+    def setmsgidcomment(self, msgidcomment):
+        if msgidcomment:
+            self.msgidcomments = ['"_: %s\\n"' % msgidcomment]
+        else:
+            self.msgidcomments = []
+
+    msgidcomment = property(_extract_msgidcomments, setmsgidcomment)
+    
+    def getcontext(self):
+        """Get the message context."""
+        return unquotefrompo(self.msgctxt) + self._extract_msgidcomments()
+
+    def getid(self):
+        """Returns a unique identifier for this unit."""
+        context = self.getcontext()
+        # Gettext does not consider the plural to determine duplicates, only 
+        # the msgid. For generation of .mo files, we might want to use this
+        # code to generate the entry for the hash table, but for now, it is 
+        # commented out for conformance to gettext.
+#        id = '\0'.join(self.source.strings)
+        id = self.source
+        if self.msgidcomments:
+            id = u"_: %s\n%s" % (context, id)
+        elif context:
+            id = u"%s\04%s" % (context, id)
+        return id
+
+class pofile(pocommon.pofile):
+    """A .po file containing various units"""
+    UnitClass = pounit
+
+    def __init__(self, inputfile=None, encoding=None, unitclass=pounit):
+        """Construct a pofile, optionally reading in from inputfile.
+        encoding can be specified but otherwise will be read from the PO header"""
+        self.UnitClass = unitclass
+        pocommon.pofile.__init__(self, unitclass=unitclass)
+        self.units = []
+        self.filename = ''
+        self._encoding = encodingToUse(encoding)
+        if inputfile is not None:
+            self.parse(inputfile)
+
+    def changeencoding(self, newencoding):
+        """Deprecated: changes the encoding on the file."""
+        # This should not be here but in poheader. It also shouldn't mangle the
+        # header itself, but use poheader methods. All users are removed, so
+        # we can deprecate after one release.
+        raise DeprecationWarning
+
+        self._encoding = encodingToUse(newencoding)
+        if not self.units:
+            return
+        header = self.header()
+        if not header or header.isblank():
+            return
+        charsetline = None
+        headerstr = unquotefrompo(header.msgstr)
+        for line in headerstr.split("\n"):
+            if not ":" in line:
+                continue
+            key, value = line.strip().split(":", 1)
+            if key.strip() != "Content-Type":
+                continue
+            charsetline = line
+        if charsetline is None:
+            headerstr += "Content-Type: text/plain; charset=%s" % self._encoding
+        else:
+            charset = re.search("charset=([^ ]*)", charsetline)
+            if charset is None:
+                newcharsetline = charsetline
+                if not newcharsetline.strip().endswith(";"):
+                    newcharsetline += ";"
+                newcharsetline += " charset=%s" % self._encoding
+            else:
+                charset = charset.group(1)
+                newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self._encoding, 1)
+            headerstr = headerstr.replace(charsetline, newcharsetline, 1)
+        header.msgstr = quoteforpo(headerstr)
+
+    def parse(self, input):
+        """Parses the given file or file source string."""
+        try:
+            if hasattr(input, 'name'):
+                self.filename = input.name
+            elif not getattr(self, 'filename', ''):
+                self.filename = ''
+            if isinstance(input, str):
+                input = cStringIO.StringIO(input)
+            poparser.parse_units(poparser.ParseState(input, pounit), self)
+        except Exception, e:
+            raise base.ParseError(e)
+
+    def removeduplicates(self, duplicatestyle="merge"):
+        """Make sure each msgid is unique ; merge comments etc from duplicates into original"""
+        # TODO: can we handle consecutive calls to removeduplicates()? What
+        # about files already containing msgctxt? - test
+        id_dict = {}
+        uniqueunits = []
+        # TODO: this is using a list as the pos aren't hashable, but this is slow.
+        # probably not used frequently enough to worry about it, though.
+        markedpos = []
+        def addcomment(thepo):
+            thepo.msgidcomments.append('"_: %s\\n"' % " ".join(thepo.getlocations()))
+            markedpos.append(thepo)
+        for thepo in self.units:
+            id = thepo.getid()
+            if thepo.isheader() and not thepo.getlocations():
+                # header msgids shouldn't be merged...
+                uniqueunits.append(thepo)
+            elif id in id_dict:
+                if duplicatestyle == "merge":
+                    if id:
+                        id_dict[id].merge(thepo)
+                    else:
+                        addcomment(thepo)
+                        uniqueunits.append(thepo)
+                elif duplicatestyle == "msgctxt":
+                    origpo = id_dict[id]
+                    if origpo not in markedpos:
+                        origpo.msgctxt.append('"%s"' % escapeforpo(" ".join(origpo.getlocations())))
+                        markedpos.append(thepo)
+                    thepo.msgctxt.append('"%s"' % escapeforpo(" ".join(thepo.getlocations())))
+                    uniqueunits.append(thepo)
+            else:
+                if not id:
+                    if duplicatestyle == "merge":
+                        addcomment(thepo)
+                    else:
+                        thepo.msgctxt.append('"%s"' % escapeforpo(" ".join(thepo.getlocations())))
+                id_dict[id] = thepo
+                uniqueunits.append(thepo)
+        self.units = uniqueunits
+
+    def __str__(self):
+        """Convert to a string. double check that unicode is handled somehow here"""
+        output = self._getoutput()
+        if isinstance(output, unicode):
+            return output.encode(getattr(self, "encoding", "UTF-8"))
+        return output
+
+    def _getoutput(self):
+        """convert the units back to lines"""
+        lines = []
+        for unit in self.units:
+            unitsrc = str(unit) + "\n"
+            lines.append(unitsrc)
+        lines = "".join(self.encode(lines)).rstrip()
+        #After the last pounit we will have \n\n and we only want to end in \n:
+        if lines:
+            lines += "\n"
+        return lines
+
+    def encode(self, lines):
+        """encode any unicode strings in lines in self._encoding"""
+        newlines = []
+        encoding = self._encoding
+        if encoding is None or encoding.lower() == "charset":
+            encoding = 'UTF-8'
+        for line in lines:
+            if isinstance(line, unicode):
+                line = line.encode(encoding)
+            newlines.append(line)
+        return newlines
+
+    def decode(self, lines):
+        """decode any non-unicode strings in lines with self._encoding"""
+        newlines = []
+        for line in lines:
+            if isinstance(line, str) and self._encoding is not None and self._encoding.lower() != "charset":
+                try:
+                    line = line.decode(self._encoding)
+                except UnicodeError, e:
+                    raise UnicodeError("Error decoding line with encoding %r: %s. Line is %r" % (self._encoding, e, line))
+            newlines.append(line)
+        return newlines
+
+    def unit_iter(self):
+        for unit in self.units:
+            if not (unit.isheader() or unit.isobsolete()):
+                yield unit