#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2002-2009 Zuza Software Foundation
#
# This file is part of the Translate Toolkit.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

"""classes that hold units of .po files (pounit) or entire files (pofile)
gettext-style .po (or .pot) files are used in translations for KDE et al (see kbabel)"""

from __future__ import generators
from translate.misc.multistring import multistring
from translate.misc import quote
from translate.misc import textwrap
from translate.lang import data
from translate.storage import pocommon, base
import re
import copy
import cStringIO
import poparser

lsep = "\n#: "
"""Seperator for #: entries"""

# general functions for quoting / unquoting po strings

po_unescape_map = {"\\r": "\r", "\\t": "\t", '\\"': '"', '\\n': '\n', '\\\\': '\\'}
po_escape_map = dict([(value, key) for (key, value) in po_unescape_map.items()])

def escapeforpo(line):
    """Escapes a line for po format. assumes no \n occurs in the line.

    @param line: unescaped text
    """
    special_locations = []
    for special_key in po_escape_map:
        special_locations.extend(quote.find_all(line, special_key))
    special_locations = dict.fromkeys(special_locations).keys()
    special_locations.sort()
    escaped_line = ""
    last_location = 0
    for location in special_locations:
        escaped_line += line[last_location:location]
        escaped_line += po_escape_map[line[location:location+1]]
        last_location = location+1
    escaped_line += line[last_location:]
    return escaped_line

def unescapehandler(escape):

    return po_unescape_map.get(escape, escape)

def wrapline(line):
    """Wrap text for po files."""
    wrappedlines = textwrap.wrap(line, 76, replace_whitespace=False, expand_tabs=False, drop_whitespace=False)

    # Lines should not start with a space...
    if len(wrappedlines) > 1:
        for index, line in enumerate(wrappedlines[1:]):
            if line.startswith(' '):
                # Remove the space at the beginning of the line:
                wrappedlines[index+1] = line[1:]

                # Append a space to the previous line:
                wrappedlines[index] += ' '
    return wrappedlines

def quoteforpo(text):
    """quotes the given text for a PO file, returning quoted and escaped lines"""
    polines = []
    if text is None:
        return polines
    lines = text.split("\n")
    if len(lines) > 1 or (len(lines) == 1 and len(lines[0]) > 71):
        if len(lines) != 2 or lines[1]:
            polines.extend(['""'])
        for line in lines[:-1]:
            #TODO: We should only wrap after escaping
            lns = wrapline(line)
            if len(lns) > 0:
                for ln in lns[:-1]:
                    polines.extend(['"' + escapeforpo(ln) + '"'])
                if lns[-1]:
                    polines.extend(['"' + escapeforpo(lns[-1]) + '\\n"'])
            else:
                polines.extend(['"\\n"'])
    if lines[-1]:
        polines.extend(['"' + escapeforpo(line) + '"' for line in wrapline(lines[-1])])
    return polines

def extractpoline(line):
    """Remove quote and unescape line from po file.

    @param line: a quoted line from a po file (msgid or msgstr)
    """
    extracted = quote.extractwithoutquotes(line, '"', '"', '\\', includeescapes=unescapehandler)[0]
    return extracted

def unquotefrompo(postr):
    return u"".join([extractpoline(line) for line in postr])

def encodingToUse(encoding):
    """Tests whether the given encoding is known in the python runtime, or returns utf-8.
    This function is used to ensure that a valid encoding is always used."""
    if encoding == "CHARSET" or encoding == None:
        return 'utf-8'
    return encoding
#    if encoding is None: return False
#    return True
#    try:
#        tuple = codecs.lookup(encoding)
#    except LookupError:
#        return False
#    return True

def is_null(lst):
    return lst == [] or len(lst) == 1 and lst[0] == '""'

def extractstr(string):
    left = string.find('"')
    right = string.rfind('"')
    if right > -1:
        return string[left:right+1]
    else:
        return string[left:] + '"'

class pounit(pocommon.pounit):
    # othercomments = []      #   # this is another comment
    # automaticcomments = []  #   #. comment extracted from the source code
    # sourcecomments = []     #   #: sourcefile.xxx:35
    # prev_msgctxt = []       #   #| The previous values that msgctxt and msgid held
    # prev_msgid = []         # 
    # prev_msgid_plural = []  # 
    # typecomments = []       #   #, fuzzy
    # msgidcomments = []      #   _: within msgid
    # msgctxt
    # msgid = []
    # msgstr = []

    # Our homegrown way to indicate what must be copied in a shallow
    # fashion
    __shallow__ = ['_store']

    def __init__(self, source=None, encoding="UTF-8"):
        self._encoding = encodingToUse(encoding)
        self.obsolete = False
        self._initallcomments(blankall=True)
        self.prev_msgctxt = []
        self.prev_msgid = []
        self.prev_msgid_plural = []
        self.msgctxt = []
        self.msgid = []
        self.msgid_pluralcomments = []
        self.msgid_plural = []
        self.msgstr = []
        self.obsoletemsgctxt = []
        self.obsoletemsgid = []
        self.obsoletemsgid_pluralcomments = []
        self.obsoletemsgid_plural = []
        self.obsoletemsgstr = []
        pocommon.pounit.__init__(self, source)

    def _initallcomments(self, blankall=False):
        """Initialises allcomments"""
        if blankall:
            self.othercomments = []
            self.automaticcomments = []
            self.sourcecomments = []
            self.typecomments = []
            self.msgidcomments = []
            self.obsoletemsgidcomments = []

    def _get_all_comments(self):
        return [self.othercomments,
                self.automaticcomments,
                self.sourcecomments,
                self.typecomments,
                self.msgidcomments,
                self.obsoletemsgidcomments]

    allcomments = property(_get_all_comments)

    def _get_source_vars(self, msgid, msgid_plural):
        multi = multistring(unquotefrompo(msgid), self._encoding)
        if self.hasplural():
            pluralform = unquotefrompo(msgid_plural)
            if isinstance(pluralform, str):
                pluralform = pluralform.decode(self._encoding)
            multi.strings.append(pluralform)
        return multi

    def _set_source_vars(self, source):
        msgid = None
        msgid_plural = None
        if isinstance(source, str):
            source = source.decode(self._encoding)
        if isinstance(source, multistring):
            source = source.strings
        if isinstance(source, list):
            msgid = quoteforpo(source[0])
            if len(source) > 1:
                msgid_plural = quoteforpo(source[1])
            else:
                msgid_plural = []
        else:
            msgid = quoteforpo(source)
            msgid_plural = []
        return msgid, msgid_plural

    def getsource(self):
        """Returns the unescaped msgid"""
        return self._get_source_vars(self.msgid, self.msgid_plural)

    def setsource(self, source):
        """Sets the msgid to the given (unescaped) value.

        @param source: an unescaped source string.
        """
        self.msgid, self.msgid_plural = self._set_source_vars(source)
    source = property(getsource, setsource)

    def _get_prev_source(self):
        """Returns the unescaped msgid"""
        return self._get_source_vars(self.prev_msgid, self.prev_msgid_plural)

    def _set_prev_source(self, source):
        """Sets the msgid to the given (unescaped) value.

        @param source: an unescaped source string.
        """
        self.prev_msgid, self.prev_msgid_plural = self._set_source_vars(source)
    prev_source = property(_get_prev_source, _set_prev_source)

    def gettarget(self):
        """Returns the unescaped msgstr"""
        if isinstance(self.msgstr, dict):
            multi = multistring(map(unquotefrompo, self.msgstr.values()), self._encoding)
        else:
            multi = multistring(unquotefrompo(self.msgstr), self._encoding)
        return multi

    def settarget(self, target):
        """Sets the msgstr to the given (unescaped) value"""
        self._rich_target = None
        if isinstance(target, str):
            target = target.decode(self._encoding)
        if self.hasplural():
            if isinstance(target, multistring):
                target = target.strings
            elif isinstance(target, basestring):
                target = [target]
        elif isinstance(target, (dict, list)):
            if len(target) == 1:
                target = target[0]
            else:
                raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target))
        templates = self.msgstr
        if isinstance(templates, list):
            templates = {0: templates}
        if isinstance(target, list):
            self.msgstr = dict([(i, quoteforpo(target[i])) for i in range(len(target))])
        elif isinstance(target, dict):
            self.msgstr = dict([(i, quoteforpo(targetstring)) for i, targetstring in target.iteritems()])
        else:
            self.msgstr = quoteforpo(target)
    target = property(gettarget, settarget)

    def getnotes(self, origin=None):
        """Return comments based on origin value (programmer, developer, source code and translator)"""
        if origin == None:
            comments = u"".join([comment[2:] for comment in self.othercomments])
            comments += u"".join([comment[3:] for comment in self.automaticcomments])
        elif origin == "translator":
            comments = u"".join ([comment[2:] for comment in self.othercomments])
        elif origin in ["programmer", "developer", "source code"]:
            comments = u"".join([comment[3:] for comment in self.automaticcomments])
        else:
            raise ValueError("Comment type not valid")
        # Let's drop the last newline
        return comments[:-1]

    def addnote(self, text, origin=None, position="append"):
        """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote"""
        # ignore empty strings and strings without non-space characters
        if not (text and text.strip()):
            return
        text = data.forceunicode(text)
        commentlist = self.othercomments
        linestart = "# "
        autocomments = False
        if origin in ["programmer", "developer", "source code"]:
            autocomments = True
            commentlist = self.automaticcomments
            linestart = "#. "
        text = text.split("\n")
        if position == "append":
            commentlist += [linestart + line + "\n" for line in text]
        else:
            newcomments = [linestart + line + "\n" for line in text]
            newcomments += [line for line in commentlist]
            if autocomments:
                self.automaticcomments = newcomments
            else:
                self.othercomments = newcomments

    def removenotes(self):
        """Remove all the translator's notes (other comments)"""
        self.othercomments = []

    def __deepcopy__(self, memo={}):
        # Make an instance to serve as the copy
        new_unit = self.__class__()
        # We'll be testing membership frequently, so make a set from
        # self.__shallow__
        shallow = set(self.__shallow__)
        # Make deep copies of all members which are not in shallow
        for key, value in self.__dict__.iteritems():
            if key not in shallow:
                setattr(new_unit, key, copy.deepcopy(value))
        # Make shallow copies of all members which are in shallow
        for key in set(shallow):
            setattr(new_unit, key, getattr(self, key))
        # Mark memo with ourself, so that we won't get deep copied
        # again
        memo[id(self)] = self
        # Return our copied unit
        return new_unit

    def copy(self):
        return copy.deepcopy(self)

    def _msgidlen(self):
        if self.hasplural():
            return len(unquotefrompo(self.msgid).strip()) + len(unquotefrompo(self.msgid_plural).strip())
        else:
            return len(unquotefrompo(self.msgid).strip())

    def _msgstrlen(self):
        if isinstance(self.msgstr, dict):
            combinedstr = "\n".join([unquotefrompo(msgstr).strip() for msgstr in self.msgstr.itervalues()])
            return len(combinedstr.strip())
        else:
            return len(unquotefrompo(self.msgstr).strip())

    def merge(self, otherpo, overwrite=False, comments=True, authoritative=False):
        """Merges the otherpo (with the same msgid) into this one.

        Overwrite non-blank self.msgstr only if overwrite is True
        merge comments only if comments is True
        """

        def mergelists(list1, list2, split=False):
            #decode where necessary
            if unicode in [type(item) for item in list2] + [type(item) for item in list1]:
                for position, item in enumerate(list1):
                    if isinstance(item, str):
                        list1[position] = item.decode("utf-8")
                for position, item in enumerate(list2):
                    if isinstance(item, str):
                        list2[position] = item.decode("utf-8")

            #Determine the newline style of list1
            lineend = ""
            if list1 and list1[0]:
                for candidate in ["\n", "\r", "\n\r"]:
                    if list1[0].endswith(candidate):
                        lineend = candidate
                if not lineend:
                    lineend = ""
            else:
                lineend = "\n"

            #Split if directed to do so:
            if split:
                splitlist1 = []
                splitlist2 = []
                prefix = "#"
                for item in list1:
                    splitlist1.extend(item.split()[1:])
                    prefix = item.split()[0]
                for item in list2:
                    splitlist2.extend(item.split()[1:])
                    prefix = item.split()[0]
                list1.extend(["%s %s%s" % (prefix, item, lineend) for item in splitlist2 if not item in splitlist1])
            else:
                #Normal merge, but conform to list1 newline style
                if list1 != list2:
                    for item in list2:
                        if lineend:
                            item = item.rstrip() + lineend
                        # avoid duplicate comment lines (this might cause some problems)
                        if item not in list1 or len(item) < 5:
                            list1.append(item)
        if not isinstance(otherpo, pounit):
            super(pounit, self).merge(otherpo, overwrite, comments)
            return
        if comments:
            mergelists(self.othercomments, otherpo.othercomments)
            mergelists(self.typecomments, otherpo.typecomments)
            if not authoritative:
                # We don't bring across otherpo.automaticcomments as we consider ourself
                # to be the the authority.  Same applies to otherpo.msgidcomments
                mergelists(self.automaticcomments, otherpo.automaticcomments)
                mergelists(self.msgidcomments, otherpo.msgidcomments)
                mergelists(self.sourcecomments, otherpo.sourcecomments, split=True)
        if not self.istranslated() or overwrite:
            # Remove kde-style comments from the translation (if any).
            if self._extract_msgidcomments(otherpo.target):
                otherpo.target = otherpo.target.replace('_: ' + otherpo._extract_msgidcomments()+ '\n', '')
            self.target = otherpo.target
            if self.source != otherpo.source or self.getcontext() != otherpo.getcontext():
                self.markfuzzy()
            else:
                self.markfuzzy(otherpo.isfuzzy())
        elif not otherpo.istranslated():
            if self.source != otherpo.source:
                self.markfuzzy()
        else:
            if self.target != otherpo.target:
                self.markfuzzy()

    def isheader(self):
        #return (self._msgidlen() == 0) and (self._msgstrlen() > 0) and (len(self.msgidcomments) == 0)
        #rewritten here for performance:
        return (is_null(self.msgid)
                        and not is_null(self.msgstr)
                        and self.msgidcomments == []
                        and is_null(self.msgctxt)
        )

    def isblank(self):
        if self.isheader() or len(self.msgidcomments):
            return False
        if (self._msgidlen() == 0) and (self._msgstrlen() == 0) and (is_null(self.msgctxt)):
            return True
        return False
        # TODO: remove:
        # Before, the equivalent of the following was the final return statement:
        # return len(self.source.strip()) == 0

    def hastypecomment(self, typecomment):
        """Check whether the given type comment is present"""
        # check for word boundaries properly by using a regular expression...
        return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0

    def hasmarkedcomment(self, commentmarker):
        """Check whether the given comment marker is present as # (commentmarker) ..."""
        commentmarker = "(%s)" % commentmarker
        for comment in self.othercomments:
            if comment.replace("#", "", 1).strip().startswith(commentmarker):
                return True
        return False

    def settypecomment(self, typecomment, present=True):
        """Alters whether a given typecomment is present"""
        if self.hastypecomment(typecomment) != present:
            if present:
                self.typecomments.append("#, %s\n" % typecomment)
            else:
                # this should handle word boundaries properly ...
                typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments)
                self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments)

    def isfuzzy(self):
        return self.hastypecomment("fuzzy")

    def markfuzzy(self, present=True):
        self.settypecomment("fuzzy", present)

    def isobsolete(self):
        return self.obsolete

    def makeobsolete(self):
        """Makes this unit obsolete"""
        self.obsolete = True
        if self.msgctxt:
            self.obsoletemsgctxt = self.msgctxt
        if self.msgid:
            self.obsoletemsgid = self.msgid
            self.msgid = []
        if self.msgidcomments:
            self.obsoletemsgidcomments = self.msgidcomments
            self.msgidcomments = []
        if self.msgid_plural:
            self.obsoletemsgid_plural = self.msgid_plural
            self.msgid_plural = []
        if self.msgstr:
            self.obsoletemsgstr = self.msgstr
            self.msgstr = []
        self.sourcecomments = []
        self.automaticcomments = []

    def resurrect(self):
        """Makes an obsolete unit normal"""
        self.obsolete = False
        if self.obsoletemsgctxt:
            self.msgid = self.obsoletemsgctxt
            self.obsoletemsgctxt = []
        if self.obsoletemsgid:
            self.msgid = self.obsoletemsgid
            self.obsoletemsgid = []
        if self.obsoletemsgidcomments:
            self.msgidcomments = self.obsoletemsgidcomments
            self.obsoletemsgidcomments = []
        if self.obsoletemsgid_plural:
            self.msgid_plural = self.obsoletemsgid_plural
            self.obsoletemsgid_plural = []
        if self.obsoletemsgstr:
            self.msgstr = self.obsoletemsgstr
            self.obsoletemgstr = []

    def hasplural(self):
        """returns whether this pounit contains plural strings..."""
        return len(self.msgid_plural) > 0

    def parse(self, src):
        return poparser.parse_unit(poparser.ParseState(cStringIO.StringIO(src), pounit), self)

    def _getmsgpartstr(self, partname, partlines, partcomments=""):
        if isinstance(partlines, dict):
            partkeys = partlines.keys()
            partkeys.sort()
            return "".join([self._getmsgpartstr("%s[%d]" % (partname, partkey), partlines[partkey], partcomments) for partkey in partkeys])
        partstr = partname + " "
        partstartline = 0
        if len(partlines) > 0 and len(partcomments) == 0:
            partstr += partlines[0]
            partstartline = 1
        elif len(partcomments) > 0:
            if len(partlines) > 0 and len(unquotefrompo(partlines[:1])) == 0:
                # if there is a blank leader line, it must come before the comment
                partstr += partlines[0] + '\n'
                # but if the whole string is blank, leave it in
                if len(partlines) > 1:
                    partstartline += 1
            else:
                # All partcomments should start on a newline
                partstr += '""\n'
            # combine comments into one if more than one
            if len(partcomments) > 1:
                combinedcomment = []
                for comment in partcomments:
                    comment = unquotefrompo([comment])
                    if comment.startswith("_:"):
                        comment = comment[len("_:"):]
                    if comment.endswith("\\n"):
                        comment = comment[:-len("\\n")]
                    #Before we used to strip. Necessary in some cases?
                    combinedcomment.append(comment)
                partcomments = quoteforpo("_:%s" % "".join(combinedcomment))
            # comments first, no blank leader line needed
            partstr += "\n".join(partcomments)
            partstr = quote.rstripeol(partstr)
        else:
            partstr += '""'
        partstr += '\n'
        # add the rest
        for partline in partlines[partstartline:]:
            partstr += partline + '\n'
        return partstr

    def _encodeifneccessary(self, output):
        """encodes unicode strings and returns other strings unchanged"""
        if isinstance(output, unicode):
            encoding = encodingToUse(getattr(self, "encoding", "UTF-8"))
            return output.encode(encoding)
        return output

    def __str__(self):
        """convert to a string. double check that unicode is handled somehow here"""
        output = self._getoutput()
        return self._encodeifneccessary(output)

    def _getoutput(self):
        """return this po element as a string"""
        def add_prev_msgid_lines(lines, header, var):
            if len(var) > 0:
                lines.append("#| %s %s\n" % (header, var[0]))
                lines.extend("#| %s\n" % line for line in var[1:])

        def add_prev_msgid_info(lines):
            add_prev_msgid_lines(lines, 'msgctxt', self.prev_msgctxt)
            add_prev_msgid_lines(lines, 'msgid', self.prev_msgid)
            add_prev_msgid_lines(lines, 'msgid_plural', self.prev_msgid_plural)

        lines = []
        lines.extend(self.othercomments)
        if self.isobsolete():
            lines.extend(self.typecomments)
            obsoletelines = []
            if self.obsoletemsgctxt:
                obsoletelines.append(self._getmsgpartstr("#~ msgctxt", self.obsoletemsgctxt))
            obsoletelines.append(self._getmsgpartstr("#~ msgid", self.obsoletemsgid, self.obsoletemsgidcomments))
            if self.obsoletemsgid_plural or self.obsoletemsgid_pluralcomments:
                obsoletelines.append(self._getmsgpartstr("#~ msgid_plural", self.obsoletemsgid_plural, self.obsoletemsgid_pluralcomments))
            obsoletelines.append(self._getmsgpartstr("#~ msgstr", self.obsoletemsgstr))
            for index, obsoleteline in enumerate(obsoletelines):
                # We need to account for a multiline msgid or msgstr here
                obsoletelines[index] = obsoleteline.replace('\n"', '\n#~ "')
            lines.extend(obsoletelines)
            lines = [self._encodeifneccessary(line) for line in lines]
            return "".join(lines)
        # if there's no msgid don't do msgid and string, unless we're the header
        # this will also discard any comments other than plain othercomments...
        if is_null(self.msgid):
            if not (self.isheader() or self.getcontext() or self.sourcecomments):
                return "".join(lines)
        lines.extend(self.automaticcomments)
        lines.extend(self.sourcecomments)
        lines.extend(self.typecomments)
        add_prev_msgid_info(lines)
        if self.msgctxt:
            lines.append(self._getmsgpartstr("msgctxt", self.msgctxt))
        lines.append(self._getmsgpartstr("msgid", self.msgid, self.msgidcomments))
        if self.msgid_plural or self.msgid_pluralcomments:
            lines.append(self._getmsgpartstr("msgid_plural", self.msgid_plural, self.msgid_pluralcomments))
        lines.append(self._getmsgpartstr("msgstr", self.msgstr))
        lines = [self._encodeifneccessary(line) for line in lines]
        postr = "".join(lines)
        return postr

    def getlocations(self):
        """Get a list of locations from sourcecomments in the PO unit

        rtype: List
        return: A list of the locations with '#: ' stripped

        """
        locations = []
        for sourcecomment in self.sourcecomments:
            locations += quote.rstripeol(sourcecomment)[3:].split()
        return locations

    def addlocation(self, location):
        """Add a location to sourcecomments in the PO unit

        @param location: Text location e.g. 'file.c:23' does not include #:
        @type location: String

        """
        self.sourcecomments.append("#: %s\n" % location)

    def _extract_msgidcomments(self, text=None):
        """Extract KDE style msgid comments from the unit.

        @rtype: String
        @return: Returns the extracted msgidcomments found in this unit's msgid.
        """

        if not text:
            text = unquotefrompo(self.msgidcomments)
        return text.split('\n')[0].replace('_: ', '', 1)

    def setmsgidcomment(self, msgidcomment):
        if msgidcomment:
            self.msgidcomments = ['"_: %s\\n"' % msgidcomment]
        else:
            self.msgidcomments = []

    msgidcomment = property(_extract_msgidcomments, setmsgidcomment)
    
    def getcontext(self):
        """Get the message context."""
        return unquotefrompo(self.msgctxt) + self._extract_msgidcomments()

    def getid(self):
        """Returns a unique identifier for this unit."""
        context = self.getcontext()
        # Gettext does not consider the plural to determine duplicates, only 
        # the msgid. For generation of .mo files, we might want to use this
        # code to generate the entry for the hash table, but for now, it is 
        # commented out for conformance to gettext.
#        id = '\0'.join(self.source.strings)
        id = self.source
        if self.msgidcomments:
            id = u"_: %s\n%s" % (context, id)
        elif context:
            id = u"%s\04%s" % (context, id)
        return id

class pofile(pocommon.pofile):
    """A .po file containing various units"""
    UnitClass = pounit

    def __init__(self, inputfile=None, encoding=None, unitclass=pounit):
        """Construct a pofile, optionally reading in from inputfile.
        encoding can be specified but otherwise will be read from the PO header"""
        self.UnitClass = unitclass
        pocommon.pofile.__init__(self, unitclass=unitclass)
        self.units = []
        self.filename = ''
        self._encoding = encodingToUse(encoding)
        if inputfile is not None:
            self.parse(inputfile)

    def changeencoding(self, newencoding):
        """Deprecated: changes the encoding on the file."""
        # This should not be here but in poheader. It also shouldn't mangle the
        # header itself, but use poheader methods. All users are removed, so
        # we can deprecate after one release.
        raise DeprecationWarning

        self._encoding = encodingToUse(newencoding)
        if not self.units:
            return
        header = self.header()
        if not header or header.isblank():
            return
        charsetline = None
        headerstr = unquotefrompo(header.msgstr)
        for line in headerstr.split("\n"):
            if not ":" in line:
                continue
            key, value = line.strip().split(":", 1)
            if key.strip() != "Content-Type":
                continue
            charsetline = line
        if charsetline is None:
            headerstr += "Content-Type: text/plain; charset=%s" % self._encoding
        else:
            charset = re.search("charset=([^ ]*)", charsetline)
            if charset is None:
                newcharsetline = charsetline
                if not newcharsetline.strip().endswith(";"):
                    newcharsetline += ";"
                newcharsetline += " charset=%s" % self._encoding
            else:
                charset = charset.group(1)
                newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self._encoding, 1)
            headerstr = headerstr.replace(charsetline, newcharsetline, 1)
        header.msgstr = quoteforpo(headerstr)

    def parse(self, input):
        """Parses the given file or file source string."""
        try:
            if hasattr(input, 'name'):
                self.filename = input.name
            elif not getattr(self, 'filename', ''):
                self.filename = ''
            if isinstance(input, str):
                input = cStringIO.StringIO(input)
            poparser.parse_units(poparser.ParseState(input, pounit), self)
        except Exception, e:
            raise base.ParseError(e)

    def removeduplicates(self, duplicatestyle="merge"):
        """Make sure each msgid is unique ; merge comments etc from duplicates into original"""
        # TODO: can we handle consecutive calls to removeduplicates()? What
        # about files already containing msgctxt? - test
        id_dict = {}
        uniqueunits = []
        # TODO: this is using a list as the pos aren't hashable, but this is slow.
        # probably not used frequently enough to worry about it, though.
        markedpos = []
        def addcomment(thepo):
            thepo.msgidcomments.append('"_: %s\\n"' % " ".join(thepo.getlocations()))
            markedpos.append(thepo)
        for thepo in self.units:
            id = thepo.getid()
            if thepo.isheader() and not thepo.getlocations():
                # header msgids shouldn't be merged...
                uniqueunits.append(thepo)
            elif id in id_dict:
                if duplicatestyle == "merge":
                    if id:
                        id_dict[id].merge(thepo)
                    else:
                        addcomment(thepo)
                        uniqueunits.append(thepo)
                elif duplicatestyle == "msgctxt":
                    origpo = id_dict[id]
                    if origpo not in markedpos:
                        origpo.msgctxt.append('"%s"' % escapeforpo(" ".join(origpo.getlocations())))
                        markedpos.append(thepo)
                    thepo.msgctxt.append('"%s"' % escapeforpo(" ".join(thepo.getlocations())))
                    uniqueunits.append(thepo)
            else:
                if not id:
                    if duplicatestyle == "merge":
                        addcomment(thepo)
                    else:
                        thepo.msgctxt.append('"%s"' % escapeforpo(" ".join(thepo.getlocations())))
                id_dict[id] = thepo
                uniqueunits.append(thepo)
        self.units = uniqueunits

    def __str__(self):
        """Convert to a string. double check that unicode is handled somehow here"""
        output = self._getoutput()
        if isinstance(output, unicode):
            return output.encode(getattr(self, "encoding", "UTF-8"))
        return output

    def _getoutput(self):
        """convert the units back to lines"""
        lines = []
        for unit in self.units:
            unitsrc = str(unit) + "\n"
            lines.append(unitsrc)
        lines = "".join(self.encode(lines)).rstrip()
        #After the last pounit we will have \n\n and we only want to end in \n:
        if lines:
            lines += "\n"
        return lines

    def encode(self, lines):
        """encode any unicode strings in lines in self._encoding"""
        newlines = []
        encoding = self._encoding
        if encoding is None or encoding.lower() == "charset":
            encoding = 'UTF-8'
        for line in lines:
            if isinstance(line, unicode):
                line = line.encode(encoding)
            newlines.append(line)
        return newlines

    def decode(self, lines):
        """decode any non-unicode strings in lines with self._encoding"""
        newlines = []
        for line in lines:
            if isinstance(line, str) and self._encoding is not None and self._encoding.lower() != "charset":
                try:
                    line = line.decode(self._encoding)
                except UnicodeError, e:
                    raise UnicodeError("Error decoding line with encoding %r: %s. Line is %r" % (self._encoding, e, line))
            newlines.append(line)
        return newlines

    def unit_iter(self):
        for unit in self.units:
            if not (unit.isheader() or unit.isobsolete()):
                yield unit