Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/translate-toolkit-1.5.1/translate/storage/pypo.py
blob: 8a6c4dccc6db3f865af86ed365c412623935b67f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2002-2009 Zuza Software Foundation
#
# This file is part of the Translate Toolkit.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

"""classes that hold units of .po files (pounit) or entire files (pofile)
gettext-style .po (or .pot) files are used in translations for KDE et al (see kbabel)"""

from __future__ import generators
from translate.misc.multistring import multistring
from translate.misc import quote
from translate.misc import textwrap
from translate.lang import data
from translate.storage import pocommon, base
import re
import copy
import cStringIO
import poparser

lsep = "\n#: "
"""Seperator for #: entries"""

# general functions for quoting / unquoting po strings

po_unescape_map = {"\\r": "\r", "\\t": "\t", '\\"': '"', '\\n': '\n', '\\\\': '\\'}
po_escape_map = dict([(value, key) for (key, value) in po_unescape_map.items()])

def escapeforpo(line):
    """Escapes a line for po format. assumes no \n occurs in the line.

    @param line: unescaped text
    """
    special_locations = []
    for special_key in po_escape_map:
        special_locations.extend(quote.find_all(line, special_key))
    special_locations = dict.fromkeys(special_locations).keys()
    special_locations.sort()
    escaped_line = ""
    last_location = 0
    for location in special_locations:
        escaped_line += line[last_location:location]
        escaped_line += po_escape_map[line[location:location+1]]
        last_location = location+1
    escaped_line += line[last_location:]
    return escaped_line

def unescapehandler(escape):

    return po_unescape_map.get(escape, escape)

def wrapline(line):
    """Wrap text for po files."""
    wrappedlines = textwrap.wrap(line, 76, replace_whitespace=False, expand_tabs=False, drop_whitespace=False)

    # Lines should not start with a space...
    if len(wrappedlines) > 1:
        for index, line in enumerate(wrappedlines[1:]):
            if line.startswith(' '):
                # Remove the space at the beginning of the line:
                wrappedlines[index+1] = line[1:]

                # Append a space to the previous line:
                wrappedlines[index] += ' '
    return wrappedlines

def quoteforpo(text):
    """quotes the given text for a PO file, returning quoted and escaped lines"""
    polines = []
    if text is None:
        return polines
    lines = text.split("\n")
    if len(lines) > 1 or (len(lines) == 1 and len(lines[0]) > 71):
        if len(lines) != 2 or lines[1]:
            polines.extend(['""'])
        for line in lines[:-1]:
            #TODO: We should only wrap after escaping
            lns = wrapline(line)
            if len(lns) > 0:
                for ln in lns[:-1]:
                    polines.extend(['"' + escapeforpo(ln) + '"'])
                if lns[-1]:
                    polines.extend(['"' + escapeforpo(lns[-1]) + '\\n"'])
            else:
                polines.extend(['"\\n"'])
    if lines[-1]:
        polines.extend(['"' + escapeforpo(line) + '"' for line in wrapline(lines[-1])])
    return polines

def extractpoline(line):
    """Remove quote and unescape line from po file.

    @param line: a quoted line from a po file (msgid or msgstr)
    """
    extracted = quote.extractwithoutquotes(line, '"', '"', '\\', includeescapes=unescapehandler)[0]
    return extracted

def unquotefrompo(postr):
    return u"".join([extractpoline(line) for line in postr])

def encodingToUse(encoding):
    """Tests whether the given encoding is known in the python runtime, or returns utf-8.
    This function is used to ensure that a valid encoding is always used."""
    if encoding == "CHARSET" or encoding == None:
        return 'utf-8'
    return encoding
#    if encoding is None: return False
#    return True
#    try:
#        tuple = codecs.lookup(encoding)
#    except LookupError:
#        return False
#    return True

def is_null(lst):
    return lst == [] or len(lst) == 1 and lst[0] == '""'

def extractstr(string):
    left = string.find('"')
    right = string.rfind('"')
    if right > -1:
        return string[left:right+1]
    else:
        return string[left:] + '"'

class pounit(pocommon.pounit):
    # othercomments = []      #   # this is another comment
    # automaticcomments = []  #   #. comment extracted from the source code
    # sourcecomments = []     #   #: sourcefile.xxx:35
    # prev_msgctxt = []       #   #| The previous values that msgctxt and msgid held
    # prev_msgid = []         # 
    # prev_msgid_plural = []  # 
    # typecomments = []       #   #, fuzzy
    # msgidcomments = []      #   _: within msgid
    # msgctxt
    # msgid = []
    # msgstr = []

    # Our homegrown way to indicate what must be copied in a shallow
    # fashion
    __shallow__ = ['_store']

    def __init__(self, source=None, encoding="UTF-8"):
        self._encoding = encodingToUse(encoding)
        self.obsolete = False
        self._initallcomments(blankall=True)
        self.prev_msgctxt = []
        self.prev_msgid = []
        self.prev_msgid_plural = []
        self.msgctxt = []
        self.msgid = []
        self.msgid_pluralcomments = []
        self.msgid_plural = []
        self.msgstr = []
        self.obsoletemsgctxt = []
        self.obsoletemsgid = []
        self.obsoletemsgid_pluralcomments = []
        self.obsoletemsgid_plural = []
        self.obsoletemsgstr = []
        pocommon.pounit.__init__(self, source)

    def _initallcomments(self, blankall=False):
        """Initialises allcomments"""
        if blankall:
            self.othercomments = []
            self.automaticcomments = []
            self.sourcecomments = []
            self.typecomments = []
            self.msgidcomments = []
            self.obsoletemsgidcomments = []

    def _get_all_comments(self):
        return [self.othercomments,
                self.automaticcomments,
                self.sourcecomments,
                self.typecomments,
                self.msgidcomments,
                self.obsoletemsgidcomments]

    allcomments = property(_get_all_comments)

    def _get_source_vars(self, msgid, msgid_plural):
        multi = multistring(unquotefrompo(msgid), self._encoding)
        if self.hasplural():
            pluralform = unquotefrompo(msgid_plural)
            if isinstance(pluralform, str):
                pluralform = pluralform.decode(self._encoding)
            multi.strings.append(pluralform)
        return multi

    def _set_source_vars(self, source):
        msgid = None
        msgid_plural = None
        if isinstance(source, str):
            source = source.decode(self._encoding)
        if isinstance(source, multistring):
            source = source.strings
        if isinstance(source, list):
            msgid = quoteforpo(source[0])
            if len(source) > 1:
                msgid_plural = quoteforpo(source[1])
            else:
                msgid_plural = []
        else:
            msgid = quoteforpo(source)
            msgid_plural = []
        return msgid, msgid_plural

    def getsource(self):
        """Returns the unescaped msgid"""
        return self._get_source_vars(self.msgid, self.msgid_plural)

    def setsource(self, source):
        """Sets the msgid to the given (unescaped) value.

        @param source: an unescaped source string.
        """
        self.msgid, self.msgid_plural = self._set_source_vars(source)
    source = property(getsource, setsource)

    def _get_prev_source(self):
        """Returns the unescaped msgid"""
        return self._get_source_vars(self.prev_msgid, self.prev_msgid_plural)

    def _set_prev_source(self, source):
        """Sets the msgid to the given (unescaped) value.

        @param source: an unescaped source string.
        """
        self.prev_msgid, self.prev_msgid_plural = self._set_source_vars(source)
    prev_source = property(_get_prev_source, _set_prev_source)

    def gettarget(self):
        """Returns the unescaped msgstr"""
        if isinstance(self.msgstr, dict):
            multi = multistring(map(unquotefrompo, self.msgstr.values()), self._encoding)
        else:
            multi = multistring(unquotefrompo(self.msgstr), self._encoding)
        return multi

    def settarget(self, target):
        """Sets the msgstr to the given (unescaped) value"""
        self._rich_target = None
        if isinstance(target, str):
            target = target.decode(self._encoding)
        if self.hasplural():
            if isinstance(target, multistring):
                target = target.strings
            elif isinstance(target, basestring):
                target = [target]
        elif isinstance(target, (dict, list)):
            if len(target) == 1:
                target = target[0]
            else:
                raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target))
        templates = self.msgstr
        if isinstance(templates, list):
            templates = {0: templates}
        if isinstance(target, list):
            self.msgstr = dict([(i, quoteforpo(target[i])) for i in range(len(target))])
        elif isinstance(target, dict):
            self.msgstr = dict([(i, quoteforpo(targetstring)) for i, targetstring in target.iteritems()])
        else:
            self.msgstr = quoteforpo(target)
    target = property(gettarget, settarget)

    def getnotes(self, origin=None):
        """Return comments based on origin value (programmer, developer, source code and translator)"""
        if origin == None:
            comments = u"".join([comment[2:] for comment in self.othercomments])
            comments += u"".join([comment[3:] for comment in self.automaticcomments])
        elif origin == "translator":
            comments = u"".join ([comment[2:] for comment in self.othercomments])
        elif origin in ["programmer", "developer", "source code"]:
            comments = u"".join([comment[3:] for comment in self.automaticcomments])
        else:
            raise ValueError("Comment type not valid")
        # Let's drop the last newline
        return comments[:-1]

    def addnote(self, text, origin=None, position="append"):
        """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote"""
        # ignore empty strings and strings without non-space characters
        if not (text and text.strip()):
            return
        text = data.forceunicode(text)
        commentlist = self.othercomments
        linestart = "# "
        autocomments = False
        if origin in ["programmer", "developer", "source code"]:
            autocomments = True
            commentlist = self.automaticcomments
            linestart = "#. "
        text = text.split("\n")
        if position == "append":
            commentlist += [linestart + line + "\n" for line in text]
        else:
            newcomments = [linestart + line + "\n" for line in text]
            newcomments += [line for line in commentlist]
            if autocomments:
                self.automaticcomments = newcomments
            else:
                self.othercomments = newcomments

    def removenotes(self):
        """Remove all the translator's notes (other comments)"""
        self.othercomments = []

    def __deepcopy__(self, memo={}):
        # Make an instance to serve as the copy
        new_unit = self.__class__()
        # We'll be testing membership frequently, so make a set from
        # self.__shallow__
        shallow = set(self.__shallow__)
        # Make deep copies of all members which are not in shallow
        for key, value in self.__dict__.iteritems():
            if key not in shallow:
                setattr(new_unit, key, copy.deepcopy(value))
        # Make shallow copies of all members which are in shallow
        for key in set(shallow):
            setattr(new_unit, key, getattr(self, key))
        # Mark memo with ourself, so that we won't get deep copied
        # again
        memo[id(self)] = self
        # Return our copied unit
        return new_unit

    def copy(self):
        return copy.deepcopy(self)

    def _msgidlen(self):
        if self.hasplural():
            return len(unquotefrompo(self.msgid).strip()) + len(unquotefrompo(self.msgid_plural).strip())
        else:
            return len(unquotefrompo(self.msgid).strip())

    def _msgstrlen(self):
        if isinstance(self.msgstr, dict):
            combinedstr = "\n".join([unquotefrompo(msgstr).strip() for msgstr in self.msgstr.itervalues()])
            return len(combinedstr.strip())
        else:
            return len(unquotefrompo(self.msgstr).strip())

    def merge(self, otherpo, overwrite=False, comments=True, authoritative=False):
        """Merges the otherpo (with the same msgid) into this one.

        Overwrite non-blank self.msgstr only if overwrite is True
        merge comments only if comments is True
        """

        def mergelists(list1, list2, split=False):
            #decode where necessary
            if unicode in [type(item) for item in list2] + [type(item) for item in list1]:
                for position, item in enumerate(list1):
                    if isinstance(item, str):
                        list1[position] = item.decode("utf-8")
                for position, item in enumerate(list2):
                    if isinstance(item, str):
                        list2[position] = item.decode("utf-8")

            #Determine the newline style of list1
            lineend = ""
            if list1 and list1[0]:
                for candidate in ["\n", "\r", "\n\r"]:
                    if list1[0].endswith(candidate):
                        lineend = candidate
                if not lineend:
                    lineend = ""
            else:
                lineend = "\n"

            #Split if directed to do so:
            if split:
                splitlist1 = []
                splitlist2 = []
                prefix = "#"
                for item in list1:
                    splitlist1.extend(item.split()[1:])
                    prefix = item.split()[0]
                for item in list2:
                    splitlist2.extend(item.split()[1:])
                    prefix = item.split()[0]
                list1.extend(["%s %s%s" % (prefix, item, lineend) for item in splitlist2 if not item in splitlist1])
            else:
                #Normal merge, but conform to list1 newline style
                if list1 != list2:
                    for item in list2:
                        if lineend:
                            item = item.rstrip() + lineend
                        # avoid duplicate comment lines (this might cause some problems)
                        if item not in list1 or len(item) < 5:
                            list1.append(item)
        if not isinstance(otherpo, pounit):
            super(pounit, self).merge(otherpo, overwrite, comments)
            return
        if comments:
            mergelists(self.othercomments, otherpo.othercomments)
            mergelists(self.typecomments, otherpo.typecomments)
            if not authoritative:
                # We don't bring across otherpo.automaticcomments as we consider ourself
                # to be the the authority.  Same applies to otherpo.msgidcomments
                mergelists(self.automaticcomments, otherpo.automaticcomments)
                mergelists(self.msgidcomments, otherpo.msgidcomments)
                mergelists(self.sourcecomments, otherpo.sourcecomments, split=True)
        if not self.istranslated() or overwrite:
            # Remove kde-style comments from the translation (if any).
            if self._extract_msgidcomments(otherpo.target):
                otherpo.target = otherpo.target.replace('_: ' + otherpo._extract_msgidcomments()+ '\n', '')
            self.target = otherpo.target
            if self.source != otherpo.source or self.getcontext() != otherpo.getcontext():
                self.markfuzzy()
            else:
                self.markfuzzy(otherpo.isfuzzy())
        elif not otherpo.istranslated():
            if self.source != otherpo.source:
                self.markfuzzy()
        else:
            if self.target != otherpo.target:
                self.markfuzzy()

    def isheader(self):
        #return (self._msgidlen() == 0) and (self._msgstrlen() > 0) and (len(self.msgidcomments) == 0)
        #rewritten here for performance:
        return (is_null(self.msgid)
                        and not is_null(self.msgstr)
                        and self.msgidcomments == []
                        and is_null(self.msgctxt)
        )

    def isblank(self):
        if self.isheader() or len(self.msgidcomments):
            return False
        if (self._msgidlen() == 0) and (self._msgstrlen() == 0) and (is_null(self.msgctxt)):
            return True
        return False
        # TODO: remove:
        # Before, the equivalent of the following was the final return statement:
        # return len(self.source.strip()) == 0

    def hastypecomment(self, typecomment):
        """Check whether the given type comment is present"""
        # check for word boundaries properly by using a regular expression...
        return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0

    def hasmarkedcomment(self, commentmarker):
        """Check whether the given comment marker is present as # (commentmarker) ..."""
        commentmarker = "(%s)" % commentmarker
        for comment in self.othercomments:
            if comment.replace("#", "", 1).strip().startswith(commentmarker):
                return True
        return False

    def settypecomment(self, typecomment, present=True):
        """Alters whether a given typecomment is present"""
        if self.hastypecomment(typecomment) != present:
            if present:
                self.typecomments.append("#, %s\n" % typecomment)
            else:
                # this should handle word boundaries properly ...
                typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments)
                self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments)

    def isfuzzy(self):
        return self.hastypecomment("fuzzy")

    def markfuzzy(self, present=True):
        self.settypecomment("fuzzy", present)

    def isobsolete(self):
        return self.obsolete

    def makeobsolete(self):
        """Makes this unit obsolete"""
        self.obsolete = True
        if self.msgctxt:
            self.obsoletemsgctxt = self.msgctxt
        if self.msgid:
            self.obsoletemsgid = self.msgid
            self.msgid = []
        if self.msgidcomments:
            self.obsoletemsgidcomments = self.msgidcomments
            self.msgidcomments = []
        if self.msgid_plural:
            self.obsoletemsgid_plural = self.msgid_plural
            self.msgid_plural = []
        if self.msgstr:
            self.obsoletemsgstr = self.msgstr
            self.msgstr = []
        self.sourcecomments = []
        self.automaticcomments = []

    def resurrect(self):
        """Makes an obsolete unit normal"""
        self.obsolete = False
        if self.obsoletemsgctxt:
            self.msgid = self.obsoletemsgctxt
            self.obsoletemsgctxt = []
        if self.obsoletemsgid:
            self.msgid = self.obsoletemsgid
            self.obsoletemsgid = []
        if self.obsoletemsgidcomments:
            self.msgidcomments = self.obsoletemsgidcomments
            self.obsoletemsgidcomments = []
        if self.obsoletemsgid_plural:
            self.msgid_plural = self.obsoletemsgid_plural
            self.obsoletemsgid_plural = []
        if self.obsoletemsgstr:
            self.msgstr = self.obsoletemsgstr
            self.obsoletemgstr = []

    def hasplural(self):
        """returns whether this pounit contains plural strings..."""
        return len(self.msgid_plural) > 0

    def parse(self, src):
        return poparser.parse_unit(poparser.ParseState(cStringIO.StringIO(src), pounit), self)

    def _getmsgpartstr(self, partname, partlines, partcomments=""):
        if isinstance(partlines, dict):
            partkeys = partlines.keys()
            partkeys.sort()
            return "".join([self._getmsgpartstr("%s[%d]" % (partname, partkey), partlines[partkey], partcomments) for partkey in partkeys])
        partstr = partname + " "
        partstartline = 0
        if len(partlines) > 0 and len(partcomments) == 0:
            partstr += partlines[0]
            partstartline = 1
        elif len(partcomments) > 0:
            if len(partlines) > 0 and len(unquotefrompo(partlines[:1])) == 0:
                # if there is a blank leader line, it must come before the comment
                partstr += partlines[0] + '\n'
                # but if the whole string is blank, leave it in
                if len(partlines) > 1:
                    partstartline += 1
            else:
                # All partcomments should start on a newline
                partstr += '""\n'
            # combine comments into one if more than one
            if len(partcomments) > 1:
                combinedcomment = []
                for comment in partcomments:
                    comment = unquotefrompo([comment])
                    if comment.startswith("_:"):
                        comment = comment[len("_:"):]
                    if comment.endswith("\\n"):
                        comment = comment[:-len("\\n")]
                    #Before we used to strip. Necessary in some cases?
                    combinedcomment.append(comment)
                partcomments = quoteforpo("_:%s" % "".join(combinedcomment))
            # comments first, no blank leader line needed
            partstr += "\n".join(partcomments)
            partstr = quote.rstripeol(partstr)
        else:
            partstr += '""'
        partstr += '\n'
        # add the rest
        for partline in partlines[partstartline:]:
            partstr += partline + '\n'
        return partstr

    def _encodeifneccessary(self, output):
        """encodes unicode strings and returns other strings unchanged"""
        if isinstance(output, unicode):
            encoding = encodingToUse(getattr(self, "encoding", "UTF-8"))
            return output.encode(encoding)
        return output

    def __str__(self):
        """convert to a string. double check that unicode is handled somehow here"""
        output = self._getoutput()
        return self._encodeifneccessary(output)

    def _getoutput(self):
        """return this po element as a string"""
        def add_prev_msgid_lines(lines, header, var):
            if len(var) > 0:
                lines.append("#| %s %s\n" % (header, var[0]))
                lines.extend("#| %s\n" % line for line in var[1:])

        def add_prev_msgid_info(lines):
            add_prev_msgid_lines(lines, 'msgctxt', self.prev_msgctxt)
            add_prev_msgid_lines(lines, 'msgid', self.prev_msgid)
            add_prev_msgid_lines(lines, 'msgid_plural', self.prev_msgid_plural)

        lines = []
        lines.extend(self.othercomments)
        if self.isobsolete():
            lines.extend(self.typecomments)
            obsoletelines = []
            if self.obsoletemsgctxt:
                obsoletelines.append(self._getmsgpartstr("#~ msgctxt", self.obsoletemsgctxt))
            obsoletelines.append(self._getmsgpartstr("#~ msgid", self.obsoletemsgid, self.obsoletemsgidcomments))
            if self.obsoletemsgid_plural or self.obsoletemsgid_pluralcomments:
                obsoletelines.append(self._getmsgpartstr("#~ msgid_plural", self.obsoletemsgid_plural, self.obsoletemsgid_pluralcomments))
            obsoletelines.append(self._getmsgpartstr("#~ msgstr", self.obsoletemsgstr))
            for index, obsoleteline in enumerate(obsoletelines):
                # We need to account for a multiline msgid or msgstr here
                obsoletelines[index] = obsoleteline.replace('\n"', '\n#~ "')
            lines.extend(obsoletelines)
            lines = [self._encodeifneccessary(line) for line in lines]
            return "".join(lines)
        # if there's no msgid don't do msgid and string, unless we're the header
        # this will also discard any comments other than plain othercomments...
        if is_null(self.msgid):
            if not (self.isheader() or self.getcontext() or self.sourcecomments):
                return "".join(lines)
        lines.extend(self.automaticcomments)
        lines.extend(self.sourcecomments)
        lines.extend(self.typecomments)
        add_prev_msgid_info(lines)
        if self.msgctxt:
            lines.append(self._getmsgpartstr("msgctxt", self.msgctxt))
        lines.append(self._getmsgpartstr("msgid", self.msgid, self.msgidcomments))
        if self.msgid_plural or self.msgid_pluralcomments:
            lines.append(self._getmsgpartstr("msgid_plural", self.msgid_plural, self.msgid_pluralcomments))
        lines.append(self._getmsgpartstr("msgstr", self.msgstr))
        lines = [self._encodeifneccessary(line) for line in lines]
        postr = "".join(lines)
        return postr

    def getlocations(self):
        """Get a list of locations from sourcecomments in the PO unit

        rtype: List
        return: A list of the locations with '#: ' stripped

        """
        locations = []
        for sourcecomment in self.sourcecomments:
            locations += quote.rstripeol(sourcecomment)[3:].split()
        return locations

    def addlocation(self, location):
        """Add a location to sourcecomments in the PO unit

        @param location: Text location e.g. 'file.c:23' does not include #:
        @type location: String

        """
        self.sourcecomments.append("#: %s\n" % location)

    def _extract_msgidcomments(self, text=None):
        """Extract KDE style msgid comments from the unit.

        @rtype: String
        @return: Returns the extracted msgidcomments found in this unit's msgid.
        """

        if not text:
            text = unquotefrompo(self.msgidcomments)
        return text.split('\n')[0].replace('_: ', '', 1)

    def setmsgidcomment(self, msgidcomment):
        if msgidcomment:
            self.msgidcomments = ['"_: %s\\n"' % msgidcomment]
        else:
            self.msgidcomments = []

    msgidcomment = property(_extract_msgidcomments, setmsgidcomment)
    
    def getcontext(self):
        """Get the message context."""
        return unquotefrompo(self.msgctxt) + self._extract_msgidcomments()

    def getid(self):
        """Returns a unique identifier for this unit."""
        context = self.getcontext()
        # Gettext does not consider the plural to determine duplicates, only 
        # the msgid. For generation of .mo files, we might want to use this
        # code to generate the entry for the hash table, but for now, it is 
        # commented out for conformance to gettext.
#        id = '\0'.join(self.source.strings)
        id = self.source
        if self.msgidcomments:
            id = u"_: %s\n%s" % (context, id)
        elif context:
            id = u"%s\04%s" % (context, id)
        return id

class pofile(pocommon.pofile):
    """A .po file containing various units"""
    UnitClass = pounit

    def __init__(self, inputfile=None, encoding=None, unitclass=pounit):
        """Construct a pofile, optionally reading in from inputfile.
        encoding can be specified but otherwise will be read from the PO header"""
        self.UnitClass = unitclass
        pocommon.pofile.__init__(self, unitclass=unitclass)
        self.units = []
        self.filename = ''
        self._encoding = encodingToUse(encoding)
        if inputfile is not None:
            self.parse(inputfile)

    def changeencoding(self, newencoding):
        """Deprecated: changes the encoding on the file."""
        # This should not be here but in poheader. It also shouldn't mangle the
        # header itself, but use poheader methods. All users are removed, so
        # we can deprecate after one release.
        raise DeprecationWarning

        self._encoding = encodingToUse(newencoding)
        if not self.units:
            return
        header = self.header()
        if not header or header.isblank():
            return
        charsetline = None
        headerstr = unquotefrompo(header.msgstr)
        for line in headerstr.split("\n"):
            if not ":" in line:
                continue
            key, value = line.strip().split(":", 1)
            if key.strip() != "Content-Type":
                continue
            charsetline = line
        if charsetline is None:
            headerstr += "Content-Type: text/plain; charset=%s" % self._encoding
        else:
            charset = re.search("charset=([^ ]*)", charsetline)
            if charset is None:
                newcharsetline = charsetline
                if not newcharsetline.strip().endswith(";"):
                    newcharsetline += ";"
                newcharsetline += " charset=%s" % self._encoding
            else:
                charset = charset.group(1)
                newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self._encoding, 1)
            headerstr = headerstr.replace(charsetline, newcharsetline, 1)
        header.msgstr = quoteforpo(headerstr)

    def parse(self, input):
        """Parses the given file or file source string."""
        try:
            if hasattr(input, 'name'):
                self.filename = input.name
            elif not getattr(self, 'filename', ''):
                self.filename = ''
            if isinstance(input, str):
                input = cStringIO.StringIO(input)
            poparser.parse_units(poparser.ParseState(input, pounit), self)
        except Exception, e:
            raise base.ParseError(e)

    def removeduplicates(self, duplicatestyle="merge"):
        """Make sure each msgid is unique ; merge comments etc from duplicates into original"""
        # TODO: can we handle consecutive calls to removeduplicates()? What
        # about files already containing msgctxt? - test
        id_dict = {}
        uniqueunits = []
        # TODO: this is using a list as the pos aren't hashable, but this is slow.
        # probably not used frequently enough to worry about it, though.
        markedpos = []
        def addcomment(thepo):
            thepo.msgidcomments.append('"_: %s\\n"' % " ".join(thepo.getlocations()))
            markedpos.append(thepo)
        for thepo in self.units:
            id = thepo.getid()
            if thepo.isheader() and not thepo.getlocations():
                # header msgids shouldn't be merged...
                uniqueunits.append(thepo)
            elif id in id_dict:
                if duplicatestyle == "merge":
                    if id:
                        id_dict[id].merge(thepo)
                    else:
                        addcomment(thepo)
                        uniqueunits.append(thepo)
                elif duplicatestyle == "msgctxt":
                    origpo = id_dict[id]
                    if origpo not in markedpos:
                        origpo.msgctxt.append('"%s"' % escapeforpo(" ".join(origpo.getlocations())))
                        markedpos.append(thepo)
                    thepo.msgctxt.append('"%s"' % escapeforpo(" ".join(thepo.getlocations())))
                    uniqueunits.append(thepo)
            else:
                if not id:
                    if duplicatestyle == "merge":
                        addcomment(thepo)
                    else:
                        thepo.msgctxt.append('"%s"' % escapeforpo(" ".join(thepo.getlocations())))
                id_dict[id] = thepo
                uniqueunits.append(thepo)
        self.units = uniqueunits

    def __str__(self):
        """Convert to a string. double check that unicode is handled somehow here"""
        output = self._getoutput()
        if isinstance(output, unicode):
            return output.encode(getattr(self, "encoding", "UTF-8"))
        return output

    def _getoutput(self):
        """convert the units back to lines"""
        lines = []
        for unit in self.units:
            unitsrc = str(unit) + "\n"
            lines.append(unitsrc)
        lines = "".join(self.encode(lines)).rstrip()
        #After the last pounit we will have \n\n and we only want to end in \n:
        if lines:
            lines += "\n"
        return lines

    def encode(self, lines):
        """encode any unicode strings in lines in self._encoding"""
        newlines = []
        encoding = self._encoding
        if encoding is None or encoding.lower() == "charset":
            encoding = 'UTF-8'
        for line in lines:
            if isinstance(line, unicode):
                line = line.encode(encoding)
            newlines.append(line)
        return newlines

    def decode(self, lines):
        """decode any non-unicode strings in lines with self._encoding"""
        newlines = []
        for line in lines:
            if isinstance(line, str) and self._encoding is not None and self._encoding.lower() != "charset":
                try:
                    line = line.decode(self._encoding)
                except UnicodeError, e:
                    raise UnicodeError("Error decoding line with encoding %r: %s. Line is %r" % (self._encoding, e, line))
            newlines.append(line)
        return newlines

    def unit_iter(self):
        for unit in self.units:
            if not (unit.isheader() or unit.isobsolete()):
                yield unit