Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/translate-toolkit-1.5.1/translate/storage/fpo.py
blob: 579317d9a85a270d2a9e8aa5659155e661cafa46 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2002-2009 Zuza Software Foundation
#
# This file is part of the Translate Toolkit.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

"""Classes for the support of Gettext .po and .pot files.

This implementation assumes that cpo is working. This should not be used
directly, but can be used once cpo has been established to work."""

#TODO:
# - handle headerless PO files better
# - previous msgid and msgctxt
# - accept only unicodes everywhere

from translate.misc.multistring import multistring
from translate.lang import data
from translate.storage import pocommon, base, cpo
import re
import copy
import cStringIO

lsep = " "
"""Seperator for #: entries"""

basic_header = r'''msgid ""
msgstr ""
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
'''

def encodingToUse(encoding):
    """Tests whether the given encoding is known in the python runtime, or returns utf-8.
    This function is used to ensure that a valid encoding is always used."""
    if encoding == "CHARSET" or encoding == None:
        return 'utf-8'
    return encoding

class pounit(pocommon.pounit):
    # othercomments = []      #   # this is another comment
    # automaticcomments = []  #   #. comment extracted from the source code
    # sourcecomments = []     #   #: sourcefile.xxx:35
    # prev_msgctxt = []       #   #| The previous values that msgctxt and msgid held
    # prev_msgid = []         # 
    # prev_msgid_plural = []  # 
    # typecomments = []       #   #, fuzzy
    # msgidcomment = u""      #   _: within msgid
    # msgctxt
    # msgid = []
    # msgstr = []

    # Our homegrown way to indicate what must be copied in a shallow
    # fashion
    __shallow__ = ['_store']

    def __init__(self, source=None, encoding="UTF-8"):
        pocommon.pounit.__init__(self, source)
        self._encoding = encodingToUse(encoding)
        self.obsolete = False
        self._initallcomments(blankall=True)
        self._msgctxt = u""

        self.target = u""

    def _initallcomments(self, blankall=False):
        """Initialises allcomments"""
        if blankall:
            self.othercomments = []
            self.automaticcomments = []
            self.sourcecomments = []
            self.typecomments = []
            self.msgidcomment = u""

    def getsource(self):
        return self._source

    def setsource(self, source):
        self._rich_source = None
#        assert isinstance(source, unicode)
        source = data.forceunicode(source or u"")
        source = source or u""
        if isinstance(source, multistring):
            self._source = source
        elif isinstance(source, unicode):
            self._source = source
        else:
            #unicode, list, dict
            self._source = multistring(source)
    source = property(getsource, setsource)


    def gettarget(self):
        """Returns the unescaped msgstr"""
        return self._target

    def settarget(self, target):
        """Sets the msgstr to the given (unescaped) value"""
        self._rich_target = None
#        assert isinstance(target, unicode)
#        target = data.forceunicode(target)
        if self.hasplural():
            if isinstance(target, multistring):
                self._target = target
            else:
                #unicode, list, dict
                self._target = multistring(target)
        elif isinstance(target, (dict, list)):
            if len(target) == 1:
                self._target = target[0]
            else:
                raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target))
        else:
            self._target = target
    target = property(gettarget, settarget)

    def getnotes(self, origin=None):
        """Return comments based on origin value (programmer, developer, source code and translator)"""
        if origin == None:
            comments = u"\n".join(self.othercomments)
            comments += u"\n".join(self.automaticcomments)
        elif origin == "translator":
            comments = u"\n".join (self.othercomments)
        elif origin in ["programmer", "developer", "source code"]:
            comments = u"\n".join(self.automaticcomments)
        else:
            raise ValueError("Comment type not valid")
        return comments

    def addnote(self, text, origin=None, position="append"):
        """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote"""
        # ignore empty strings and strings without non-space characters
        if not (text and text.strip()):
            return
        text = data.forceunicode(text)
        commentlist = self.othercomments
        if origin in ["programmer", "developer", "source code"]:
            autocomments = True
            commentlist = self.automaticcomments
        if text.endswith(u'\n'):
            text = text[:-1]
        text = text.split(u"\n")
        if position == "append":
            commentlist.extend(text)
        else:
            newcomments = text
            newcomments.extend(commentlist)
            if autocomments:
                self.automaticcomments = newcomments
            else:
                self.othercomments = newcomments

    def removenotes(self):
        """Remove all the translator's notes (other comments)"""
        self.othercomments = []

    def __deepcopy__(self, memo={}):
        # Make an instance to serve as the copy
        new_unit = self.__class__()
        # We'll be testing membership frequently, so make a set from
        # self.__shallow__
        shallow = set(self.__shallow__)
        # Make deep copies of all members which are not in shallow
        for key, value in self.__dict__.iteritems():
            if key not in shallow:
                setattr(new_unit, key, copy.deepcopy(value))
        # Make shallow copies of all members which are in shallow
        for key in set(shallow):
            setattr(new_unit, key, getattr(self, key))
        # Mark memo with ourself, so that we won't get deep copied
        # again
        memo[id(self)] = self
        # Return our copied unit
        return new_unit

    def copy(self):
        return copy.deepcopy(self)

    def _msgidlen(self):
        if self.hasplural():
            len("".join([string.strip() for string in self.source.strings]))
        else:
            return len(self.source.strip())

    def _msgstrlen(self):
        if self.hasplural():
            len("".join([string.strip() for string in self.target.strings]))
        else:
            return len(self.target.strip())

    def merge(self, otherpo, overwrite=False, comments=True, authoritative=False):
        """Merges the otherpo (with the same msgid) into this one.

        Overwrite non-blank self.msgstr only if overwrite is True
        merge comments only if comments is True
        """

        def mergelists(list1, list2, split=False):
            #Split if directed to do so:
            if split:
                splitlist1 = []
                splitlist2 = []
                for item in list1:
                    splitlist1.extend(item.split())
                for item in list2:
                    splitlist2.extend(item.split())
                list1.extend([item for item in splitlist2 if not item in splitlist1])
            else:
                #Normal merge, but conform to list1 newline style
                if list1 != list2:
                    for item in list2:
                        # avoid duplicate comment lines (this might cause some problems)
                        if item not in list1 or len(item) < 5:
                            list1.append(item)

        if not isinstance(otherpo, pounit):
            super(pounit, self).merge(otherpo, overwrite, comments)
            return
        if comments:
            mergelists(self.othercomments, otherpo.othercomments)
            mergelists(self.typecomments, otherpo.typecomments)
            if not authoritative:
                # We don't bring across otherpo.automaticcomments as we consider ourself
                # to be the the authority.  Same applies to otherpo.msgidcomments
                mergelists(self.automaticcomments, otherpo.automaticcomments)
#                mergelists(self.msgidcomments, otherpo.msgidcomments) #XXX?
                mergelists(self.sourcecomments, otherpo.sourcecomments, split=True)
        if not self.istranslated() or overwrite:
            # Remove kde-style comments from the translation (if any). XXX - remove
            if pocommon.extract_msgid_comment(otherpo.target):
                otherpo.target = otherpo.target.replace('_: ' + otherpo._extract_msgidcomments()+ '\n', '')
            self.target = otherpo.target
            if self.source != otherpo.source or self.getcontext() != otherpo.getcontext():
                self.markfuzzy()
            else:
                self.markfuzzy(otherpo.isfuzzy())
        elif not otherpo.istranslated():
            if self.source != otherpo.source:
                self.markfuzzy()
        else:
            if self.target != otherpo.target:
                self.markfuzzy()

    def isheader(self):
        #TODO: fix up nicely
        return not self.getid() and len(self.target) > 0

    def isblank(self):
        if self.isheader() or self.msgidcomment:
            return False
        if (self._msgidlen() == 0) and (self._msgstrlen() == 0) and len(self._msgctxt) == 0:
            return True
        return False

    def hastypecomment(self, typecomment):
        """Check whether the given type comment is present"""
        # check for word boundaries properly by using a regular expression...
        return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0

    def hasmarkedcomment(self, commentmarker):
        """Check whether the given comment marker is present as # (commentmarker) ..."""
#        raise DeprecationWarning
        commentmarker = "(%s)" % commentmarker
        for comment in self.othercomments:
            if comment.startswith(commentmarker):
                return True
        return False

    def settypecomment(self, typecomment, present=True):
        """Alters whether a given typecomment is present"""
        if self.hastypecomment(typecomment) != present:
            if present:
                self.typecomments.append("#, %s\n" % typecomment)
            else:
                # this should handle word boundaries properly ...
                typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments)
                self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments)

    def istranslated(self):
        return super(pounit, self).istranslated() and not self.isobsolete()

    def istranslatable(self):
        return not (self.isheader() or self.isblank() or self.obsolete)

    def isfuzzy(self):
        return self.hastypecomment("fuzzy")

    def markfuzzy(self, present=True):
        self.settypecomment("fuzzy", present)

    def isobsolete(self):
        return self.obsolete

    def makeobsolete(self):
        """Makes this unit obsolete"""
        self.obsolete = True
        self.sourcecomments = []
        self.automaticcomments = []

    def resurrect(self):
        """Makes an obsolete unit normal"""
        self.obsolete = False

    def hasplural(self):
        """returns whether this pounit contains plural strings..."""
        source = self.source
        return isinstance(source, multistring) and len(source.strings) > 1

    def parse(self, src):
        raise DeprecationWarning("Should not be parsing with a unit")
        return poparser.parse_unit(poparser.ParseState(cStringIO.StringIO(src), pounit), self)

    def __str__(self):
        """convert to a string. double check that unicode is handled somehow here"""
        _cpo_unit = cpo.pounit.buildfromunit(self)
        return str(_cpo_unit)

    def getlocations(self):
        """Get a list of locations from sourcecomments in the PO unit

        rtype: List
        return: A list of the locations with '#: ' stripped

        """
        #TODO: rename to .locations
        return self.sourcecomments

    def addlocation(self, location):
        """Add a location to sourcecomments in the PO unit

        @param location: Text location e.g. 'file.c:23' does not include #:
        @type location: String
        """
        self.sourcecomments.extend(location.split())

    def _extract_msgidcomments(self, text=None):
        """Extract KDE style msgid comments from the unit.

        @rtype: String
        @return: Returns the extracted msgidcomments found in this unit's msgid.
        """
        if text:
            return pocommon.extract_msgid_comment(text)
        else:
            return self.msgidcomment

    def getcontext(self):
        """Get the message context."""
        return self._msgctxt + self.msgidcomment

    def getid(self):
        """Returns a unique identifier for this unit."""
        context = self.getcontext()
        # Gettext does not consider the plural to determine duplicates, only 
        # the msgid. For generation of .mo files, we might want to use this
        # code to generate the entry for the hash table, but for now, it is 
        # commented out for conformance to gettext.
#        id = '\0'.join(self.source.strings)
        id = self.source
        if self.msgidcomment:
            id = u"_: %s\n%s" % (context, id)
        elif context:
            id = u"%s\04%s" % (context, id)
        return id

    def buildfromunit(cls, unit):
        """Build a native unit from a foreign unit, preserving as much
        information as possible."""
        if type(unit) == cls and hasattr(unit, "copy") and callable(unit.copy):
            return unit.copy()
        elif isinstance(unit, pocommon.pounit):
            newunit = cls(unit.source)
            newunit.target = unit.target
            #context
            newunit.msgidcomment = unit._extract_msgidcomments()
            if not newunit.msgidcomment:
                newunit._msgctxt = unit.getcontext()

            locations = unit.getlocations()
            if locations:
                newunit.addlocations(locations)
            notes = unit.getnotes("developer")
            if notes:
                newunit.addnote(notes, "developer")
            notes = unit.getnotes("translator")
            if notes:
                newunit.addnote(notes, "translator")
            if unit.isobsolete():
                newunit.makeobsolete()
            newunit.markfuzzy(unit.isfuzzy())
            for tc in ['python-format', 'c-format', 'php-format']:
                if unit.hastypecomment(tc):
                    newunit.settypecomment(tc)
                    break
            return newunit
        else:
            return base.TranslationUnit.buildfromunit(unit)
    buildfromunit = classmethod(buildfromunit)

class pofile(pocommon.pofile):
    """A .po file containing various units"""
    UnitClass = pounit

    def __init__(self, inputfile=None, encoding=None, unitclass=pounit):
        """Construct a pofile, optionally reading in from inputfile.
        encoding can be specified but otherwise will be read from the PO header"""
        self.UnitClass = unitclass
        pocommon.pofile.__init__(self, unitclass=unitclass)
        self.units = []
        self.filename = ''
        self._encoding = encodingToUse(encoding)
        if inputfile is not None:
            self.parse(inputfile)

    def changeencoding(self, newencoding):
        """Deprecated: changes the encoding on the file."""
        # This should not be here but in poheader. It also shouldn't mangle the
        # header itself, but use poheader methods. All users are removed, so
        # we can deprecate after one release.
        raise DeprecationWarning

        self._encoding = encodingToUse(newencoding)
        if not self.units:
            return
        header = self.header()
        if not header or header.isblank():
            return
        charsetline = None
        headerstr = header.target
        for line in headerstr.split("\n"):
            if not ":" in line:
                continue
            key, value = line.strip().split(":", 1)
            if key.strip() != "Content-Type":
                continue
            charsetline = line
        if charsetline is None:
            headerstr += "Content-Type: text/plain; charset=%s" % self._encoding
        else:
            charset = re.search("charset=([^ ]*)", charsetline)
            if charset is None:
                newcharsetline = charsetline
                if not newcharsetline.strip().endswith(";"):
                    newcharsetline += ";"
                newcharsetline += " charset=%s" % self._encoding
            else:
                charset = charset.group(1)
                newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self._encoding, 1)
            headerstr = headerstr.replace(charsetline, newcharsetline, 1)
        header.target = headerstr

    def _build_self_from_cpo(self):
        """Builds up this store from the internal cpo store.

        A user must ensure that self._cpo_store already exists, and that it is
        deleted afterwards."""
        for unit in self._cpo_store.units:
            self.addunit(self.UnitClass.buildfromunit(unit))
        self._encoding = self._cpo_store._encoding

    def _build_cpo_from_self(self):
        """Builds the internal cpo store from the data in self.

        A user must ensure that self._cpo_store does not exist, and should
        delete it after using it."""
        self._cpo_store = cpo.pofile()
        for unit in self.units:
            if not unit.isblank():
                self._cpo_store.addunit(cpo.pofile.UnitClass.buildfromunit(unit))
        if not self._cpo_store.header():
            #only add a temporary header
            self._cpo_store.makeheader(charset="utf-8", encoding="8bit")


    def parse(self, input):
        """Parses the given file or file source string."""
        try:
            if hasattr(input, 'name'):
                self.filename = input.name
            elif not getattr(self, 'filename', ''):
                self.filename = ''
            tmp_header_added = False
#            if isinstance(input, str) and '"Content-Type: text/plain; charset=' not in input[:200]:
#                input = basic_header + input
#                tmp_header_added = True
            self._cpo_store = cpo.pofile(input)
            self._build_self_from_cpo()
            del self._cpo_store
            if tmp_header_added:
                self.units = self.units[1:]
        except Exception, e:
            raise base.ParseError(e)

    def removeduplicates(self, duplicatestyle="merge"):
        """Make sure each msgid is unique ; merge comments etc from duplicates into original"""
        # TODO: can we handle consecutive calls to removeduplicates()? What
        # about files already containing msgctxt? - test
        id_dict = {}
        uniqueunits = []
        # TODO: this is using a list as the pos aren't hashable, but this is slow.
        # probably not used frequently enough to worry about it, though.
        markedpos = []
        def addcomment(thepo):
            thepo.msgidcomment = " ".join(thepo.getlocations())
            markedpos.append(thepo)
        for thepo in self.units:
            id = thepo.getid()
            if thepo.isheader() and not thepo.getlocations():
                # header msgids shouldn't be merged...
                uniqueunits.append(thepo)
            elif id in id_dict:
                if duplicatestyle == "merge":
                    if id:
                        id_dict[id].merge(thepo)
                    else:
                        addcomment(thepo)
                        uniqueunits.append(thepo)
                elif duplicatestyle == "msgctxt":
                    origpo = id_dict[id]
                    if origpo not in markedpos:
                        origpo._msgctxt += " ".join(origpo.getlocations())
                        markedpos.append(thepo)
                    thepo._msgctxt += " ".join(thepo.getlocations())
                    uniqueunits.append(thepo)
            else:
                if not id:
                    if duplicatestyle == "merge":
                        addcomment(thepo)
                    else:
                        thepo._msgctxt += u" ".join(thepo.getlocations())
                id_dict[id] = thepo
                uniqueunits.append(thepo)
        self.units = uniqueunits

    def __str__(self):
        """Convert to a string. double check that unicode is handled somehow here"""
        self._cpo_store = cpo.pofile(encoding=self._encoding)
        self._build_cpo_from_self()
        output = str(self._cpo_store)
        del self._cpo_store
        return output