#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2002-2009 Zuza Software Foundation # # This file is part of the Translate Toolkit. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . """Classes for the support of Gettext .po and .pot files. This implementation assumes that cpo is working. This should not be used directly, but can be used once cpo has been established to work.""" #TODO: # - handle headerless PO files better # - previous msgid and msgctxt # - accept only unicodes everywhere from translate.misc.multistring import multistring from translate.lang import data from translate.storage import pocommon, base, cpo import re import copy import cStringIO lsep = " " """Seperator for #: entries""" basic_header = r'''msgid "" msgstr "" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" ''' def encodingToUse(encoding): """Tests whether the given encoding is known in the python runtime, or returns utf-8. This function is used to ensure that a valid encoding is always used.""" if encoding == "CHARSET" or encoding == None: return 'utf-8' return encoding class pounit(pocommon.pounit): # othercomments = [] # # this is another comment # automaticcomments = [] # #. comment extracted from the source code # sourcecomments = [] # #: sourcefile.xxx:35 # prev_msgctxt = [] # #| The previous values that msgctxt and msgid held # prev_msgid = [] # # prev_msgid_plural = [] # # typecomments = [] # #, fuzzy # msgidcomment = u"" # _: within msgid # msgctxt # msgid = [] # msgstr = [] # Our homegrown way to indicate what must be copied in a shallow # fashion __shallow__ = ['_store'] def __init__(self, source=None, encoding="UTF-8"): pocommon.pounit.__init__(self, source) self._encoding = encodingToUse(encoding) self.obsolete = False self._initallcomments(blankall=True) self._msgctxt = u"" self.target = u"" def _initallcomments(self, blankall=False): """Initialises allcomments""" if blankall: self.othercomments = [] self.automaticcomments = [] self.sourcecomments = [] self.typecomments = [] self.msgidcomment = u"" def getsource(self): return self._source def setsource(self, source): self._rich_source = None # assert isinstance(source, unicode) source = data.forceunicode(source or u"") source = source or u"" if isinstance(source, multistring): self._source = source elif isinstance(source, unicode): self._source = source else: #unicode, list, dict self._source = multistring(source) source = property(getsource, setsource) def gettarget(self): """Returns the unescaped msgstr""" return self._target def settarget(self, target): """Sets the msgstr to the given (unescaped) value""" self._rich_target = None # assert isinstance(target, unicode) # target = data.forceunicode(target) if self.hasplural(): if isinstance(target, multistring): self._target = target else: #unicode, list, dict self._target = multistring(target) elif isinstance(target, (dict, list)): if len(target) == 1: self._target = target[0] else: raise ValueError("po msgid element has no plural but msgstr has %d elements (%s)" % (len(target), target)) else: self._target = target target = property(gettarget, settarget) def getnotes(self, origin=None): """Return comments based on origin value (programmer, developer, source code and translator)""" if origin == None: comments = u"\n".join(self.othercomments) comments += u"\n".join(self.automaticcomments) elif origin == "translator": comments = u"\n".join (self.othercomments) elif origin in ["programmer", "developer", "source code"]: comments = u"\n".join(self.automaticcomments) else: raise ValueError("Comment type not valid") return comments def addnote(self, text, origin=None, position="append"): """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote""" # ignore empty strings and strings without non-space characters if not (text and text.strip()): return text = data.forceunicode(text) commentlist = self.othercomments if origin in ["programmer", "developer", "source code"]: autocomments = True commentlist = self.automaticcomments if text.endswith(u'\n'): text = text[:-1] text = text.split(u"\n") if position == "append": commentlist.extend(text) else: newcomments = text newcomments.extend(commentlist) if autocomments: self.automaticcomments = newcomments else: self.othercomments = newcomments def removenotes(self): """Remove all the translator's notes (other comments)""" self.othercomments = [] def __deepcopy__(self, memo={}): # Make an instance to serve as the copy new_unit = self.__class__() # We'll be testing membership frequently, so make a set from # self.__shallow__ shallow = set(self.__shallow__) # Make deep copies of all members which are not in shallow for key, value in self.__dict__.iteritems(): if key not in shallow: setattr(new_unit, key, copy.deepcopy(value)) # Make shallow copies of all members which are in shallow for key in set(shallow): setattr(new_unit, key, getattr(self, key)) # Mark memo with ourself, so that we won't get deep copied # again memo[id(self)] = self # Return our copied unit return new_unit def copy(self): return copy.deepcopy(self) def _msgidlen(self): if self.hasplural(): len("".join([string.strip() for string in self.source.strings])) else: return len(self.source.strip()) def _msgstrlen(self): if self.hasplural(): len("".join([string.strip() for string in self.target.strings])) else: return len(self.target.strip()) def merge(self, otherpo, overwrite=False, comments=True, authoritative=False): """Merges the otherpo (with the same msgid) into this one. Overwrite non-blank self.msgstr only if overwrite is True merge comments only if comments is True """ def mergelists(list1, list2, split=False): #Split if directed to do so: if split: splitlist1 = [] splitlist2 = [] for item in list1: splitlist1.extend(item.split()) for item in list2: splitlist2.extend(item.split()) list1.extend([item for item in splitlist2 if not item in splitlist1]) else: #Normal merge, but conform to list1 newline style if list1 != list2: for item in list2: # avoid duplicate comment lines (this might cause some problems) if item not in list1 or len(item) < 5: list1.append(item) if not isinstance(otherpo, pounit): super(pounit, self).merge(otherpo, overwrite, comments) return if comments: mergelists(self.othercomments, otherpo.othercomments) mergelists(self.typecomments, otherpo.typecomments) if not authoritative: # We don't bring across otherpo.automaticcomments as we consider ourself # to be the the authority. Same applies to otherpo.msgidcomments mergelists(self.automaticcomments, otherpo.automaticcomments) # mergelists(self.msgidcomments, otherpo.msgidcomments) #XXX? mergelists(self.sourcecomments, otherpo.sourcecomments, split=True) if not self.istranslated() or overwrite: # Remove kde-style comments from the translation (if any). XXX - remove if pocommon.extract_msgid_comment(otherpo.target): otherpo.target = otherpo.target.replace('_: ' + otherpo._extract_msgidcomments()+ '\n', '') self.target = otherpo.target if self.source != otherpo.source or self.getcontext() != otherpo.getcontext(): self.markfuzzy() else: self.markfuzzy(otherpo.isfuzzy()) elif not otherpo.istranslated(): if self.source != otherpo.source: self.markfuzzy() else: if self.target != otherpo.target: self.markfuzzy() def isheader(self): #TODO: fix up nicely return not self.getid() and len(self.target) > 0 def isblank(self): if self.isheader() or self.msgidcomment: return False if (self._msgidlen() == 0) and (self._msgstrlen() == 0) and len(self._msgctxt) == 0: return True return False def hastypecomment(self, typecomment): """Check whether the given type comment is present""" # check for word boundaries properly by using a regular expression... return sum(map(lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)), self.typecomments)) != 0 def hasmarkedcomment(self, commentmarker): """Check whether the given comment marker is present as # (commentmarker) ...""" # raise DeprecationWarning commentmarker = "(%s)" % commentmarker for comment in self.othercomments: if comment.startswith(commentmarker): return True return False def settypecomment(self, typecomment, present=True): """Alters whether a given typecomment is present""" if self.hastypecomment(typecomment) != present: if present: self.typecomments.append("#, %s\n" % typecomment) else: # this should handle word boundaries properly ... typecomments = map(lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline), self.typecomments) self.typecomments = filter(lambda tcline: tcline.strip() != "#,", typecomments) def istranslated(self): return super(pounit, self).istranslated() and not self.isobsolete() def istranslatable(self): return not (self.isheader() or self.isblank() or self.obsolete) def isfuzzy(self): return self.hastypecomment("fuzzy") def markfuzzy(self, present=True): self.settypecomment("fuzzy", present) def isobsolete(self): return self.obsolete def makeobsolete(self): """Makes this unit obsolete""" self.obsolete = True self.sourcecomments = [] self.automaticcomments = [] def resurrect(self): """Makes an obsolete unit normal""" self.obsolete = False def hasplural(self): """returns whether this pounit contains plural strings...""" source = self.source return isinstance(source, multistring) and len(source.strings) > 1 def parse(self, src): raise DeprecationWarning("Should not be parsing with a unit") return poparser.parse_unit(poparser.ParseState(cStringIO.StringIO(src), pounit), self) def __str__(self): """convert to a string. double check that unicode is handled somehow here""" _cpo_unit = cpo.pounit.buildfromunit(self) return str(_cpo_unit) def getlocations(self): """Get a list of locations from sourcecomments in the PO unit rtype: List return: A list of the locations with '#: ' stripped """ #TODO: rename to .locations return self.sourcecomments def addlocation(self, location): """Add a location to sourcecomments in the PO unit @param location: Text location e.g. 'file.c:23' does not include #: @type location: String """ self.sourcecomments.extend(location.split()) def _extract_msgidcomments(self, text=None): """Extract KDE style msgid comments from the unit. @rtype: String @return: Returns the extracted msgidcomments found in this unit's msgid. """ if text: return pocommon.extract_msgid_comment(text) else: return self.msgidcomment def getcontext(self): """Get the message context.""" return self._msgctxt + self.msgidcomment def getid(self): """Returns a unique identifier for this unit.""" context = self.getcontext() # Gettext does not consider the plural to determine duplicates, only # the msgid. For generation of .mo files, we might want to use this # code to generate the entry for the hash table, but for now, it is # commented out for conformance to gettext. # id = '\0'.join(self.source.strings) id = self.source if self.msgidcomment: id = u"_: %s\n%s" % (context, id) elif context: id = u"%s\04%s" % (context, id) return id def buildfromunit(cls, unit): """Build a native unit from a foreign unit, preserving as much information as possible.""" if type(unit) == cls and hasattr(unit, "copy") and callable(unit.copy): return unit.copy() elif isinstance(unit, pocommon.pounit): newunit = cls(unit.source) newunit.target = unit.target #context newunit.msgidcomment = unit._extract_msgidcomments() if not newunit.msgidcomment: newunit._msgctxt = unit.getcontext() locations = unit.getlocations() if locations: newunit.addlocations(locations) notes = unit.getnotes("developer") if notes: newunit.addnote(notes, "developer") notes = unit.getnotes("translator") if notes: newunit.addnote(notes, "translator") if unit.isobsolete(): newunit.makeobsolete() newunit.markfuzzy(unit.isfuzzy()) for tc in ['python-format', 'c-format', 'php-format']: if unit.hastypecomment(tc): newunit.settypecomment(tc) break return newunit else: return base.TranslationUnit.buildfromunit(unit) buildfromunit = classmethod(buildfromunit) class pofile(pocommon.pofile): """A .po file containing various units""" UnitClass = pounit def __init__(self, inputfile=None, encoding=None, unitclass=pounit): """Construct a pofile, optionally reading in from inputfile. encoding can be specified but otherwise will be read from the PO header""" self.UnitClass = unitclass pocommon.pofile.__init__(self, unitclass=unitclass) self.units = [] self.filename = '' self._encoding = encodingToUse(encoding) if inputfile is not None: self.parse(inputfile) def changeencoding(self, newencoding): """Deprecated: changes the encoding on the file.""" # This should not be here but in poheader. It also shouldn't mangle the # header itself, but use poheader methods. All users are removed, so # we can deprecate after one release. raise DeprecationWarning self._encoding = encodingToUse(newencoding) if not self.units: return header = self.header() if not header or header.isblank(): return charsetline = None headerstr = header.target for line in headerstr.split("\n"): if not ":" in line: continue key, value = line.strip().split(":", 1) if key.strip() != "Content-Type": continue charsetline = line if charsetline is None: headerstr += "Content-Type: text/plain; charset=%s" % self._encoding else: charset = re.search("charset=([^ ]*)", charsetline) if charset is None: newcharsetline = charsetline if not newcharsetline.strip().endswith(";"): newcharsetline += ";" newcharsetline += " charset=%s" % self._encoding else: charset = charset.group(1) newcharsetline = charsetline.replace("charset=%s" % charset, "charset=%s" % self._encoding, 1) headerstr = headerstr.replace(charsetline, newcharsetline, 1) header.target = headerstr def _build_self_from_cpo(self): """Builds up this store from the internal cpo store. A user must ensure that self._cpo_store already exists, and that it is deleted afterwards.""" for unit in self._cpo_store.units: self.addunit(self.UnitClass.buildfromunit(unit)) self._encoding = self._cpo_store._encoding def _build_cpo_from_self(self): """Builds the internal cpo store from the data in self. A user must ensure that self._cpo_store does not exist, and should delete it after using it.""" self._cpo_store = cpo.pofile() for unit in self.units: if not unit.isblank(): self._cpo_store.addunit(cpo.pofile.UnitClass.buildfromunit(unit)) if not self._cpo_store.header(): #only add a temporary header self._cpo_store.makeheader(charset="utf-8", encoding="8bit") def parse(self, input): """Parses the given file or file source string.""" try: if hasattr(input, 'name'): self.filename = input.name elif not getattr(self, 'filename', ''): self.filename = '' tmp_header_added = False # if isinstance(input, str) and '"Content-Type: text/plain; charset=' not in input[:200]: # input = basic_header + input # tmp_header_added = True self._cpo_store = cpo.pofile(input) self._build_self_from_cpo() del self._cpo_store if tmp_header_added: self.units = self.units[1:] except Exception, e: raise base.ParseError(e) def removeduplicates(self, duplicatestyle="merge"): """Make sure each msgid is unique ; merge comments etc from duplicates into original""" # TODO: can we handle consecutive calls to removeduplicates()? What # about files already containing msgctxt? - test id_dict = {} uniqueunits = [] # TODO: this is using a list as the pos aren't hashable, but this is slow. # probably not used frequently enough to worry about it, though. markedpos = [] def addcomment(thepo): thepo.msgidcomment = " ".join(thepo.getlocations()) markedpos.append(thepo) for thepo in self.units: id = thepo.getid() if thepo.isheader() and not thepo.getlocations(): # header msgids shouldn't be merged... uniqueunits.append(thepo) elif id in id_dict: if duplicatestyle == "merge": if id: id_dict[id].merge(thepo) else: addcomment(thepo) uniqueunits.append(thepo) elif duplicatestyle == "msgctxt": origpo = id_dict[id] if origpo not in markedpos: origpo._msgctxt += " ".join(origpo.getlocations()) markedpos.append(thepo) thepo._msgctxt += " ".join(thepo.getlocations()) uniqueunits.append(thepo) else: if not id: if duplicatestyle == "merge": addcomment(thepo) else: thepo._msgctxt += u" ".join(thepo.getlocations()) id_dict[id] = thepo uniqueunits.append(thepo) self.units = uniqueunits def __str__(self): """Convert to a string. double check that unicode is handled somehow here""" self._cpo_store = cpo.pofile(encoding=self._encoding) self._build_cpo_from_self() output = str(self._cpo_store) del self._cpo_store return output