#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2002-2008 Zuza Software Foundation # # This file is part of translate. # # translate is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # translate is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with translate; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """ Classes that hold units of .oo files (oounit) or entire files (oofile). These are specific .oo files for localisation exported by OpenOffice.org - SDF format (previously knows as GSI files). For an overview of the format, see U{http://l10n.openoffice.org/L10N_Framework/Intermediate_file_format.html} The behaviour in terms of escaping is explained in detail in the programming comments. """ # FIXME: add simple test which reads in a file and writes it out again import os import re from translate.misc import quote from translate.misc import wStringIO import warnings # File normalisation normalfilenamechars = "/#.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" normalizetable = "" for i in map(chr, range(256)): if i in normalfilenamechars: normalizetable += i else: normalizetable += "_" class unormalizechar(dict): def __init__(self, normalchars): self.normalchars = {} for char in normalchars: self.normalchars[ord(char)] = char def __getitem__(self, key): return self.normalchars.get(key, u"_") unormalizetable = unormalizechar(normalfilenamechars.decode("ascii")) def normalizefilename(filename): """converts any non-alphanumeric (standard roman) characters to _""" if isinstance(filename, str): return filename.translate(normalizetable) else: return filename.translate(unormalizetable) def makekey(ookey, long_keys): """converts an oo key tuple into a unique identifier @param ookey: an oo key @type ookey: tuple @param long_keys: Use long keys @type long_keys: Boolean @rtype: str @return: unique ascii identifier """ project, sourcefile, resourcetype, groupid, localid, platform = ookey sourcefile = sourcefile.replace('\\','/') if long_keys: sourcebase = os.path.join(project, sourcefile) else: sourceparts = sourcefile.split('/') sourcebase = "".join(sourceparts[-1:]) if len(groupid) == 0 or len(localid) == 0: fullid = groupid + localid else: fullid = groupid + "." + localid if resourcetype: fullid = fullid + "." + resourcetype key = "%s#%s" % (sourcebase, fullid) return normalizefilename(key) # These are functions that deal with escaping and unescaping of the text fields # of the SDF file. These should only be applied to the text column. # The fields quickhelptext and title are assumed to carry no escaping. # # The escaping of all strings except those coming from .xhp (helpcontent2) # sourcefiles work as follows: # (newline) -> \n # (carriage return) -> \r # (tab) -> \t # Backslash characters (\) and single quotes (') are not consistently escaped, # and are therefore left as they are. # # For strings coming from .xhp (helpcontent2) sourcefiles the following # characters are escaped inside XML tags only: # < -> \< when used with lowercase tagnames (with some exceptions) # > -> \> when used with lowercase tagnames (with some exceptions) # " -> \" around XML properties # The following is consistently escaped in .xhp strings (not only in XML tags): # \ -> \\ def escape_text(text): """Escapes SDF text to be suitable for unit consumption.""" return text.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r") def unescape_text(text): """Unescapes SDF text to be suitable for unit consumption.""" return text.replace("\\\\", "\a").replace("\\n", "\n").replace("\\t", "\t").\ replace("\\r", "\r").replace("\a", "\\\\") helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-z]+?=".*?") *[/]??>''') def escape_help_text(text): """Escapes the help text as it would be in an SDF file. <, >, " are only escaped in <[[:lower:]]> tags. Some HTML tags make it in in lowercase so those are dealt with. Some OpenOffice.org help tags are not escaped. """ text = text.replace("\\", "\\\\") for tag in helptagre.findall(text): escapethistag = False for escape_tag in ["ahelp", "link", "item", "emph", "defaultinline", "switchinline", "caseinline", "variable", "bookmark_value", "image", "embedvar", "alt"]: if tag.startswith("<%s" % escape_tag) or tag == "" % escape_tag: escapethistag = True if tag in ["
", ""]: escapethistag = True if escapethistag: escaped_tag = ("\\<" + tag[1:-1] + "\\>").replace('"', '\\"') text = text.replace(tag, escaped_tag) return text def unescape_help_text(text): """Unescapes normal text to be suitable for writing to the SDF file.""" return text.replace(r"\<", "<").replace(r"\>", ">").replace(r'\"', '"').replace(r"\\", "\\") def encode_if_needed_utf8(text): """Encode a Unicode string the the specified encoding""" if isinstance(text, unicode): return text.encode('UTF-8') return text class ooline(object): """this represents one line, one translation in an .oo file""" def __init__(self, parts=None): """construct an ooline from its parts""" if parts is None: self.project, self.sourcefile, self.dummy, self.resourcetype, \ self.groupid, self.localid, self.helpid, self.platform, \ self.width, self.languageid, self.text, self.helptext, \ self.quickhelptext, self.title, self.timestamp = [""] * 15 else: self.setparts(parts) def setparts(self, parts): """create a line from its tab-delimited parts""" if len(parts) != 15: warnings.warn("oo line contains %d parts, it should contain 15: %r" % \ (len(parts), parts)) newparts = list(parts) if len(newparts) < 15: newparts = newparts + [""] * (15-len(newparts)) else: newparts = newparts[:15] parts = tuple(newparts) self.project, self.sourcefile, self.dummy, self.resourcetype, \ self.groupid, self.localid, self.helpid, self.platform, \ self.width, self.languageid, self._text, self.helptext, \ self.quickhelptext, self.title, self.timestamp = parts def getparts(self): """return a list of parts in this line""" return (self.project, self.sourcefile, self.dummy, self.resourcetype, self.groupid, self.localid, self.helpid, self.platform, self.width, self.languageid, self._text, self.helptext, self.quickhelptext, self.title, self.timestamp) def gettext(self): """Obtains the text column and handle escaping.""" if self.sourcefile.endswith(".xhp"): return unescape_help_text(self._text) else: return unescape_text(self._text) def settext(self, text): """Sets the text column and handle escaping.""" if self.sourcefile.endswith(".xhp"): self._text = escape_help_text(text) else: self._text = escape_text(text) text = property(gettext, settext) def __str__(self): """convert to a string. double check that unicode is handled""" return encode_if_needed_utf8(self.getoutput()) def getoutput(self): """return a line in tab-delimited form""" parts = self.getparts() return "\t".join(parts) def getkey(self): """get the key that identifies the resource""" return (self.project, self.sourcefile, self.resourcetype, self.groupid, self.localid, self.platform) class oounit: """this represents a number of translations of a resource""" def __init__(self): """construct the oounit""" self.languages = {} self.lines = [] def addline(self, line): """add a line to the oounit""" self.languages[line.languageid] = line self.lines.append(line) def __str__(self): """convert to a string. double check that unicode is handled""" return encode_if_needed_utf8(self.getoutput()) def getoutput(self): """return the lines in tab-delimited form""" return "\r\n".join([str(line) for line in self.lines]) class oofile: """this represents an entire .oo file""" UnitClass = oounit def __init__(self, input=None): """constructs the oofile""" self.oolines = [] self.units = [] self.ookeys = {} self.filename = "" self.languages = [] if input is not None: self.parse(input) def addline(self, thisline): """adds a parsed line to the file""" key = thisline.getkey() element = self.ookeys.get(key, None) if element is None: element = self.UnitClass() self.units.append(element) self.ookeys[key] = element element.addline(thisline) self.oolines.append(thisline) if thisline.languageid not in self.languages: self.languages.append(thisline.languageid) def parse(self, input): """parses lines and adds them to the file""" if not self.filename: self.filename = getattr(input, 'name', '') if hasattr(input, "read"): src = input.read() input.close() else: src = input for line in src.split("\n"): line = quote.rstripeol(line) if not line: continue parts = line.split("\t") thisline = ooline(parts) self.addline(thisline) def __str__(self): """convert to a string. double check that unicode is handled""" return encode_if_needed_utf8(self.getoutput()) def getoutput(self): """converts all the lines back to tab-delimited form""" lines = [] for oe in self.units: if len(oe.lines) > 2: warnings.warn("contains %d lines (should be 2 at most): languages %r" % (len(oe.lines), oe.languages)) oekeys = [line.getkey() for line in oe.lines] warnings.warn("contains %d lines (should be 2 at most): keys %r" % (len(oe.lines), oekeys)) oeline = str(oe) + "\r\n" lines.append(oeline) return "".join(lines) class oomultifile: """this takes a huge GSI file and represents it as multiple smaller files...""" def __init__(self, filename, mode=None, multifilestyle="single"): """initialises oomultifile from a seekable inputfile or writable outputfile""" self.filename = filename if mode is None: if os.path.exists(filename): mode = 'r' else: mode = 'w' self.mode = mode self.multifilestyle = multifilestyle self.multifilename = os.path.splitext(filename)[0] self.multifile = open(filename, mode) self.subfilelines = {} if mode == "r": self.createsubfileindex() def createsubfileindex(self): """reads in all the lines and works out the subfiles""" linenum = 0 for line in self.multifile: subfile = self.getsubfilename(line) if not subfile in self.subfilelines: self.subfilelines[subfile] = [] self.subfilelines[subfile].append(linenum) linenum += 1 def getsubfilename(self, line): """looks up the subfile name for the line""" if line.count("\t") < 2: raise ValueError("invalid tab-delimited line: %r" % line) lineparts = line.split("\t", 2) module, filename = lineparts[0], lineparts[1] if self.multifilestyle == "onefile": ooname = self.multifilename elif self.multifilestyle == "toplevel": ooname = module else: filename = filename.replace("\\", "/") fileparts = [module] + filename.split("/") ooname = os.path.join(*fileparts[:-1]) return ooname + os.extsep + "oo" def listsubfiles(self): """returns a list of subfiles in the file""" return self.subfilelines.keys() def __iter__(self): """iterates through the subfile names""" for subfile in self.listsubfiles(): yield subfile def __contains__(self, pathname): """checks if this pathname is a valid subfile""" return pathname in self.subfilelines def getsubfilesrc(self, subfile): """returns the list of lines matching the subfile""" lines = [] requiredlines = dict.fromkeys(self.subfilelines[subfile]) linenum = 0 self.multifile.seek(0) for line in self.multifile: if linenum in requiredlines: lines.append(line) linenum += 1 return "".join(lines) def openinputfile(self, subfile): """returns a pseudo-file object for the given subfile""" subfilesrc = self.getsubfilesrc(subfile) inputfile = wStringIO.StringIO(subfilesrc) inputfile.filename = subfile return inputfile def openoutputfile(self, subfile): """returns a pseudo-file object for the given subfile""" def onclose(contents): self.multifile.write(contents) self.multifile.flush() outputfile = wStringIO.CatchStringOutput(onclose) outputfile.filename = subfile return outputfile def getoofile(self, subfile): """returns an oofile built up from the given subfile's lines""" subfilesrc = self.getsubfilesrc(subfile) oosubfile = oofile() oosubfile.filename = subfile oosubfile.parse(subfilesrc) return oosubfile