#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2002-2008 Zuza Software Foundation
#
# This file is part of translate.
#
# translate is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# translate is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with translate; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
"""
Classes that hold units of .oo files (oounit) or entire files (oofile).
These are specific .oo files for localisation exported by OpenOffice.org - SDF
format (previously knows as GSI files). For an overview of the format, see
U{http://l10n.openoffice.org/L10N_Framework/Intermediate_file_format.html}
The behaviour in terms of escaping is explained in detail in the programming
comments.
"""
# FIXME: add simple test which reads in a file and writes it out again
import os
import re
from translate.misc import quote
from translate.misc import wStringIO
import warnings
# File normalisation
normalfilenamechars = "/#.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
normalizetable = ""
for i in map(chr, range(256)):
if i in normalfilenamechars:
normalizetable += i
else:
normalizetable += "_"
class unormalizechar(dict):
def __init__(self, normalchars):
self.normalchars = {}
for char in normalchars:
self.normalchars[ord(char)] = char
def __getitem__(self, key):
return self.normalchars.get(key, u"_")
unormalizetable = unormalizechar(normalfilenamechars.decode("ascii"))
def normalizefilename(filename):
"""converts any non-alphanumeric (standard roman) characters to _"""
if isinstance(filename, str):
return filename.translate(normalizetable)
else:
return filename.translate(unormalizetable)
def makekey(ookey, long_keys):
"""converts an oo key tuple into a unique identifier
@param ookey: an oo key
@type ookey: tuple
@param long_keys: Use long keys
@type long_keys: Boolean
@rtype: str
@return: unique ascii identifier
"""
project, sourcefile, resourcetype, groupid, localid, platform = ookey
sourcefile = sourcefile.replace('\\','/')
if long_keys:
sourcebase = os.path.join(project, sourcefile)
else:
sourceparts = sourcefile.split('/')
sourcebase = "".join(sourceparts[-1:])
if len(groupid) == 0 or len(localid) == 0:
fullid = groupid + localid
else:
fullid = groupid + "." + localid
if resourcetype:
fullid = fullid + "." + resourcetype
key = "%s#%s" % (sourcebase, fullid)
return normalizefilename(key)
# These are functions that deal with escaping and unescaping of the text fields
# of the SDF file. These should only be applied to the text column.
# The fields quickhelptext and title are assumed to carry no escaping.
#
# The escaping of all strings except those coming from .xhp (helpcontent2)
# sourcefiles work as follows:
# (newline) -> \n
# (carriage return) -> \r
# (tab) -> \t
# Backslash characters (\) and single quotes (') are not consistently escaped,
# and are therefore left as they are.
#
# For strings coming from .xhp (helpcontent2) sourcefiles the following
# characters are escaped inside XML tags only:
# < -> \< when used with lowercase tagnames (with some exceptions)
# > -> \> when used with lowercase tagnames (with some exceptions)
# " -> \" around XML properties
# The following is consistently escaped in .xhp strings (not only in XML tags):
# \ -> \\
def escape_text(text):
"""Escapes SDF text to be suitable for unit consumption."""
return text.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r")
def unescape_text(text):
"""Unescapes SDF text to be suitable for unit consumption."""
return text.replace("\\\\", "\a").replace("\\n", "\n").replace("\\t", "\t").\
replace("\\r", "\r").replace("\a", "\\\\")
helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-z]+?=".*?") *[/]??>''')
def escape_help_text(text):
"""Escapes the help text as it would be in an SDF file.
<, >, " are only escaped in <[[:lower:]]> tags. Some HTML tags make it in in
lowercase so those are dealt with. Some OpenOffice.org help tags are not
escaped.
"""
text = text.replace("\\", "\\\\")
for tag in helptagre.findall(text):
escapethistag = False
for escape_tag in ["ahelp", "link", "item", "emph", "defaultinline", "switchinline", "caseinline", "variable", "bookmark_value", "image", "embedvar", "alt"]:
if tag.startswith("<%s" % escape_tag) or tag == "%s>" % escape_tag:
escapethistag = True
if tag in ["
", ""]:
escapethistag = True
if escapethistag:
escaped_tag = ("\\<" + tag[1:-1] + "\\>").replace('"', '\\"')
text = text.replace(tag, escaped_tag)
return text
def unescape_help_text(text):
"""Unescapes normal text to be suitable for writing to the SDF file."""
return text.replace(r"\<", "<").replace(r"\>", ">").replace(r'\"', '"').replace(r"\\", "\\")
def encode_if_needed_utf8(text):
"""Encode a Unicode string the the specified encoding"""
if isinstance(text, unicode):
return text.encode('UTF-8')
return text
class ooline(object):
"""this represents one line, one translation in an .oo file"""
def __init__(self, parts=None):
"""construct an ooline from its parts"""
if parts is None:
self.project, self.sourcefile, self.dummy, self.resourcetype, \
self.groupid, self.localid, self.helpid, self.platform, \
self.width, self.languageid, self.text, self.helptext, \
self.quickhelptext, self.title, self.timestamp = [""] * 15
else:
self.setparts(parts)
def setparts(self, parts):
"""create a line from its tab-delimited parts"""
if len(parts) != 15:
warnings.warn("oo line contains %d parts, it should contain 15: %r" % \
(len(parts), parts))
newparts = list(parts)
if len(newparts) < 15:
newparts = newparts + [""] * (15-len(newparts))
else:
newparts = newparts[:15]
parts = tuple(newparts)
self.project, self.sourcefile, self.dummy, self.resourcetype, \
self.groupid, self.localid, self.helpid, self.platform, \
self.width, self.languageid, self._text, self.helptext, \
self.quickhelptext, self.title, self.timestamp = parts
def getparts(self):
"""return a list of parts in this line"""
return (self.project, self.sourcefile, self.dummy, self.resourcetype,
self.groupid, self.localid, self.helpid, self.platform,
self.width, self.languageid, self._text, self.helptext,
self.quickhelptext, self.title, self.timestamp)
def gettext(self):
"""Obtains the text column and handle escaping."""
if self.sourcefile.endswith(".xhp"):
return unescape_help_text(self._text)
else:
return unescape_text(self._text)
def settext(self, text):
"""Sets the text column and handle escaping."""
if self.sourcefile.endswith(".xhp"):
self._text = escape_help_text(text)
else:
self._text = escape_text(text)
text = property(gettext, settext)
def __str__(self):
"""convert to a string. double check that unicode is handled"""
return encode_if_needed_utf8(self.getoutput())
def getoutput(self):
"""return a line in tab-delimited form"""
parts = self.getparts()
return "\t".join(parts)
def getkey(self):
"""get the key that identifies the resource"""
return (self.project, self.sourcefile, self.resourcetype, self.groupid,
self.localid, self.platform)
class oounit:
"""this represents a number of translations of a resource"""
def __init__(self):
"""construct the oounit"""
self.languages = {}
self.lines = []
def addline(self, line):
"""add a line to the oounit"""
self.languages[line.languageid] = line
self.lines.append(line)
def __str__(self):
"""convert to a string. double check that unicode is handled"""
return encode_if_needed_utf8(self.getoutput())
def getoutput(self):
"""return the lines in tab-delimited form"""
return "\r\n".join([str(line) for line in self.lines])
class oofile:
"""this represents an entire .oo file"""
UnitClass = oounit
def __init__(self, input=None):
"""constructs the oofile"""
self.oolines = []
self.units = []
self.ookeys = {}
self.filename = ""
self.languages = []
if input is not None:
self.parse(input)
def addline(self, thisline):
"""adds a parsed line to the file"""
key = thisline.getkey()
element = self.ookeys.get(key, None)
if element is None:
element = self.UnitClass()
self.units.append(element)
self.ookeys[key] = element
element.addline(thisline)
self.oolines.append(thisline)
if thisline.languageid not in self.languages:
self.languages.append(thisline.languageid)
def parse(self, input):
"""parses lines and adds them to the file"""
if not self.filename:
self.filename = getattr(input, 'name', '')
if hasattr(input, "read"):
src = input.read()
input.close()
else:
src = input
for line in src.split("\n"):
line = quote.rstripeol(line)
if not line:
continue
parts = line.split("\t")
thisline = ooline(parts)
self.addline(thisline)
def __str__(self):
"""convert to a string. double check that unicode is handled"""
return encode_if_needed_utf8(self.getoutput())
def getoutput(self):
"""converts all the lines back to tab-delimited form"""
lines = []
for oe in self.units:
if len(oe.lines) > 2:
warnings.warn("contains %d lines (should be 2 at most): languages %r" % (len(oe.lines), oe.languages))
oekeys = [line.getkey() for line in oe.lines]
warnings.warn("contains %d lines (should be 2 at most): keys %r" % (len(oe.lines), oekeys))
oeline = str(oe) + "\r\n"
lines.append(oeline)
return "".join(lines)
class oomultifile:
"""this takes a huge GSI file and represents it as multiple smaller files..."""
def __init__(self, filename, mode=None, multifilestyle="single"):
"""initialises oomultifile from a seekable inputfile or writable outputfile"""
self.filename = filename
if mode is None:
if os.path.exists(filename):
mode = 'r'
else:
mode = 'w'
self.mode = mode
self.multifilestyle = multifilestyle
self.multifilename = os.path.splitext(filename)[0]
self.multifile = open(filename, mode)
self.subfilelines = {}
if mode == "r":
self.createsubfileindex()
def createsubfileindex(self):
"""reads in all the lines and works out the subfiles"""
linenum = 0
for line in self.multifile:
subfile = self.getsubfilename(line)
if not subfile in self.subfilelines:
self.subfilelines[subfile] = []
self.subfilelines[subfile].append(linenum)
linenum += 1
def getsubfilename(self, line):
"""looks up the subfile name for the line"""
if line.count("\t") < 2:
raise ValueError("invalid tab-delimited line: %r" % line)
lineparts = line.split("\t", 2)
module, filename = lineparts[0], lineparts[1]
if self.multifilestyle == "onefile":
ooname = self.multifilename
elif self.multifilestyle == "toplevel":
ooname = module
else:
filename = filename.replace("\\", "/")
fileparts = [module] + filename.split("/")
ooname = os.path.join(*fileparts[:-1])
return ooname + os.extsep + "oo"
def listsubfiles(self):
"""returns a list of subfiles in the file"""
return self.subfilelines.keys()
def __iter__(self):
"""iterates through the subfile names"""
for subfile in self.listsubfiles():
yield subfile
def __contains__(self, pathname):
"""checks if this pathname is a valid subfile"""
return pathname in self.subfilelines
def getsubfilesrc(self, subfile):
"""returns the list of lines matching the subfile"""
lines = []
requiredlines = dict.fromkeys(self.subfilelines[subfile])
linenum = 0
self.multifile.seek(0)
for line in self.multifile:
if linenum in requiredlines:
lines.append(line)
linenum += 1
return "".join(lines)
def openinputfile(self, subfile):
"""returns a pseudo-file object for the given subfile"""
subfilesrc = self.getsubfilesrc(subfile)
inputfile = wStringIO.StringIO(subfilesrc)
inputfile.filename = subfile
return inputfile
def openoutputfile(self, subfile):
"""returns a pseudo-file object for the given subfile"""
def onclose(contents):
self.multifile.write(contents)
self.multifile.flush()
outputfile = wStringIO.CatchStringOutput(onclose)
outputfile.filename = subfile
return outputfile
def getoofile(self, subfile):
"""returns an oofile built up from the given subfile's lines"""
subfilesrc = self.getsubfilesrc(subfile)
oosubfile = oofile()
oosubfile.filename = subfile
oosubfile.parse(subfilesrc)
return oosubfile