#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2007 Zuza Software Foundation # # This file is part of translate. # # translate is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # translate is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with translate; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA """Manage the Wordfast Translation Memory format Wordfast TM format is the Translation Memory format used by the U{Wordfast} computer aided translation tool. It is a bilingual base class derived format with L{WordfastTMFile} and L{WordfastUnit} providing file and unit level access. Wordfast tools ============== Wordfast is a computer aided translation tool. It is an application built on top of Microsoft Word and is implemented as a rather sophisticated set of macros. Understanding that helps us understand many of the seemingly strange choices around this format including: encoding, escaping and file naming. Implementation ============== The implementation covers the full requirements of a Wordfast TM file. The files are simple Tab Separated Value (TSV) files that can be read by Microsoft Excel and other spreadsheet programs. They use the .txt extension which does make it more difficult to automatically identify such files. The dialect of the TSV files is specified by L{WordfastDialect}. Encoding -------- The files are UTF-16 or ISO-8859-1 (Latin1) encoded. These choices are most likely because Microsoft Word is the base editing tool for Wordfast. The format is tab separated so we are able to detect UTF-16 vs Latin-1 by searching for the occurance of a UTF-16 tab character and then continuing with the parsing. Timestamps ---------- L{WordfastTime} allows for the correct management of the Wordfast YYYYMMDD~HHMMSS timestamps. However, timestamps on individual units are not updated when edited. Header ------ L{WordfastHeader} provides header management support. The header functionality is fully implemented through observing the behaviour of the files in real use cases, input from the Wordfast programmers and public documentation. Escaping -------- Wordfast TM implements a form of escaping that covers two aspects: 1. Placeable: bold, formating, etc. These are left as is and ignored. It is up to the editor and future placeable implementation to manage these. 2. Escapes: items that may confuse Excel or translators are escaped as &'XX;. These are fully implemented and are converted to and from Unicode. By observing behaviour and reading documentation we where able to observe all possible escapes. Unfortunately the escaping differs slightly between Windows and Mac version. This might cause errors in future. Functions allow for L{conversion to Unicode<_wf_to_char>} and L{back to Wordfast escapes<_char_to_wf>}. Extended Attributes ------------------- The last 4 columns allow users to define and manage extended attributes. These are left as is and are not directly managed byour implemenation. """ import csv import sys import time from translate.storage import base WF_TIMEFORMAT = "%Y%m%d~%H%M%S" """Time format used by Wordfast""" WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version", "target-lang", "license", "attr1list", "attr2list", "attr3list", "attr4list", "attr5list"] """Field names for the Wordfast header""" WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang", "target", "attr1", "attr2", "attr3", "attr4"] """Field names for a Wordfast TU""" WF_FIELDNAMES_HEADER_DEFAULTS = { "date": "%19000101~121212", "userlist": "%User ID,TT,TT Translate-Toolkit", "tucount": "%TU=00000001", "src-lang": "%EN-US", "version": "%Wordfast TM v.5.51w9/00", "target-lang": "", "license": "%---00000001", "attr1list": "", "attr2list": "", "attr3list": "", "attr4list": "" } """Default or minimum header entries for a Wordfast file""" # TODO Needs validation. The following need to be checked against a WF TM file to ensure # that the correct Unicode values have been chosen for the characters. For now these look # correct and have been taken from Windows CP1252 and Macintosh code points found for # the respective character sets on Linux. WF_ESCAPE_MAP = ( ("&'26;", u"\u0026"), # & - Ampersand (must be first to prevent escaping of escapes) ("&'82;", u"\u201A"), # ‚ - Single low-9 quotation mark ("&'85;", u"\u2026"), # … - Elippsis ("&'91;", u"\u2018"), # ‘ - left single quotation mark ("&'92;", u"\u2019"), # ’ - right single quotation mark ("&'93;", u"\u201C"), # “ - left double quotation mark ("&'94;", u"\u201D"), # ” - right double quotation mark ("&'96;", u"\u2013"), # – - en dash (validate) ("&'97;", u"\u2014"), # — - em dash (validate) ("&'99;", u"\u2122"), # ™ - Trade mark # Windows only ("&'A0;", u"\u00A0"), #   - Non breaking space ("&'A9;", u"\u00A9"), # © - Copyright ("&'AE;", u"\u00AE"), # ® - Registered ("&'BC;", u"\u00BC"), # ¼ ("&'BD;", u"\u00BD"), # ½ ("&'BE;", u"\u00BE"), # ¾ # Mac only ("&'A8;", u"\u00AE"), # ® - Registered ("&'AA;", u"\u2122"), # ™ - Trade mark ("&'C7;", u"\u00AB"), # « - Left-pointing double angle quotation mark ("&'C8;", u"\u00BB"), # » - Right-pointing double angle quotation mark ("&'C9;", u"\u2026"), # … - Horizontal Elippsis ("&'CA;", u"\u00A0"), #   - Non breaking space ("&'D0;", u"\u2013"), # – - en dash (validate) ("&'D1;", u"\u2014"), # — - em dash (validate) ("&'D2;", u"\u201C"), # “ - left double quotation mark ("&'D3;", u"\u201D"), # ” - right double quotation mark ("&'D4;", u"\u2018"), # ‘ - left single quotation mark ("&'D5;", u"\u2019"), # ’ - right single quotation mark ("&'E2;", u"\u201A"), # ‚ - Single low-9 quotation mark ("&'E3;", u"\u201E"), # „ - Double low-9 quotation mark # Other markers #("&'B;", u"\n"), # Soft-break - XXX creates a problem with roundtripping could also be represented by \u2028 ) """Mapping of Wordfast &'XX; escapes to correct Unicode characters""" TAB_UTF16 = "\x00\x09" """The tab \\t character as it would appear in UTF-16 encoding""" def _char_to_wf(string): """Char -> Wordfast &'XX; escapes Full roundtripping is not possible because of the escaping of NEWLINE \\n and TAB \\t""" # FIXME there is no platform check to ensure that we use Mac encodings when running on a Mac if string: for code, char in WF_ESCAPE_MAP: string = string.replace(char.encode('utf-8'), code) string = string.replace("\n", "\\n").replace("\t", "\\t") return string def _wf_to_char(string): """Wordfast &'XX; escapes -> Char""" if string: for code, char in WF_ESCAPE_MAP: string = string.replace(code, char.encode('utf-8')) string = string.replace("\\n", "\n").replace("\\t", "\t") return string class WordfastDialect(csv.Dialect): """Describe the properties of a Wordfast generated TAB-delimited file.""" delimiter = "\t" lineterminator = "\r\n" quoting = csv.QUOTE_NONE if sys.version_info < (2, 5, 0): # We need to define the following items for csv in Python < 2.5 quoting = csv.QUOTE_MINIMAL # Wordfast does not quote anything, since we escape # \t anyway in _char_to_wf this should not be a problem doublequote = False skipinitialspace = False escapechar = None quotechar = '"' csv.register_dialect("wordfast", WordfastDialect) class WordfastTime(object): """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss""" def __init__(self, newtime=None): self._time = None if not newtime: self.time = None elif isinstance(newtime, basestring): self.timestring = newtime elif isinstance(newtime, time.struct_time): self.time = newtime def get_timestring(self): """Get the time in the Wordfast time format""" if not self._time: return None else: return time.strftime(WF_TIMEFORMAT, self._time) def set_timestring(self, timestring): """Set the time_sturct object using a Wordfast time formated string @param timestring: A Wordfast time string (YYYMMDD~hhmmss) @type timestring: String """ self._time = time.strptime(timestring, WF_TIMEFORMAT) timestring = property(get_timestring, set_timestring) def get_time(self): """Get the time_struct object""" return self._time def set_time(self, newtime): """Set the time_struct object @param newtime: a new time object @type newtime: time.time_struct """ if newtime and isinstance(newtime, time.struct_time): self._time = newtime else: self._time = None time = property(get_time, set_time) def __str__(self): if not self.timestring: return "" else: return self.timestring class WordfastHeader(object): """A wordfast translation memory header""" def __init__(self, header=None): self._header_dict = [] if not header: self.header = self._create_default_header() elif isinstance(header, dict): self.header = header def _create_default_header(self): """Create a default Wordfast header with the date set to the current time""" defaultheader = WF_FIELDNAMES_HEADER_DEFAULTS defaultheader['date'] = '%%%s' % WordfastTime(time.localtime()).timestring return defaultheader def getheader(self): """Get the header dictionary""" return self._header_dict def setheader(self, newheader): self._header_dict = newheader header = property(getheader, setheader) def settargetlang(self, newlang): self._header_dict['target-lang'] = '%%%s' % newlang targetlang = property(None, settargetlang) def settucount(self, count): self._header_dict['tucount'] = '%%TU=%08d' % count tucount = property(None, settucount) class WordfastUnit(base.TranslationUnit): """A Wordfast translation memory unit""" def __init__(self, source=None): self._dict = {} if source: self.source = source super(WordfastUnit, self).__init__(source) def _update_timestamp(self): """Refresh the timestamp for the unit""" self._dict['date'] = WordfastTime(time.localtime()).timestring def getdict(self): """Get the dictionary of values for a Wordfast line""" return self._dict def setdict(self, newdict): """Set the dictionary of values for a Wordfast line @param newdict: a new dictionary with Wordfast line elements @type newdict: Dict """ # TODO First check that the values are OK self._dict = newdict dict = property(getdict, setdict) def _get_source_or_target(self, key): if self._dict.get(key, None) is None: return None elif self._dict[key]: return _wf_to_char(self._dict[key]).decode('utf-8') else: return "" def _set_source_or_target(self, key, newvalue): if newvalue is None: self._dict[key] = None if isinstance(newvalue, unicode): newvalue = newvalue.encode('utf-8') newvalue = _char_to_wf(newvalue) if not key in self._dict or newvalue != self._dict[key]: self._dict[key] = newvalue self._update_timestamp() def getsource(self): return self._get_source_or_target('source') def setsource(self, newsource): return self._set_source_or_target('source', newsource) source = property(getsource, setsource) def gettarget(self): return self._get_source_or_target('target') def settarget(self, newtarget): return self._set_source_or_target('target', newtarget) target = property(gettarget, settarget) def settargetlang(self, newlang): self._dict['target-lang'] = newlang targetlang = property(None, settargetlang) def __str__(self): return str(self._dict) def istranslated(self): if not self._dict.get('source', None): return False return bool(self._dict.get('target', None)) class WordfastTMFile(base.TranslationStore): """A Wordfast translation memory file""" Name = _("Wordfast Translation Memory") Mimetypes = ["application/x-wordfast"] Extensions = ["txt"] def __init__(self, inputfile=None, unitclass=WordfastUnit): """construct a Wordfast TM, optionally reading in from inputfile.""" self.UnitClass = unitclass base.TranslationStore.__init__(self, unitclass=unitclass) self.filename = '' self.header = WordfastHeader() self._encoding = 'iso-8859-1' if inputfile is not None: self.parse(inputfile) def parse(self, input): """parsese the given file or file source string""" if hasattr(input, 'name'): self.filename = input.name elif not getattr(self, 'filename', ''): self.filename = '' if hasattr(input, "read"): tmsrc = input.read() input.close() input = tmsrc if TAB_UTF16 in input.split("\n")[0]: self._encoding = 'utf-16' else: self._encoding = 'iso-8859-1' try: input = input.decode(self._encoding).encode('utf-8') except: raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded") for header in csv.DictReader(input.split("\n")[:1], fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast"): self.header = WordfastHeader(header) lines = csv.DictReader(input.split("\n")[1:], fieldnames=WF_FIELDNAMES, dialect="wordfast") for line in lines: newunit = WordfastUnit() newunit.dict = line self.addunit(newunit) def __str__(self): output = csv.StringIO() header_output = csv.StringIO() writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="wordfast") unit_count = 0 for unit in self.units: if unit.istranslated(): unit_count += 1 writer.writerow(unit.dict) if unit_count == 0: return "" output.reset() self.header.tucount = unit_count outheader = csv.DictWriter(header_output, fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast") outheader.writerow(self.header.header) header_output.reset() decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8') try: return decoded.encode(self._encoding) except UnicodeEncodeError: return decoded.encode('utf-16')