#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright 2009 Zuza Software Foundation # # This file is part of the Translate Toolkit. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . """Manage the OmegaT glossary format OmegaT glossary format is used by the U{OmegaT} computer aided translation tool. It is a bilingual base class derived format with L{OmegaTFile} and L{OmegaTUnit} providing file and unit level access. Format Implementation ===================== The OmegaT glossary format is a simple Tab Separated Value (TSV) file with the columns: source, target, comment. The dialect of the TSV files is specified by L{OmegaTDialect}. Encoding -------- The files are either UTF-8 or encoded using the system default. UTF-8 encoded files use the .utf8 extension while system encoded files use the .tab extension. """ import csv import locale import os.path import sys import time from translate.storage import base OMEGAT_FIELDNAMES = ["source", "target", "comment"] """Field names for an OmegaT glossary unit""" class OmegaTDialect(csv.Dialect): """Describe the properties of an OmegaT generated TAB-delimited file.""" delimiter = "\t" lineterminator = "\r\n" quoting = csv.QUOTE_NONE if sys.version_info < (2, 5, 0): # We need to define the following items for csv in Python < 2.5 quoting = csv.QUOTE_MINIMAL # OmegaT does not quote anything FIXME So why MINIMAL? doublequote = False skipinitialspace = False escapechar = None quotechar = '"' csv.register_dialect("omegat", OmegaTDialect) class OmegaTUnit(base.TranslationUnit): """An OmegaT translation memory unit""" def __init__(self, source=None): self._dict = {} if source: self.source = source super(OmegaTUnit, self).__init__(source) def getdict(self): """Get the dictionary of values for a OmegaT line""" return self._dict def setdict(self, newdict): """Set the dictionary of values for a OmegaT line @param newdict: a new dictionary with OmegaT line elements @type newdict: Dict """ # TODO First check that the values are OK self._dict = newdict dict = property(getdict, setdict) def _get_field(self, key): if key not in self._dict: return None elif self._dict[key]: return self._dict[key].decode('utf-8') else: return "" def _set_field(self, key, newvalue): if newvalue is None: self._dict[key] = None if isinstance(newvalue, unicode): newvalue = newvalue.encode('utf-8') if not key in self._dict or newvalue != self._dict[key]: self._dict[key] = newvalue def getnotes(self, origin=None): return self._get_field('comment') def getsource(self): return self._get_field('source') def setsource(self, newsource): self._rich_source = None return self._set_field('source', newsource) source = property(getsource, setsource) def gettarget(self): return self._get_field('target') def settarget(self, newtarget): self._rich_target = None return self._set_field('target', newtarget) target = property(gettarget, settarget) def settargetlang(self, newlang): self._dict['target-lang'] = newlang targetlang = property(None, settargetlang) def __str__(self): return str(self._dict) def istranslated(self): return bool(self._dict.get('target', None)) class OmegaTFile(base.TranslationStore): """An OmegaT translation memory file""" # FIXME: uncomment this when we next open from string freeze #Name = _("OmegaT Glossary") Name = None Mimetypes = ["application/x-omegat-glossary"] Extensions = ["utf8"] def __init__(self, inputfile=None, unitclass=OmegaTUnit): """Construct an OmegaT glossary, optionally reading in from inputfile.""" self.UnitClass = unitclass base.TranslationStore.__init__(self, unitclass=unitclass) self.filename = '' self.extension = '' self._encoding = self._get_encoding() if inputfile is not None: self.parse(inputfile) def _get_encoding(self): return 'utf-8' def parse(self, input): """parsese the given file or file source string""" if hasattr(input, 'name'): self.filename = input.name elif not getattr(self, 'filename', ''): self.filename = '' if hasattr(input, "read"): tmsrc = input.read() input.close() input = tmsrc try: input = input.decode(self._encoding).encode('utf-8') except: raise ValueError("OmegaT files are either UTF-8 encoded or use the default system encoding") lines = csv.DictReader(input.split("\n"), fieldnames=OMEGAT_FIELDNAMES, dialect="omegat") for line in lines: newunit = OmegaTUnit() newunit.dict = line self.addunit(newunit) def __str__(self): output = csv.StringIO() writer = csv.DictWriter(output, fieldnames=OMEGAT_FIELDNAMES, dialect="omegat") unit_count = 0 for unit in self.units: if unit.istranslated(): unit_count += 1 writer.writerow(unit.dict) if unit_count == 0: return "" output.reset() decoded = "".join(output.readlines()).decode('utf-8') try: return decoded.encode(self._encoding) except UnicodeEncodeError: return decoded.encode('utf-8') class OmegaTFileTab(OmegaTFile): """An OmegT translation memory file in the default system encoding""" # FIXME: uncomment this when we next open from string freeze #Name = _("OmegaT Glossary") Name = None Mimetypes = ["application/x-omegat-glossary"] Extensions = ["tab"] def _get_encoding(self): return locale.getdefaultlocale()[1]