diff options
Diffstat (limited to 'translate-toolkit-1.5.1/translate/storage/statsdb.py')
-rw-r--r-- | translate-toolkit-1.5.1/translate/storage/statsdb.py | 654 |
1 files changed, 654 insertions, 0 deletions
diff --git a/translate-toolkit-1.5.1/translate/storage/statsdb.py b/translate-toolkit-1.5.1/translate/storage/statsdb.py new file mode 100644 index 0000000..cccbb5b --- /dev/null +++ b/translate-toolkit-1.5.1/translate/storage/statsdb.py @@ -0,0 +1,654 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright 2007-2009 Zuza Software Foundation +# +# This file is part of the Translate Toolkit. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see <http://www.gnu.org/licenses/>. + + +"""Module to provide a cache of statistics in a database. + +@organization: Zuza Software Foundation +@copyright: 2007 Zuza Software Foundation +@license: U{GPL <http://www.fsf.org/licensing/licenses/gpl.html>} +""" + +from UserDict import UserDict + +from translate import __version__ as toolkitversion +from translate.storage import factory +from translate.misc.multistring import multistring +from translate.lang.common import Common + +try: + from sqlite3 import dbapi2 +except ImportError: + from pysqlite2 import dbapi2 +import os.path +import re +import sys +import stat +import thread + +kdepluralre = re.compile("^_n: ") +brtagre = re.compile("<br\s*?/?>") +xmltagre = re.compile("<[^>]+>") +numberre = re.compile("\\D\\.\\D") + +state_strings = {0: "untranslated", 1: "translated", 2: "fuzzy"} + +def wordcount(string): + # TODO: po class should understand KDE style plurals + string = kdepluralre.sub("", string) + string = brtagre.sub("\n", string) + string = xmltagre.sub("", string) + string = numberre.sub(" ", string) + #TODO: This should still use the correct language to count in the target + #language + return len(Common.words(string)) + +def wordsinunit(unit): + """Counts the words in the unit's source and target, taking plurals into + account. The target words are only counted if the unit is translated.""" + (sourcewords, targetwords) = (0, 0) + if isinstance(unit.source, multistring): + sourcestrings = unit.source.strings + else: + sourcestrings = [unit.source or ""] + for s in sourcestrings: + sourcewords += wordcount(s) + if not unit.istranslated(): + return sourcewords, targetwords + if isinstance(unit.target, multistring): + targetstrings = unit.target.strings + else: + targetstrings = [unit.target or ""] + for s in targetstrings: + targetwords += wordcount(s) + return sourcewords, targetwords + +class Record(UserDict): + def __init__(self, record_keys, record_values=None, compute_derived_values = lambda x: x): + if record_values == None: + record_values = (0 for _i in record_keys) + self.record_keys = record_keys + self.data = dict(zip(record_keys, record_values)) + self._compute_derived_values = compute_derived_values + self._compute_derived_values(self) + + def to_tuple(self): + return tuple(self[key] for key in self.record_keys) + + def __add__(self, other): + result = Record(self.record_keys) + for key in self.keys(): + result[key] = self[key] + other[key] + self._compute_derived_values(self) + return result + + def __sub__(self, other): + result = Record(self.record_keys) + for key in self.keys(): + result[key] = self[key] - other[key] + self._compute_derived_values(self) + return result + + def as_string_for_db(self): + return ",".join([repr(x) for x in self.to_tuple()]) + +def transaction(f): + """Modifies f to commit database changes if it executes without exceptions. + Otherwise it rolls back the database. + + ALL publicly accessible methods in StatsCache MUST be decorated with this + decorator. + """ + + def decorated_f(self, *args, **kwargs): + try: + result = f(self, *args, **kwargs) + self.con.commit() + return result + except: + # If ANY exception is raised, we're left in an + # uncertain state and we MUST roll back any changes to avoid getting + # stuck in an inconsistent state. + if self.con: + self.con.rollback() + raise + return decorated_f + +UNTRANSLATED, TRANSLATED, FUZZY = 0, 1, 2 +def statefordb(unit): + """Returns the numeric database state for the unit.""" + if unit.istranslated(): + return TRANSLATED + if unit.isfuzzy() and unit.target: + return FUZZY + return UNTRANSLATED + +class FileTotals(object): + keys = ['translatedsourcewords', + 'fuzzysourcewords', + 'untranslatedsourcewords', + 'translated', + 'fuzzy', + 'untranslated', + 'translatedtargetwords'] + + def db_keys(self): + return ",".join(self.keys) + + def __init__(self, cur): + self.cur = cur + self.cur.execute(""" + CREATE TABLE IF NOT EXISTS filetotals( + fileid INTEGER PRIMARY KEY AUTOINCREMENT, + translatedsourcewords INTEGER NOT NULL, + fuzzysourcewords INTEGER NOT NULL, + untranslatedsourcewords INTEGER NOT NULL, + translated INTEGER NOT NULL, + fuzzy INTEGER NOT NULL, + untranslated INTEGER NOT NULL, + translatedtargetwords INTEGER NOT NULL);""") + + def new_record(cls, state_for_db=None, sourcewords=None, targetwords=None): + record = Record(cls.keys, compute_derived_values = cls._compute_derived_values) + if state_for_db is not None: + if state_for_db is UNTRANSLATED: + record['untranslated'] = 1 + record['untranslatedsourcewords'] = sourcewords + if state_for_db is TRANSLATED: + record['translated'] = 1 + record['translatedsourcewords'] = sourcewords + record['translatedtargetwords'] = targetwords + elif state_for_db is FUZZY: + record['fuzzy'] = 1 + record['fuzzysourcewords'] = sourcewords + return record + + new_record = classmethod(new_record) + + def _compute_derived_values(cls, record): + record["total"] = record["untranslated"] + \ + record["translated"] + \ + record["fuzzy"] + record["totalsourcewords"] = record["untranslatedsourcewords"] + \ + record["translatedsourcewords"] + \ + record["fuzzysourcewords"] + record["review"] = 0 + _compute_derived_values = classmethod(_compute_derived_values) + + def __getitem__(self, fileid): + result = self.cur.execute(""" + SELECT %(keys)s + FROM filetotals + WHERE fileid=?;""" % {'keys': self.db_keys()}, (fileid,)) + return Record(FileTotals.keys, result.fetchone(), self._compute_derived_values) + + def __setitem__(self, fileid, record): + self.cur.execute(""" + INSERT OR REPLACE into filetotals + VALUES (%(fileid)d, %(vals)s); + """ % {'fileid': fileid, 'vals': record.as_string_for_db()}) + + def __delitem__(self, fileid): + self.cur.execute(""" + DELETE FROM filetotals + WHERE fileid=?; + """, (fileid,)) + +def emptyfiletotals(): + """Returns a dictionary with all statistics initalised to 0.""" + return FileTotals.new_record() + +def emptyfilechecks(): + return {} + +def emptyfilestats(): + return {"total": [], "translated": [], "fuzzy": [], "untranslated": []} + +def emptyunitstats(): + return {"sourcewordcount": [], "targetwordcount": []} + +# We allow the caller to specify which value to return when errors_return_empty +# is True. We do this, since Poolte wants None to be returned when it calls +# get_mod_info directly, whereas we want an integer to be returned for +# uses of get_mod_info within this module. +# TODO: Get rid of empty_return when Pootle code is improved to not require +# this. +def get_mod_info(file_path): + file_stat = os.stat(file_path) + assert not stat.S_ISDIR(file_stat.st_mode) + return file_stat.st_mtime, file_stat.st_size + +def suggestion_extension(): + return os.path.extsep + 'pending' + +def suggestion_filename(filename): + return filename + suggestion_extension() + +# ALL PUBLICLY ACCESSIBLE METHODS MUST BE DECORATED WITH THE transaction DECORATOR. +class StatsCache(object): + """An object instantiated as a singleton for each statsfile that provides + access to the database cache from a pool of StatsCache objects.""" + _caches = {} + defaultfile = None + con = None + """This cache's connection""" + cur = None + """The current cursor""" + + def __new__(cls, statsfile=None): + current_thread = thread.get_ident() + def make_database(statsfile): + def connect(cache): + cache.con = dbapi2.connect(statsfile) + cache.cur = cache.con.cursor() + + def clear_old_data(cache): + try: + cache.cur.execute("""SELECT toolkitbuild FROM files""") + val = cache.cur.fetchone() + # If the database is empty, we have no idea whether its layout + # is correct, so we might as well delete it. + if val is None or val[0] < toolkitversion.build: + cache.con.close() + del cache + os.unlink(statsfile) + return True + return False + except dbapi2.OperationalError: + return False + + cache = cls._caches.setdefault(current_thread, {})[statsfile] = object.__new__(cls) + connect(cache) + if clear_old_data(cache): + connect(cache) + cache.create() + return cache + + if not statsfile: + if not cls.defaultfile: + userdir = os.path.expanduser("~") + cachedir = None + if os.name == "nt": + cachedir = os.path.join(userdir, "Translate Toolkit") + else: + cachedir = os.path.join(userdir, ".translate_toolkit") + if not os.path.exists(cachedir): + os.mkdir(cachedir) + cls.defaultfile = os.path.realpath(os.path.join(cachedir, "stats.db")) + statsfile = cls.defaultfile + else: + statsfile = os.path.realpath(statsfile) + # First see if a cache for this file already exists: + if current_thread in cls._caches and statsfile in cls._caches[current_thread]: + return cls._caches[current_thread][statsfile] + # No existing cache. Let's build a new one and keep a copy + return make_database(statsfile) + + @transaction + def create(self): + """Create all tables and indexes.""" + self.file_totals = FileTotals(self.cur) + + self.cur.execute("""CREATE TABLE IF NOT EXISTS files( + fileid INTEGER PRIMARY KEY AUTOINCREMENT, + path VARCHAR NOT NULL UNIQUE, + st_mtime INTEGER NOT NULL, + st_size INTEGER NOT NULL, + toolkitbuild INTEGER NOT NULL);""") + + self.cur.execute("""CREATE UNIQUE INDEX IF NOT EXISTS filepathindex + ON files (path);""") + + self.cur.execute("""CREATE TABLE IF NOT EXISTS units( + id INTEGER PRIMARY KEY AUTOINCREMENT, + unitid VARCHAR NOT NULL, + fileid INTEGER NOT NULL, + unitindex INTEGER NOT NULL, + source VARCHAR NOT NULL, + target VARCHAR, + state INTEGER, + sourcewords INTEGER, + targetwords INTEGER);""") + + self.cur.execute("""CREATE INDEX IF NOT EXISTS fileidindex + ON units(fileid);""") + + self.cur.execute("""CREATE TABLE IF NOT EXISTS checkerconfigs( + configid INTEGER PRIMARY KEY AUTOINCREMENT, + config VARCHAR);""") + + self.cur.execute("""CREATE INDEX IF NOT EXISTS configindex + ON checkerconfigs(config);""") + + self.cur.execute("""CREATE TABLE IF NOT EXISTS uniterrors( + errorid INTEGER PRIMARY KEY AUTOINCREMENT, + unitindex INTEGER NOT NULL, + fileid INTEGER NOT NULL, + configid INTEGER NOT NULL, + name VARCHAR NOT NULL, + message VARCHAR);""") + + self.cur.execute("""CREATE INDEX IF NOT EXISTS uniterrorindex + ON uniterrors(fileid, configid);""") + + @transaction + def _getfileid(self, filename, check_mod_info=True, store=None): + """return fileid representing the given file in the statscache. + + if file not in cache or has been updated since last record + update, recalculate stats. + + optional argument store can be used to avoid unnessecary + reparsing of already loaded translation files. + + store can be a TranslationFile object or a callback that returns one. + """ + realpath = os.path.realpath(filename) + self.cur.execute("""SELECT fileid, st_mtime, st_size FROM files + WHERE path=?;""", (realpath,)) + filerow = self.cur.fetchone() + mod_info = get_mod_info(realpath) + if filerow: + fileid = filerow[0] + if not check_mod_info: + # Update the mod_info of the file + self.cur.execute("""UPDATE files + SET st_mtime=?, st_size=? + WHERE fileid=?;""", (mod_info[0], mod_info[1], fileid)) + return fileid + if (filerow[1], filerow[2]) == mod_info: + return fileid + + # file wasn't in db at all, lets recache it + if callable(store): + store = store() + else: + store = store or factory.getobject(realpath) + + return self._cachestore(store, realpath, mod_info) + + def _getstoredcheckerconfig(self, checker): + """See if this checker configuration has been used before.""" + config = str(checker.config.__dict__) + self.cur.execute("""SELECT configid, config FROM checkerconfigs WHERE + config=?;""", (config,)) + configrow = self.cur.fetchone() + if not configrow or configrow[1] != config: + return None + else: + return configrow[0] + + @transaction + def _cacheunitstats(self, units, fileid, unitindex=None, file_totals_record=FileTotals.new_record()): + """Cache the statistics for the supplied unit(s).""" + unitvalues = [] + for index, unit in enumerate(units): + if unit.istranslatable(): + sourcewords, targetwords = wordsinunit(unit) + if unitindex: + index = unitindex + # what about plurals in .source and .target? + unitvalues.append((unit.getid(), fileid, index, \ + unit.source, unit.target, \ + sourcewords, targetwords, \ + statefordb(unit))) + file_totals_record = file_totals_record + FileTotals.new_record(statefordb(unit), sourcewords, targetwords) + # XXX: executemany is non-standard + self.cur.executemany("""INSERT INTO units + (unitid, fileid, unitindex, source, target, sourcewords, targetwords, state) + values (?, ?, ?, ?, ?, ?, ?, ?);""", + unitvalues) + self.file_totals[fileid] = file_totals_record + if unitindex: + return state_strings[statefordb(units[0])] + return "" + + @transaction + def _cachestore(self, store, realpath, mod_info): + """Calculates and caches the statistics of the given store + unconditionally.""" + self.cur.execute("""DELETE FROM files WHERE + path=?;""", (realpath,)) + self.cur.execute("""INSERT INTO files + (fileid, path, st_mtime, st_size, toolkitbuild) values (NULL, ?, ?, ?, ?);""", + (realpath, mod_info[0], mod_info[1], toolkitversion.build)) + fileid = self.cur.lastrowid + self.cur.execute("""DELETE FROM units WHERE + fileid=?""", (fileid,)) + self._cacheunitstats(store.units, fileid) + return fileid + + def filetotals(self, filename, store=None): + """Retrieves the statistics for the given file if possible, otherwise + delegates to cachestore().""" + return self.file_totals[self._getfileid(filename, store=store)] + + @transaction + def _cacheunitschecks(self, units, fileid, configid, checker, unitindex=None): + """Helper method for cachestorechecks() and recacheunit()""" + # We always want to store one dummy error to know that we have actually + # run the checks on this file with the current checker configuration + dummy = (-1, fileid, configid, "noerror", "") + unitvalues = [dummy] + # if we are doing a single unit, we want to return the checknames + errornames = [] + for index, unit in enumerate(units): + if unit.istranslatable(): + # Correctly assign the unitindex + if unitindex: + index = unitindex + failures = checker.run_filters(unit) + for checkname, checkmessage in failures.iteritems(): + unitvalues.append((index, fileid, configid, checkname, checkmessage)) + errornames.append("check-" + checkname) + checker.setsuggestionstore(None) + + if unitindex: + # We are only updating a single unit, so we don't want to add an + # extra noerror-entry + unitvalues.remove(dummy) + errornames.append("total") + + # XXX: executemany is non-standard + self.cur.executemany("""INSERT INTO uniterrors + (unitindex, fileid, configid, name, message) + values (?, ?, ?, ?, ?);""", + unitvalues) + return errornames + + @transaction + def _cachestorechecks(self, fileid, store, checker, configid): + """Calculates and caches the error statistics of the given store + unconditionally.""" + # Let's purge all previous failures because they will probably just + # fill up the database without much use. + self.cur.execute("""DELETE FROM uniterrors WHERE + fileid=?;""", (fileid,)) + self._cacheunitschecks(store.units, fileid, configid, checker) + return fileid + + def get_unit_stats(self, fileid, unitid): + values = self.cur.execute(""" + SELECT state, sourcewords, targetwords + FROM units + WHERE fileid=? AND unitid=? + """, (fileid, unitid)) + result = values.fetchone() + if result is not None: + return result + else: + print >> sys.stderr, """WARNING: Database in inconsistent state. + fileid %d and unitid %s have no entries in the table units.""" % (fileid, unitid) + # If values.fetchone() is None, then we return an empty list, + # to make FileTotals.new_record(*self.get_unit_stats(fileid, unitid)) + # do the right thing. + return [] + + @transaction + def recacheunit(self, filename, checker, unit): + """Recalculate all information for a specific unit. This is necessary + for updating all statistics when a translation of a unit took place, + for example. + + This method assumes that everything was up to date before (file totals, + checks, checker config, etc.""" + fileid = self._getfileid(filename, check_mod_info=False) + configid = self._get_config_id(fileid, checker) + unitid = unit.getid() + # get the unit index + totals_without_unit = self.file_totals[fileid] - \ + FileTotals.new_record(*self.get_unit_stats(fileid, unitid)) + self.cur.execute("""SELECT unitindex FROM units WHERE + fileid=? AND unitid=?;""", (fileid, unitid)) + unitindex = self.cur.fetchone()[0] + self.cur.execute("""DELETE FROM units WHERE + fileid=? AND unitid=?;""", (fileid, unitid)) + state = [self._cacheunitstats([unit], fileid, unitindex, totals_without_unit)] + # remove the current errors + self.cur.execute("""DELETE FROM uniterrors WHERE + fileid=? AND unitindex=?;""", (fileid, unitindex)) + if os.path.exists(suggestion_filename(filename)): + checker.setsuggestionstore(factory.getobject(suggestion_filename(filename), ignore=suggestion_extension())) + state.extend(self._cacheunitschecks([unit], fileid, configid, checker, unitindex)) + return state + + def _checkerrors(self, filename, fileid, configid, checker, store): + def geterrors(): + self.cur.execute("""SELECT + name, + unitindex + FROM uniterrors WHERE fileid=? and configid=? + ORDER BY unitindex;""", (fileid, configid)) + return self.cur.fetchone(), self.cur + + first, cur = geterrors() + if first is not None: + return first, cur + + # This could happen if we haven't done the checks before, or the + # file changed, or we are using a different configuration + if callable(store): + store = store() + else: + store = store or factory.getobject(filename) + + if os.path.exists(suggestion_filename(filename)): + checker.setsuggestionstore(factory.getobject(suggestion_filename(filename), ignore=suggestion_extension())) + self._cachestorechecks(fileid, store, checker, configid) + return geterrors() + + def _geterrors(self, filename, fileid, configid, checker, store): + result = [] + first, cur = self._checkerrors(filename, fileid, configid, checker, store) + result.append(first) + result.extend(cur.fetchall()) + return result + + @transaction + def _get_config_id(self, fileid, checker): + configid = self._getstoredcheckerconfig(checker) + if configid: + return configid + self.cur.execute("""INSERT INTO checkerconfigs + (configid, config) values (NULL, ?);""", + (str(checker.config.__dict__),)) + return self.cur.lastrowid + + def filechecks(self, filename, checker, store=None): + """Retrieves the error statistics for the given file if possible, + otherwise delegates to cachestorechecks().""" + fileid = self._getfileid(filename, store=store) + configid = self._get_config_id(fileid, checker) + values = self._geterrors(filename, fileid, configid, checker, store) + + errors = emptyfilechecks() + for value in values: + if value[1] == -1: + continue + checkkey = 'check-' + value[0] #value[0] is the error name + if not checkkey in errors: + errors[checkkey] = [] + errors[checkkey].append(value[1]) #value[1] is the unitindex + + return errors + + def file_fails_test(self, filename, checker, name): + fileid = self._getfileid(filename) + configid = self._get_config_id(fileid, checker) + self._checkerrors(filename, fileid, configid, checker, None) + self.cur.execute("""SELECT + name, + unitindex + FROM uniterrors + WHERE fileid=? and configid=? and name=?;""", (fileid, configid, name)) + return self.cur.fetchone() is not None + + def filestatestats(self, filename, store=None): + """Return a dictionary of unit stats mapping sets of unit + indices with those states""" + stats = emptyfilestats() + fileid = self._getfileid(filename, store=store) + + self.cur.execute("""SELECT + state, + unitindex + FROM units WHERE fileid=? + ORDER BY unitindex;""", (fileid,)) + values = self.cur.fetchall() + + for value in values: + stats[state_strings[value[0]]].append(value[1]) + stats["total"].append(value[1]) + + return stats + + def filestats(self, filename, checker, store=None): + """Return a dictionary of property names mapping sets of unit + indices with those properties.""" + stats = emptyfilestats() + stats.update(self.filechecks(filename, checker, store)) + stats.update(self.filestatestats(filename, store)) + return stats + + def unitstats(self, filename, _lang=None, store=None): + # For now, lang and store are unused. lang will allow the user to + # base stats information on the given language. See the commented + # line containing stats.update below. + """Return a dictionary of property names mapping to arrays which + map unit indices to property values. + + Please note that this is different from filestats, since filestats + supplies sets of unit indices with a given property, whereas this + method supplies arrays which map unit indices to given values.""" + stats = emptyunitstats() + + #stats.update(self.unitchecks(filename, lang, store)) + fileid = self._getfileid(filename, store=store) + + self.cur.execute("""SELECT + sourcewords, targetwords + FROM units WHERE fileid=? + ORDER BY unitindex;""", (fileid,)) + + for sourcecount, targetcount in self.cur.fetchall(): + stats["sourcewordcount"].append(sourcecount) + stats["targetwordcount"].append(targetcount) + + return stats |