Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSascha Silbe <sascha-pgp@silbe.org>2012-04-22 15:10:42 (GMT)
committer Sascha Silbe <sascha-pgp@silbe.org>2012-04-22 16:56:13 (GMT)
commit6c22132fd2f13b8a76a4fc6b84aa52af0efb5f44 (patch)
tree55f1e3fa1a105cf6f750350c3ef73de5cdb76178
parentb3a5dadb60c362b44b865122fda10ca23a7a244d (diff)
Add support for rebuilding index from git
We're already storing full metadata in git, using the Xapian index only for performance reasons. By adding support for rebuilding or updating the index based on what's in git, we can recover from index corruptions and interrupted execution (power loss, crash). As an additional benefit, adventurous users can synchronise their data stores using git fetch, forcing gdatastore to update the index by removing the last_object_id file. gdatastore before b3a5dad stored various data types as D-Bus types in the git commit messages. Compatibility with these versions is provided by removing the D-Bus type information on the fly before parsing.
-rw-r--r--gdatastore/datastore.py121
1 files changed, 113 insertions, 8 deletions
diff --git a/gdatastore/datastore.py b/gdatastore/datastore.py
index d74ebc3..b30f529 100644
--- a/gdatastore/datastore.py
+++ b/gdatastore/datastore.py
@@ -16,10 +16,12 @@
Gdatastore D-Bus service API
"""
+import ast
import hashlib
import logging
import os
import pprint
+import re
import shutil
from subprocess import Popen, PIPE
import tempfile
@@ -45,6 +47,11 @@ DBUS_SERVICE_SUGAR_V3 = 'org.laptop.sugar.DataStore'
DBUS_INTERFACE_SUGAR_V3 = 'org.laptop.sugar.DataStore2'
DBUS_PATH_SUGAR_V3 = '/org/laptop/sugar/DataStore2'
+_DBUS_METADATA_BASIC_RE = re.compile(
+ r"""dbus.(U?Int(16|32|64)|Double|String|ByteArray)\((?P<value>(-?[0-9]+(\.[0-9]*)?)|(u?('([^'\\]|\\.)*'|"([^"\\]|\\.)*")))(, variant_level=[0-9]+)?\)""")
+_DBUS_METADATA_DICTIONARY_RE = re.compile(
+ r"""dbus.Dictionary\((?P<value>\{.*\}), signature=dbus.Signature\('s[sv]'\)\)""")
+
class DataStoreError(Exception):
pass
@@ -415,6 +422,8 @@ class InternalApi(object):
logging.debug('max_versions=%r', self._max_versions)
self._index = Index(os.path.join(self._base_dir, 'index'))
self._migrate()
+ self._check_reindex()
+ logging.info('ready')
def add_callback(self, signal, callback):
if signal not in InternalApi.SIGNALS:
@@ -529,7 +538,8 @@ class InternalApi(object):
metadata['version_id'] = child_id
# TODO: check metadata for validity first (index?)
- self._store_entry(tree_id, child_id, parent_id, path, metadata)
+ self._log_store((tree_id, child_id))
+ self._store_entry_in_git(tree_id, child_id, parent_id, path, metadata)
self._index.store((tree_id, child_id), metadata)
self._invoke_callbacks('save', tree_id, child_id, parent_id, metadata)
@@ -542,7 +552,7 @@ class InternalApi(object):
logging.debug('stop()')
self._index.close()
- def _add_to_index(self, index_path, path):
+ def _add_to_git_index(self, index_path, path):
if os.path.isdir(path):
self._git_call('add', ['-A'], work_dir=path, index_path=index_path)
elif os.path.isfile(path):
@@ -583,9 +593,68 @@ class InternalApi(object):
os.makedirs(self._git_dir)
self._git_call('init', ['-q', '--bare'])
+ def _migrate(self):
+ if not os.path.exists(self._git_dir):
+ return self._create_repo()
+
+ def _check_reindex(self):
+ """Recreate or update index if necessary
+ """
+ last_object_id = self._get_last_object_id_from_log()
+ # Non-existence of the log (i.e. last_object_id=None) does not
+ # necessarily mean an empty data store: We could be upgrading
+ # from a previous version that didn't write the file, a file
+ # system corruption may have occured or the user may have
+ # deleted the log to force reindexing. This operation is cheap
+ # enough on empty data stores that we don't care about the
+ # performance impact on valid, empty data stores.
+ if not last_object_id or not self._index.contains(last_object_id):
+ logging.info('Rebuilding index')
+ self._reindex()
+
+ def _reindex(self):
+ """Recreate or update index from git repository
+
+ Log the last object after finishing the rebuild.
+ """
+ last_object_id = None
+ for object_id in self._get_object_ids_from_git():
+ last_object_id = object_id
+ logging.debug('reindex(): checking entry %r', object_id)
+ if self._index.contains(object_id):
+ continue
+
+ logging.debug('reindex(): adding entry %r from git', object_id)
+ metadata = self._get_metadata_from_git(object_id)
+ self._index.store(object_id, metadata)
+
+ if last_object_id:
+ self._log_store(last_object_id)
+
def _format_commit_message(self, metadata):
return pprint.pformat(to_native(metadata))
+ def _parse_commit_message(self, commit_message):
+ try:
+ return ast.literal_eval(commit_message)
+ except ValueError:
+ return self._parse_commit_message_dbus(commit_message)
+
+ def _parse_commit_message_dbus(self, commit_message):
+ # Compatibility work-around to parse commit messages
+ # written by previous versions and containing dbus.Int()
+ # instead of plain integer literals.
+ num_subs = 1
+ while num_subs:
+ commit_message, num_subs = re.subn(_DBUS_METADATA_DICTIONARY_RE,
+ '\g<value>', commit_message)
+ num_subs = 1
+ while num_subs:
+ commit_message, num_subs = re.subn(_DBUS_METADATA_BASIC_RE,
+ '\g<value>', commit_message)
+
+ return ast.literal_eval(commit_message)
+
def _gen_uuid(self):
return str(uuid.uuid4())
@@ -609,11 +678,7 @@ class InternalApi(object):
for callback in self._callbacks.get(signal, []):
callback(*args)
- def _migrate(self):
- if not os.path.exists(self._git_dir):
- return self._create_repo()
-
- def _store_entry(self, tree_id, version_id, parent_id, path, metadata):
+ def _store_entry_in_git(self, tree_id, version_id, parent_id, path, metadata):
commit_message = self._format_commit_message(metadata)
tree_hash = self._write_tree(path)
commit_hash = self._git_call('commit-tree', [tree_hash],
@@ -630,11 +695,51 @@ class InternalApi(object):
index_dir = tempfile.mkdtemp(prefix='gdatastore-')
index_path = os.path.join(index_dir, 'index')
try:
- self._add_to_index(index_path, path)
+ self._add_to_git_index(index_path, path)
return self._git_call('write-tree', index_path=index_path).strip()
finally:
shutil.rmtree(index_dir)
+ def _get_object_ids_from_git(self):
+ args = ['--sort=committerdate', '--format=%(refname)',
+ 'refs/gdatastore/*/*']
+ return [tuple(line.rsplit('/', 2)[1:])
+ for line in self._git_call('for-each-ref', args).split()]
+
+ def _get_metadata_from_git(self, object_id):
+ args = ['commit', _format_ref(*object_id)]
+ commit_message = self._git_call('cat-file', args).split('\n\n', 1)[1]
+ return self._parse_commit_message(commit_message)
+
+ def _log_store(self, object_id):
+ """Record the fact that we tried to store the given object
+
+ Make sure we know on next start-up that the object with the
+ given object_id was the last one to be processed. Used for
+ checking the index on start-up and triggering a rebuild if
+ necessary.
+ """
+ log_name = os.path.join(self._base_dir, 'last_object_id')
+ tmp_name = log_name + '.tmp'
+ with open(tmp_name, 'w') as f:
+ f.write(repr(tuple(object_id)))
+ f.flush()
+ os.fsync(f.fileno())
+
+ os.rename(tmp_name, log_name)
+
+ def _get_last_object_id_from_log(self):
+ """Return the object_id saved by _log_store()
+
+ Return the object_id of the last object to be processed, as
+ written by _log_store(). If no such log exists, return None.
+ """
+ log_name = os.path.join(self._base_dir, 'last_object_id')
+ if not os.path.exists(log_name):
+ return None
+
+ return ast.literal_eval(open(log_name).read())
+
def calculate_checksum(path):
checksum = hashlib.sha1()