Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSascha Silbe <sascha-pgp@silbe.org>2014-05-24 19:24:48 (GMT)
committer Sascha Silbe <sascha-pgp@silbe.org>2014-05-24 19:24:48 (GMT)
commit5dc851866520362f31fb498fbe0d03166f68dae4 (patch)
tree413653d269062d357fe698f12f2b627f2568028e
parentfbd7bd8d36535c9ff93e5b2f4850b6c2727b9062 (diff)
Reindex entry on new commit (metadata update)
When re-indexing entries, take the git commit id into account so that we notice entries we already know about but where the metadata has changed. The first re-index run after an update will need to process all entries in order to add the commit_id value to the index. This may take a long time. This change is backwards compatible.
-rw-r--r--gdatastore/datastore.py31
-rw-r--r--gdatastore/index.py16
2 files changed, 30 insertions, 17 deletions
diff --git a/gdatastore/datastore.py b/gdatastore/datastore.py
index a33aca8..345ca77 100644
--- a/gdatastore/datastore.py
+++ b/gdatastore/datastore.py
@@ -542,8 +542,8 @@ class InternalApi(object):
old_metadata = self._index.retrieve(object_id)['metadata']
metadata['creation_time'] = old_metadata['creation_time']
- self._update_metadata_in_git(object_id, metadata)
- self._index.store(object_id, metadata)
+ commit_id = self._update_metadata_in_git(object_id, metadata)
+ self._index.store(object_id, metadata, commit_id)
self._invoke_callbacks('change_metadata', object_id, metadata)
def delete(self, object_id):
@@ -653,8 +653,9 @@ class InternalApi(object):
# TODO: check metadata for validity first (index?)
self._log_store((tree_id, child_id))
- self._store_entry_in_git(tree_id, child_id, parent_id, path, metadata)
- self._index.store((tree_id, child_id), metadata)
+ commit_id = self._store_entry_in_git(tree_id, child_id, parent_id,
+ path, metadata)
+ self._index.store((tree_id, child_id), metadata, commit_id)
self._invoke_callbacks('save', tree_id, child_id, parent_id, metadata)
if delete_after and path:
@@ -733,15 +734,17 @@ class InternalApi(object):
Log the last object after finishing the rebuild.
"""
last_object_id = None
- for object_id in self._get_object_ids_from_git():
+ for object_id, commit_id in self._get_object_ids_from_git():
last_object_id = object_id
- logging.debug('reindex(): checking entry %r', object_id)
- if self._index.contains(object_id):
+ logging.debug('reindex(): checking entry %r (commit %r)',
+ object_id, commit_id)
+ if self._index.contains(object_id, commit_id):
continue
- logging.debug('reindex(): adding entry %r from git', object_id)
+ logging.debug('reindex(): (re-)adding entry %r from git',
+ object_id)
metadata = self._get_metadata_from_git(object_id)
- self._index.store(object_id, metadata)
+ self._index.store(object_id, metadata, commit_id)
if last_object_id:
self._log_store(last_object_id)
@@ -801,6 +804,7 @@ class InternalApi(object):
input=commit_message).strip()
self._git_call('update-ref', [_format_ref(tree_id, version_id),
commit_hash])
+ return commit_hash
def _write_tree(self, path):
if not path:
@@ -823,6 +827,7 @@ class InternalApi(object):
commit_hash = self._git_call('commit-tree', ['-p', ref, tree_hash],
input=commit_message).strip()
self._git_call('update-ref', [ref, commit_hash])
+ return commit_hash
def _get_tree_hash(self, object_id):
args = ['commit', _format_ref(*object_id)]
@@ -830,10 +835,12 @@ class InternalApi(object):
return self._git_call('cat-file', args).split('\n', 1)[0].split(' ')[1]
def _get_object_ids_from_git(self):
- args = ['--sort=committerdate', '--format=%(refname)',
+ args = ['--sort=committerdate', '--format=%(refname) %(objectname)',
'refs/gdatastore/*/*']
- return [tuple(line.rsplit('/', 2)[1:])
- for line in self._git_call('for-each-ref', args).split()]
+ lines = self._git_call('for-each-ref', args).strip().split('\n')
+ ref_commits = [line.split(' ') for line in lines]
+ return [(tuple(ref.rsplit('/', 2)[1:]), commit_id)
+ for ref, commit_id in ref_commits]
def _get_metadata_from_git(self, object_id):
args = ['commit', _format_ref(*object_id)]
diff --git a/gdatastore/index.py b/gdatastore/index.py
index f60125f..849824f 100644
--- a/gdatastore/index.py
+++ b/gdatastore/index.py
@@ -42,6 +42,7 @@ _VALUE_VERSION_ID = 1
_VALUE_MTIME = 2
_VALUE_SIZE = 3
_VALUE_CTIME = 4
+_VALUE_COMMIT_ID = 5
_STANDARD_VALUES = {
'creation_time': {'number': _VALUE_CTIME, 'type': float},
'filesize': {'number': _VALUE_SIZE, 'type': int},
@@ -240,13 +241,16 @@ class Index(object):
self._database.close()
self._database = None
- def contains(self, object_id):
+ def contains(self, object_id, commit_id=None):
postings = self._database.postlist(_object_id_term(object_id))
try:
- _ = postings.next()
+ doc_id = postings.next().docid
except StopIteration:
return False
- return True
+ if not commit_id:
+ return True
+ document = self._database.get_document(doc_id)
+ return document.get_value(_VALUE_COMMIT_ID) == commit_id
def delete(self, object_id):
writable_db = self._get_writable_db()
@@ -331,9 +335,10 @@ class Index(object):
# global_doc_id = (local_doc_id - 1) * num_databases + db_index + 1
ds_index = (doc_id - 1) % len(self._data_stores)
return {'metadata': deserialise_metadata(document.get_data()),
- 'data_store': self._data_stores[ds_index]}
+ 'data_store': self._data_stores[ds_index],
+ 'commit_id': document.get_value(_VALUE_COMMIT_ID)}
- def store(self, object_id, properties):
+ def store(self, object_id, properties, commit_id):
logging.debug('store(%r, %r)', object_id, properties)
assert (properties['tree_id'], properties['version_id']) == object_id
id_term = _object_id_term(object_id)
@@ -344,6 +349,7 @@ class Index(object):
term_generator = TermGenerator()
term_generator.index_document(document, properties)
assert (document.get_value(_VALUE_TREE_ID), document.get_value(_VALUE_VERSION_ID)) == object_id
+ document.add_value(_VALUE_COMMIT_ID, commit_id)
writable_db = self._get_writable_db()
writable_db.replace_document(id_term, document)
writable_db.commit()