From 5dc851866520362f31fb498fbe0d03166f68dae4 Mon Sep 17 00:00:00 2001 From: Sascha Silbe Date: Sat, 24 May 2014 19:24:48 +0000 Subject: Reindex entry on new commit (metadata update) When re-indexing entries, take the git commit id into account so that we notice entries we already know about but where the metadata has changed. The first re-index run after an update will need to process all entries in order to add the commit_id value to the index. This may take a long time. This change is backwards compatible. --- diff --git a/gdatastore/datastore.py b/gdatastore/datastore.py index a33aca8..345ca77 100644 --- a/gdatastore/datastore.py +++ b/gdatastore/datastore.py @@ -542,8 +542,8 @@ class InternalApi(object): old_metadata = self._index.retrieve(object_id)['metadata'] metadata['creation_time'] = old_metadata['creation_time'] - self._update_metadata_in_git(object_id, metadata) - self._index.store(object_id, metadata) + commit_id = self._update_metadata_in_git(object_id, metadata) + self._index.store(object_id, metadata, commit_id) self._invoke_callbacks('change_metadata', object_id, metadata) def delete(self, object_id): @@ -653,8 +653,9 @@ class InternalApi(object): # TODO: check metadata for validity first (index?) self._log_store((tree_id, child_id)) - self._store_entry_in_git(tree_id, child_id, parent_id, path, metadata) - self._index.store((tree_id, child_id), metadata) + commit_id = self._store_entry_in_git(tree_id, child_id, parent_id, + path, metadata) + self._index.store((tree_id, child_id), metadata, commit_id) self._invoke_callbacks('save', tree_id, child_id, parent_id, metadata) if delete_after and path: @@ -733,15 +734,17 @@ class InternalApi(object): Log the last object after finishing the rebuild. """ last_object_id = None - for object_id in self._get_object_ids_from_git(): + for object_id, commit_id in self._get_object_ids_from_git(): last_object_id = object_id - logging.debug('reindex(): checking entry %r', object_id) - if self._index.contains(object_id): + logging.debug('reindex(): checking entry %r (commit %r)', + object_id, commit_id) + if self._index.contains(object_id, commit_id): continue - logging.debug('reindex(): adding entry %r from git', object_id) + logging.debug('reindex(): (re-)adding entry %r from git', + object_id) metadata = self._get_metadata_from_git(object_id) - self._index.store(object_id, metadata) + self._index.store(object_id, metadata, commit_id) if last_object_id: self._log_store(last_object_id) @@ -801,6 +804,7 @@ class InternalApi(object): input=commit_message).strip() self._git_call('update-ref', [_format_ref(tree_id, version_id), commit_hash]) + return commit_hash def _write_tree(self, path): if not path: @@ -823,6 +827,7 @@ class InternalApi(object): commit_hash = self._git_call('commit-tree', ['-p', ref, tree_hash], input=commit_message).strip() self._git_call('update-ref', [ref, commit_hash]) + return commit_hash def _get_tree_hash(self, object_id): args = ['commit', _format_ref(*object_id)] @@ -830,10 +835,12 @@ class InternalApi(object): return self._git_call('cat-file', args).split('\n', 1)[0].split(' ')[1] def _get_object_ids_from_git(self): - args = ['--sort=committerdate', '--format=%(refname)', + args = ['--sort=committerdate', '--format=%(refname) %(objectname)', 'refs/gdatastore/*/*'] - return [tuple(line.rsplit('/', 2)[1:]) - for line in self._git_call('for-each-ref', args).split()] + lines = self._git_call('for-each-ref', args).strip().split('\n') + ref_commits = [line.split(' ') for line in lines] + return [(tuple(ref.rsplit('/', 2)[1:]), commit_id) + for ref, commit_id in ref_commits] def _get_metadata_from_git(self, object_id): args = ['commit', _format_ref(*object_id)] diff --git a/gdatastore/index.py b/gdatastore/index.py index f60125f..849824f 100644 --- a/gdatastore/index.py +++ b/gdatastore/index.py @@ -42,6 +42,7 @@ _VALUE_VERSION_ID = 1 _VALUE_MTIME = 2 _VALUE_SIZE = 3 _VALUE_CTIME = 4 +_VALUE_COMMIT_ID = 5 _STANDARD_VALUES = { 'creation_time': {'number': _VALUE_CTIME, 'type': float}, 'filesize': {'number': _VALUE_SIZE, 'type': int}, @@ -240,13 +241,16 @@ class Index(object): self._database.close() self._database = None - def contains(self, object_id): + def contains(self, object_id, commit_id=None): postings = self._database.postlist(_object_id_term(object_id)) try: - _ = postings.next() + doc_id = postings.next().docid except StopIteration: return False - return True + if not commit_id: + return True + document = self._database.get_document(doc_id) + return document.get_value(_VALUE_COMMIT_ID) == commit_id def delete(self, object_id): writable_db = self._get_writable_db() @@ -331,9 +335,10 @@ class Index(object): # global_doc_id = (local_doc_id - 1) * num_databases + db_index + 1 ds_index = (doc_id - 1) % len(self._data_stores) return {'metadata': deserialise_metadata(document.get_data()), - 'data_store': self._data_stores[ds_index]} + 'data_store': self._data_stores[ds_index], + 'commit_id': document.get_value(_VALUE_COMMIT_ID)} - def store(self, object_id, properties): + def store(self, object_id, properties, commit_id): logging.debug('store(%r, %r)', object_id, properties) assert (properties['tree_id'], properties['version_id']) == object_id id_term = _object_id_term(object_id) @@ -344,6 +349,7 @@ class Index(object): term_generator = TermGenerator() term_generator.index_document(document, properties) assert (document.get_value(_VALUE_TREE_ID), document.get_value(_VALUE_VERSION_ID)) == object_id + document.add_value(_VALUE_COMMIT_ID, commit_id) writable_db = self._get_writable_db() writable_db.replace_document(id_term, document) writable_db.commit() -- cgit v0.9.1