diff options
author | Benjamin Saller <bcsaller@objectrealms.net> | 2007-07-13 04:33:19 (GMT) |
---|---|---|
committer | Benjamin Saller <bcsaller@objectrealms.net> | 2007-07-13 04:33:19 (GMT) |
commit | ea06d07882613332afa18e1e0fe2323ba2580b06 (patch) | |
tree | 7f50caccf827f209bebb05171cfb3e816f82d4f8 | |
parent | a9ef157802cd7c3884814bc55b6cff1a5886c66e (diff) | |
parent | a41d2d5ca21115f3b76a789f5874f35dca089a3d (diff) |
Merge branch 'dualxap'
-rwxr-xr-x | bin/datastore-service | 8 | ||||
-rwxr-xr-x | bin/index-service | 173 | ||||
-rwxr-xr-x | bin/sample-client.py | 70 | ||||
-rw-r--r-- | etc/Makefile.am | 7 | ||||
-rw-r--r-- | etc/org.laptop.sugar.Indexer.service.in | 3 | ||||
-rw-r--r-- | src/olpc/datastore/__init__.py | 4 | ||||
-rw-r--r-- | src/olpc/datastore/backingstore.py | 118 | ||||
-rw-r--r-- | src/olpc/datastore/converter.py | 4 | ||||
-rw-r--r-- | src/olpc/datastore/datastore.py | 97 | ||||
-rw-r--r-- | src/olpc/datastore/indexer.py | 47 | ||||
-rw-r--r-- | src/olpc/datastore/model.py | 528 | ||||
-rw-r--r-- | src/olpc/datastore/query.py | 642 | ||||
-rw-r--r-- | src/olpc/datastore/xapianindex.py | 390 | ||||
-rw-r--r-- | tests/Makefile | 4 | ||||
-rwxr-xr-x | tests/cleaner.py | 40 | ||||
-rw-r--r-- | tests/milestone_1.txt | 24 | ||||
-rw-r--r-- | tests/milestone_2.txt | 16 | ||||
-rw-r--r-- | tests/mountpoints.txt | 15 | ||||
-rw-r--r-- | tests/properties.txt | 22 | ||||
-rw-r--r-- | tests/query.txt | 277 | ||||
-rw-r--r-- | tests/runalltests.py | 48 | ||||
-rw-r--r-- | tests/sugar_demo_may17.txt | 7 | ||||
-rw-r--r-- | tests/test_backingstore.py | 30 | ||||
-rw-r--r-- | tests/test_model.py | 34 | ||||
-rw-r--r-- | tests/test_xapianindex.py | 90 | ||||
-rw-r--r-- | tests/testutils.py | 5 | ||||
-rw-r--r-- | tests/xapianindex.txt | 73 |
27 files changed, 1097 insertions, 1679 deletions
diff --git a/bin/datastore-service b/bin/datastore-service index 4300619..532516b 100755 --- a/bin/datastore-service +++ b/bin/datastore-service @@ -4,7 +4,6 @@ import gobject import dbus.service import dbus.mainloop.glib from olpc.datastore import DataStore, DS_LOG_CHANNEL, backingstore -from olpc.datastore.indexer import INDEX_SERVICE, INDEX_OBJECT_PATH import logging SYNC_INDEX = True @@ -32,8 +31,6 @@ logging.basicConfig(level=logging.DEBUG, filename = filename, ) # disable subsystem logging except where critical -logging.getLogger('sqlalchemy').setLevel(logging.CRITICAL) -logging.getLogger('lemur').setLevel(logging.CRITICAL) logger = logging.getLogger(DS_LOG_CHANNEL) # check for old lockfiles, the rules here are that we can't be @@ -53,7 +50,7 @@ bus = dbus.SessionBus() ds = DataStore() ds.registerBackend(backingstore.FileBackingStore) ds.registerBackend(backingstore.InplaceFileBackingStore) -ds.mount(repo_dir, {'querymanager_sync_index': SYNC_INDEX}) +ds.mount(repo_dir, {'indexmanager.sync_index': SYNC_INDEX}) # and run it logger.info("Starting Datastore %s" % (repo_dir)) @@ -68,9 +65,6 @@ signal.signal(signal.SIGHUP, handle_shutdown) signal.signal(signal.SIGTERM, handle_shutdown) def main(): - if SYNC_INDEX is False: - indexer = bus.get_object(INDEX_SERVICE, INDEX_OBJECT_PATH) - try: mainloop.run() except KeyboardInterrupt: ds.stop() diff --git a/bin/index-service b/bin/index-service deleted file mode 100755 index a2ff83c..0000000 --- a/bin/index-service +++ /dev/null @@ -1,173 +0,0 @@ -#!/usr/bin/env python - -""" Async index service for the Datastore. - -Subscribes to the create/update/delete messages of the Datastore and -performs the indexing. When this service is enabled the Datastore -access the Xapian repository in read only mode. -""" - - -try: from ore.main import Application -except ImportError: Application = object - -from olpc.datastore.datastore import DS_SERVICE, DS_OBJECT_PATH -from olpc.datastore.datastore import DS_DBUS_INTERFACE -from olpc.datastore.indexer import Indexer -import dbus -import dbus.mainloop.glib -import logging -import sys -import os -import signal - -profile = os.environ.get('SUGAR_PROFILE', 'default') -base_dir = os.path.join(os.path.expanduser('~'), '.sugar', profile) -repo_dir = os.path.join(base_dir, 'datastore') -fulltext_dir = os.path.join(repo_dir, 'fulltext') - -log_dir = os.path.join(base_dir, "logs") -if not os.path.exists(log_dir): os.makedirs(log_dir) - -os.chdir(repo_dir) - -# setup logger -filename = None -if not sys.stdin.isatty(): - filename = os.path.join(log_dir, "indexer.log") -logging.basicConfig(level=logging.DEBUG, - format="%(asctime)-15s %(levelname)s: %(message)s", - filename = filename, - ) - -logger = logging.getLogger('org.laptop.sugar.Indexer') -logger.setLevel(logging.DEBUG) - -class IndexService(Application): - def manage_options(self): - self.parser.add_option("--olpc.fulltext.repo", - dest="fulltext_dir", - action="store", default='fulltext', - help="""Location of the FullText Repository""") - - - def main(self): - logging.debug('Starting the index service at %s' % self.options.fulltext_dir) - dbus.mainloop.glib.DBusGMainLoop(set_as_default=True) - bus = dbus.SessionBus() - self.fulltext = Indexer(self.options.fulltext_dir) - self.fulltext.use_fulltext = True - - ds = bus.get_object(DS_SERVICE, DS_OBJECT_PATH) - self.ds = dbus.Interface(ds, dbus_interface=DS_DBUS_INTERFACE) - - self.ds.connect_to_signal("Created", self.created, - dbus_interface=DS_DBUS_INTERFACE) - - self.ds.connect_to_signal("Updated", self.updated, - dbus_interface=DS_DBUS_INTERFACE) - - self.ds.connect_to_signal("Deleted", self.deleted, - dbus_interface=DS_DBUS_INTERFACE) - - - self.ds.connect_to_signal("Stopped", self.stopped, - dbus_interface=DS_DBUS_INTERFACE) - - self.eventloop.run() - - def get_textprops(self, uid): - # text properties also get full text indexing - # currently this is still searched with the 'fulltext' - # parameter of find() - textprops = {} - for k,v in self.ds.get_properties(uid, dict(type='text')).items(): - textprops[str(k)] = v and str(v) or '' - return textprops - - def created(self, uid): - """An object was created on the bus and we want to index it""" - # because the file isn't encoded anywhere accessible in the - # create call we must actually get the filename and trigger - # the indexing on that - filename = self.ds.get_filename(uid) - r = None - if filename: - mime_type = self.ds.get_properties(uid, {}).get('mime_type', None) - r = self.fulltext.fulltext_index(uid, filename, mime_type, - self.get_textprops(uid)) - if r is True: - logger.debug("index creation of %s" % uid) - elif r is False: - logger.debug("unable to index creation of %s" % uid) - else: - logger.debug("nothing to index on creation of %s" % uid) - - def updated(self, uid): - """An object was updated on the bus and we want to index it""" - # because the file isn't encoded anywhere accessible in the - # create call we must actually get the filename and trigger - # the indexing on that - filename = self.ds.get_filename(uid) - r = None - if filename: - mime_type = self.ds.get_properties(uid, {}).get('mime_type', - None) - r = self.fulltext.fulltext_index(uid, filename, mime_type, - self.get_textprops(uid)) - if r is True: - logger.debug("index update of %s" % uid) - elif r is False: - logger.debug("unable to index update of %s" % uid) - else: - logger.debug("nothing to index on update of %s" % uid) - - - def deleted(self, uid): - """An object was updated on the bus and we want to index it""" - # because the file isn't encoded anywhere accessible in the - # create call we must actually get the filename and trigger - # the indexing on that - try: - self.fulltext.fulltext_unindex(uid) - logger.debug("unindex deletion of %s" % uid); - except KeyError: pass - - - def stopped(self): - """Respond to the datastore being stopped by shutting down - ourselves""" - self.fulltext.stop() - self.eventloop.quit() - - -if __name__ == "__main__": - def handle_shutdown(signum, frame): - idx.stopped() - print "shutdown cleanly" - raise SystemExit("Shutting down on signal %s" % signum) - - signal.signal(signal.SIGHUP, handle_shutdown) - signal.signal(signal.SIGTERM, handle_shutdown) - - idx = IndexService() - #idx() - # w/o ore.main - - import gobject - idx.eventloop = gobject.MainLoop() - class options(object): pass - o = options() - o.fulltext_dir = 'fulltext' - idx.options = o - try: - idx.main() - except: - # force logging this one - logger.setLevel(logging.DEBUG) - logger.debug("Problem in index service", - exc_info=sys.exc_info()) - idx.stopped() - - - diff --git a/bin/sample-client.py b/bin/sample-client.py index bd609a7..2a2b19c 100755 --- a/bin/sample-client.py +++ b/bin/sample-client.py @@ -1,8 +1,6 @@ #!/usr/bin/env python -from ore.main import Application import dbus import os -import time def main(): bus = dbus.SessionBus() @@ -10,36 +8,62 @@ def main(): "/org/laptop/sugar/DataStore") uid = datastore.create(dict(title="from dbus", author="Benjamin"), os.path.abspath('tests/test.pdf')) - print "created uid", uid + print "created uid", uid, "with binary content" - - #for u in datastore.find()[0]: - # datastore.delete(u['uid']) - #return - # let the async indexer run - time.sleep(1.2) - #import pdb;pdb.set_trace() - print "find", datastore.find(dict(author="Benjamin", title="from")) + datastore.complete_indexing() + res, count = datastore.find(dict(fulltext="peek")) - if not res: - print "unable to index content" - #return - print "bcsaller", [item['uid'] for item in res] + assert count == 1, "failed to index content" + assert res[0]['uid'] == uid, "returned incorrect results" + print "found inside binary file :: PDF" + + assert datastore.find(dict(fulltext="kfdshaksjd"))[1] == 0 + print "successfully ignored bad searches" + + # try the other mimetypes + datastore.update(uid, dict(title="updated title", + mime_type="application/msword"), + os.path.abspath('tests/test.doc')) - print "huh?", datastore.find(dict(fulltext="kfdshaksjd")) + datastore.complete_indexing() + + assert datastore.find(dict(fulltext="inside"))[0][0]['uid'] == uid + print "found in binary file :: WORD" + + datastore.update(uid, dict(title="another updated title", + mime_type="application/vnd.oasis.opendocument.text"), + os.path.abspath('tests/test.odt')) + datastore.complete_indexing() + + assert datastore.find(dict(fulltext="amazed"))[0][0]['uid'] == uid + print "found in binary file :: ODT" - # try the other mimetypes - datastore.update(uid, dict(title="updated title", mime_type="application/msword"), os.path.abspath('tests/test.doc')) - print datastore.find(dict(fulltext="inside")) - datastore.update(uid, dict(title="another updated title", mime_type="application/vnd.oasis.opendocument.text"), os.path.abspath('tests/test.odt')) - print datastore.find(dict(fulltext="amazed")) datastore.get_properties(uid) - print "title in fulltext", datastore.find(dict(fulltext="another")) - + assert datastore.find(dict(title="another"))[0][0]['uid'] == uid + print "found title using dict params", + + assert datastore.find("another")[0][0]['uid'] == uid + print "found title in search of all fields (as text)" + + + assert datastore.find('title:"another"')[0][0]['uid'] == uid + print "field in query field:'value' " + datastore.delete(uid) + datastore.complete_indexing() + + print "deleted", uid + try: datastore.get_properties(uid) + except: pass + else: + print "Found deleted value... oops" + raise KeyError(uid) + + print "ALL GOOD" if __name__ == '__main__': + #from ore.main import Application #a = Application("client", main) #a.plugins.append('ore.main.profile_support.ProfileSupport') #a() diff --git a/etc/Makefile.am b/etc/Makefile.am index 1d8a54c..a9b28b1 100644 --- a/etc/Makefile.am +++ b/etc/Makefile.am @@ -1,15 +1,12 @@ servicedir = $(datadir)/dbus-1/services service_in_files = \ - org.laptop.sugar.DataStore.service.in \ - org.laptop.sugar.Indexer.service.in + org.laptop.sugar.DataStore.service.in + service_DATA = $(service_in_files:.service.in=.service) org.laptop.sugar.DataStore.service: org.laptop.sugar.DataStore.service.in @sed -e "s|\@bindir\@|$(bindir)|" $< > $@ -org.laptop.sugar.Indexer.service: org.laptop.sugar.Indexer.service.in - @sed -e "s|\@bindir\@|$(bindir)|" $< > $@ - DISTCLEANFILES = $(service_DATA) EXTRA_DIST = $(service_in_files) diff --git a/etc/org.laptop.sugar.Indexer.service.in b/etc/org.laptop.sugar.Indexer.service.in deleted file mode 100644 index fb0a7ec..0000000 --- a/etc/org.laptop.sugar.Indexer.service.in +++ /dev/null @@ -1,3 +0,0 @@ -[D-BUS Service] -Name = org.laptop.sugar.Indexer -Exec = @bindir@/index-service diff --git a/src/olpc/datastore/__init__.py b/src/olpc/datastore/__init__.py index d38dcff..fd38d75 100644 --- a/src/olpc/datastore/__init__.py +++ b/src/olpc/datastore/__init__.py @@ -1,7 +1,5 @@ # datastore package +from olpc.datastore.datastore import DataStore, DS_LOG_CHANNEL -from olpc.datastore.datastore import DataStore, DS_LOG_CHANNEL -from olpc.datastore.backingstore import FileBackingStore -from olpc.datastore.query import DefaultQueryManager diff --git a/src/olpc/datastore/backingstore.py b/src/olpc/datastore/backingstore.py index b0a05ad..354426e 100644 --- a/src/olpc/datastore/backingstore.py +++ b/src/olpc/datastore/backingstore.py @@ -17,7 +17,7 @@ import re import subprocess import time -from olpc.datastore import query +from olpc.datastore.xapianindex import IndexManager from olpc.datastore import utils # changing this pattern impacts _targetFile @@ -75,7 +75,7 @@ class BackingStore(object): def load(self): """load the index for a given mount-point, then initialize its fulltext subsystem. This is the routine that will bootstrap - the querymanager (though create() may have just created it) + the indexmanager (though create() may have just created it) """ pass @@ -121,11 +121,11 @@ class FileBackingStore(BackingStore): """ FileSystemStore(path=<root of managed storage>) """ self.options = kwargs - self.local_querymanager = self.options.get('local_querymanager', True) + self.local_indexmanager = self.options.get('local_indexmanager', True) self.uri = uri self.base = os.path.join(uri, self.STORE_NAME) - self.querymanager = None + self.indexmanager = None # Informational def descriptor(self): @@ -190,47 +190,40 @@ class FileBackingStore(BackingStore): if not os.path.exists(self.base): os.makedirs(self.base) - # examine options and see what the querymanager plan is - if self.local_querymanager: - # create a local storage using the querymanager + # examine options and see what the indexmanager plan is + if self.local_indexmanager: + # create a local storage using the indexmanager # otherwise we will connect the global manager # in load index_name = os.path.join(self.base, self.INDEX_NAME) - options = utils.options_for(self.options, 'querymanager_') - if 'fulltext_repo' not in options: - options['fulltext_repo'] = os.path.join(self.base, - query.DefaultQueryManager.FULLTEXT_NAME) - - qm = query.DefaultQueryManager(index_name, **options) + options = utils.options_for(self.options, 'indexmanager.') + im = IndexManager() # This will ensure the fulltext and so on are all assigned - qm.bind_to(self) - qm.prepare() + im.bind_to(self) + im.connect(index_name, **options) self.create_descriptor(**options) - self.querymanager = qm + self.indexmanager = im def load(self): - if not self.querymanager and self.local_querymanager: - # create a local storage using the querymanager + if not self.indexmanager and self.local_indexmanager: + # create a local storage using the indexmanager # otherwise we will connect the global manager # in load index_name = os.path.join(self.base, self.INDEX_NAME) - options = utils.options_for(self.options, 'querymanager_') - if 'fulltext_repo' not in self.options: - options['fulltext_repo'] = os.path.join(self.base, - query.DefaultQueryManager.FULLTEXT_NAME) - - qm = query.DefaultQueryManager(index_name, **options) + options = utils.options_for(self.options, 'indexmanager.') + im = IndexManager() desc = utils.options_for(self.options, - 'querymanager_', invert=True) + 'indexmanager.', + invert=True) if desc: self.create_descriptor(**desc) # This will ensure the fulltext and so on are all assigned - qm.bind_to(self) - qm.prepare() + im.bind_to(self) + im.connect(index_name) - self.querymanager = qm + self.indexmanager = im def bind_to(self, datastore): ## signal from datastore that we are being bound to it @@ -283,7 +276,7 @@ class FileBackingStore(BackingStore): # env would contain things like cwd if we wanted to map to a # known space - content = self.querymanager.get(uid) + content = self.indexmanager.get(uid) # we need to map a copy of the content from the backingstore into the # activities addressable space. # map this to a rw file @@ -316,7 +309,7 @@ class FileBackingStore(BackingStore): fp.write(line) fp.close() if verify: - content = self.querymanager.get(uid) + content = self.indexmanager.get(uid) content.checksum = c.hexdigest() def _checksum(self, filename): @@ -329,18 +322,18 @@ class FileBackingStore(BackingStore): # File Management API def create(self, props, filelike): - content = self.querymanager.create(props, filelike) + uid = self.indexmanager.index(props, filelike) filename = filelike if filelike: if isinstance(filelike, basestring): # lets treat it as a filename filelike = open(filelike, "r") filelike.seek(0) - self._writeContent(content.id, filelike, replace=False) - return content + self._writeContent(uid, filelike, replace=False) + return uid def get(self, uid, env=None, allowMissing=False): - content = self.querymanager.get(uid) + content = self.indexmanager.get(uid) if not content: raise KeyError(uid) path = self._translatePath(uid) fp = None @@ -352,7 +345,9 @@ class FileBackingStore(BackingStore): return self._mapContent(uid, fp, path, env) def update(self, uid, props, filelike=None): - self.querymanager.update(uid, props, filelike) + if 'uid' not in props: props['uid'] = uid + + self.indexmanager.index(props, filelike) filename = filelike if filelike: if isinstance(filelike, basestring): @@ -365,7 +360,7 @@ class FileBackingStore(BackingStore): self._writeContent(uid, filelike) def delete(self, uid, allowMissing=True): - self.querymanager.delete(uid) + self.indexmanager.delete(uid) path = self._translatePath(uid) if os.path.exists(path): os.unlink(path) @@ -374,21 +369,23 @@ class FileBackingStore(BackingStore): raise KeyError("object for uid:%s missing" % uid) def get_uniquevaluesfor(self, propertyname): - return self.querymanager.get_uniquevaluesfor(propertyname) + return self.indexmanager.get_uniquevaluesfor(propertyname) def find(self, query): - return self.querymanager.find(query) + return self.indexmanager.search(query) def stop(self): - self.querymanager.stop() - + self.indexmanager.stop() + + def complete_indexing(self): + self.indexmanager.complete_indexing() class InplaceFileBackingStore(FileBackingStore): """Like the normal FileBackingStore this Backingstore manages the storage of files, but doesn't move files into a repository. There are no working copies. It simply adds index data through its - querymanager and provides fulltext ontop of a regular + indexmanager and provides fulltext ontop of a regular filesystem. It does record its metadata relative to this mount point. @@ -434,45 +431,48 @@ class InplaceFileBackingStore(FileBackingStore): for fn in filenames: source = os.path.join(dirpath, fn) relative = source[len(self.uri)+1:] - result, count = self.querymanager.find(dict(filename=relative)) + + result, count = self.indexmanager.search(dict(filename=relative)) if not count: # create a new record self.create(dict(filename=relative), source) else: # update the object with the new content iif the # checksum is different - # XXX: what if there is more than one? (shouldn't happen) - content = result[0] - uid = content + # XXX: what if there is more than one? (shouldn't + # happen) + content = result.next() + uid = content.id # only if the checksum is different - checksum = self._checksum(source) - if checksum != content.checksum: - self.update(uid, dict(filename=relative), source) - - #self.querymanager.index.flush() + #checksum = self._checksum(source) + #if checksum != content.checksum: + self.update(uid, dict(filename=relative), source) - + if self.options.get('sync_mount', False): + self.complete_indexing() + # File Management API def create(self, props, filelike): # the file would have already been changed inplace # don't touch it - return self.querymanager.create(props, filelike) + return self.indexmanager.index(props, filelike) def get(self, uid, env=None, allowMissing=False): - content = self.querymanager.get(uid) + content = self.indexmanager.get(uid) if not content: raise KeyError(uid) return content.get_property('filename') def update(self, uid, props, filelike=None): # the file would have already been changed inplace # don't touch it - self.querymanager.update(uid, props, filelike) + props['uid'] = uid + self.indexmanager.index(props, filelike) - def delete(self, uid, allowMissing=True): - c = self.querymanager.get(uid) - path = c.get_property('filename') - self.querymanager.delete(uid) - if os.path.exists(path): + def delete(self, uid): + c = self.indexmanager.get(uid) + path = c.get_property('filename', None) + self.indexmanager.delete(uid) + if path and os.path.exists(path): os.unlink(path) diff --git a/src/olpc/datastore/converter.py b/src/olpc/datastore/converter.py index 1250dbb..6f0ede6 100644 --- a/src/olpc/datastore/converter.py +++ b/src/olpc/datastore/converter.py @@ -95,11 +95,13 @@ class Converter(object): # maps both extension -> plugin # and mimetype -> plugin self._converters = {} + self._default = None self.logger = logging.getLogger('org.laptop.sugar.Indexer') def registerConverter(self, ext_or_mime, plugin): if plugin.verify(): self._converters[ext_or_mime] = plugin + if self._default is None: self._default = plugin def __call__(self, filename, encoding=None, mimetype=None): """Convert filename's content to utf-8 encoded text.""" @@ -119,6 +121,8 @@ class Converter(object): converter = self._converters.get(mt) if not converter: converter = self._converters.get(ext) + if not converter: + converter = self._default if converter: try: return converter(filename) diff --git a/src/olpc/datastore/datastore.py b/src/olpc/datastore/datastore.py index 142d801..9121be9 100644 --- a/src/olpc/datastore/datastore.py +++ b/src/olpc/datastore/datastore.py @@ -18,8 +18,6 @@ import dbus.mainloop.glib from olpc.datastore import utils -from StringIO import StringIO - # the name used by the logger DS_LOG_CHANNEL = 'org.laptop.sugar.DataStore' @@ -68,14 +66,11 @@ class DataStore(dbus.service.Object): # medium (maybe an SD card for example) and we'd want to keep # that on the XO itself. In these cases their might be very # little identifying information on the media itself. - uri = str(uri) - _options = {} - if options: - for key, value in options.iteritems(): - _options[str(key)] = str(value) - + _options = utils._convert(options) + if _options is None: _options = {} + mp = self.connect_backingstore(uri, **_options) if not mp: return '' if mp.id in self.mountpoints: @@ -116,14 +111,28 @@ class DataStore(dbus.service.Object): ## sticks and so on. We provide a facility for tracking ## co-authors of content ## there are associated changes to 'find' to resolve buddies - def addBuddy(self, id, name, fg_color, bg_color): - pass + def addBuddy(self, id, name, fg_color, bg_color, mountpoint=None): + mp = None + if mountpoint is None: mp = self.root + else: mp = self.mountpoints.get(mountpoint) + if mp is None: raise ValueError("Invalid mountpoint") + mp.addBuddy(id, name, fg_color, bg_color) + + def getBuddy(self, bid): + """Get a buddy by its id""" + b = None + for mp in self.mountpoints.itervalues(): + b = mp.getBuddy(bid) + if b: break + return b - def getBuddy(self, id): - pass def buddies(self): - pass + buddies = set() + for mp in self.mountpoints.itervalues(): + buddies = buddies.union(mp.getBuddies()) + return buddies + ## end buddy api @@ -146,7 +155,7 @@ class DataStore(dbus.service.Object): def _resolveMountpoint(self, mountpoint=None): if isinstance(mountpoint, dict): - mountpoint = mountpoint.get('mountpoint') + mountpoint = mountpoint.pop('mountpoint', None) if mountpoint is not None: # this should be the id of a mount point @@ -173,26 +182,15 @@ class DataStore(dbus.service.Object): over this process can come at a later time. """ mp = self._resolveMountpoint(props) - content = mp.create(props, filelike) - self.Created(content.id) - logging.debug("created %s" % content.id) + uid = mp.create(props, filelike) + self.Created(uid) + logging.debug("created %s" % uid) - return content.id + return uid @dbus.service.signal(DS_DBUS_INTERFACE, signature="s") def Created(self, uid): pass - - @dbus.service.method(DS_DBUS_INTERFACE, - in_signature='', - out_signature='as') - def all(self): - # workaround for not having optional args or None in - # DBus .. blah - results = self.querymanager.find() - return [r.id for r in results] - - def _multiway_search(self, query): mountpoints = query.pop('mountpoints', self.mountpoints) mountpoints = [self.mountpoints[str(m)] for m in mountpoints] @@ -306,9 +304,8 @@ class DataStore(dbus.service.Object): d = [] for r in results: props = {} - for prop in r.get_properties(): - props[prop.key] = prop.marshall() - + props.update(r.properties) + if 'uid' not in props: props['uid'] = r.id @@ -317,7 +314,7 @@ class DataStore(dbus.service.Object): filename = '' if include_files : - try: filename = self.backingstore.get(r.id).filename + try: filename = r.filename except KeyError: pass props['filename'] = filename d.append(props) @@ -344,25 +341,13 @@ class DataStore(dbus.service.Object): except AttributeError: pass return '' - def get_data(self, uid): - content = self.get(uid) - if content: - return content.get_data() - - def put_data(self, uid, data): - self.update(uid, None, StringIO(data)) - #@utils.sanitize_dbus @dbus.service.method(DS_DBUS_INTERFACE, - in_signature='sa{sv}', + in_signature='s', out_signature='a{sv}') - def get_properties(self, uid, query=None): + def get_properties(self, uid): content = self.get(uid) - dictionary = {} - if not query: query = {} - for prop in content.get_properties(**query): - dictionary[prop.key] = prop.marshall() - return dictionary + return content.properties @dbus.service.method(DS_DBUS_INTERFACE, in_signature='sa{sv}', @@ -372,7 +357,7 @@ class DataStore(dbus.service.Object): mountpoints = query.pop('mountpoints', self.mountpoints) mountpoints = [self.mountpoints[str(m)] for m in mountpoints] results = set() - + for mp in mountpoints: result = mp.get_uniquevaluesfor(propertyname) results = results.union(result) @@ -405,8 +390,8 @@ class DataStore(dbus.service.Object): content = self.get(uid) if content: content.backingstore.delete(uid) - self.Deleted(content.id) - logger.debug("deleted %s" % content.id) + self.Deleted(uid) + logger.debug("deleted %s" % uid) @dbus.service.signal(DS_DBUS_INTERFACE, signature="s") def Deleted(self, uid): pass @@ -421,4 +406,12 @@ class DataStore(dbus.service.Object): @dbus.service.signal(DS_DBUS_INTERFACE) def Stopped(self): pass - + @dbus.service.method(DS_DBUS_INTERFACE, + in_signature='', + out_signature='') + def complete_indexing(self): + """Block waiting for all queued indexing operations to + complete. Used mostly in testing""" + for mp in self.mountpoints.itervalues(): + mp.complete_indexing() + diff --git a/src/olpc/datastore/indexer.py b/src/olpc/datastore/indexer.py deleted file mode 100644 index de7ef33..0000000 --- a/src/olpc/datastore/indexer.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -indexer -~~~~~~~~~~~~~~~~~~~~ -fulltext index module - -""" - -__author__ = 'Benjamin Saller <bcsaller@objectrealms.net>' -__docformat__ = 'restructuredtext' -__copyright__ = 'Copyright ObjectRealms, LLC, 2007' -__license__ = 'The GNU Public License V2+' - - -# the name used by the logger -import logging -import dbus.service -import dbus.mainloop.glib -from olpc.datastore.query import XapianFulltext - -INDEX_LOG_CHANNEL = 'org.laptop.sugar.Indexer' - -INDEX_SERVICE = "org.laptop.sugar.Indexer" -INDEX_DBUS_INTERFACE = "org.laptop.sugar.Indexer" -INDEX_OBJECT_PATH = "/org/laptop/sugar/Indexer" - -logger = logging.getLogger(INDEX_LOG_CHANNEL) - -class Indexer(dbus.service.Object, XapianFulltext): - # This object doesn't really publish an interface right now - # Its a bus object so that dbus can start it automatically - # when the datastore requests a binding to it - def __init__(self, repo='fulltext'): - # global handle to the main look - dbus.mainloop.glib.DBusGMainLoop(set_as_default=True) - session_bus = dbus.SessionBus() - - self.bus_name = dbus.service.BusName(INDEX_SERVICE, - bus=session_bus, - replace_existing=True, - allow_replacement=True) - dbus.service.Object.__init__(self, self.bus_name, INDEX_OBJECT_PATH) - - - self.connect_fulltext(repo, read_only=False) - - - diff --git a/src/olpc/datastore/model.py b/src/olpc/datastore/model.py index 8c8ab05..18d409f 100644 --- a/src/olpc/datastore/model.py +++ b/src/olpc/datastore/model.py @@ -10,17 +10,11 @@ __docformat__ = 'restructuredtext' __copyright__ = 'Copyright ObjectRealms, LLC, 2007' __license__ = 'The GNU Public License V2+' -from sqlalchemy import Table, Column, UniqueConstraint -from sqlalchemy import String, Integer, Unicode -from sqlalchemy import ForeignKey, Sequence, Index -from sqlalchemy import mapper, relation -from sqlalchemy import create_session -from sqlalchemy import MapperExtension, EXT_PASS, clear_mappers - import datetime import mimetypes import os import time +import warnings # XXX: Open issues # list properties - Contributors (a, b, c) @@ -28,51 +22,139 @@ import time # content state - searches don't include content deletion flag # - not recording if content is on other storage yet - -# we have a global thread local session factory -context = {} propertyTypes = {} _marker = object() -def get_session(backingstore): - return context[backingstore] +def registerPropertyType(kind, get, set, xapian_sort_type=None, defaults=None): + propertyTypes[kind] = PropertyImpl(get, set, xapian_sort_type, defaults) -def registerPropertyType(kind, class_): propertyTypes[kind] = class_ def propertyByKind(kind): return propertyTypes[kind] +class PropertyImpl(object): + __slots__ = ('_get', '_set', 'xapian_sort_type', 'defaults') + + def __init__(self, get, set, xapian_sort_type=None, defaults=None): + self._get, self._set = get, set + self.xapian_sort_type = xapian_sort_type + self.defaults = defaults + + def get(self, value): return self._get(value) + def set(self, value): return self._set(value) + +class Property(object): + """Light-weight property implementation. + Handles typed properties via a global registry of type->callbacks + + >>> p = Property(key, value, 'string') + >>> b = Property(key, value, 'binary') + """ + def __init__(self, key, value, kind=None): + self.key = key + self._value = value + self.kind = kind + if kind not in propertyTypes: + warnings.warn("Unknown property type: %s on key %s" % \ + (kind, key), RuntimeWarning) + else: self._impl = propertyTypes[kind] + + @classmethod + def fromstring(cls, key, value=''): + kind = 'string' + if ':' in key: + key, kind = key.split(':', 1) + # now resolve the kind to a property class + return cls(key, value, kind) + -class Content(object): def __repr__(self): - return "<Content id:%s>" % (self.id, ) + return "<%s(%s) %s:%r>" % (self.__class__.__name__, + self.kind, + self.key, self.value) - def get_property(self, key, default=_marker): - # mapped to property keys - session = get_session(self.backingstore) - query = session.query(Property) - p = query.get_by(content_id=self.id, key=key) - if not p: - if default is _marker: raise AttributeError(key) - return default - return p.value - - def get_properties(self, **kwargs): - session = get_session(self.backingstore) - query = session.query(Property) - return query.select_by(content_id=self.id, **kwargs) - - - # Backingstore dependent bindings - def get_file(self): - if not hasattr(self, "_file") or self._file.closed is True: - self.backingstore.get(self.id) - return self._file + def get_value(self): return self._impl.get(self._value) + def set_value(self, value): self._value = self._impl.set(value) + value = property(get_value, set_value) + + def __str__(self): return str(self.value) + +class Model(object): + """Object containing the field/property model used by the + system""" - def set_file(self, fileobj): - self._file = fileobj - file = property(get_file, set_file) + def __init__(self): + self.fields = {} + self.fieldnames = [] + + def copy(self): + m = Model() + m.fields = self.fields.copy() + m.fieldnames = self.fieldnames[:] + return m + + def addField(self, key, kind, overrides=None): + """ Add a field to the model. + key -- field name + kind -- type by name (registered with registerPropertyType) + kwargs -- overrides and additional values to the default + arguments supplied by kind + """ + if key in self.fields: + raise KeyError("""Another source tried to add %s field to the model""" % key) + + impl = propertyByKind(kind) + options = impl.defaults.copy() + if overrides: options.update(overrides) + if impl.xapian_sort_type: + if 'type' not in options: + options['type'] = impl.xapian_sort_type + + self.fields[key] = (key, kind, options) + self.fieldnames.append(key) + return self + + def addFields(self, *args): + """ List of arguments to addField """ + for arg in args: self.addField(*arg) + return self + + def apply(self, indexmanager): + addField = indexmanager.addField + for fn in self.fieldnames: + args = self.fields[fn] + addField(args[0], **args[2]) + +class Content(object): + """A light weight proxy around Xapian Documents from secore. + This provides additional methods which are used in the + backingstore to assist in storage + """ + __slots__ = ('_doc', '_backingstore', '_file') + + def __init__(self, xapdoc, backingstore=None): + self._doc = xapdoc + self._backingstore = backingstore + self._file = None + + def __repr__(self): + return "<%s %s>" %(self.__class__.__name__, + self.properties) + + def get_property(self, key, default=_marker): + result = self._doc.data.get(key, default) + if result is _marker: raise KeyError(key) + if isinstance(result, list) and len(result) == 1: + return result[0] + return result @property - def filename(self): return self.file.name + def properties(self): + d = {} + for k, v in self.data.iteritems(): + if isinstance(v, list) and len(v) == 1: + v = v[0] + d[k] = v + return d + def suggestName(self): # we look for certain known property names @@ -89,8 +171,7 @@ class Content(object): f, e = os.path.splitext(filename) if e: return filename, None if ext: return "%s.%s" % (filename, ext), None - elif ext: - return None, ext + elif ext: return None, ext else: # try to get an extension from the mimetype if available mt = self.get_property('mime_type', None) @@ -99,279 +180,110 @@ class Content(object): if ext: return None, ext return None, None - def get_data(self): - f = self.file - t = f.tell() - data = f.read() - f.seek(t) - return data - - def set_data(self, filelike): - self.backingstore.set(self.id, filelike) - - data = property(get_data, set_data) - - -class BackingStoreContentMapping(MapperExtension): - """This mapper extension populates Content objects with the - binding to the backing store the files are kept on, this allow the - file-like methods to work as expected on content - """ - def __init__(self, backingstore): - MapperExtension.__init__(self) - self.backingstore = backingstore - - def populate_instance(self, mapper, selectcontext, row, instance, identitykey, isnew): - """called right before the mapper, after creating an instance - from a row, passes the row to its MapperProperty objects which - are responsible for populating the object's attributes. If - this method returns EXT_PASS, it is assumed that the mapper - should do the appending, else if this method returns any other - value or None, it is assumed that the append was handled by - this method. - - """ - instance.backingstore = self.backingstore - # allow normal population to happen - return EXT_PASS - - -class Property(object): - """A typed key value pair associated with a content object. - This is the objects metadata. The value side of the kv pair is - typically encoded as a UTF-8 String. There are however cases where - richer metadata is required by the application using the - datastore. - In these cases the type field is overridden to encode a reference - to another object that must be used to satisfy this value. An - example of this would be storing a PNG thumbnail as the a - value. In a case such as that the value should be set to a path or - key used to find the image on stable storage or in a database and - the type field will be used to demarshall it through this object. - """ - def __init__(self, key, value, type='string'): - self.key = key - self.value = value - self.type = type - - def __repr__(self): - return "<%s %s:%r>" % (self.__class__.__name__, - self.key, self.value) - def marshall(self): - """Return the value marshalled as a string""" - return str(self.value) - -class TextProperty(Property): - """A text property is one that will also get full automatic text - indexing when available. This is used for fields like title where - searching in the text is more important than doing a direct match - """ - def __init__(self, key, value, type='text'): - Property.__init__(self, key, value, type) - - def get_value(self): return self._value - def set_value(self, value): self._value = value - value = property(get_value, set_value) - + def get_file(self): + if not hasattr(self, "_file") or self._file.closed is True: + self.backingstore.get(self.id) + return self._file -class DateProperty(Property): - format = "%Y-%m-%dT%H:%M:%S" - - def __init__(self, key, value, type="date"): - self._value = None - Property.__init__(self, key, value, type) - - def get_value(self): - # parse the value back into a datetime - # XXX: strptime on datetime is a 2.5 thing :( - # XXX: we lose timezone in this conversion currently - if not self._value: return None - ti = time.strptime(self._value, self.format) - dt = datetime.datetime(*(ti[:-2])) - dt = dt.replace(microsecond=0) - return dt - - def set_value(self, value): - if isinstance(value, basestring): - # XXX: there is an issue with microseconds not getting parsed - ti = time.strptime(value, self.format) - value = datetime.datetime(*(ti[:-2])) - value = value.replace(microsecond=0) - - self._value = value.isoformat() + def set_file(self, fileobj): + self._file = fileobj + file = property(get_file, set_file) - value = property(get_value, set_value) + @property + def filename(self): return self.file.name - def marshall(self): return self.value.isoformat() - + @property + def contents(self): return self.file.read() -class NumberProperty(Property): - def __init__(self, key, value, type="number"): - Property.__init__(self, key, value, type) - - def get_value(self): return float(self._value) - def set_value(self, value): self._value = value - value = property(get_value, set_value) + @property + def backingstore(self): return self._backingstore + @property + def id(self): return self._doc.id -class BinaryProperty(Property): - # base64 encode binary data - def __init__(self, key, value, type="binary"): - Property.__init__(self, key, value, type) - - def get_value(self): return self._value.decode('base64') - def set_value(self, value): self._value = value.encode('base64') - value = property(get_value, set_value) - - -class Model(object): - """ Manages the global state of the metadata model index. This is - intended to only be consumed by an olpc.datastore.query.QueryManager - instance for the management of its metadata. - - >>> m = Model() - >>> m.prepare(querymanager) - - >>> m.content - ... # Content Table - - >>> m['content'] - ... # content Mapper - - For details see the sqlalchemy documentation - - """ - - def __init__(self): - self.tables = {} - self.mappers = {} + @property + def data(self): return self._doc.data - def __getattr__(self, key): return self.tables[key] - def __getitem__(self, key): return self.mappers[key] - - - def prepare(self, querymanager): - self.querymanager = querymanager - - # a single session manages the exclusive access we keep to the - # db. - global context - self.session = create_session(bind_to=self.querymanager.db) - context[self.querymanager.backingstore] = self.session - - # content object - content = Table('content', - self.querymanager.metadata, - Column('id', String, primary_key=True, nullable=False), - Column('activity_id', Integer), - Column('checksum', String,), - UniqueConstraint('id', name='content_key') - ) - Index('content_activity_id_idx', content.c.activity_id) - - # the properties of content objects - properties = Table('properties', - self.querymanager.metadata, - Column('id', Integer, Sequence('property_id_seq'), primary_key=True), - Column('content_id', Integer, ForeignKey('content.id')), - Column('key', Unicode, ), - Column('value', Unicode, ), - Column('type', Unicode, ), - # unique key to content mapping - UniqueConstraint('content_id', 'key', - name='property_content_key') - ) - - Index('property_key_idx', properties.c.key) - Index('property_type_idx', properties.c.type) - - # storage - storage = Table('storage', - self.querymanager.metadata, - Column('id', String, primary_key=True), - Column('description', String, ), - Column('uri', String, ) - ) - - # storage -> * content - # XXX: this could be a purely runtime in-memory construct - # removing the storage table as well. Would depend in part on - # the frequency of the garbage collection runs and the - # frequency of connection to stable storage - storage_content = Table('storage_content', - self.querymanager.metadata, - Column('storage_id', Integer, ForeignKey('storage.id')), - Column('content_id', Integer, ForeignKey('content.id')), - ) - Index('idx_storage_content_content_id', storage_content.c.content_id) - - # Object Mapping - # the query manager provides a mapping extension for - # Content <-> BackingStore binding - - # XXX gross and not what we want, we can only define mappers - # once but we may have more than one datastore. - # this can impact all sqla in the runtime though - clear_mappers() +## class Buddy(object): +## """A co-author on content. Information is collected and managed +## here""" +## pass + + + +def noop(value): return value + +import re +base64hack = re.compile("(\S{212})") +def base64enc(value): return ' '.join(base64hack.split(value.encode('base64'))) +def base64dec(value): return value.replace(' ', '').decode('base64') + +dateformat = "%Y-%m-%dT%H:%M:%S" +def datedec(value, dateformat=dateformat): + ti = time.strptime(value, dateformat) + dt = datetime.datetime(*(ti[:-2])) + dt = dt.replace(microsecond=0) + return dt + +def dateenc(value, dateformat=dateformat): + if isinstance(value, basestring): + # XXX: there is an issue with microseconds not getting parsed + ti = time.strptime(value, dateformat) + value = datetime.datetime(*(ti[:-2])) + value = value.replace(microsecond=0) + # XXX: drop time for now, this is a xapian issue + value = value.date() + return value.isoformat() + +# type, get, set, xapian sort type [string|float|date], defaults +# defaults are the default options to addField in IndexManager +# these can be overridden on model assignment +registerPropertyType('string', noop, noop, 'string', {'store' : True, + 'exact' : True, + 'sortable' : True}) + +registerPropertyType('text', noop, noop, 'string', {'store' : True, + 'exact' : False, + 'sortable' : False}) + +registerPropertyType('binary', noop, noop, None, {'store' : True, + 'exact' : False, + 'sortable' : False}) + +registerPropertyType('int', str, int, 'float', {'store' : True, + 'exact' : True, + 'sortable' : True}) + +registerPropertyType('number', str, float, 'float', {'store' : True, + 'exact' : True, + 'sortable' : True}) + +registerPropertyType('date', dateenc, datedec, 'date', {'store' : True, + 'exact' : True, + 'sortable' : True + }) + + + +defaultModel = Model().addFields( + ('fulltext', 'text'), + # vid is version id + ('vid', 'number'), + ('checksum', 'string'), + ('filename', 'string'), + # Title has additional weight + ('title', 'text', {'weight' : 2 }), + ('url', 'string'), + ('mimetype', 'string'), + ('author', 'string'), + ('language', 'string'), + ('ctime', 'date'), + ('mtime', 'date'), + # this will just be a space delimited list of tags + # indexed with the content + # I give them high weight as they have user given semantic value. + ('tags', 'text', {'weight' :3 } ), + ) - content_mapper = mapper(Content, content, - extension=self.querymanager.content_ext, - properties = { - 'properties' : relation(Property, - cascade="all,delete-orphan", - backref='content', - lazy=True), - }, - - ) - - # retain reference to these tables to use for queries - self.tables['content'] = content - self.tables['properties'] = properties - self.tables['storage'] = storage - self.tables['storage_content'] = storage_content - - # and the mappers (though most likely not needed) - property_mapper = mapper(Property, properties, polymorphic_on=properties.c.type) - self.mappers['properties'] = property_mapper - self.mappers['content'] = content_mapper - - # default Property types are mapped to classes here - self.addPropertyType(DateProperty, 'date') - self.addPropertyType(NumberProperty, 'number') - self.addPropertyType(TextProperty, 'text') - self.addPropertyType(BinaryProperty, 'binary') - - - - - def addPropertyType(self, PropertyClass, typename, - map_value=True, **kwargs): - """Register a new type of Property. PropertyClass should be a - subclass of Property, typename is the textual - name of the new Property type. - - The flag map_value indicates if Property.value should - automatically be diverted to _value so that you can more - easily manage the interfaces 'value' as a Python property - (descriptor) - - Keyword args will be passed to the properties dictionary of - the sqlalchemy mapper call. See sqlalchemy docs for additional - details. - """ - properties = {} - properties.update(kwargs) - if map_value is True: - properties['_value'] = self.properties.c.value - - mapper(PropertyClass, - inherits=self.mappers['properties'], - polymorphic_identity=typename, - properties=properties - ) - - registerPropertyType(typename, PropertyClass) - diff --git a/src/olpc/datastore/query.py b/src/olpc/datastore/query.py deleted file mode 100644 index 2c5dd9f..0000000 --- a/src/olpc/datastore/query.py +++ /dev/null @@ -1,642 +0,0 @@ -""" -olpc.datastore.query -~~~~~~~~~~~~~~~~~~~~ -manage the metadata index and make it queryable. this in turn will -depend on olpc.datastore.fulltext which indexes the actual content. - -""" - -__author__ = 'Benjamin Saller <bcsaller@objectrealms.net>' -__docformat__ = 'restructuredtext' -__copyright__ = 'Copyright ObjectRealms, LLC, 2007' -__license__ = 'The GNU Public License V2+' - - -from datetime import datetime -from lemur.xapian.sei import DocumentStore, DocumentPiece, SortableValue -from olpc.datastore.converter import converter -from olpc.datastore.model import DateProperty, TextProperty -from olpc.datastore.model import Model, Content, Property, propertyByKind -from olpc.datastore.utils import create_uid - -from sqlalchemy import create_engine, BoundMetaData -from sqlalchemy import select, intersect, and_ -import atexit -import logging -import os, sys - -_marker = object() - - -class SugarDomain(object): - """The underlying property set used for metadata in the sugar - system""" - def kind_by_key(self, key): - """resolves property names to the factory type that supports - them in the model - """ - # key may be a two part form directly indicating the property - # type - if ':' in key: - key, kind = key.split(':', 1) - # now resolve the kind to a property class - return key, propertyByKind(kind) - - return key, { - 'ctime' : DateProperty, - 'mtime' : DateProperty, - 'author' : Property, - 'title' : TextProperty, - 'mime_type' : Property, - 'language' : Property, - }.get(key, Property) - - def propertyFactory(self, key, value='', dict=None): - key, kind = self.kind_by_key(key) - p = kind(key, value) - if dict is not None: dict[key] = p - return kind - - def _automaticProperties(self): - d = {} - now = datetime.now() - self.propertyFactory('mtime', now, dict=d) - return d - - def _defaultProperties(self): - d = {} - now = datetime.now() - self.propertyFactory('ctime', now, dict=d) - self.propertyFactory('author', dict=d) - self.propertyFactory('title', dict=d) - self.propertyFactory('mime_type', dict=d) - self.propertyFactory('language', dict=d) - - d.update(self._automaticProperties()) - return d - - def _normalizeProps(self, props, creating, include_defaults): - # return a dict of {name : property} - if isinstance(props, dict): - # convert it into a dict of Property objects - d = {} - for k,v in props.iteritems(): - k, kind = self.kind_by_key(k) - p = kind(k, v) - d[k] = p - if creating and include_defaults: - defaults = self._defaultProperties() - for k, v in defaults.iteritems(): - if k not in d: d[k] = v - props = d - else: - d = {} - for p in props: - d[p.key] = p - props = d - return props - - - -class QueryManager(SugarDomain): - FULLTEXT_NAME = "fulltext" - - def __init__(self, metadata_uri, **options): - """ - The metadata_uri is a string used to find the database. - - - This will check keywords for: - 'language' Language is the language code used in the fulltext - engine. This helps improve stemming and - so on. In the future additional control - will be provided. - - 'sync_index' which determines if we use an internal - sync index impl or an out of process one - via DBus. If the async process is to be - used it must be properly configured and - available for DBus to spawn. - - 'fulltext_repo' the full filepath to which the fulltext - index data will be stored - - 'use_fulltext' when true indexing will be performed - - """ - self.uri = metadata_uri - self.options = options - - self.backingstore = None - self.content_ext = None - - - def _handle_option(self, options, key, default=_marker): - value = options.get(key, default) - if value is _marker: raise KeyError(key) - setattr(self, key, value) - - def _handle_options(self, **kwargs): - self._handle_option(kwargs, 'fulltext_repo') - self._handle_option(kwargs, 'use_fulltext', True) - self._handle_option(kwargs, 'sync_index', True) - self._handle_option(kwargs, 'language', 'en') - self.sync_index = self.use_fulltext and self.sync_index - - def bind_to(self, backingstore): - self.backingstore = backingstore - - def prepare_index(self): - self.connect_db() - self.prepare_db() - self.connect_model() - - def prepare_fulltext(self): - self.connect_fulltext(self.fulltext_repo, self.language, - read_only=not self.sync_index) - - def prepare(self): - """This is called by the datastore with its backingstore and - querymanager. Its assumed that querymanager is None and we are - the first in this release - """ - self._handle_options(**self.options) - self.prepare_index() - self.prepare_fulltext() - return True - - def stop(self): - pass - - # Primary interface - def create(self, props, filelike=None, include_defaults=True): - """Props can either be a dict of k,v pairs or a sequence of - Property objects. - - The advantage of using property objects is that the data can - by typed. When k/v pairs are used a default string type will - be chosen. - - When include_defaults is True a default set of properties are - created on behalf of the Content if they were not provided. - - These include: - author : '' - title : '' - mime_type : '' - language : '', - ctime : '', - mtime : '', - """ - s = self.model.session - c = Content() - # its important the id be set before other operations - c.id = create_uid() - s.save(c) - - self._bindProperties(c, props, creating=True, include_defaults=include_defaults) - s.flush() - c.backingstore = self.backingstore - - if self.sync_index and filelike: - self.fulltext_index(c.id, filelike, - mimetype=c.get_property('mime_type'), - textprops=self.get_textprops(c)) - - return c - - def update(self, content_or_uid, props=None, filelike=None): - content = self._resolve(content_or_uid) - content.backingstore = self.backingstore - if props is not None: - self._bindProperties(content, props, creating=False) - self.model.session.flush() - - if self.sync_index and filelike: - self.fulltext_index(content.id, filelike, textprops=self.get_textprops(content)) - - - def _bindProperties(self, content, props, creating=False, include_defaults=False): - """Handle either a dict of properties or a list of property - objects, binding them to the content instance. - """ - # for information on include_defaults see create() - # default properties are only provided when creating is True - session = self.model.session - - props = self._normalizeProps(props, creating, - include_defaults) - - # we should have a dict of property objects - if creating: - content.properties.extend(props.values()) - else: - # if the automatically maintained properties (like mtime) - # are not set, include them now - auto = self._automaticProperties() - auto.update(props) - props = auto - # we have to check for the update case - oldProps = dict([(p.key, p) for p in content.properties]) - for k, p in props.iteritems(): - if k in oldProps: - oldProps[k].value = p.value - oldProps[k].type = p.type - else: - content.properties.append(p) - - def get(self, uid): - return self.model.session.query(self.model.mappers['content']).get(uid) - - def get_properties(self, content_or_uid, keys): - c = self._resolve(content_or_uid) - return self.model.session.query(Property).select_by(self.model.property.c.key.in_(keys), - content_id=c.id) - - - def get_uniquevaluesfor(self, propertyname): - properties = self.model.tables['properties'] - return [r[0] for r in select([properties.c.value], - properties.c.key==propertyname, - distinct=True).execute().fetchall()] - - - - def delete(self, content_or_uid): - c = self._resolve(content_or_uid) - s = self.model.session - s.delete(c) - s.flush() - if self.sync_index: - self.fulltext_unindex(c.id) - - - def find(self, query=None, **kwargs): - """ - dates can be search in one of two ways. - date='YYYY-MM-DD HH:MM:SS' - date={'start' : 'YYYY-MM-DD HH:MM:SS', - 'end' : 'YYYY-MM-DD HH:MM:SS' - } - where date is either ctime or mtime. - if start or end is omitted its becomes a simple before/after - style query. If both are provided its a between query. - - providing the key 'fulltext' will include a full text search - of content matching its parameters. see fulltext_search for - additional details. - - - If 'limit' is passed it will be the maximum number of results - to return and 'offset' will be the offset from 0 into the - result set to return. - - """ - - # XXX: this will have to be expanded, but in its simplest form - if not self.sync_index: self.index.reopen() - - s = self.model.session - properties = self.model.tables['properties'] - if not query: query = {} - query.update(kwargs) - q = s.query(Content) - # rewrite the query to reference properties - # XXX: if there is a 'content' key will will have to search - # the content using the full text index which will result in a - # list of id's which must be mapped into the query - # fulltext_threshold is the minimum acceptable relevance score - limit = query.pop('limit', None) - offset = query.pop('offset', None) - - if offset: q = q.offset(offset) - if limit: q = q.limit(limit) - - if query: - where = [] - fulltext = query.pop('fulltext', None) - threshold = query.pop('fulltext_threshold', 60) - - - - statement = None - ft_select = None - - if query: - # daterange support - # XXX: this is sort of a hack because - # - it relies on Manifest typing in sqlite - # - value's type is not normalized - # - we make special exception based on property name - # if we need special db handling of dates ctime/mtime - # will become columns of Content and not properties - ctime = query.pop('ctime', None) - mtime = query.pop('mtime', None) - if ctime or mtime: - self._query_dates(ctime, mtime, where) - for k,v in query.iteritems(): - if isinstance(v, list): - v = properties.c.value.in_(*v) - else: - v = properties.c.value==v - - where.append(select([properties.c.content_id], - and_( properties.c.key==k, - v))) - - statement = intersect(*where) - statement.distinct=True - - if fulltext and self.use_fulltext: - # perform the full text search and map the id's into - # the statement for inclusion - ft_res = self.fulltext_search(fulltext) - if ft_res: - ft_ids = [ft[0] for ft in ft_res if ft[1] >= - threshold] - - if ft_ids: - ft_select = select([properties.c.content_id], - properties.c.content_id.in_(*ft_ids)) - - if ft_select is None: - # the full text query eliminated the possibility - # of results by returning nothing under a logical - # AND condition, bail now - return ([], 0) - else: - if statement is None: - statement = ft_select - statement.distinct = True - else: - statement = intersect(statement, ft_select) - - result = statement.execute() - r = [q.get(i[0]) for i in result] - r = (r, len(r)) - else: - r = (q.select(), q.count()) - - # XXX: make sure the proper backingstore is mapped - # this currently forbids the use case of keeping index data - # for a read-only store. - for item in r[0]: - item.backingstore = self.backingstore - - return r - - # sqla util - def _resolve(self, content_or_uid): - if isinstance(content_or_uid, basestring): - # we need to resolve the object - content_or_uid = self.model.session.query(Content).get(content_or_uid) - return content_or_uid - - def _query_dates(self, ctime, mtime, selects): - if ctime: selects.append(self._query_date('ctime', ctime)) - if mtime: selects.append(self._query_date('mtime', mtime)) - - def _query_date(self, key, date): - properties = self.model.properties - - if isinstance(date, basestring): - s = select([properties.c.content_id], - and_( properties.c.key==key, - properties.c.value==date)) - else: - # its a dict with start/end - start = date.get('start') - end = date.get('end') - if start and end: - s = select([properties.c.content_id], - and_( properties.c.key==key, - properties.c.value.between(start, - end))) - elif start: - s = select([properties.c.content_id], - and_( properties.c.key==key, - properties.c.value >=start)) - else: - s = select([properties.c.content_id], - and_( properties.c.key==key, - properties.c.value < end)) - - return s - - - def get_textprops(self, uid_or_content): - # text properties also get full text indexing - # currently this is still searched with the 'fulltext' - # parameter of find() - content = self._resolve(uid_or_content) - textprops = {} - for p in content.get_properties(type='text'): - textprops[p.key] = p.value and p.value or '' - return textprops - - - # fulltext interface - def fulltext_index(self, uid, fileobj, mimetype=None, textprops=None): - """Index the fileobj relative to uid which should be a - olpc.datastore.model.Content object's uid. The fileobj can be - either a pathname or an object implementing the Python file - ('read') interface. - """ - pass - - def fulltext_unindex(self, content_id): - pass - - def fulltext_search(self, *args, **kwargs): - return [] - - # lifecycle - def connect_db(self): - """Connect to the underlying database. Called implicitly by - __init__""" - pass - - - def prepare_db(self): - """After connecting to the metadata database take any - initialization steps needed for the environment. - - This is called implicitly by __init__ before the model is - brought online. - """ - pass - - def connect_model(self, model): - """Connect the model. Called with the model passed into - __init__ after the database has been prepared. - """ - pass - - def connect_fulltext(self, repo, language, read_only): - """Connect the full text index""" - pass - - -class SQLiteQueryManager(QueryManager): - """The default implementation of the query manager. This owns the - model object and the fulltext object - """ - - def __init__(self, uri, **kwargs): - super(SQLiteQueryManager, self).__init__(uri, **kwargs) - # now re-write the URI to be sqlite specific - # (we were initialized to a namepattern in the proper - # directory by the backingstore) - self.uri= "sqlite:///%s.db" % self.uri - - def connect_db(self): - self.db = create_engine(self.uri) - self.metadata = BoundMetaData(self.db) - - def prepare_db(self): - # Using the sqlite backend we can tune the performance to - # limit writes as much as possible - if self.db.name.startswith('sqlite'): - connection = self.db.connect() - # cut down of per-activity file locking writes - connection.execute("PRAGMA locking_mode=EXCLUSIVE") - # don't demand fsync -- if this is too dangerous - # we can change it to normal which is still less writey - # than the default FULL - connection.execute("PRAGMA synchronous=OFF") - # temporary tables and indices are kept in memory - connection.execute("PRAGMA temp_store=MEMORY") - # XXX: what is the ideal jffs2 page size - # connection.execute("PRAGMA page_size 4096") - - def connect_model(self, model=None): - if model is None: model = Model() - # take the model and connect it to us - model.prepare(self) - - # make sure all the tables and indexes exist - self.metadata.create_all() - - self.model = model - - - def stop(self): - # clean up - self.db.dispose() - -# Full text support -def flatten_unicode(value): return value.encode('utf-8') - -class XapianBinaryValue(SortableValue): - def __init__(self, value, field_name="content"): - SortableValue.__init__(self, value, field_name) - -class XapianFulltext(object): - def connect_fulltext(self, repo, language='en', read_only=True): - if not os.path.exists(repo) and read_only is True: - # create the store - index = DocumentStore(repo, language, read_only=False) - index.close() - # and abandon it - self.index = DocumentStore(repo, language, read_only=read_only) - self.index.registerFlattener(unicode, flatten_unicode) - atexit.register(self.index.close) - - def fulltext_index(self, uid, fileobj, mimetype=None, textprops=None): - """Index the fileobj relative to uid which should be a - olpc.datastore.model.Content's uid. The fileobj can be either - a pathname or an object implementing the Python file ('read') - interface. - """ - piece = DocumentPiece - if isinstance(fileobj, basestring): - # treat it as a pathname - # use the global converter to try to get text from the - # file - fp = converter(fileobj, mimetype=mimetype) - #piece = XapianBinaryValue - elif hasattr(fileobj, 'read'): - # this is an off case, we have to assume utf-8 data - logging.debug("Indexing from readable, not filename") - fp = fileobj - else: - raise ValueError("Not a valid file object") - - if fp is None: - # for whatever reason we were unable to get the content - # into an indexable form. - logging.debug("Unable to index %s %s" % (uid, fileobj)) - return False - return self._ft_index(uid, fp, piece, textprops) - - def _ft_index(self, content_id, fp, piece=DocumentPiece, fields=None): - try: - doc = [piece(fp.read())] - if fields: - # add in properties that need extra fulltext like - # management - for key, value in fields.iteritems(): - doc.append(DocumentPiece(value, key)) - - self.index.addDocument(doc, content_id) - self.index.flush() - return True - except: - logging.debug("fulltext index exception", exc_info=sys.exc_info()) - return False - - - - def fulltext_search(self, *args, **kwargs): - """ - perform search(search_string, ) -> [(content_id, relevance), ...] - - search_string is a string defining the serach in standard web search - syntax. - - ie: it contains a set of search terms. Each search term may be - preceded by a "+" sign to indicate that the term is required, or a "-" - to indicate that is is required to be absent. - - If field_name is not None, it is the prefix of a field, which the - search will be restricted to. - - If field_name is None, the search will search all fields by default, - but search terms may be preceded by a fieldname followed by a colon to - restrict part of the search to a given field. - - combiner is one of DocumentStore.OP_OR or DocumentStore.OP_AND, and is - used to indicate the default operator used to combine terms. - - partial is a flag, which should be set to True to enable partial search - matching, for use when doing interactive searches and we're not sure if - the user has finished typing the search yet. - - range_restrictions is a RangeRestrictions object, used to restrict the - search results. - - """ - if len(args) == 1: - # workaround for api change - args = (args[0], 0, 10) - - res = self.index.performSearch(*args, **kwargs) - est = max(1, res.estimatedResultCount()) - return res.getResults(0, est) - - def fulltext_similar(self, *content_ids): - return self.index.findSimilar(content_ids) - - def fulltext_unindex(self, content_id): - self.index.deleteDocument(content_id) - - def stop(self): - if self.use_fulltext: - self.index.close() - - -class DefaultQueryManager(XapianFulltext, SQLiteQueryManager): - - def stop(self): - XapianFulltext.stop(self) - SQLiteQueryManager.stop(self) diff --git a/src/olpc/datastore/xapianindex.py b/src/olpc/datastore/xapianindex.py new file mode 100644 index 0000000..ec7206d --- /dev/null +++ b/src/olpc/datastore/xapianindex.py @@ -0,0 +1,390 @@ +""" +xapianindex +~~~~~~~~~~~~~~~~~~~~ +maintain indexes on content + +""" + +__author__ = 'Benjamin Saller <bcsaller@objectrealms.net>' +__docformat__ = 'restructuredtext' +__copyright__ = 'Copyright ObjectRealms, LLC, 2007' +__license__ = 'The GNU Public License V2+' + + +from Queue import Queue, Empty +import logging +import re + +import threading +import warnings + +import secore + +from olpc.datastore import model +from olpc.datastore.converter import converter +from olpc.datastore.utils import create_uid + + +# Setup Logger +logger = logging.getLogger('org.sugar.datastore.xapianindex') + +# Indexer Operations +CREATE = 1 +UPDATE = 2 +DELETE = 3 + + +class ContentMappingIter(object): + """An iterator over a set of results from a search. + + """ + def __init__(self, results, backingstore): + self._results = results + self._backingstore = backingstore + self._iter = iter(results) + + def __iter__(self): return self + + def next(self): + searchresult = self._iter.next() + return model.Content(searchresult, self._backingstore) + + +class IndexManager(object): + DEFAULT_DATABASE_NAME = 'index' + + def __init__(self, default_language='en'): + # We will maintain two connections to the database + # we trigger automatic flushes to the read_index + # after any write operation + self.write_index = None + self.read_index = None + self.queue = Queue(0) + self.indexer_running = False + self.language = default_language + + self.backingstore = None + + self.fields = set() + + # + # Initialization + def connect(self, repo, **kwargs): + if self.write_index is not None: + warnings.warn('''Requested redundant connect to index''', + RuntimeWarning) + + self.repo = repo + self.write_index = secore.IndexerConnection(repo) + + # configure the database according to the model + datamodel = kwargs.get('model', model.defaultModel) + datamodel.apply(self) + + # store a reference + self.datamodel = datamodel + + self.read_index = secore.SearchConnection(repo) + + self.flush() + + # by default we start the indexer now + self.startIndexer() + + def bind_to(self, backingstore): + # signal from backingstore that its our parent + self.backingstore = backingstore + + # flow control + def flush(self): + """Called after any database mutation""" + self.write_index.flush() + self.read_index.reopen() + + def stop(self): + self.stopIndexer() + self.write_index.close() + self.read_index.close() + + # Index thread management + def startIndexer(self): + self.indexer_running = True + self.indexer = threading.Thread(target=self.indexThread) + self.indexer.setDaemon(True) + self.indexer.start() + + def stopIndexer(self, force=False): + if not self.indexer_running: return + if not force: self.queue.join() + self.indexer_running = False + self.indexer.join() + + def enque(self, uid, vid, doc, operation, filestuff=None): + self.queue.put((uid, vid, doc, operation, filestuff)) + + def indexThread(self): + # process the queue + # XXX: there is currently no way to remove items from the queue + # for example if a USB stick is added and quickly removed + # the mount should however get a stop() call which would + # request that the indexing finish + while self.indexer_running: + # include timeout here to ease shutdown of the thread + # if this is a non-issue we can simply allow it to block + try: + uid, vid, doc, operation, filestuff = self.queue.get(timeout=0.5) + if operation is DELETE: self.write_index.delete(uid) + elif operation in (CREATE, UPDATE): + # Here we handle the conversion of binary + # documents to plain text for indexing. This is + # done in the thread to keep things async and + # latency lower. + if filestuff: + filename, mimetype = filestuff + fp = converter(filename, mimetype) + if fp: + doc.fields.append(secore.Field('fulltext', + fp.read())) + + if operation is CREATE: self.write_index.add(doc) + elif operation is UPDATE: self.write_index.replace(doc) + + else: + logger.warning("Unknown indexer operation ( %s: %s)" % \ + (uid, operation)) + continue + + # XXX: this isn't quite true, we haven't called flush + # yet so the document might not be on disk + logger.info("Indexed Content %s:%s" % (uid, vid)) + # but we still tell the queue its complete + self.queue.task_done() + + except Empty: + pass +## except: +## import traceback +## traceback.print_exc() +## try: self.write_index.close() +## except: pass +## try: +## self.write_index = secore.IndexerConnection(self.repo) +## self.read_index.reopen() +## except: +## # Shut down the indexer +## logger.critical("Indexer Failed, Shutting it down") +## self.indexer_running = False + + + + + + def complete_indexing(self): + """Intentionally block until the indexing is complete. Used + primarily in testing. + """ + self.queue.join() + self.flush() + + # + # Field management + def addField(self, key, store=True, exact=False, sortable=False, + type='string', collapse=False, + **kwargs): + language = kwargs.pop('language', self.language) + + xi = self.write_index.add_field_action + + if store: xi(key, secore.FieldActions.STORE_CONTENT) + if exact: xi(key, secore.FieldActions.INDEX_EXACT) + else: + # weight -- int 1 or more + # nopos -- don't include positional information + # noprefix -- boolean + xi(key, secore.FieldActions.INDEX_FREETEXT, language=language, **kwargs) + + if sortable: + xi(key, secore.FieldActions.SORTABLE, type=type) + if collapse: + xi(key, secore.FieldActions.COLLAPSE) + + # track this to find missing field configurations + self.fields.add(key) + + # + # Index Functions + def _mapProperties(self, props): + """data normalization function, maps dicts of key:kind->value + to Property objects + """ + d = {} + for k,v in props.iteritems(): + p = model.Property.fromstring(k, v) + d[p.key] = p + return d + + def index(self, props, filename=None): + """Index the content of an object. + Props must contain the following: + key -> Property() + """ + props = self._mapProperties(props) + doc = secore.UnprocessedDocument() + add = doc.fields.append + fp = None + operation = UPDATE + + filestuff = None + if filename: + # enque async file processing + # XXX: to make sure the file is kept around we could keep + # and open fp? + mimetype = props.get("mimetype") + mimetype = mimetype and mimetype.value or 'text/plain' + filestuff = (filename, mimetype) + + # + # Version handling + # + # we implicitly create new versions of documents the version + # id should have been set by the higher level system + uid = props.pop('uid', None) + vid = props.pop('vid', None) + + if uid: uid = uid.value + else: + uid = create_uid() + operation = CREATE + + if vid: vid = str(float(vid.value) + 1.0) + else: vid = "1.0" + + doc.id = uid + add(secore.Field('vid', vid)) + + # + # Property indexing + for k, prop in props.iteritems(): + value = prop.value + + if k not in self.fields: + warnings.warn("""Missing field configuration for %s""" % k, + RuntimeWarning) + continue + + add(secore.Field(k, value)) + + # queue the document for processing + self.enque(uid, vid, doc, operation, filestuff) + + return uid + + def get(self, uid): + doc = self.read_index.get_document(uid) + if not doc: raise KeyError(uid) + return model.Content(doc, self.backingstore) + + def delete(self, uid): + # does this need queuing? + # the higher level abstractions have to handle interaction + # with versioning policy and so on + self.enque(uid, None, None, DELETE) + + # + # Search + def search(self, query, start_index=0, end_index=50): + """search the xapian store. + query is a string defining the serach in standard web search syntax. + + ie: it contains a set of search terms. Each search term may be + preceded by a "+" sign to indicate that the term is required, or a "-" + to indicate that is is required to be absent. + """ + ri = self.read_index + if not query: + q = self.read_index.query_all() + elif isinstance(query, dict): + queries = [] + # each term becomes part of the query join + for k, v in query.iteritems(): + queries.append(ri.query_field(k, v)) + q = ri.query_composite(ri.OP_AND, queries) + else: + q = self.parse_query(query) + + results = ri.search(q, start_index, end_index) + count = results.matches_estimated + + # map the result set to model.Content items + return ContentMappingIter(results, self.backingstore), count + + + def get_uniquevaluesfor(self, property): + # XXX: this is very sketchy code + # try to get the searchconnection to support this directly + # this should only apply to EXACT fields + r = set() + prefix = self.read_index._field_mappings.get_prefix(property) + plen = len(prefix) + termiter = self.read_index._index.allterms(prefix) + for t in termiter: + term = t.term + if len(term) > plen: + term = term[plen:] + if term.startswith(':'): term = term[1:] + r.add(term) + + # r holds the textual representation of the fields value set + # if the type of field or property needs conversion to a + # different python type this has to happen now + descriptor = self.datamodel.fields.get(property) + if descriptor: + kind = descriptor[1] + impl = model.propertyByKind(kind) + r = set([impl.set(i) for i in r]) + + return r + + def parse_query(self, query): + # accept standard web query like syntax + # 'this' -- match this + # 'this that' -- match this and that in document + # '"this that"' match the exact pharse 'this that' + # 'title:foo' match a document whose title contains 'foo' + # 'title:"A tale of two datastores"' exact title match + # '-this that' match that w/o this + ri = self.read_index + start = 0 + end = len(query) + nextword = re.compile("(\S+)") + endquote = re.compile('(")') + queries = [] + while start < end: + m = nextword.match(query, start) + if not m: break + orig = start + field = None + start = m.end() + 1 + word = m.group(1) + if ':' in word: + # see if its a field match + fieldname, w = word.split(':', 1) + if fieldname in self.fields: + field = fieldname + + word = w + + if word.startswith('"'): + qm = endquote.search(query, start) + if qm: + #XXX: strip quotes or not here + #word = query[orig+1:qm.end(1)-1] + word = query[orig:qm.end(1)] + start = qm.end(1) + 1 + + if field: + queries.append(ri.query_field(field, word)) + else: + queries.append(ri.query_parse(word)) + q = ri.query_composite(ri.OP_AND, queries) + return q diff --git a/tests/Makefile b/tests/Makefile index 7961b02..c2581cb 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -2,10 +2,9 @@ # its not an option to configure PYTHON=python -all: test +all: clean test test: - @rm -rf fulltext @${PYTHON} runalltests.py valgrind: @@ -17,6 +16,7 @@ profile: @${PYTHON} ./profilealltests.py clean: + @${PYTHON} ./cleaner.py @find . -name "*.pyc" -exec rm {} \; @find . -name "*~" -exec rm {} \; @find . -name "hotspot*" -exec rm {} \; diff --git a/tests/cleaner.py b/tests/cleaner.py new file mode 100755 index 0000000..cfa15bf --- /dev/null +++ b/tests/cleaner.py @@ -0,0 +1,40 @@ +#!/usr/bin/python +import os +import re +from ore.main import Application + +filepattern = re.compile("(\w{8})\-(\w{4})\-(\w{4})\-(\w{4})\-(\w{12})") +tmppattern = re.compile("tmp\S{6}") +onepattern = re.compile("one.*\.txt") + +staticdirs = re.compile('test_ds|store\d') + +filepatterns = [filepattern, tmppattern, onepattern] +dirpatterns = [staticdirs] + +class Cleaner(Application): + def manage_options(self): + self.parser.add_option("--base", dest="base_dir", + action="store", default='/tmp', + help="""Where to clean (/tmp)""") + + def main(self): + """clean up files left from testing in /tmp""" + # this is done using patterned names + for root, dirs, files in os.walk(self.options.base_dir): + for filename in files: + for pat in filepatterns: + if pat.match(filename): + fn = os.path.join(root, filename) + os.remove(fn) + break + for dirname in dirs: + for pat in dirpatterns: + if pat.match(dirname): + dn = os.path.join(root, dirname) + os.system('rm -rf %s' % dn) + +if __name__ == "__main__": + Cleaner("cleaner")() + + diff --git a/tests/milestone_1.txt b/tests/milestone_1.txt index bde3720..48d09bc 100644 --- a/tests/milestone_1.txt +++ b/tests/milestone_1.txt @@ -12,6 +12,10 @@ datastore. First, create and connect the store. +>>> from testutils import waitforindex +>>> import os +>>> assert os.system('rm -rf /tmp/test_ds') == 0 + >>> from olpc.datastore import DataStore >>> from olpc.datastore import backingstore @@ -35,11 +39,13 @@ Note that we retain no reference to the created documents. Now we should be able to test the first requirement. * Get the unique ids of all the objects in the store. +>>> waitforindex(ds) + >>> results, count = ds.find() A find command with out any parameters will return everything in the store. -* Get an object from the store given his uid. +* Get an object from the store given its uid. Here we manually cycle through the results looking for the title we want. @@ -51,30 +57,24 @@ want. * Get the object metadata. >>> c1.properties -[...] +{...} * Get the object file. >>> c1.filename '/tmp/...' ->>> c1.data +>>> c1.contents 'this is the first document' >>> c1.file <open file ...> -Or if you prefer access through the datastore (which is how DBus would -use it) - ->>> fn = ds.get_filename(first_uid) ->>> ds.get_data(first_uid) -'this is the first document' - Now we can modify that file and then * Push the changes made to the file back to the store. * Update the metadata of an object. +>>> fn = c1.filename >>> fp = open(fn, 'a') >>> print >>fp, "more content" >>> fp.close() @@ -87,6 +87,4 @@ We can also remove the file from the repository. This is the basis of milestone 1. >>> ds.stop() ->>> del ds - - +>>> assert os.system('rm -rf /tmp/test_ds') == 0 diff --git a/tests/milestone_2.txt b/tests/milestone_2.txt index 516d497..73fd43a 100644 --- a/tests/milestone_2.txt +++ b/tests/milestone_2.txt @@ -7,17 +7,22 @@ First clean up from any other tests. >>> assert os.system('rm -rf /tmp/test_ds/') == 0 >>> from olpc.datastore import DataStore ->>> from olpc.datastore import backingstore +>>> from olpc.datastore import backingstore, model >>> ds = DataStore() >>> ds.registerBackend(backingstore.FileBackingStore) ->>> assert ds.mount("/tmp/test_ds") +>>> dm = model.defaultModel.copy().addField('year', 'int').addField('month', 'string') ->>> a = ds.create(dict(title="Content A", author="Bob", year=1999, month="Jan"), '') ->>> b = ds.create(dict(title="Content B", author="Alice", year=2000, month="Jan"), '') +>>> assert ds.mount("/tmp/test_ds", {'indexmanager.model' : dm}) + +>>> a = ds.create(dict(title="Content A", author="Bob", year="1999", month="Jan"), '') +>>> b = ds.create(dict(title="Content B", author="Alice", year="2000", month="Jan"), '') Find should return both >>> def find2uids(results): return [i['uid'] for i in results[0]] ->>> assert set(find2uids(ds.find({}))) == set([a,b]) + +>>> ds.complete_indexing() + +>>> assert set(find2uids(ds.find())) == set([a,b]) But what if we want the results ordered? @@ -35,3 +40,4 @@ and if we want to reverse order it? >>> ds.stop() >>> del ds +>>> assert os.system('rm -rf /tmp/test_ds/') == 0 diff --git a/tests/mountpoints.txt b/tests/mountpoints.txt index 9a821b5..eebad9b 100644 --- a/tests/mountpoints.txt +++ b/tests/mountpoints.txt @@ -20,7 +20,7 @@ Here we create a datastore, and mount a backingstore on tmp. By default this will create a new directory in /tmp which will then be used for storage. ->>> ds = DataStore(sync_index=True) +>>> ds = DataStore() >>> ds.registerBackend(backingstore.FileBackingStore) >>> mp1 = ds.mount("/tmp/store1", dict(title="Primary Storage")) @@ -36,11 +36,12 @@ can be used to control the storage target or to filter results. Now lets create some content >>> u1 = ds.create(dict(title="Document 1", filename="one.txt"), tmpData("""document one""")) ->>> u2 = ds.create(dict(title="Document 2", mime_type="text/plain"), tmpData("""document two""")) +>>> u2 = ds.create(dict(title="Document 2", mimetype="text/plain"), tmpData("""document two""")) We can now, if we wish verify which mount point this content came from. +>>> ds.complete_indexing() >>> c1 = ds.get(u1) >>> assert c1.backingstore.id == mountpoint @@ -61,6 +62,8 @@ Now lets add another mount point. Now lets create a new content item. >>> u3 = ds.create(dict(title="Document 3", mountpoint=mp2), tmpData("""document three""")) +>>> ds.complete_indexing() + We explictly passed a mount point here. Lets examine the properties of the object and verify this. >>> c3 = ds.find(dict(title="Document 3"))[0][0] @@ -102,6 +105,8 @@ Register the filesystem type If that worked it should have imported content on load(). +>>> ds.complete_indexing() + >>> result, count = ds.find(dict(fulltext="four")) >>> assert count == 1 >>> assert result[0]['mountpoint'] == mp3 @@ -112,7 +117,11 @@ as DBus data. >>> ds.unmount(mp3) ->>> mp3 = ds.mount("inplace:/tmp/store3", dict(title=dbus.String("Fake USB again"))) +>>> mp3 = ds.mount("inplace:/tmp/store3", dict(title=dbus.String("Fake USB again"), +... sync_mount=True)) + +>>> ds.complete_indexing() + >>> result, count = ds.find(dict(fulltext="four")) >>> assert count == 1 diff --git a/tests/properties.txt b/tests/properties.txt index 689414f..fe34782 100644 --- a/tests/properties.txt +++ b/tests/properties.txt @@ -8,16 +8,23 @@ properties to content and managing them. >>> from olpc.datastore import DataStore ->>> from olpc.datastore import backingstore +>>> from olpc.datastore import backingstore, model >>> from testutils import tmpData >>> import dbus Set up two mount points. ->>> ds = DataStore(sync_index=True) +>>> ds = DataStore() >>> ds.registerBackend(backingstore.FileBackingStore) ->>> mp1 = ds.mount("/tmp/store1", dict(title="Primary Storage")) ->>> mp2 = ds.mount("/tmp/store2", dict(title="Secondary Storage")) + +Extend the model to retain a 'year' property used below. + +>>> dm = model.defaultModel.copy().addField('year', "int") + +Mount a couple of stores. + +>>> mp1 = ds.mount("/tmp/store1", {'title' : "Primary Storage", 'indexmanager.model' : dm}) +>>> mp2 = ds.mount("/tmp/store2", {'title' : "Secondary Storage", 'indexmanager.model' : dm}) Create some content on each. @@ -28,15 +35,14 @@ Create some content on each. >>> u4 = ds.create({'title' : "Gamma doc", 'author' : "HAL", 'year:number' : 2001, 'mountpoint' : mp2}, tmpData("""Document 4""")) Now we should be able to discover things about the system properties. +>>> ds.complete_indexing() Here we test that we can extract the unique values for certain properties. >>> assert set(ds.get_uniquevaluesfor('author')) == set(['Ben', 'HAL']) -Here we try to gather the values for the property year. We'd expect -these values to come back as numbers, however in the current -implementation they are stored as unicode values. +Here we try to gather the values for the property year. ->>> assert set(ds.get_uniquevaluesfor('year')) == set([u'2000', u'2001']) +>>> assert set(ds.get_uniquevaluesfor('year')) == set([2000, 2001]) diff --git a/tests/query.txt b/tests/query.txt deleted file mode 100644 index 2c58851..0000000 --- a/tests/query.txt +++ /dev/null @@ -1,277 +0,0 @@ -This document outlines the basic usage of the olpc.datastore.query and -olpc.datastore.model modules. Not that these are not use independ of -the olpc.datastore.backend which in turn is only accessed through the -olpc.datastore module. This is intended only to document the innards -of those modules. - ->>> import os ->>> assert os.system('rm -rf /tmp/_test_index') == 0 ->>> assert os.system('rm -rf /tmp/_test_fulltext') == 0 - - -First lets create a query manager - ->>> from olpc.datastore.query import DefaultQueryManager ->>> from olpc.datastore.model import Property ->>> qm = DefaultQueryManager("/tmp/_test_index", fulltext_repo='/tmp/_test_fulltext') ->>> qm.prepare() -True - -That will create the memory backed database which will be used in this -documentation. The call to prepare() is normally invoked by the higher -level datastore for you. - -Because this is a new database there should be nothing in it. We can -verify this with the call to the find() method. - ->>> qm.find() -([], 0) - - -The simplest way to add an entry to the datastore is by passing a -dictionary of properties to the create method. - ->>> a = qm.create(dict(title="New Content")) - -Find will now return this object. - ->>> qm.find() -([<Content id:...>], 1) - -We can examine the Properties of this object. - ->>> a.properties -[... <TextProperty title:'New Content'>, ...] - -This returned a list of all properties on the Content object in which -case we can find the property by enumeration. The other option is -using the get_properties call on Content - ->>> a.get_properties(key='title') -[<TextProperty title:'New Content'>] - -Using the query manager API we are able to update the -properties. Using this form automatically synchronizes with the -database and the property is immediately available. To demonstrate -that this works lets attach another property. ->>> qm.update(a, dict(author='Benjamin')) - -A request for title still returns only the title property. ->>> a.get_properties(key='title') -[<TextProperty title:'New Content'>] - -And a request for author works as expected. ->>> a.get_properties(key='author') -[<Property author:'Benjamin'>] - ->>> qm.update(a, dict(foo='bar')) ->>> set([p.key for p in a.properties]) == set(['title', 'mtime', 'ctime', 'language', 'mime_type', 'author', 'foo']) -True - -We could have also passed an id for the content object rather than the -object itself. A list of properties would have been acceptable in -place of a dictionary. One thing that is shown here is that a number -of default properties were added when the Content object was -created. This is done by default and can be controlled by the -include_defaults flag to the create() method. - -Some of the default Property objects have values which are not of the -'string' type. 'ctime' and 'mtime' are examples of these - ->>> a.get_property('ctime') -datetime.datetime(...) - -We can see that ctime has been mapped to a standard Python -datetime.datetime instance. olpc.datastore.model includes support for -'number' and 'datetime' Property types by default. To add support for -new property types see the oplc.datastore.model.Model.addPropertyType -method. - -Here we want to show that certain types of Properties map to -specialized implemenations automatically based on their type. 'ctime' -is a DateTime Property and we can verify that it is returned properly -from the mapping layer with the following. ->>> ctimeProp = a.get_properties(key='ctime')[0] ->>> ctimeProp.type == "date" -True ->>> type(ctimeProp) -<class 'olpc.datastore.model.DateProperty'> - -Special support is needed to make dates easily addressable within the -datastore. The properties 'ctime', creation time, and 'mtime', -modification time are supported. To query on these properties two -methods are available. - ->>> qm.find(ctime="2007-01-01 00:00") -([], 0) - -Which matches nothing. And the other form is to pass a dict with -'start' and 'end' range boundries. ->>> import datetime ->>> now = datetime.datetime.now().isoformat() - ->>> qm.find(ctime=dict(end=now)) -([<Content id:...>], 1) - - -Property keys are unique per Content item. This means that adding a -Property with an existing key should update the value rather than add -another Property with the same key. - ->>> qm.update(a.id, [Property('another', 'property')]) ->>> a.get_property('another') -'property' - ->>> l1 = len(a.properties) ->>> qm.update(a, dict(another="test")) ->>> a.get_property('another') -'test' ->>> len(a.properties) == l1 -True - -Both forms of passing properties work. - ->>> l1 = len(a.properties) ->>> qm.update(a, [Property('another', 'value')]) ->>> a.get_property('another') -'value' ->>> len(a.properties) == l1 -True - - -We can also navigate from a Property object to the content to which it -refers. This is available through the 'content' attrbiute of -properties. Only properties bound to content and synchronized with the -database have this property. - ->>> p = a.get_properties(key='author')[0] ->>> p.content -<Content id:...> - -Let's create additional content. - ->>> b = qm.create(dict(title="My Picture", author="Sarah", mime_type="image/png")) ->>> c = qm.create(dict(title="My Song", author="Sarah", mime_type="audio/mp3")) - -At this point the find() method should be able to provide us with more -interesting results. Note at this point that find() has multiple -responsibilities. First it must search the Properties of objects, -secondly it provides access to the full-text index associated with -content in the datastore. - -For now we are only interested in the properties. - ->>> qm.find(mime_type="image/png") == ([b], 1) -True - ->>> qm.find(author="Benjamin") == ([a],1) -True - ->>> qm.find(author="Sarah") == ([b, c],2) -True - ->>> qm.find(author="Sarah", mime_type="audio/mp3") == ([c], 1) -True - -Passing the special value, 'content' to find will pass an expression -to the full text index engine. The query manager maintains a full-text -index in parallel to the normal metadata storeage. The full text index -is updated on every create, update and delete call to the query -manager provided the mime_type Property of the Content is one -understood by the index. - ->>> from StringIO import StringIO ->>> qm.update(a, {}, StringIO("this is my content, hear it roar")) ->>> qm.find(fulltext="roar") == ([a], 1) -True - -Combining this with properties also works. ->>> qm.find(fulltext="roar", author="Benjamin") == ([a], 1) -True - -And we can verify the negitive as well. ->>> qm.find(fulltext="roar", author="Sarah") -([], 0) - -Calls to update() and create() both take an optional file argument -which will update the fulltext indexed content with the new value of -file. - ->>> qm.update(a, filelike=StringIO("different text")) - -The new content will be found ->>> qm.find(fulltext="different", author="Benjamin") == ([a], 1) -True - -And the old content is not. ->>> qm.find(fulltext="roar", author="Benjamin") -([], 0) - - -Passing a filename for file works as well. Files can be in a variety -of binary formats include PDF. ->>> qm.update(a, filelike="test.doc") ->>> qm.find(fulltext="roar", author="Benjamin") -([], 0) ->>> qm.find(fulltext="amazed", author="Benjamin") == ([a], 1) -True - -We have converters for DOC, PDF and ODT by default - ->>> qm.update(a, filelike="test.pdf") ->>> qm.find(fulltext="peek", author="Benjamin") == ([a], 1) -True - - ->>> qm.update(a, filelike="test.odt") ->>> qm.find(fulltext="amazed", author="Benjamin") == ([a], 1) -True - ->>> qm.update(a, dict(title="titled indeed"), filelike="test.doc") ->>> qm.find(fulltext="amazed", author="Benjamin") == ([a], 1) -True - -For the last example we can see that we also updated the title. Here -we show that because title is a 'text' property, rather than a simple -string its contents will be available to the text indexing engine as -well. - -Searching for a direct match on the property works. ->>> qm.find(title="titled indeed") == ([a], 1) -True - -Doing a search for text internal to the title doesn't however. - ->>> qm.find(title="indeed") == ([a], 1) -False - -Searching for it in the fulltext index does return a result. ->>> qm.find(fulltext="indeed") == ([a], 1) -True - -Searching for only title in fulltext index does return a result as well. ->>> qm.find(fulltext="title:indeed") == ([a], 1) -True - - -Here we show off the get_uniquevaluesfor call examining all the values -used in the 'author' field. - ->>> assert set(qm.get_uniquevaluesfor('author')) == set(['Benjamin', 'Sarah']) - -Now that we can see a set of possible values it might be nice to -select any content with properties from a known set. For example - ->>> r, c = qm.find(author=['Benjamin', 'Sarah']) ->>> assert c == 3 - -By putting the request value in a list we can ask that the value be -'IN' this collection. All participating values are included in this -way. - - -Now for politeness we shut everything down ->>> qm.stop() ->>> import shutil, os ->>> shutil.rmtree('/tmp/_test_fulltext') ->>> os.unlink('/tmp/_test_index.db') diff --git a/tests/runalltests.py b/tests/runalltests.py index bbf0f97..02034b9 100644 --- a/tests/runalltests.py +++ b/tests/runalltests.py @@ -14,15 +14,14 @@ import unittest import doctest from pkg_resources import resource_filename -from sqlalchemy import clear_mappers doctests = [ - resource_filename(__name__, "query.txt"), + resource_filename(__name__, "xapianindex.txt"), resource_filename(__name__, "milestone_1.txt"), resource_filename(__name__, "sugar_demo_may17.txt"), resource_filename(__name__, "milestone_2.txt"), resource_filename(__name__, "mountpoints.txt"), - resource_filename(__name__, "properties.txt") + resource_filename(__name__, "properties.txt"), ] @@ -30,43 +29,30 @@ doctest_options = doctest.ELLIPSIS doctest_options |= doctest.REPORT_ONLY_FIRST_FAILURE -# IF YOU ARE NOT GETTING THE RESULTS YOU EXPECT WHILE TESTING -# THIS IS THE LIKELY CAUSE -# :: Use distutils to modify the pythonpath for inplace testing -# using the build directory -from distutils.util import get_platform -plat_specifier = ".%s-%s" % (get_platform(), sys.version[0:3]) -build_platlib = os.path.join("build", 'lib' + plat_specifier) -test_lib = os.path.join(os.path.abspath(".."), build_platlib) -sys.path.insert(0, test_lib) -# END PATH ADJUSTMENT CODE - - - -def tearDownDS(test): - # reset the module global mappers used in SQLAlchemy between tests - clear_mappers() - # and remove the test repository used in some tests - os.system('rm -rf /tmp/test_ds') - def test_suite(): + global doctests suite = unittest.TestSuite() + if len(sys.argv) > 1: + doctests = sys.argv[1:] + for dt in doctests: suite.addTest(doctest.DocFileSuite(dt, - optionflags=doctest_options, tearDown=tearDownDS)) + optionflags=doctest_options)) - tests = os.listdir(os.curdir) - tests = [n[:-3] for n in tests if n.startswith('test') and - n.endswith('.py')] + if len(sys.argv) <= 1: + tests = os.listdir(os.curdir) + tests = [n[:-3] for n in tests if n.startswith('test') and + n.endswith('.py')] - for test in tests: - m = __import__(test) - if hasattr(m, 'test_suite'): - suite.addTest(m.test_suite()) + for test in tests: + m = __import__(test) + if hasattr(m, 'test_suite'): + suite.addTest(m.test_suite()) return suite if __name__ == "__main__": runner = unittest.TextTestRunner(verbosity=1) - runner.run(test_suite()) + suite = test_suite() + runner.run(suite) diff --git a/tests/sugar_demo_may17.txt b/tests/sugar_demo_may17.txt index c899799..f242140 100644 --- a/tests/sugar_demo_may17.txt +++ b/tests/sugar_demo_may17.txt @@ -2,6 +2,7 @@ How Sugar will interact with the DS for the May 17th demo in Argentina: >>> from olpc.datastore import DataStore >>> from olpc.datastore import backingstore +>>> from testutils import waitforindex >>> ds = DataStore() >>> ds.registerBackend(backingstore.FileBackingStore) >>> assert ds.mount("/tmp/test_ds") @@ -9,11 +10,14 @@ How Sugar will interact with the DS for the May 17th demo in Argentina: Create an entry without data: >>> uid = ds.create(dict(title="New entry"), '') +>>> waitforindex(ds) + >>> ds.get_filename(uid) '' Update an entry without data: >>> ds.update(uid, dict(title="New entry still without content"), '') +>>> waitforindex(ds) >>> ds.get_filename(uid) '' @@ -23,6 +27,7 @@ Add some data to the same entry: >>> print >>fp, "some content" >>> fp.close() >>> ds.update(uid, dict(title="Same entry now with some content"), fp.name) +>>> waitforindex(ds) Retrieve that data: >>> fn = ds.get_filename(uid) @@ -36,6 +41,7 @@ Update again: >>> print >>fp, "some other content" >>> fp.close() >>> ds.update(uid, dict(title="Same entry with some other content"), fp.name) +>>> waitforindex(ds) And retrieve again: >>> fn = ds.get_filename(uid) @@ -60,6 +66,7 @@ Set content as pdf: >>> ds.update(uid, dict(title="Same entry with some content in pdf"), 'test.pdf') >>> ds.update(uid, dict(title="Same entry with some content in doc"), 'test.doc') >>> ds.update(uid, dict(title="Same entry with some content in odt"), 'test.odt') +>>> waitforindex(ds) >>> ds.stop() >>> del ds diff --git a/tests/test_backingstore.py b/tests/test_backingstore.py index 28fdeba..8aab9e6 100644 --- a/tests/test_backingstore.py +++ b/tests/test_backingstore.py @@ -1,21 +1,21 @@ import unittest -from StringIO import StringIO +from testutils import tmpData, waitforindex from olpc.datastore import backingstore -from sqlalchemy import clear_mappers import os DEFAULT_STORE = '/tmp/_bs_test' class Test(unittest.TestCase): - def tearDown(self): + def setUp(self): if os.path.exists(DEFAULT_STORE): os.system("rm -rf %s" % DEFAULT_STORE) - clear_mappers() + def tearDown(self): + if os.path.exists(DEFAULT_STORE): + os.system("rm -rf %s" % DEFAULT_STORE) def test_fsstore(self): - clear_mappers() bs = backingstore.FileBackingStore(DEFAULT_STORE) bs.initialize_and_load() bs.create_descriptor() @@ -28,20 +28,28 @@ class Test(unittest.TestCase): d = """This is a test""" d2 = "Different" - c = bs.create(dict(title="A"), StringIO(d)) - obj = bs.get(c.id) + uid = bs.create(dict(title="A"), tmpData(d)) + + waitforindex(bs) + + obj = bs.get(uid) + assert obj.get_property('title') == "A" got = obj.file.read() assert got == d - bs.update(c.id, dict(title="B"), StringIO(d2)) - obj = bs.get(c.id) + bs.update(uid, dict(title="B"), tmpData(d2)) + + waitforindex(bs) + + obj = bs.get(uid) assert obj.get_property('title') == "B" got = obj.file.read() assert got == d2 - bs.delete(c.id) - self.failUnlessRaises(KeyError, bs.get, c.id) + bs.delete(uid) + bs.complete_indexing() + self.failUnlessRaises(KeyError, bs.get, uid) def test_suite(): suite = unittest.TestSuite() diff --git a/tests/test_model.py b/tests/test_model.py index 6e8c896..2ac2fb2 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,35 +1,55 @@ import unittest -from testutils import tmpData +from testutils import tmpData, waitforindex from olpc.datastore import DataStore from olpc.datastore import model, backingstore import datetime import os + +DEFAULT_STORE = '/tmp/test_ds' + class Test(unittest.TestCase): + def setUp(self): os.system('rm -rf %s' % DEFAULT_STORE) + def tearDown(self): os.system('rm -rf %s' % DEFAULT_STORE) + def test_dateproperty(self): n = datetime.datetime.now() # we have to kill the microseconds as # time.strptime which we must use in 2.4 doesn't parse it n = n.replace(microsecond=0) - p = model.DateProperty('ctime', n) + p = model.Property('ctime', n, 'date') assert p.key == "ctime" - assert p.value.isoformat() == n.isoformat() + # XXX: the 'date()' is a work around for a missing secore + # feature right now + assert p.value == n.date().isoformat() + def test_binaryproperty(self): ds = DataStore() ds.registerBackend(backingstore.FileBackingStore) - ds.mount('/tmp/test_ds') + + #add a custom field to the model + dm = model.defaultModel.copy().addField('thumbnail', 'binary') + + ds.mount(DEFAULT_STORE, {'indexmanager.model' : dm}) + + data = open('test.jpg', 'r').read() # binary data with \0's in it can cause dbus errors here - uid = ds.create({'title' : "Document 1", 'thumbnail:binary' : data}, - tmpData("with image\0\0 prop")) + fn = tmpData("with image\0\0 prop") + # XXX: We should be able to remove:binary now + uid = ds.create({'title' : "Document 1", 'thumbnail:binary' : data}, fn) + + waitforindex(ds) + c = ds.get(uid) assert c.get_property('thumbnail') == data + ds.stop() - os.system('rm -rf /tmp/test_ds') + def test_suite(): suite = unittest.TestSuite() diff --git a/tests/test_xapianindex.py b/tests/test_xapianindex.py new file mode 100644 index 0000000..db6afef --- /dev/null +++ b/tests/test_xapianindex.py @@ -0,0 +1,90 @@ +from olpc.datastore.xapianindex import IndexManager +import os +from datetime import datetime + +import time +import unittest +import gnomevfs + +DEFAULT_STORE = '/tmp/_xi_test' + + +def index_file(iconn, filepath): + """Index a file.""" + + mimetype = gnomevfs.get_mime_type(filepath) + main, subtype = mimetype.split('/',1) + + stat = os.stat(filepath) + ctime = datetime.fromtimestamp(stat.st_ctime) + mtime = datetime.fromtimestamp(stat.st_mtime) + + if main in ['image']: filepath = None + if subtype in ['x-trash', 'x-python-bytecode']: filepath = None + + + + props = {'mimetype' : mimetype, 'mtime:date' : mtime, + 'ctime:date' : ctime,} + + if filepath: + fn = os.path.split(filepath)[1] + props['filename'] = fn + + iconn.index(props, filepath) + + return 1 + +def index_path(iconn, docpath): + """Index a path.""" + count = 0 + for dirpath, dirnames, filenames in os.walk(docpath): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + index_file(iconn, filepath) + count += 1 + return count + +class Test(unittest.TestCase): + def setUp(self): + if os.path.exists(DEFAULT_STORE): + os.system("rm -rf %s" % DEFAULT_STORE) + + def tearDown(self): + if os.path.exists(DEFAULT_STORE): + os.system("rm -rf %s" % DEFAULT_STORE) + + def test_index(self): + # import a bunch of documents into the store + im = IndexManager() + im.connect(DEFAULT_STORE) + + # test basic index performance + start = time.time() + count = index_path(im, os.getcwd()) + end = time.time() + delta = end - start + + #print "%s in %s %s/sec" % (count, delta, count/delta) + + # wait for indexing to finish + im.complete_indexing() + + # test basic search performance + results = list(im.search('peek')[0]) + + # this indicates that we found text inside binary content that + # we expected + assert 'test.pdf' in set(r.get_property('filename') for r in results) + + assert im.search('mimetype:application/pdf filename:test.pdf peek')[1] == 1 + + +def test_suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(Test)) + return suite + +if __name__ == "__main__": + unittest.main() + diff --git a/tests/testutils.py b/tests/testutils.py index 243747a..a4efc0a 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -7,3 +7,8 @@ def tmpData(data): os.write(fd, data) os.close(fd) return fn + +def waitforindex(obj): + # wait for any/all index managers associated with object to finish + # indexing so that tests can do there thing + obj.complete_indexing() diff --git a/tests/xapianindex.txt b/tests/xapianindex.txt new file mode 100644 index 0000000..354d7a8 --- /dev/null +++ b/tests/xapianindex.txt @@ -0,0 +1,73 @@ +The xapian index module can be used directly as follows + +First clean up any old test data. + +>>> index_home = "/tmp/xi" +>>> import os, sys, time, logging +>>> assert os.system('rm -rf %s' % index_home) == 0 + +# >>> logging.basicConfig(level=logging.DEBUG, +# ... format="%(asctime)-15s %(name)s %(levelname)s: %(message)s", +# ... stream=sys.stderr) + + +>>> from olpc.datastore.xapianindex import IndexManager +>>> from olpc.datastore import model +>>> im = IndexManager() +>>> im.connect(index_home) + + +Now add the file to the index. + +>>> props = dict(title="PDF Document", +... mimetype="application/pdf") + + +>>> uid = im.index(props, "test.pdf") + +Let the async indexer do its thing. We ask the indexer if it has work +left, when it has none we expect our content to be indexed and searchable. + +>>> im.complete_indexing() + + +Searching on an property of the content works. +>>> def expect(r, count=None): +... if count: assert r[1] == count +... return list(r[0]) +>>> def expect_single(r): +... assert r[1] == 1 +... return r[0].next() +>>> def expect_none(r): +... assert r[1] == 0 +... assert list(r[0]) == [] + + +>>> assert expect_single(im.search("PDF")).id == uid + +Searching into the binary content of the object works as well. +>>> assert expect_single(im.search("peek")).id == uid + +Specifying a search that demands a document term be found only in the +title works as well. + +>>> assert expect_single(im.search('title:PDF')).id == uid +>>> expect_none(im.search('title:peek')) + +Searching for documents that are PDF works as expected here. Here we +use the dictionary form of the query where each field name is given +and creates a search. +>>> assert expect_single(im.search(dict(mimetype='application/pdf'))).id == uid + +Punctuation is fine. + +>>> assert expect_single(im.search("Don't peek")).id == uid + +As well as quoted strings + +>>> assert expect_single(im.search(r'''"Don't peek"''')).id == uid + +Cleanly shut down. +>>> im.stop() + +>>> assert os.system('rm -rf %s' % index_home) == 0 |