Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Saller <bcsaller@objectrealms.net>2007-07-13 04:33:19 (GMT)
committer Benjamin Saller <bcsaller@objectrealms.net>2007-07-13 04:33:19 (GMT)
commitea06d07882613332afa18e1e0fe2323ba2580b06 (patch)
tree7f50caccf827f209bebb05171cfb3e816f82d4f8
parenta9ef157802cd7c3884814bc55b6cff1a5886c66e (diff)
parenta41d2d5ca21115f3b76a789f5874f35dca089a3d (diff)
Merge branch 'dualxap'
-rwxr-xr-xbin/datastore-service8
-rwxr-xr-xbin/index-service173
-rwxr-xr-xbin/sample-client.py70
-rw-r--r--etc/Makefile.am7
-rw-r--r--etc/org.laptop.sugar.Indexer.service.in3
-rw-r--r--src/olpc/datastore/__init__.py4
-rw-r--r--src/olpc/datastore/backingstore.py118
-rw-r--r--src/olpc/datastore/converter.py4
-rw-r--r--src/olpc/datastore/datastore.py97
-rw-r--r--src/olpc/datastore/indexer.py47
-rw-r--r--src/olpc/datastore/model.py528
-rw-r--r--src/olpc/datastore/query.py642
-rw-r--r--src/olpc/datastore/xapianindex.py390
-rw-r--r--tests/Makefile4
-rwxr-xr-xtests/cleaner.py40
-rw-r--r--tests/milestone_1.txt24
-rw-r--r--tests/milestone_2.txt16
-rw-r--r--tests/mountpoints.txt15
-rw-r--r--tests/properties.txt22
-rw-r--r--tests/query.txt277
-rw-r--r--tests/runalltests.py48
-rw-r--r--tests/sugar_demo_may17.txt7
-rw-r--r--tests/test_backingstore.py30
-rw-r--r--tests/test_model.py34
-rw-r--r--tests/test_xapianindex.py90
-rw-r--r--tests/testutils.py5
-rw-r--r--tests/xapianindex.txt73
27 files changed, 1097 insertions, 1679 deletions
diff --git a/bin/datastore-service b/bin/datastore-service
index 4300619..532516b 100755
--- a/bin/datastore-service
+++ b/bin/datastore-service
@@ -4,7 +4,6 @@ import gobject
import dbus.service
import dbus.mainloop.glib
from olpc.datastore import DataStore, DS_LOG_CHANNEL, backingstore
-from olpc.datastore.indexer import INDEX_SERVICE, INDEX_OBJECT_PATH
import logging
SYNC_INDEX = True
@@ -32,8 +31,6 @@ logging.basicConfig(level=logging.DEBUG,
filename = filename,
)
# disable subsystem logging except where critical
-logging.getLogger('sqlalchemy').setLevel(logging.CRITICAL)
-logging.getLogger('lemur').setLevel(logging.CRITICAL)
logger = logging.getLogger(DS_LOG_CHANNEL)
# check for old lockfiles, the rules here are that we can't be
@@ -53,7 +50,7 @@ bus = dbus.SessionBus()
ds = DataStore()
ds.registerBackend(backingstore.FileBackingStore)
ds.registerBackend(backingstore.InplaceFileBackingStore)
-ds.mount(repo_dir, {'querymanager_sync_index': SYNC_INDEX})
+ds.mount(repo_dir, {'indexmanager.sync_index': SYNC_INDEX})
# and run it
logger.info("Starting Datastore %s" % (repo_dir))
@@ -68,9 +65,6 @@ signal.signal(signal.SIGHUP, handle_shutdown)
signal.signal(signal.SIGTERM, handle_shutdown)
def main():
- if SYNC_INDEX is False:
- indexer = bus.get_object(INDEX_SERVICE, INDEX_OBJECT_PATH)
-
try: mainloop.run()
except KeyboardInterrupt:
ds.stop()
diff --git a/bin/index-service b/bin/index-service
deleted file mode 100755
index a2ff83c..0000000
--- a/bin/index-service
+++ /dev/null
@@ -1,173 +0,0 @@
-#!/usr/bin/env python
-
-""" Async index service for the Datastore.
-
-Subscribes to the create/update/delete messages of the Datastore and
-performs the indexing. When this service is enabled the Datastore
-access the Xapian repository in read only mode.
-"""
-
-
-try: from ore.main import Application
-except ImportError: Application = object
-
-from olpc.datastore.datastore import DS_SERVICE, DS_OBJECT_PATH
-from olpc.datastore.datastore import DS_DBUS_INTERFACE
-from olpc.datastore.indexer import Indexer
-import dbus
-import dbus.mainloop.glib
-import logging
-import sys
-import os
-import signal
-
-profile = os.environ.get('SUGAR_PROFILE', 'default')
-base_dir = os.path.join(os.path.expanduser('~'), '.sugar', profile)
-repo_dir = os.path.join(base_dir, 'datastore')
-fulltext_dir = os.path.join(repo_dir, 'fulltext')
-
-log_dir = os.path.join(base_dir, "logs")
-if not os.path.exists(log_dir): os.makedirs(log_dir)
-
-os.chdir(repo_dir)
-
-# setup logger
-filename = None
-if not sys.stdin.isatty():
- filename = os.path.join(log_dir, "indexer.log")
-logging.basicConfig(level=logging.DEBUG,
- format="%(asctime)-15s %(levelname)s: %(message)s",
- filename = filename,
- )
-
-logger = logging.getLogger('org.laptop.sugar.Indexer')
-logger.setLevel(logging.DEBUG)
-
-class IndexService(Application):
- def manage_options(self):
- self.parser.add_option("--olpc.fulltext.repo",
- dest="fulltext_dir",
- action="store", default='fulltext',
- help="""Location of the FullText Repository""")
-
-
- def main(self):
- logging.debug('Starting the index service at %s' % self.options.fulltext_dir)
- dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
- bus = dbus.SessionBus()
- self.fulltext = Indexer(self.options.fulltext_dir)
- self.fulltext.use_fulltext = True
-
- ds = bus.get_object(DS_SERVICE, DS_OBJECT_PATH)
- self.ds = dbus.Interface(ds, dbus_interface=DS_DBUS_INTERFACE)
-
- self.ds.connect_to_signal("Created", self.created,
- dbus_interface=DS_DBUS_INTERFACE)
-
- self.ds.connect_to_signal("Updated", self.updated,
- dbus_interface=DS_DBUS_INTERFACE)
-
- self.ds.connect_to_signal("Deleted", self.deleted,
- dbus_interface=DS_DBUS_INTERFACE)
-
-
- self.ds.connect_to_signal("Stopped", self.stopped,
- dbus_interface=DS_DBUS_INTERFACE)
-
- self.eventloop.run()
-
- def get_textprops(self, uid):
- # text properties also get full text indexing
- # currently this is still searched with the 'fulltext'
- # parameter of find()
- textprops = {}
- for k,v in self.ds.get_properties(uid, dict(type='text')).items():
- textprops[str(k)] = v and str(v) or ''
- return textprops
-
- def created(self, uid):
- """An object was created on the bus and we want to index it"""
- # because the file isn't encoded anywhere accessible in the
- # create call we must actually get the filename and trigger
- # the indexing on that
- filename = self.ds.get_filename(uid)
- r = None
- if filename:
- mime_type = self.ds.get_properties(uid, {}).get('mime_type', None)
- r = self.fulltext.fulltext_index(uid, filename, mime_type,
- self.get_textprops(uid))
- if r is True:
- logger.debug("index creation of %s" % uid)
- elif r is False:
- logger.debug("unable to index creation of %s" % uid)
- else:
- logger.debug("nothing to index on creation of %s" % uid)
-
- def updated(self, uid):
- """An object was updated on the bus and we want to index it"""
- # because the file isn't encoded anywhere accessible in the
- # create call we must actually get the filename and trigger
- # the indexing on that
- filename = self.ds.get_filename(uid)
- r = None
- if filename:
- mime_type = self.ds.get_properties(uid, {}).get('mime_type',
- None)
- r = self.fulltext.fulltext_index(uid, filename, mime_type,
- self.get_textprops(uid))
- if r is True:
- logger.debug("index update of %s" % uid)
- elif r is False:
- logger.debug("unable to index update of %s" % uid)
- else:
- logger.debug("nothing to index on update of %s" % uid)
-
-
- def deleted(self, uid):
- """An object was updated on the bus and we want to index it"""
- # because the file isn't encoded anywhere accessible in the
- # create call we must actually get the filename and trigger
- # the indexing on that
- try:
- self.fulltext.fulltext_unindex(uid)
- logger.debug("unindex deletion of %s" % uid);
- except KeyError: pass
-
-
- def stopped(self):
- """Respond to the datastore being stopped by shutting down
- ourselves"""
- self.fulltext.stop()
- self.eventloop.quit()
-
-
-if __name__ == "__main__":
- def handle_shutdown(signum, frame):
- idx.stopped()
- print "shutdown cleanly"
- raise SystemExit("Shutting down on signal %s" % signum)
-
- signal.signal(signal.SIGHUP, handle_shutdown)
- signal.signal(signal.SIGTERM, handle_shutdown)
-
- idx = IndexService()
- #idx()
- # w/o ore.main
-
- import gobject
- idx.eventloop = gobject.MainLoop()
- class options(object): pass
- o = options()
- o.fulltext_dir = 'fulltext'
- idx.options = o
- try:
- idx.main()
- except:
- # force logging this one
- logger.setLevel(logging.DEBUG)
- logger.debug("Problem in index service",
- exc_info=sys.exc_info())
- idx.stopped()
-
-
-
diff --git a/bin/sample-client.py b/bin/sample-client.py
index bd609a7..2a2b19c 100755
--- a/bin/sample-client.py
+++ b/bin/sample-client.py
@@ -1,8 +1,6 @@
#!/usr/bin/env python
-from ore.main import Application
import dbus
import os
-import time
def main():
bus = dbus.SessionBus()
@@ -10,36 +8,62 @@ def main():
"/org/laptop/sugar/DataStore")
uid = datastore.create(dict(title="from dbus", author="Benjamin"), os.path.abspath('tests/test.pdf'))
- print "created uid", uid
+ print "created uid", uid, "with binary content"
-
- #for u in datastore.find()[0]:
- # datastore.delete(u['uid'])
- #return
- # let the async indexer run
- time.sleep(1.2)
- #import pdb;pdb.set_trace()
- print "find", datastore.find(dict(author="Benjamin", title="from"))
+ datastore.complete_indexing()
+
res, count = datastore.find(dict(fulltext="peek"))
- if not res:
- print "unable to index content"
- #return
- print "bcsaller", [item['uid'] for item in res]
+ assert count == 1, "failed to index content"
+ assert res[0]['uid'] == uid, "returned incorrect results"
+ print "found inside binary file :: PDF"
+
+ assert datastore.find(dict(fulltext="kfdshaksjd"))[1] == 0
+ print "successfully ignored bad searches"
+
+ # try the other mimetypes
+ datastore.update(uid, dict(title="updated title",
+ mime_type="application/msword"),
+ os.path.abspath('tests/test.doc'))
- print "huh?", datastore.find(dict(fulltext="kfdshaksjd"))
+ datastore.complete_indexing()
+
+ assert datastore.find(dict(fulltext="inside"))[0][0]['uid'] == uid
+ print "found in binary file :: WORD"
+
+ datastore.update(uid, dict(title="another updated title",
+ mime_type="application/vnd.oasis.opendocument.text"),
+ os.path.abspath('tests/test.odt'))
+ datastore.complete_indexing()
+
+ assert datastore.find(dict(fulltext="amazed"))[0][0]['uid'] == uid
+ print "found in binary file :: ODT"
- # try the other mimetypes
- datastore.update(uid, dict(title="updated title", mime_type="application/msword"), os.path.abspath('tests/test.doc'))
- print datastore.find(dict(fulltext="inside"))
- datastore.update(uid, dict(title="another updated title", mime_type="application/vnd.oasis.opendocument.text"), os.path.abspath('tests/test.odt'))
- print datastore.find(dict(fulltext="amazed"))
datastore.get_properties(uid)
- print "title in fulltext", datastore.find(dict(fulltext="another"))
-
+ assert datastore.find(dict(title="another"))[0][0]['uid'] == uid
+ print "found title using dict params",
+
+ assert datastore.find("another")[0][0]['uid'] == uid
+ print "found title in search of all fields (as text)"
+
+
+ assert datastore.find('title:"another"')[0][0]['uid'] == uid
+ print "field in query field:'value' "
+
datastore.delete(uid)
+ datastore.complete_indexing()
+
+ print "deleted", uid
+ try: datastore.get_properties(uid)
+ except: pass
+ else:
+ print "Found deleted value... oops"
+ raise KeyError(uid)
+
+ print "ALL GOOD"
if __name__ == '__main__':
+ #from ore.main import Application
#a = Application("client", main)
#a.plugins.append('ore.main.profile_support.ProfileSupport')
#a()
diff --git a/etc/Makefile.am b/etc/Makefile.am
index 1d8a54c..a9b28b1 100644
--- a/etc/Makefile.am
+++ b/etc/Makefile.am
@@ -1,15 +1,12 @@
servicedir = $(datadir)/dbus-1/services
service_in_files = \
- org.laptop.sugar.DataStore.service.in \
- org.laptop.sugar.Indexer.service.in
+ org.laptop.sugar.DataStore.service.in
+
service_DATA = $(service_in_files:.service.in=.service)
org.laptop.sugar.DataStore.service: org.laptop.sugar.DataStore.service.in
@sed -e "s|\@bindir\@|$(bindir)|" $< > $@
-org.laptop.sugar.Indexer.service: org.laptop.sugar.Indexer.service.in
- @sed -e "s|\@bindir\@|$(bindir)|" $< > $@
-
DISTCLEANFILES = $(service_DATA)
EXTRA_DIST = $(service_in_files)
diff --git a/etc/org.laptop.sugar.Indexer.service.in b/etc/org.laptop.sugar.Indexer.service.in
deleted file mode 100644
index fb0a7ec..0000000
--- a/etc/org.laptop.sugar.Indexer.service.in
+++ /dev/null
@@ -1,3 +0,0 @@
-[D-BUS Service]
-Name = org.laptop.sugar.Indexer
-Exec = @bindir@/index-service
diff --git a/src/olpc/datastore/__init__.py b/src/olpc/datastore/__init__.py
index d38dcff..fd38d75 100644
--- a/src/olpc/datastore/__init__.py
+++ b/src/olpc/datastore/__init__.py
@@ -1,7 +1,5 @@
# datastore package
+from olpc.datastore.datastore import DataStore, DS_LOG_CHANNEL
-from olpc.datastore.datastore import DataStore, DS_LOG_CHANNEL
-from olpc.datastore.backingstore import FileBackingStore
-from olpc.datastore.query import DefaultQueryManager
diff --git a/src/olpc/datastore/backingstore.py b/src/olpc/datastore/backingstore.py
index b0a05ad..354426e 100644
--- a/src/olpc/datastore/backingstore.py
+++ b/src/olpc/datastore/backingstore.py
@@ -17,7 +17,7 @@ import re
import subprocess
import time
-from olpc.datastore import query
+from olpc.datastore.xapianindex import IndexManager
from olpc.datastore import utils
# changing this pattern impacts _targetFile
@@ -75,7 +75,7 @@ class BackingStore(object):
def load(self):
"""load the index for a given mount-point, then initialize its
fulltext subsystem. This is the routine that will bootstrap
- the querymanager (though create() may have just created it)
+ the indexmanager (though create() may have just created it)
"""
pass
@@ -121,11 +121,11 @@ class FileBackingStore(BackingStore):
""" FileSystemStore(path=<root of managed storage>)
"""
self.options = kwargs
- self.local_querymanager = self.options.get('local_querymanager', True)
+ self.local_indexmanager = self.options.get('local_indexmanager', True)
self.uri = uri
self.base = os.path.join(uri, self.STORE_NAME)
- self.querymanager = None
+ self.indexmanager = None
# Informational
def descriptor(self):
@@ -190,47 +190,40 @@ class FileBackingStore(BackingStore):
if not os.path.exists(self.base):
os.makedirs(self.base)
- # examine options and see what the querymanager plan is
- if self.local_querymanager:
- # create a local storage using the querymanager
+ # examine options and see what the indexmanager plan is
+ if self.local_indexmanager:
+ # create a local storage using the indexmanager
# otherwise we will connect the global manager
# in load
index_name = os.path.join(self.base, self.INDEX_NAME)
- options = utils.options_for(self.options, 'querymanager_')
- if 'fulltext_repo' not in options:
- options['fulltext_repo'] = os.path.join(self.base,
- query.DefaultQueryManager.FULLTEXT_NAME)
-
- qm = query.DefaultQueryManager(index_name, **options)
+ options = utils.options_for(self.options, 'indexmanager.')
+ im = IndexManager()
# This will ensure the fulltext and so on are all assigned
- qm.bind_to(self)
- qm.prepare()
+ im.bind_to(self)
+ im.connect(index_name, **options)
self.create_descriptor(**options)
- self.querymanager = qm
+ self.indexmanager = im
def load(self):
- if not self.querymanager and self.local_querymanager:
- # create a local storage using the querymanager
+ if not self.indexmanager and self.local_indexmanager:
+ # create a local storage using the indexmanager
# otherwise we will connect the global manager
# in load
index_name = os.path.join(self.base, self.INDEX_NAME)
- options = utils.options_for(self.options, 'querymanager_')
- if 'fulltext_repo' not in self.options:
- options['fulltext_repo'] = os.path.join(self.base,
- query.DefaultQueryManager.FULLTEXT_NAME)
-
- qm = query.DefaultQueryManager(index_name, **options)
+ options = utils.options_for(self.options, 'indexmanager.')
+ im = IndexManager()
desc = utils.options_for(self.options,
- 'querymanager_', invert=True)
+ 'indexmanager.',
+ invert=True)
if desc: self.create_descriptor(**desc)
# This will ensure the fulltext and so on are all assigned
- qm.bind_to(self)
- qm.prepare()
+ im.bind_to(self)
+ im.connect(index_name)
- self.querymanager = qm
+ self.indexmanager = im
def bind_to(self, datastore):
## signal from datastore that we are being bound to it
@@ -283,7 +276,7 @@ class FileBackingStore(BackingStore):
# env would contain things like cwd if we wanted to map to a
# known space
- content = self.querymanager.get(uid)
+ content = self.indexmanager.get(uid)
# we need to map a copy of the content from the backingstore into the
# activities addressable space.
# map this to a rw file
@@ -316,7 +309,7 @@ class FileBackingStore(BackingStore):
fp.write(line)
fp.close()
if verify:
- content = self.querymanager.get(uid)
+ content = self.indexmanager.get(uid)
content.checksum = c.hexdigest()
def _checksum(self, filename):
@@ -329,18 +322,18 @@ class FileBackingStore(BackingStore):
# File Management API
def create(self, props, filelike):
- content = self.querymanager.create(props, filelike)
+ uid = self.indexmanager.index(props, filelike)
filename = filelike
if filelike:
if isinstance(filelike, basestring):
# lets treat it as a filename
filelike = open(filelike, "r")
filelike.seek(0)
- self._writeContent(content.id, filelike, replace=False)
- return content
+ self._writeContent(uid, filelike, replace=False)
+ return uid
def get(self, uid, env=None, allowMissing=False):
- content = self.querymanager.get(uid)
+ content = self.indexmanager.get(uid)
if not content: raise KeyError(uid)
path = self._translatePath(uid)
fp = None
@@ -352,7 +345,9 @@ class FileBackingStore(BackingStore):
return self._mapContent(uid, fp, path, env)
def update(self, uid, props, filelike=None):
- self.querymanager.update(uid, props, filelike)
+ if 'uid' not in props: props['uid'] = uid
+
+ self.indexmanager.index(props, filelike)
filename = filelike
if filelike:
if isinstance(filelike, basestring):
@@ -365,7 +360,7 @@ class FileBackingStore(BackingStore):
self._writeContent(uid, filelike)
def delete(self, uid, allowMissing=True):
- self.querymanager.delete(uid)
+ self.indexmanager.delete(uid)
path = self._translatePath(uid)
if os.path.exists(path):
os.unlink(path)
@@ -374,21 +369,23 @@ class FileBackingStore(BackingStore):
raise KeyError("object for uid:%s missing" % uid)
def get_uniquevaluesfor(self, propertyname):
- return self.querymanager.get_uniquevaluesfor(propertyname)
+ return self.indexmanager.get_uniquevaluesfor(propertyname)
def find(self, query):
- return self.querymanager.find(query)
+ return self.indexmanager.search(query)
def stop(self):
- self.querymanager.stop()
-
+ self.indexmanager.stop()
+
+ def complete_indexing(self):
+ self.indexmanager.complete_indexing()
class InplaceFileBackingStore(FileBackingStore):
"""Like the normal FileBackingStore this Backingstore manages the
storage of files, but doesn't move files into a repository. There
are no working copies. It simply adds index data through its
- querymanager and provides fulltext ontop of a regular
+ indexmanager and provides fulltext ontop of a regular
filesystem. It does record its metadata relative to this mount
point.
@@ -434,45 +431,48 @@ class InplaceFileBackingStore(FileBackingStore):
for fn in filenames:
source = os.path.join(dirpath, fn)
relative = source[len(self.uri)+1:]
- result, count = self.querymanager.find(dict(filename=relative))
+
+ result, count = self.indexmanager.search(dict(filename=relative))
if not count:
# create a new record
self.create(dict(filename=relative), source)
else:
# update the object with the new content iif the
# checksum is different
- # XXX: what if there is more than one? (shouldn't happen)
- content = result[0]
- uid = content
+ # XXX: what if there is more than one? (shouldn't
+ # happen)
+ content = result.next()
+ uid = content.id
# only if the checksum is different
- checksum = self._checksum(source)
- if checksum != content.checksum:
- self.update(uid, dict(filename=relative), source)
-
- #self.querymanager.index.flush()
+ #checksum = self._checksum(source)
+ #if checksum != content.checksum:
+ self.update(uid, dict(filename=relative), source)
-
+ if self.options.get('sync_mount', False):
+ self.complete_indexing()
+
# File Management API
def create(self, props, filelike):
# the file would have already been changed inplace
# don't touch it
- return self.querymanager.create(props, filelike)
+ return self.indexmanager.index(props, filelike)
def get(self, uid, env=None, allowMissing=False):
- content = self.querymanager.get(uid)
+ content = self.indexmanager.get(uid)
if not content: raise KeyError(uid)
return content.get_property('filename')
def update(self, uid, props, filelike=None):
# the file would have already been changed inplace
# don't touch it
- self.querymanager.update(uid, props, filelike)
+ props['uid'] = uid
+ self.indexmanager.index(props, filelike)
- def delete(self, uid, allowMissing=True):
- c = self.querymanager.get(uid)
- path = c.get_property('filename')
- self.querymanager.delete(uid)
- if os.path.exists(path):
+ def delete(self, uid):
+ c = self.indexmanager.get(uid)
+ path = c.get_property('filename', None)
+ self.indexmanager.delete(uid)
+ if path and os.path.exists(path):
os.unlink(path)
diff --git a/src/olpc/datastore/converter.py b/src/olpc/datastore/converter.py
index 1250dbb..6f0ede6 100644
--- a/src/olpc/datastore/converter.py
+++ b/src/olpc/datastore/converter.py
@@ -95,11 +95,13 @@ class Converter(object):
# maps both extension -> plugin
# and mimetype -> plugin
self._converters = {}
+ self._default = None
self.logger = logging.getLogger('org.laptop.sugar.Indexer')
def registerConverter(self, ext_or_mime, plugin):
if plugin.verify():
self._converters[ext_or_mime] = plugin
+ if self._default is None: self._default = plugin
def __call__(self, filename, encoding=None, mimetype=None):
"""Convert filename's content to utf-8 encoded text."""
@@ -119,6 +121,8 @@ class Converter(object):
converter = self._converters.get(mt)
if not converter:
converter = self._converters.get(ext)
+ if not converter:
+ converter = self._default
if converter:
try:
return converter(filename)
diff --git a/src/olpc/datastore/datastore.py b/src/olpc/datastore/datastore.py
index 142d801..9121be9 100644
--- a/src/olpc/datastore/datastore.py
+++ b/src/olpc/datastore/datastore.py
@@ -18,8 +18,6 @@ import dbus.mainloop.glib
from olpc.datastore import utils
-from StringIO import StringIO
-
# the name used by the logger
DS_LOG_CHANNEL = 'org.laptop.sugar.DataStore'
@@ -68,14 +66,11 @@ class DataStore(dbus.service.Object):
# medium (maybe an SD card for example) and we'd want to keep
# that on the XO itself. In these cases their might be very
# little identifying information on the media itself.
-
uri = str(uri)
- _options = {}
- if options:
- for key, value in options.iteritems():
- _options[str(key)] = str(value)
-
+ _options = utils._convert(options)
+ if _options is None: _options = {}
+
mp = self.connect_backingstore(uri, **_options)
if not mp: return ''
if mp.id in self.mountpoints:
@@ -116,14 +111,28 @@ class DataStore(dbus.service.Object):
## sticks and so on. We provide a facility for tracking
## co-authors of content
## there are associated changes to 'find' to resolve buddies
- def addBuddy(self, id, name, fg_color, bg_color):
- pass
+ def addBuddy(self, id, name, fg_color, bg_color, mountpoint=None):
+ mp = None
+ if mountpoint is None: mp = self.root
+ else: mp = self.mountpoints.get(mountpoint)
+ if mp is None: raise ValueError("Invalid mountpoint")
+ mp.addBuddy(id, name, fg_color, bg_color)
+
+ def getBuddy(self, bid):
+ """Get a buddy by its id"""
+ b = None
+ for mp in self.mountpoints.itervalues():
+ b = mp.getBuddy(bid)
+ if b: break
+ return b
- def getBuddy(self, id):
- pass
def buddies(self):
- pass
+ buddies = set()
+ for mp in self.mountpoints.itervalues():
+ buddies = buddies.union(mp.getBuddies())
+ return buddies
+
## end buddy api
@@ -146,7 +155,7 @@ class DataStore(dbus.service.Object):
def _resolveMountpoint(self, mountpoint=None):
if isinstance(mountpoint, dict):
- mountpoint = mountpoint.get('mountpoint')
+ mountpoint = mountpoint.pop('mountpoint', None)
if mountpoint is not None:
# this should be the id of a mount point
@@ -173,26 +182,15 @@ class DataStore(dbus.service.Object):
over this process can come at a later time.
"""
mp = self._resolveMountpoint(props)
- content = mp.create(props, filelike)
- self.Created(content.id)
- logging.debug("created %s" % content.id)
+ uid = mp.create(props, filelike)
+ self.Created(uid)
+ logging.debug("created %s" % uid)
- return content.id
+ return uid
@dbus.service.signal(DS_DBUS_INTERFACE, signature="s")
def Created(self, uid): pass
-
- @dbus.service.method(DS_DBUS_INTERFACE,
- in_signature='',
- out_signature='as')
- def all(self):
- # workaround for not having optional args or None in
- # DBus .. blah
- results = self.querymanager.find()
- return [r.id for r in results]
-
-
def _multiway_search(self, query):
mountpoints = query.pop('mountpoints', self.mountpoints)
mountpoints = [self.mountpoints[str(m)] for m in mountpoints]
@@ -306,9 +304,8 @@ class DataStore(dbus.service.Object):
d = []
for r in results:
props = {}
- for prop in r.get_properties():
- props[prop.key] = prop.marshall()
-
+ props.update(r.properties)
+
if 'uid' not in props:
props['uid'] = r.id
@@ -317,7 +314,7 @@ class DataStore(dbus.service.Object):
filename = ''
if include_files :
- try: filename = self.backingstore.get(r.id).filename
+ try: filename = r.filename
except KeyError: pass
props['filename'] = filename
d.append(props)
@@ -344,25 +341,13 @@ class DataStore(dbus.service.Object):
except AttributeError: pass
return ''
- def get_data(self, uid):
- content = self.get(uid)
- if content:
- return content.get_data()
-
- def put_data(self, uid, data):
- self.update(uid, None, StringIO(data))
-
#@utils.sanitize_dbus
@dbus.service.method(DS_DBUS_INTERFACE,
- in_signature='sa{sv}',
+ in_signature='s',
out_signature='a{sv}')
- def get_properties(self, uid, query=None):
+ def get_properties(self, uid):
content = self.get(uid)
- dictionary = {}
- if not query: query = {}
- for prop in content.get_properties(**query):
- dictionary[prop.key] = prop.marshall()
- return dictionary
+ return content.properties
@dbus.service.method(DS_DBUS_INTERFACE,
in_signature='sa{sv}',
@@ -372,7 +357,7 @@ class DataStore(dbus.service.Object):
mountpoints = query.pop('mountpoints', self.mountpoints)
mountpoints = [self.mountpoints[str(m)] for m in mountpoints]
results = set()
-
+
for mp in mountpoints:
result = mp.get_uniquevaluesfor(propertyname)
results = results.union(result)
@@ -405,8 +390,8 @@ class DataStore(dbus.service.Object):
content = self.get(uid)
if content:
content.backingstore.delete(uid)
- self.Deleted(content.id)
- logger.debug("deleted %s" % content.id)
+ self.Deleted(uid)
+ logger.debug("deleted %s" % uid)
@dbus.service.signal(DS_DBUS_INTERFACE, signature="s")
def Deleted(self, uid): pass
@@ -421,4 +406,12 @@ class DataStore(dbus.service.Object):
@dbus.service.signal(DS_DBUS_INTERFACE)
def Stopped(self): pass
-
+ @dbus.service.method(DS_DBUS_INTERFACE,
+ in_signature='',
+ out_signature='')
+ def complete_indexing(self):
+ """Block waiting for all queued indexing operations to
+ complete. Used mostly in testing"""
+ for mp in self.mountpoints.itervalues():
+ mp.complete_indexing()
+
diff --git a/src/olpc/datastore/indexer.py b/src/olpc/datastore/indexer.py
deleted file mode 100644
index de7ef33..0000000
--- a/src/olpc/datastore/indexer.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-indexer
-~~~~~~~~~~~~~~~~~~~~
-fulltext index module
-
-"""
-
-__author__ = 'Benjamin Saller <bcsaller@objectrealms.net>'
-__docformat__ = 'restructuredtext'
-__copyright__ = 'Copyright ObjectRealms, LLC, 2007'
-__license__ = 'The GNU Public License V2+'
-
-
-# the name used by the logger
-import logging
-import dbus.service
-import dbus.mainloop.glib
-from olpc.datastore.query import XapianFulltext
-
-INDEX_LOG_CHANNEL = 'org.laptop.sugar.Indexer'
-
-INDEX_SERVICE = "org.laptop.sugar.Indexer"
-INDEX_DBUS_INTERFACE = "org.laptop.sugar.Indexer"
-INDEX_OBJECT_PATH = "/org/laptop/sugar/Indexer"
-
-logger = logging.getLogger(INDEX_LOG_CHANNEL)
-
-class Indexer(dbus.service.Object, XapianFulltext):
- # This object doesn't really publish an interface right now
- # Its a bus object so that dbus can start it automatically
- # when the datastore requests a binding to it
- def __init__(self, repo='fulltext'):
- # global handle to the main look
- dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
- session_bus = dbus.SessionBus()
-
- self.bus_name = dbus.service.BusName(INDEX_SERVICE,
- bus=session_bus,
- replace_existing=True,
- allow_replacement=True)
- dbus.service.Object.__init__(self, self.bus_name, INDEX_OBJECT_PATH)
-
-
- self.connect_fulltext(repo, read_only=False)
-
-
-
diff --git a/src/olpc/datastore/model.py b/src/olpc/datastore/model.py
index 8c8ab05..18d409f 100644
--- a/src/olpc/datastore/model.py
+++ b/src/olpc/datastore/model.py
@@ -10,17 +10,11 @@ __docformat__ = 'restructuredtext'
__copyright__ = 'Copyright ObjectRealms, LLC, 2007'
__license__ = 'The GNU Public License V2+'
-from sqlalchemy import Table, Column, UniqueConstraint
-from sqlalchemy import String, Integer, Unicode
-from sqlalchemy import ForeignKey, Sequence, Index
-from sqlalchemy import mapper, relation
-from sqlalchemy import create_session
-from sqlalchemy import MapperExtension, EXT_PASS, clear_mappers
-
import datetime
import mimetypes
import os
import time
+import warnings
# XXX: Open issues
# list properties - Contributors (a, b, c)
@@ -28,51 +22,139 @@ import time
# content state - searches don't include content deletion flag
# - not recording if content is on other storage yet
-
-# we have a global thread local session factory
-context = {}
propertyTypes = {}
_marker = object()
-def get_session(backingstore):
- return context[backingstore]
+def registerPropertyType(kind, get, set, xapian_sort_type=None, defaults=None):
+ propertyTypes[kind] = PropertyImpl(get, set, xapian_sort_type, defaults)
-def registerPropertyType(kind, class_): propertyTypes[kind] = class_
def propertyByKind(kind): return propertyTypes[kind]
+class PropertyImpl(object):
+ __slots__ = ('_get', '_set', 'xapian_sort_type', 'defaults')
+
+ def __init__(self, get, set, xapian_sort_type=None, defaults=None):
+ self._get, self._set = get, set
+ self.xapian_sort_type = xapian_sort_type
+ self.defaults = defaults
+
+ def get(self, value): return self._get(value)
+ def set(self, value): return self._set(value)
+
+class Property(object):
+ """Light-weight property implementation.
+ Handles typed properties via a global registry of type->callbacks
+
+ >>> p = Property(key, value, 'string')
+ >>> b = Property(key, value, 'binary')
+ """
+ def __init__(self, key, value, kind=None):
+ self.key = key
+ self._value = value
+ self.kind = kind
+ if kind not in propertyTypes:
+ warnings.warn("Unknown property type: %s on key %s" % \
+ (kind, key), RuntimeWarning)
+ else: self._impl = propertyTypes[kind]
+
+ @classmethod
+ def fromstring(cls, key, value=''):
+ kind = 'string'
+ if ':' in key:
+ key, kind = key.split(':', 1)
+ # now resolve the kind to a property class
+ return cls(key, value, kind)
+
-class Content(object):
def __repr__(self):
- return "<Content id:%s>" % (self.id, )
+ return "<%s(%s) %s:%r>" % (self.__class__.__name__,
+ self.kind,
+ self.key, self.value)
- def get_property(self, key, default=_marker):
- # mapped to property keys
- session = get_session(self.backingstore)
- query = session.query(Property)
- p = query.get_by(content_id=self.id, key=key)
- if not p:
- if default is _marker: raise AttributeError(key)
- return default
- return p.value
-
- def get_properties(self, **kwargs):
- session = get_session(self.backingstore)
- query = session.query(Property)
- return query.select_by(content_id=self.id, **kwargs)
-
-
- # Backingstore dependent bindings
- def get_file(self):
- if not hasattr(self, "_file") or self._file.closed is True:
- self.backingstore.get(self.id)
- return self._file
+ def get_value(self): return self._impl.get(self._value)
+ def set_value(self, value): self._value = self._impl.set(value)
+ value = property(get_value, set_value)
+
+ def __str__(self): return str(self.value)
+
+class Model(object):
+ """Object containing the field/property model used by the
+ system"""
- def set_file(self, fileobj):
- self._file = fileobj
- file = property(get_file, set_file)
+ def __init__(self):
+ self.fields = {}
+ self.fieldnames = []
+
+ def copy(self):
+ m = Model()
+ m.fields = self.fields.copy()
+ m.fieldnames = self.fieldnames[:]
+ return m
+
+ def addField(self, key, kind, overrides=None):
+ """ Add a field to the model.
+ key -- field name
+ kind -- type by name (registered with registerPropertyType)
+ kwargs -- overrides and additional values to the default
+ arguments supplied by kind
+ """
+ if key in self.fields:
+ raise KeyError("""Another source tried to add %s field to the model""" % key)
+
+ impl = propertyByKind(kind)
+ options = impl.defaults.copy()
+ if overrides: options.update(overrides)
+ if impl.xapian_sort_type:
+ if 'type' not in options:
+ options['type'] = impl.xapian_sort_type
+
+ self.fields[key] = (key, kind, options)
+ self.fieldnames.append(key)
+ return self
+
+ def addFields(self, *args):
+ """ List of arguments to addField """
+ for arg in args: self.addField(*arg)
+ return self
+
+ def apply(self, indexmanager):
+ addField = indexmanager.addField
+ for fn in self.fieldnames:
+ args = self.fields[fn]
+ addField(args[0], **args[2])
+
+class Content(object):
+ """A light weight proxy around Xapian Documents from secore.
+ This provides additional methods which are used in the
+ backingstore to assist in storage
+ """
+ __slots__ = ('_doc', '_backingstore', '_file')
+
+ def __init__(self, xapdoc, backingstore=None):
+ self._doc = xapdoc
+ self._backingstore = backingstore
+ self._file = None
+
+ def __repr__(self):
+ return "<%s %s>" %(self.__class__.__name__,
+ self.properties)
+
+ def get_property(self, key, default=_marker):
+ result = self._doc.data.get(key, default)
+ if result is _marker: raise KeyError(key)
+ if isinstance(result, list) and len(result) == 1:
+ return result[0]
+ return result
@property
- def filename(self): return self.file.name
+ def properties(self):
+ d = {}
+ for k, v in self.data.iteritems():
+ if isinstance(v, list) and len(v) == 1:
+ v = v[0]
+ d[k] = v
+ return d
+
def suggestName(self):
# we look for certain known property names
@@ -89,8 +171,7 @@ class Content(object):
f, e = os.path.splitext(filename)
if e: return filename, None
if ext: return "%s.%s" % (filename, ext), None
- elif ext:
- return None, ext
+ elif ext: return None, ext
else:
# try to get an extension from the mimetype if available
mt = self.get_property('mime_type', None)
@@ -99,279 +180,110 @@ class Content(object):
if ext: return None, ext
return None, None
- def get_data(self):
- f = self.file
- t = f.tell()
- data = f.read()
- f.seek(t)
- return data
-
- def set_data(self, filelike):
- self.backingstore.set(self.id, filelike)
-
- data = property(get_data, set_data)
-
-
-class BackingStoreContentMapping(MapperExtension):
- """This mapper extension populates Content objects with the
- binding to the backing store the files are kept on, this allow the
- file-like methods to work as expected on content
- """
- def __init__(self, backingstore):
- MapperExtension.__init__(self)
- self.backingstore = backingstore
-
- def populate_instance(self, mapper, selectcontext, row, instance, identitykey, isnew):
- """called right before the mapper, after creating an instance
- from a row, passes the row to its MapperProperty objects which
- are responsible for populating the object's attributes. If
- this method returns EXT_PASS, it is assumed that the mapper
- should do the appending, else if this method returns any other
- value or None, it is assumed that the append was handled by
- this method.
-
- """
- instance.backingstore = self.backingstore
- # allow normal population to happen
- return EXT_PASS
-
-
-class Property(object):
- """A typed key value pair associated with a content object.
- This is the objects metadata. The value side of the kv pair is
- typically encoded as a UTF-8 String. There are however cases where
- richer metadata is required by the application using the
- datastore.
- In these cases the type field is overridden to encode a reference
- to another object that must be used to satisfy this value. An
- example of this would be storing a PNG thumbnail as the a
- value. In a case such as that the value should be set to a path or
- key used to find the image on stable storage or in a database and
- the type field will be used to demarshall it through this object.
- """
- def __init__(self, key, value, type='string'):
- self.key = key
- self.value = value
- self.type = type
-
- def __repr__(self):
- return "<%s %s:%r>" % (self.__class__.__name__,
- self.key, self.value)
- def marshall(self):
- """Return the value marshalled as a string"""
- return str(self.value)
-
-class TextProperty(Property):
- """A text property is one that will also get full automatic text
- indexing when available. This is used for fields like title where
- searching in the text is more important than doing a direct match
- """
- def __init__(self, key, value, type='text'):
- Property.__init__(self, key, value, type)
-
- def get_value(self): return self._value
- def set_value(self, value): self._value = value
- value = property(get_value, set_value)
-
+ def get_file(self):
+ if not hasattr(self, "_file") or self._file.closed is True:
+ self.backingstore.get(self.id)
+ return self._file
-class DateProperty(Property):
- format = "%Y-%m-%dT%H:%M:%S"
-
- def __init__(self, key, value, type="date"):
- self._value = None
- Property.__init__(self, key, value, type)
-
- def get_value(self):
- # parse the value back into a datetime
- # XXX: strptime on datetime is a 2.5 thing :(
- # XXX: we lose timezone in this conversion currently
- if not self._value: return None
- ti = time.strptime(self._value, self.format)
- dt = datetime.datetime(*(ti[:-2]))
- dt = dt.replace(microsecond=0)
- return dt
-
- def set_value(self, value):
- if isinstance(value, basestring):
- # XXX: there is an issue with microseconds not getting parsed
- ti = time.strptime(value, self.format)
- value = datetime.datetime(*(ti[:-2]))
- value = value.replace(microsecond=0)
-
- self._value = value.isoformat()
+ def set_file(self, fileobj):
+ self._file = fileobj
+ file = property(get_file, set_file)
- value = property(get_value, set_value)
+ @property
+ def filename(self): return self.file.name
- def marshall(self): return self.value.isoformat()
-
+ @property
+ def contents(self): return self.file.read()
-class NumberProperty(Property):
- def __init__(self, key, value, type="number"):
- Property.__init__(self, key, value, type)
-
- def get_value(self): return float(self._value)
- def set_value(self, value): self._value = value
- value = property(get_value, set_value)
+ @property
+ def backingstore(self): return self._backingstore
+ @property
+ def id(self): return self._doc.id
-class BinaryProperty(Property):
- # base64 encode binary data
- def __init__(self, key, value, type="binary"):
- Property.__init__(self, key, value, type)
-
- def get_value(self): return self._value.decode('base64')
- def set_value(self, value): self._value = value.encode('base64')
- value = property(get_value, set_value)
-
-
-class Model(object):
- """ Manages the global state of the metadata model index. This is
- intended to only be consumed by an olpc.datastore.query.QueryManager
- instance for the management of its metadata.
-
- >>> m = Model()
- >>> m.prepare(querymanager)
-
- >>> m.content
- ... # Content Table
-
- >>> m['content']
- ... # content Mapper
-
- For details see the sqlalchemy documentation
-
- """
-
- def __init__(self):
- self.tables = {}
- self.mappers = {}
+ @property
+ def data(self): return self._doc.data
- def __getattr__(self, key): return self.tables[key]
- def __getitem__(self, key): return self.mappers[key]
-
-
- def prepare(self, querymanager):
- self.querymanager = querymanager
-
- # a single session manages the exclusive access we keep to the
- # db.
- global context
- self.session = create_session(bind_to=self.querymanager.db)
- context[self.querymanager.backingstore] = self.session
-
- # content object
- content = Table('content',
- self.querymanager.metadata,
- Column('id', String, primary_key=True, nullable=False),
- Column('activity_id', Integer),
- Column('checksum', String,),
- UniqueConstraint('id', name='content_key')
- )
- Index('content_activity_id_idx', content.c.activity_id)
-
- # the properties of content objects
- properties = Table('properties',
- self.querymanager.metadata,
- Column('id', Integer, Sequence('property_id_seq'), primary_key=True),
- Column('content_id', Integer, ForeignKey('content.id')),
- Column('key', Unicode, ),
- Column('value', Unicode, ),
- Column('type', Unicode, ),
- # unique key to content mapping
- UniqueConstraint('content_id', 'key',
- name='property_content_key')
- )
-
- Index('property_key_idx', properties.c.key)
- Index('property_type_idx', properties.c.type)
-
- # storage
- storage = Table('storage',
- self.querymanager.metadata,
- Column('id', String, primary_key=True),
- Column('description', String, ),
- Column('uri', String, )
- )
-
- # storage -> * content
- # XXX: this could be a purely runtime in-memory construct
- # removing the storage table as well. Would depend in part on
- # the frequency of the garbage collection runs and the
- # frequency of connection to stable storage
- storage_content = Table('storage_content',
- self.querymanager.metadata,
- Column('storage_id', Integer, ForeignKey('storage.id')),
- Column('content_id', Integer, ForeignKey('content.id')),
- )
- Index('idx_storage_content_content_id', storage_content.c.content_id)
-
- # Object Mapping
- # the query manager provides a mapping extension for
- # Content <-> BackingStore binding
-
- # XXX gross and not what we want, we can only define mappers
- # once but we may have more than one datastore.
- # this can impact all sqla in the runtime though
- clear_mappers()
+## class Buddy(object):
+## """A co-author on content. Information is collected and managed
+## here"""
+## pass
+
+
+
+def noop(value): return value
+
+import re
+base64hack = re.compile("(\S{212})")
+def base64enc(value): return ' '.join(base64hack.split(value.encode('base64')))
+def base64dec(value): return value.replace(' ', '').decode('base64')
+
+dateformat = "%Y-%m-%dT%H:%M:%S"
+def datedec(value, dateformat=dateformat):
+ ti = time.strptime(value, dateformat)
+ dt = datetime.datetime(*(ti[:-2]))
+ dt = dt.replace(microsecond=0)
+ return dt
+
+def dateenc(value, dateformat=dateformat):
+ if isinstance(value, basestring):
+ # XXX: there is an issue with microseconds not getting parsed
+ ti = time.strptime(value, dateformat)
+ value = datetime.datetime(*(ti[:-2]))
+ value = value.replace(microsecond=0)
+ # XXX: drop time for now, this is a xapian issue
+ value = value.date()
+ return value.isoformat()
+
+# type, get, set, xapian sort type [string|float|date], defaults
+# defaults are the default options to addField in IndexManager
+# these can be overridden on model assignment
+registerPropertyType('string', noop, noop, 'string', {'store' : True,
+ 'exact' : True,
+ 'sortable' : True})
+
+registerPropertyType('text', noop, noop, 'string', {'store' : True,
+ 'exact' : False,
+ 'sortable' : False})
+
+registerPropertyType('binary', noop, noop, None, {'store' : True,
+ 'exact' : False,
+ 'sortable' : False})
+
+registerPropertyType('int', str, int, 'float', {'store' : True,
+ 'exact' : True,
+ 'sortable' : True})
+
+registerPropertyType('number', str, float, 'float', {'store' : True,
+ 'exact' : True,
+ 'sortable' : True})
+
+registerPropertyType('date', dateenc, datedec, 'date', {'store' : True,
+ 'exact' : True,
+ 'sortable' : True
+ })
+
+
+
+defaultModel = Model().addFields(
+ ('fulltext', 'text'),
+ # vid is version id
+ ('vid', 'number'),
+ ('checksum', 'string'),
+ ('filename', 'string'),
+ # Title has additional weight
+ ('title', 'text', {'weight' : 2 }),
+ ('url', 'string'),
+ ('mimetype', 'string'),
+ ('author', 'string'),
+ ('language', 'string'),
+ ('ctime', 'date'),
+ ('mtime', 'date'),
+ # this will just be a space delimited list of tags
+ # indexed with the content
+ # I give them high weight as they have user given semantic value.
+ ('tags', 'text', {'weight' :3 } ),
+ )
- content_mapper = mapper(Content, content,
- extension=self.querymanager.content_ext,
- properties = {
- 'properties' : relation(Property,
- cascade="all,delete-orphan",
- backref='content',
- lazy=True),
- },
-
- )
-
- # retain reference to these tables to use for queries
- self.tables['content'] = content
- self.tables['properties'] = properties
- self.tables['storage'] = storage
- self.tables['storage_content'] = storage_content
-
- # and the mappers (though most likely not needed)
- property_mapper = mapper(Property, properties, polymorphic_on=properties.c.type)
- self.mappers['properties'] = property_mapper
- self.mappers['content'] = content_mapper
-
- # default Property types are mapped to classes here
- self.addPropertyType(DateProperty, 'date')
- self.addPropertyType(NumberProperty, 'number')
- self.addPropertyType(TextProperty, 'text')
- self.addPropertyType(BinaryProperty, 'binary')
-
-
-
-
- def addPropertyType(self, PropertyClass, typename,
- map_value=True, **kwargs):
- """Register a new type of Property. PropertyClass should be a
- subclass of Property, typename is the textual
- name of the new Property type.
-
- The flag map_value indicates if Property.value should
- automatically be diverted to _value so that you can more
- easily manage the interfaces 'value' as a Python property
- (descriptor)
-
- Keyword args will be passed to the properties dictionary of
- the sqlalchemy mapper call. See sqlalchemy docs for additional
- details.
- """
- properties = {}
- properties.update(kwargs)
- if map_value is True:
- properties['_value'] = self.properties.c.value
-
- mapper(PropertyClass,
- inherits=self.mappers['properties'],
- polymorphic_identity=typename,
- properties=properties
- )
-
- registerPropertyType(typename, PropertyClass)
-
diff --git a/src/olpc/datastore/query.py b/src/olpc/datastore/query.py
deleted file mode 100644
index 2c5dd9f..0000000
--- a/src/olpc/datastore/query.py
+++ /dev/null
@@ -1,642 +0,0 @@
-"""
-olpc.datastore.query
-~~~~~~~~~~~~~~~~~~~~
-manage the metadata index and make it queryable. this in turn will
-depend on olpc.datastore.fulltext which indexes the actual content.
-
-"""
-
-__author__ = 'Benjamin Saller <bcsaller@objectrealms.net>'
-__docformat__ = 'restructuredtext'
-__copyright__ = 'Copyright ObjectRealms, LLC, 2007'
-__license__ = 'The GNU Public License V2+'
-
-
-from datetime import datetime
-from lemur.xapian.sei import DocumentStore, DocumentPiece, SortableValue
-from olpc.datastore.converter import converter
-from olpc.datastore.model import DateProperty, TextProperty
-from olpc.datastore.model import Model, Content, Property, propertyByKind
-from olpc.datastore.utils import create_uid
-
-from sqlalchemy import create_engine, BoundMetaData
-from sqlalchemy import select, intersect, and_
-import atexit
-import logging
-import os, sys
-
-_marker = object()
-
-
-class SugarDomain(object):
- """The underlying property set used for metadata in the sugar
- system"""
- def kind_by_key(self, key):
- """resolves property names to the factory type that supports
- them in the model
- """
- # key may be a two part form directly indicating the property
- # type
- if ':' in key:
- key, kind = key.split(':', 1)
- # now resolve the kind to a property class
- return key, propertyByKind(kind)
-
- return key, {
- 'ctime' : DateProperty,
- 'mtime' : DateProperty,
- 'author' : Property,
- 'title' : TextProperty,
- 'mime_type' : Property,
- 'language' : Property,
- }.get(key, Property)
-
- def propertyFactory(self, key, value='', dict=None):
- key, kind = self.kind_by_key(key)
- p = kind(key, value)
- if dict is not None: dict[key] = p
- return kind
-
- def _automaticProperties(self):
- d = {}
- now = datetime.now()
- self.propertyFactory('mtime', now, dict=d)
- return d
-
- def _defaultProperties(self):
- d = {}
- now = datetime.now()
- self.propertyFactory('ctime', now, dict=d)
- self.propertyFactory('author', dict=d)
- self.propertyFactory('title', dict=d)
- self.propertyFactory('mime_type', dict=d)
- self.propertyFactory('language', dict=d)
-
- d.update(self._automaticProperties())
- return d
-
- def _normalizeProps(self, props, creating, include_defaults):
- # return a dict of {name : property}
- if isinstance(props, dict):
- # convert it into a dict of Property objects
- d = {}
- for k,v in props.iteritems():
- k, kind = self.kind_by_key(k)
- p = kind(k, v)
- d[k] = p
- if creating and include_defaults:
- defaults = self._defaultProperties()
- for k, v in defaults.iteritems():
- if k not in d: d[k] = v
- props = d
- else:
- d = {}
- for p in props:
- d[p.key] = p
- props = d
- return props
-
-
-
-class QueryManager(SugarDomain):
- FULLTEXT_NAME = "fulltext"
-
- def __init__(self, metadata_uri, **options):
- """
- The metadata_uri is a string used to find the database.
-
-
- This will check keywords for:
- 'language' Language is the language code used in the fulltext
- engine. This helps improve stemming and
- so on. In the future additional control
- will be provided.
-
- 'sync_index' which determines if we use an internal
- sync index impl or an out of process one
- via DBus. If the async process is to be
- used it must be properly configured and
- available for DBus to spawn.
-
- 'fulltext_repo' the full filepath to which the fulltext
- index data will be stored
-
- 'use_fulltext' when true indexing will be performed
-
- """
- self.uri = metadata_uri
- self.options = options
-
- self.backingstore = None
- self.content_ext = None
-
-
- def _handle_option(self, options, key, default=_marker):
- value = options.get(key, default)
- if value is _marker: raise KeyError(key)
- setattr(self, key, value)
-
- def _handle_options(self, **kwargs):
- self._handle_option(kwargs, 'fulltext_repo')
- self._handle_option(kwargs, 'use_fulltext', True)
- self._handle_option(kwargs, 'sync_index', True)
- self._handle_option(kwargs, 'language', 'en')
- self.sync_index = self.use_fulltext and self.sync_index
-
- def bind_to(self, backingstore):
- self.backingstore = backingstore
-
- def prepare_index(self):
- self.connect_db()
- self.prepare_db()
- self.connect_model()
-
- def prepare_fulltext(self):
- self.connect_fulltext(self.fulltext_repo, self.language,
- read_only=not self.sync_index)
-
- def prepare(self):
- """This is called by the datastore with its backingstore and
- querymanager. Its assumed that querymanager is None and we are
- the first in this release
- """
- self._handle_options(**self.options)
- self.prepare_index()
- self.prepare_fulltext()
- return True
-
- def stop(self):
- pass
-
- # Primary interface
- def create(self, props, filelike=None, include_defaults=True):
- """Props can either be a dict of k,v pairs or a sequence of
- Property objects.
-
- The advantage of using property objects is that the data can
- by typed. When k/v pairs are used a default string type will
- be chosen.
-
- When include_defaults is True a default set of properties are
- created on behalf of the Content if they were not provided.
-
- These include:
- author : ''
- title : ''
- mime_type : ''
- language : '',
- ctime : '',
- mtime : '',
- """
- s = self.model.session
- c = Content()
- # its important the id be set before other operations
- c.id = create_uid()
- s.save(c)
-
- self._bindProperties(c, props, creating=True, include_defaults=include_defaults)
- s.flush()
- c.backingstore = self.backingstore
-
- if self.sync_index and filelike:
- self.fulltext_index(c.id, filelike,
- mimetype=c.get_property('mime_type'),
- textprops=self.get_textprops(c))
-
- return c
-
- def update(self, content_or_uid, props=None, filelike=None):
- content = self._resolve(content_or_uid)
- content.backingstore = self.backingstore
- if props is not None:
- self._bindProperties(content, props, creating=False)
- self.model.session.flush()
-
- if self.sync_index and filelike:
- self.fulltext_index(content.id, filelike, textprops=self.get_textprops(content))
-
-
- def _bindProperties(self, content, props, creating=False, include_defaults=False):
- """Handle either a dict of properties or a list of property
- objects, binding them to the content instance.
- """
- # for information on include_defaults see create()
- # default properties are only provided when creating is True
- session = self.model.session
-
- props = self._normalizeProps(props, creating,
- include_defaults)
-
- # we should have a dict of property objects
- if creating:
- content.properties.extend(props.values())
- else:
- # if the automatically maintained properties (like mtime)
- # are not set, include them now
- auto = self._automaticProperties()
- auto.update(props)
- props = auto
- # we have to check for the update case
- oldProps = dict([(p.key, p) for p in content.properties])
- for k, p in props.iteritems():
- if k in oldProps:
- oldProps[k].value = p.value
- oldProps[k].type = p.type
- else:
- content.properties.append(p)
-
- def get(self, uid):
- return self.model.session.query(self.model.mappers['content']).get(uid)
-
- def get_properties(self, content_or_uid, keys):
- c = self._resolve(content_or_uid)
- return self.model.session.query(Property).select_by(self.model.property.c.key.in_(keys),
- content_id=c.id)
-
-
- def get_uniquevaluesfor(self, propertyname):
- properties = self.model.tables['properties']
- return [r[0] for r in select([properties.c.value],
- properties.c.key==propertyname,
- distinct=True).execute().fetchall()]
-
-
-
- def delete(self, content_or_uid):
- c = self._resolve(content_or_uid)
- s = self.model.session
- s.delete(c)
- s.flush()
- if self.sync_index:
- self.fulltext_unindex(c.id)
-
-
- def find(self, query=None, **kwargs):
- """
- dates can be search in one of two ways.
- date='YYYY-MM-DD HH:MM:SS'
- date={'start' : 'YYYY-MM-DD HH:MM:SS',
- 'end' : 'YYYY-MM-DD HH:MM:SS'
- }
- where date is either ctime or mtime.
- if start or end is omitted its becomes a simple before/after
- style query. If both are provided its a between query.
-
- providing the key 'fulltext' will include a full text search
- of content matching its parameters. see fulltext_search for
- additional details.
-
-
- If 'limit' is passed it will be the maximum number of results
- to return and 'offset' will be the offset from 0 into the
- result set to return.
-
- """
-
- # XXX: this will have to be expanded, but in its simplest form
- if not self.sync_index: self.index.reopen()
-
- s = self.model.session
- properties = self.model.tables['properties']
- if not query: query = {}
- query.update(kwargs)
- q = s.query(Content)
- # rewrite the query to reference properties
- # XXX: if there is a 'content' key will will have to search
- # the content using the full text index which will result in a
- # list of id's which must be mapped into the query
- # fulltext_threshold is the minimum acceptable relevance score
- limit = query.pop('limit', None)
- offset = query.pop('offset', None)
-
- if offset: q = q.offset(offset)
- if limit: q = q.limit(limit)
-
- if query:
- where = []
- fulltext = query.pop('fulltext', None)
- threshold = query.pop('fulltext_threshold', 60)
-
-
-
- statement = None
- ft_select = None
-
- if query:
- # daterange support
- # XXX: this is sort of a hack because
- # - it relies on Manifest typing in sqlite
- # - value's type is not normalized
- # - we make special exception based on property name
- # if we need special db handling of dates ctime/mtime
- # will become columns of Content and not properties
- ctime = query.pop('ctime', None)
- mtime = query.pop('mtime', None)
- if ctime or mtime:
- self._query_dates(ctime, mtime, where)
- for k,v in query.iteritems():
- if isinstance(v, list):
- v = properties.c.value.in_(*v)
- else:
- v = properties.c.value==v
-
- where.append(select([properties.c.content_id],
- and_( properties.c.key==k,
- v)))
-
- statement = intersect(*where)
- statement.distinct=True
-
- if fulltext and self.use_fulltext:
- # perform the full text search and map the id's into
- # the statement for inclusion
- ft_res = self.fulltext_search(fulltext)
- if ft_res:
- ft_ids = [ft[0] for ft in ft_res if ft[1] >=
- threshold]
-
- if ft_ids:
- ft_select = select([properties.c.content_id],
- properties.c.content_id.in_(*ft_ids))
-
- if ft_select is None:
- # the full text query eliminated the possibility
- # of results by returning nothing under a logical
- # AND condition, bail now
- return ([], 0)
- else:
- if statement is None:
- statement = ft_select
- statement.distinct = True
- else:
- statement = intersect(statement, ft_select)
-
- result = statement.execute()
- r = [q.get(i[0]) for i in result]
- r = (r, len(r))
- else:
- r = (q.select(), q.count())
-
- # XXX: make sure the proper backingstore is mapped
- # this currently forbids the use case of keeping index data
- # for a read-only store.
- for item in r[0]:
- item.backingstore = self.backingstore
-
- return r
-
- # sqla util
- def _resolve(self, content_or_uid):
- if isinstance(content_or_uid, basestring):
- # we need to resolve the object
- content_or_uid = self.model.session.query(Content).get(content_or_uid)
- return content_or_uid
-
- def _query_dates(self, ctime, mtime, selects):
- if ctime: selects.append(self._query_date('ctime', ctime))
- if mtime: selects.append(self._query_date('mtime', mtime))
-
- def _query_date(self, key, date):
- properties = self.model.properties
-
- if isinstance(date, basestring):
- s = select([properties.c.content_id],
- and_( properties.c.key==key,
- properties.c.value==date))
- else:
- # its a dict with start/end
- start = date.get('start')
- end = date.get('end')
- if start and end:
- s = select([properties.c.content_id],
- and_( properties.c.key==key,
- properties.c.value.between(start,
- end)))
- elif start:
- s = select([properties.c.content_id],
- and_( properties.c.key==key,
- properties.c.value >=start))
- else:
- s = select([properties.c.content_id],
- and_( properties.c.key==key,
- properties.c.value < end))
-
- return s
-
-
- def get_textprops(self, uid_or_content):
- # text properties also get full text indexing
- # currently this is still searched with the 'fulltext'
- # parameter of find()
- content = self._resolve(uid_or_content)
- textprops = {}
- for p in content.get_properties(type='text'):
- textprops[p.key] = p.value and p.value or ''
- return textprops
-
-
- # fulltext interface
- def fulltext_index(self, uid, fileobj, mimetype=None, textprops=None):
- """Index the fileobj relative to uid which should be a
- olpc.datastore.model.Content object's uid. The fileobj can be
- either a pathname or an object implementing the Python file
- ('read') interface.
- """
- pass
-
- def fulltext_unindex(self, content_id):
- pass
-
- def fulltext_search(self, *args, **kwargs):
- return []
-
- # lifecycle
- def connect_db(self):
- """Connect to the underlying database. Called implicitly by
- __init__"""
- pass
-
-
- def prepare_db(self):
- """After connecting to the metadata database take any
- initialization steps needed for the environment.
-
- This is called implicitly by __init__ before the model is
- brought online.
- """
- pass
-
- def connect_model(self, model):
- """Connect the model. Called with the model passed into
- __init__ after the database has been prepared.
- """
- pass
-
- def connect_fulltext(self, repo, language, read_only):
- """Connect the full text index"""
- pass
-
-
-class SQLiteQueryManager(QueryManager):
- """The default implementation of the query manager. This owns the
- model object and the fulltext object
- """
-
- def __init__(self, uri, **kwargs):
- super(SQLiteQueryManager, self).__init__(uri, **kwargs)
- # now re-write the URI to be sqlite specific
- # (we were initialized to a namepattern in the proper
- # directory by the backingstore)
- self.uri= "sqlite:///%s.db" % self.uri
-
- def connect_db(self):
- self.db = create_engine(self.uri)
- self.metadata = BoundMetaData(self.db)
-
- def prepare_db(self):
- # Using the sqlite backend we can tune the performance to
- # limit writes as much as possible
- if self.db.name.startswith('sqlite'):
- connection = self.db.connect()
- # cut down of per-activity file locking writes
- connection.execute("PRAGMA locking_mode=EXCLUSIVE")
- # don't demand fsync -- if this is too dangerous
- # we can change it to normal which is still less writey
- # than the default FULL
- connection.execute("PRAGMA synchronous=OFF")
- # temporary tables and indices are kept in memory
- connection.execute("PRAGMA temp_store=MEMORY")
- # XXX: what is the ideal jffs2 page size
- # connection.execute("PRAGMA page_size 4096")
-
- def connect_model(self, model=None):
- if model is None: model = Model()
- # take the model and connect it to us
- model.prepare(self)
-
- # make sure all the tables and indexes exist
- self.metadata.create_all()
-
- self.model = model
-
-
- def stop(self):
- # clean up
- self.db.dispose()
-
-# Full text support
-def flatten_unicode(value): return value.encode('utf-8')
-
-class XapianBinaryValue(SortableValue):
- def __init__(self, value, field_name="content"):
- SortableValue.__init__(self, value, field_name)
-
-class XapianFulltext(object):
- def connect_fulltext(self, repo, language='en', read_only=True):
- if not os.path.exists(repo) and read_only is True:
- # create the store
- index = DocumentStore(repo, language, read_only=False)
- index.close()
- # and abandon it
- self.index = DocumentStore(repo, language, read_only=read_only)
- self.index.registerFlattener(unicode, flatten_unicode)
- atexit.register(self.index.close)
-
- def fulltext_index(self, uid, fileobj, mimetype=None, textprops=None):
- """Index the fileobj relative to uid which should be a
- olpc.datastore.model.Content's uid. The fileobj can be either
- a pathname or an object implementing the Python file ('read')
- interface.
- """
- piece = DocumentPiece
- if isinstance(fileobj, basestring):
- # treat it as a pathname
- # use the global converter to try to get text from the
- # file
- fp = converter(fileobj, mimetype=mimetype)
- #piece = XapianBinaryValue
- elif hasattr(fileobj, 'read'):
- # this is an off case, we have to assume utf-8 data
- logging.debug("Indexing from readable, not filename")
- fp = fileobj
- else:
- raise ValueError("Not a valid file object")
-
- if fp is None:
- # for whatever reason we were unable to get the content
- # into an indexable form.
- logging.debug("Unable to index %s %s" % (uid, fileobj))
- return False
- return self._ft_index(uid, fp, piece, textprops)
-
- def _ft_index(self, content_id, fp, piece=DocumentPiece, fields=None):
- try:
- doc = [piece(fp.read())]
- if fields:
- # add in properties that need extra fulltext like
- # management
- for key, value in fields.iteritems():
- doc.append(DocumentPiece(value, key))
-
- self.index.addDocument(doc, content_id)
- self.index.flush()
- return True
- except:
- logging.debug("fulltext index exception", exc_info=sys.exc_info())
- return False
-
-
-
- def fulltext_search(self, *args, **kwargs):
- """
- perform search(search_string, ) -> [(content_id, relevance), ...]
-
- search_string is a string defining the serach in standard web search
- syntax.
-
- ie: it contains a set of search terms. Each search term may be
- preceded by a "+" sign to indicate that the term is required, or a "-"
- to indicate that is is required to be absent.
-
- If field_name is not None, it is the prefix of a field, which the
- search will be restricted to.
-
- If field_name is None, the search will search all fields by default,
- but search terms may be preceded by a fieldname followed by a colon to
- restrict part of the search to a given field.
-
- combiner is one of DocumentStore.OP_OR or DocumentStore.OP_AND, and is
- used to indicate the default operator used to combine terms.
-
- partial is a flag, which should be set to True to enable partial search
- matching, for use when doing interactive searches and we're not sure if
- the user has finished typing the search yet.
-
- range_restrictions is a RangeRestrictions object, used to restrict the
- search results.
-
- """
- if len(args) == 1:
- # workaround for api change
- args = (args[0], 0, 10)
-
- res = self.index.performSearch(*args, **kwargs)
- est = max(1, res.estimatedResultCount())
- return res.getResults(0, est)
-
- def fulltext_similar(self, *content_ids):
- return self.index.findSimilar(content_ids)
-
- def fulltext_unindex(self, content_id):
- self.index.deleteDocument(content_id)
-
- def stop(self):
- if self.use_fulltext:
- self.index.close()
-
-
-class DefaultQueryManager(XapianFulltext, SQLiteQueryManager):
-
- def stop(self):
- XapianFulltext.stop(self)
- SQLiteQueryManager.stop(self)
diff --git a/src/olpc/datastore/xapianindex.py b/src/olpc/datastore/xapianindex.py
new file mode 100644
index 0000000..ec7206d
--- /dev/null
+++ b/src/olpc/datastore/xapianindex.py
@@ -0,0 +1,390 @@
+"""
+xapianindex
+~~~~~~~~~~~~~~~~~~~~
+maintain indexes on content
+
+"""
+
+__author__ = 'Benjamin Saller <bcsaller@objectrealms.net>'
+__docformat__ = 'restructuredtext'
+__copyright__ = 'Copyright ObjectRealms, LLC, 2007'
+__license__ = 'The GNU Public License V2+'
+
+
+from Queue import Queue, Empty
+import logging
+import re
+
+import threading
+import warnings
+
+import secore
+
+from olpc.datastore import model
+from olpc.datastore.converter import converter
+from olpc.datastore.utils import create_uid
+
+
+# Setup Logger
+logger = logging.getLogger('org.sugar.datastore.xapianindex')
+
+# Indexer Operations
+CREATE = 1
+UPDATE = 2
+DELETE = 3
+
+
+class ContentMappingIter(object):
+ """An iterator over a set of results from a search.
+
+ """
+ def __init__(self, results, backingstore):
+ self._results = results
+ self._backingstore = backingstore
+ self._iter = iter(results)
+
+ def __iter__(self): return self
+
+ def next(self):
+ searchresult = self._iter.next()
+ return model.Content(searchresult, self._backingstore)
+
+
+class IndexManager(object):
+ DEFAULT_DATABASE_NAME = 'index'
+
+ def __init__(self, default_language='en'):
+ # We will maintain two connections to the database
+ # we trigger automatic flushes to the read_index
+ # after any write operation
+ self.write_index = None
+ self.read_index = None
+ self.queue = Queue(0)
+ self.indexer_running = False
+ self.language = default_language
+
+ self.backingstore = None
+
+ self.fields = set()
+
+ #
+ # Initialization
+ def connect(self, repo, **kwargs):
+ if self.write_index is not None:
+ warnings.warn('''Requested redundant connect to index''',
+ RuntimeWarning)
+
+ self.repo = repo
+ self.write_index = secore.IndexerConnection(repo)
+
+ # configure the database according to the model
+ datamodel = kwargs.get('model', model.defaultModel)
+ datamodel.apply(self)
+
+ # store a reference
+ self.datamodel = datamodel
+
+ self.read_index = secore.SearchConnection(repo)
+
+ self.flush()
+
+ # by default we start the indexer now
+ self.startIndexer()
+
+ def bind_to(self, backingstore):
+ # signal from backingstore that its our parent
+ self.backingstore = backingstore
+
+ # flow control
+ def flush(self):
+ """Called after any database mutation"""
+ self.write_index.flush()
+ self.read_index.reopen()
+
+ def stop(self):
+ self.stopIndexer()
+ self.write_index.close()
+ self.read_index.close()
+
+ # Index thread management
+ def startIndexer(self):
+ self.indexer_running = True
+ self.indexer = threading.Thread(target=self.indexThread)
+ self.indexer.setDaemon(True)
+ self.indexer.start()
+
+ def stopIndexer(self, force=False):
+ if not self.indexer_running: return
+ if not force: self.queue.join()
+ self.indexer_running = False
+ self.indexer.join()
+
+ def enque(self, uid, vid, doc, operation, filestuff=None):
+ self.queue.put((uid, vid, doc, operation, filestuff))
+
+ def indexThread(self):
+ # process the queue
+ # XXX: there is currently no way to remove items from the queue
+ # for example if a USB stick is added and quickly removed
+ # the mount should however get a stop() call which would
+ # request that the indexing finish
+ while self.indexer_running:
+ # include timeout here to ease shutdown of the thread
+ # if this is a non-issue we can simply allow it to block
+ try:
+ uid, vid, doc, operation, filestuff = self.queue.get(timeout=0.5)
+ if operation is DELETE: self.write_index.delete(uid)
+ elif operation in (CREATE, UPDATE):
+ # Here we handle the conversion of binary
+ # documents to plain text for indexing. This is
+ # done in the thread to keep things async and
+ # latency lower.
+ if filestuff:
+ filename, mimetype = filestuff
+ fp = converter(filename, mimetype)
+ if fp:
+ doc.fields.append(secore.Field('fulltext',
+ fp.read()))
+
+ if operation is CREATE: self.write_index.add(doc)
+ elif operation is UPDATE: self.write_index.replace(doc)
+
+ else:
+ logger.warning("Unknown indexer operation ( %s: %s)" % \
+ (uid, operation))
+ continue
+
+ # XXX: this isn't quite true, we haven't called flush
+ # yet so the document might not be on disk
+ logger.info("Indexed Content %s:%s" % (uid, vid))
+ # but we still tell the queue its complete
+ self.queue.task_done()
+
+ except Empty:
+ pass
+## except:
+## import traceback
+## traceback.print_exc()
+## try: self.write_index.close()
+## except: pass
+## try:
+## self.write_index = secore.IndexerConnection(self.repo)
+## self.read_index.reopen()
+## except:
+## # Shut down the indexer
+## logger.critical("Indexer Failed, Shutting it down")
+## self.indexer_running = False
+
+
+
+
+
+ def complete_indexing(self):
+ """Intentionally block until the indexing is complete. Used
+ primarily in testing.
+ """
+ self.queue.join()
+ self.flush()
+
+ #
+ # Field management
+ def addField(self, key, store=True, exact=False, sortable=False,
+ type='string', collapse=False,
+ **kwargs):
+ language = kwargs.pop('language', self.language)
+
+ xi = self.write_index.add_field_action
+
+ if store: xi(key, secore.FieldActions.STORE_CONTENT)
+ if exact: xi(key, secore.FieldActions.INDEX_EXACT)
+ else:
+ # weight -- int 1 or more
+ # nopos -- don't include positional information
+ # noprefix -- boolean
+ xi(key, secore.FieldActions.INDEX_FREETEXT, language=language, **kwargs)
+
+ if sortable:
+ xi(key, secore.FieldActions.SORTABLE, type=type)
+ if collapse:
+ xi(key, secore.FieldActions.COLLAPSE)
+
+ # track this to find missing field configurations
+ self.fields.add(key)
+
+ #
+ # Index Functions
+ def _mapProperties(self, props):
+ """data normalization function, maps dicts of key:kind->value
+ to Property objects
+ """
+ d = {}
+ for k,v in props.iteritems():
+ p = model.Property.fromstring(k, v)
+ d[p.key] = p
+ return d
+
+ def index(self, props, filename=None):
+ """Index the content of an object.
+ Props must contain the following:
+ key -> Property()
+ """
+ props = self._mapProperties(props)
+ doc = secore.UnprocessedDocument()
+ add = doc.fields.append
+ fp = None
+ operation = UPDATE
+
+ filestuff = None
+ if filename:
+ # enque async file processing
+ # XXX: to make sure the file is kept around we could keep
+ # and open fp?
+ mimetype = props.get("mimetype")
+ mimetype = mimetype and mimetype.value or 'text/plain'
+ filestuff = (filename, mimetype)
+
+ #
+ # Version handling
+ #
+ # we implicitly create new versions of documents the version
+ # id should have been set by the higher level system
+ uid = props.pop('uid', None)
+ vid = props.pop('vid', None)
+
+ if uid: uid = uid.value
+ else:
+ uid = create_uid()
+ operation = CREATE
+
+ if vid: vid = str(float(vid.value) + 1.0)
+ else: vid = "1.0"
+
+ doc.id = uid
+ add(secore.Field('vid', vid))
+
+ #
+ # Property indexing
+ for k, prop in props.iteritems():
+ value = prop.value
+
+ if k not in self.fields:
+ warnings.warn("""Missing field configuration for %s""" % k,
+ RuntimeWarning)
+ continue
+
+ add(secore.Field(k, value))
+
+ # queue the document for processing
+ self.enque(uid, vid, doc, operation, filestuff)
+
+ return uid
+
+ def get(self, uid):
+ doc = self.read_index.get_document(uid)
+ if not doc: raise KeyError(uid)
+ return model.Content(doc, self.backingstore)
+
+ def delete(self, uid):
+ # does this need queuing?
+ # the higher level abstractions have to handle interaction
+ # with versioning policy and so on
+ self.enque(uid, None, None, DELETE)
+
+ #
+ # Search
+ def search(self, query, start_index=0, end_index=50):
+ """search the xapian store.
+ query is a string defining the serach in standard web search syntax.
+
+ ie: it contains a set of search terms. Each search term may be
+ preceded by a "+" sign to indicate that the term is required, or a "-"
+ to indicate that is is required to be absent.
+ """
+ ri = self.read_index
+ if not query:
+ q = self.read_index.query_all()
+ elif isinstance(query, dict):
+ queries = []
+ # each term becomes part of the query join
+ for k, v in query.iteritems():
+ queries.append(ri.query_field(k, v))
+ q = ri.query_composite(ri.OP_AND, queries)
+ else:
+ q = self.parse_query(query)
+
+ results = ri.search(q, start_index, end_index)
+ count = results.matches_estimated
+
+ # map the result set to model.Content items
+ return ContentMappingIter(results, self.backingstore), count
+
+
+ def get_uniquevaluesfor(self, property):
+ # XXX: this is very sketchy code
+ # try to get the searchconnection to support this directly
+ # this should only apply to EXACT fields
+ r = set()
+ prefix = self.read_index._field_mappings.get_prefix(property)
+ plen = len(prefix)
+ termiter = self.read_index._index.allterms(prefix)
+ for t in termiter:
+ term = t.term
+ if len(term) > plen:
+ term = term[plen:]
+ if term.startswith(':'): term = term[1:]
+ r.add(term)
+
+ # r holds the textual representation of the fields value set
+ # if the type of field or property needs conversion to a
+ # different python type this has to happen now
+ descriptor = self.datamodel.fields.get(property)
+ if descriptor:
+ kind = descriptor[1]
+ impl = model.propertyByKind(kind)
+ r = set([impl.set(i) for i in r])
+
+ return r
+
+ def parse_query(self, query):
+ # accept standard web query like syntax
+ # 'this' -- match this
+ # 'this that' -- match this and that in document
+ # '"this that"' match the exact pharse 'this that'
+ # 'title:foo' match a document whose title contains 'foo'
+ # 'title:"A tale of two datastores"' exact title match
+ # '-this that' match that w/o this
+ ri = self.read_index
+ start = 0
+ end = len(query)
+ nextword = re.compile("(\S+)")
+ endquote = re.compile('(")')
+ queries = []
+ while start < end:
+ m = nextword.match(query, start)
+ if not m: break
+ orig = start
+ field = None
+ start = m.end() + 1
+ word = m.group(1)
+ if ':' in word:
+ # see if its a field match
+ fieldname, w = word.split(':', 1)
+ if fieldname in self.fields:
+ field = fieldname
+
+ word = w
+
+ if word.startswith('"'):
+ qm = endquote.search(query, start)
+ if qm:
+ #XXX: strip quotes or not here
+ #word = query[orig+1:qm.end(1)-1]
+ word = query[orig:qm.end(1)]
+ start = qm.end(1) + 1
+
+ if field:
+ queries.append(ri.query_field(field, word))
+ else:
+ queries.append(ri.query_parse(word))
+ q = ri.query_composite(ri.OP_AND, queries)
+ return q
diff --git a/tests/Makefile b/tests/Makefile
index 7961b02..c2581cb 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -2,10 +2,9 @@
# its not an option to configure
PYTHON=python
-all: test
+all: clean test
test:
- @rm -rf fulltext
@${PYTHON} runalltests.py
valgrind:
@@ -17,6 +16,7 @@ profile:
@${PYTHON} ./profilealltests.py
clean:
+ @${PYTHON} ./cleaner.py
@find . -name "*.pyc" -exec rm {} \;
@find . -name "*~" -exec rm {} \;
@find . -name "hotspot*" -exec rm {} \;
diff --git a/tests/cleaner.py b/tests/cleaner.py
new file mode 100755
index 0000000..cfa15bf
--- /dev/null
+++ b/tests/cleaner.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python
+import os
+import re
+from ore.main import Application
+
+filepattern = re.compile("(\w{8})\-(\w{4})\-(\w{4})\-(\w{4})\-(\w{12})")
+tmppattern = re.compile("tmp\S{6}")
+onepattern = re.compile("one.*\.txt")
+
+staticdirs = re.compile('test_ds|store\d')
+
+filepatterns = [filepattern, tmppattern, onepattern]
+dirpatterns = [staticdirs]
+
+class Cleaner(Application):
+ def manage_options(self):
+ self.parser.add_option("--base", dest="base_dir",
+ action="store", default='/tmp',
+ help="""Where to clean (/tmp)""")
+
+ def main(self):
+ """clean up files left from testing in /tmp"""
+ # this is done using patterned names
+ for root, dirs, files in os.walk(self.options.base_dir):
+ for filename in files:
+ for pat in filepatterns:
+ if pat.match(filename):
+ fn = os.path.join(root, filename)
+ os.remove(fn)
+ break
+ for dirname in dirs:
+ for pat in dirpatterns:
+ if pat.match(dirname):
+ dn = os.path.join(root, dirname)
+ os.system('rm -rf %s' % dn)
+
+if __name__ == "__main__":
+ Cleaner("cleaner")()
+
+
diff --git a/tests/milestone_1.txt b/tests/milestone_1.txt
index bde3720..48d09bc 100644
--- a/tests/milestone_1.txt
+++ b/tests/milestone_1.txt
@@ -12,6 +12,10 @@ datastore.
First, create and connect the store.
+>>> from testutils import waitforindex
+>>> import os
+>>> assert os.system('rm -rf /tmp/test_ds') == 0
+
>>> from olpc.datastore import DataStore
>>> from olpc.datastore import backingstore
@@ -35,11 +39,13 @@ Note that we retain no reference to the created documents.
Now we should be able to test the first requirement.
* Get the unique ids of all the objects in the store.
+>>> waitforindex(ds)
+
>>> results, count = ds.find()
A find command with out any parameters will return everything in the store.
-* Get an object from the store given his uid.
+* Get an object from the store given its uid.
Here we manually cycle through the results looking for the title we
want.
@@ -51,30 +57,24 @@ want.
* Get the object metadata.
>>> c1.properties
-[...]
+{...}
* Get the object file.
>>> c1.filename
'/tmp/...'
->>> c1.data
+>>> c1.contents
'this is the first document'
>>> c1.file
<open file ...>
-Or if you prefer access through the datastore (which is how DBus would
-use it)
-
->>> fn = ds.get_filename(first_uid)
->>> ds.get_data(first_uid)
-'this is the first document'
-
Now we can modify that file and then
* Push the changes made to the file back to the store.
* Update the metadata of an object.
+>>> fn = c1.filename
>>> fp = open(fn, 'a')
>>> print >>fp, "more content"
>>> fp.close()
@@ -87,6 +87,4 @@ We can also remove the file from the repository.
This is the basis of milestone 1.
>>> ds.stop()
->>> del ds
-
-
+>>> assert os.system('rm -rf /tmp/test_ds') == 0
diff --git a/tests/milestone_2.txt b/tests/milestone_2.txt
index 516d497..73fd43a 100644
--- a/tests/milestone_2.txt
+++ b/tests/milestone_2.txt
@@ -7,17 +7,22 @@ First clean up from any other tests.
>>> assert os.system('rm -rf /tmp/test_ds/') == 0
>>> from olpc.datastore import DataStore
->>> from olpc.datastore import backingstore
+>>> from olpc.datastore import backingstore, model
>>> ds = DataStore()
>>> ds.registerBackend(backingstore.FileBackingStore)
->>> assert ds.mount("/tmp/test_ds")
+>>> dm = model.defaultModel.copy().addField('year', 'int').addField('month', 'string')
->>> a = ds.create(dict(title="Content A", author="Bob", year=1999, month="Jan"), '')
->>> b = ds.create(dict(title="Content B", author="Alice", year=2000, month="Jan"), '')
+>>> assert ds.mount("/tmp/test_ds", {'indexmanager.model' : dm})
+
+>>> a = ds.create(dict(title="Content A", author="Bob", year="1999", month="Jan"), '')
+>>> b = ds.create(dict(title="Content B", author="Alice", year="2000", month="Jan"), '')
Find should return both
>>> def find2uids(results): return [i['uid'] for i in results[0]]
->>> assert set(find2uids(ds.find({}))) == set([a,b])
+
+>>> ds.complete_indexing()
+
+>>> assert set(find2uids(ds.find())) == set([a,b])
But what if we want the results ordered?
@@ -35,3 +40,4 @@ and if we want to reverse order it?
>>> ds.stop()
>>> del ds
+>>> assert os.system('rm -rf /tmp/test_ds/') == 0
diff --git a/tests/mountpoints.txt b/tests/mountpoints.txt
index 9a821b5..eebad9b 100644
--- a/tests/mountpoints.txt
+++ b/tests/mountpoints.txt
@@ -20,7 +20,7 @@ Here we create a datastore, and mount a backingstore on tmp. By
default this will create a new directory in /tmp which will then be
used for storage.
->>> ds = DataStore(sync_index=True)
+>>> ds = DataStore()
>>> ds.registerBackend(backingstore.FileBackingStore)
>>> mp1 = ds.mount("/tmp/store1", dict(title="Primary Storage"))
@@ -36,11 +36,12 @@ can be used to control the storage target or to filter results.
Now lets create some content
>>> u1 = ds.create(dict(title="Document 1", filename="one.txt"), tmpData("""document one"""))
->>> u2 = ds.create(dict(title="Document 2", mime_type="text/plain"), tmpData("""document two"""))
+>>> u2 = ds.create(dict(title="Document 2", mimetype="text/plain"), tmpData("""document two"""))
We can now, if we wish verify which mount point this content came
from.
+>>> ds.complete_indexing()
>>> c1 = ds.get(u1)
>>> assert c1.backingstore.id == mountpoint
@@ -61,6 +62,8 @@ Now lets add another mount point.
Now lets create a new content item.
>>> u3 = ds.create(dict(title="Document 3", mountpoint=mp2), tmpData("""document three"""))
+>>> ds.complete_indexing()
+
We explictly passed a mount point here. Lets examine the properties of
the object and verify this.
>>> c3 = ds.find(dict(title="Document 3"))[0][0]
@@ -102,6 +105,8 @@ Register the filesystem type
If that worked it should have imported content on load().
+>>> ds.complete_indexing()
+
>>> result, count = ds.find(dict(fulltext="four"))
>>> assert count == 1
>>> assert result[0]['mountpoint'] == mp3
@@ -112,7 +117,11 @@ as DBus data.
>>> ds.unmount(mp3)
->>> mp3 = ds.mount("inplace:/tmp/store3", dict(title=dbus.String("Fake USB again")))
+>>> mp3 = ds.mount("inplace:/tmp/store3", dict(title=dbus.String("Fake USB again"),
+... sync_mount=True))
+
+>>> ds.complete_indexing()
+
>>> result, count = ds.find(dict(fulltext="four"))
>>> assert count == 1
diff --git a/tests/properties.txt b/tests/properties.txt
index 689414f..fe34782 100644
--- a/tests/properties.txt
+++ b/tests/properties.txt
@@ -8,16 +8,23 @@ properties to content and managing them.
>>> from olpc.datastore import DataStore
->>> from olpc.datastore import backingstore
+>>> from olpc.datastore import backingstore, model
>>> from testutils import tmpData
>>> import dbus
Set up two mount points.
->>> ds = DataStore(sync_index=True)
+>>> ds = DataStore()
>>> ds.registerBackend(backingstore.FileBackingStore)
->>> mp1 = ds.mount("/tmp/store1", dict(title="Primary Storage"))
->>> mp2 = ds.mount("/tmp/store2", dict(title="Secondary Storage"))
+
+Extend the model to retain a 'year' property used below.
+
+>>> dm = model.defaultModel.copy().addField('year', "int")
+
+Mount a couple of stores.
+
+>>> mp1 = ds.mount("/tmp/store1", {'title' : "Primary Storage", 'indexmanager.model' : dm})
+>>> mp2 = ds.mount("/tmp/store2", {'title' : "Secondary Storage", 'indexmanager.model' : dm})
Create some content on each.
@@ -28,15 +35,14 @@ Create some content on each.
>>> u4 = ds.create({'title' : "Gamma doc", 'author' : "HAL", 'year:number' : 2001, 'mountpoint' : mp2}, tmpData("""Document 4"""))
Now we should be able to discover things about the system properties.
+>>> ds.complete_indexing()
Here we test that we can extract the unique values for certain properties.
>>> assert set(ds.get_uniquevaluesfor('author')) == set(['Ben', 'HAL'])
-Here we try to gather the values for the property year. We'd expect
-these values to come back as numbers, however in the current
-implementation they are stored as unicode values.
+Here we try to gather the values for the property year.
->>> assert set(ds.get_uniquevaluesfor('year')) == set([u'2000', u'2001'])
+>>> assert set(ds.get_uniquevaluesfor('year')) == set([2000, 2001])
diff --git a/tests/query.txt b/tests/query.txt
deleted file mode 100644
index 2c58851..0000000
--- a/tests/query.txt
+++ /dev/null
@@ -1,277 +0,0 @@
-This document outlines the basic usage of the olpc.datastore.query and
-olpc.datastore.model modules. Not that these are not use independ of
-the olpc.datastore.backend which in turn is only accessed through the
-olpc.datastore module. This is intended only to document the innards
-of those modules.
-
->>> import os
->>> assert os.system('rm -rf /tmp/_test_index') == 0
->>> assert os.system('rm -rf /tmp/_test_fulltext') == 0
-
-
-First lets create a query manager
-
->>> from olpc.datastore.query import DefaultQueryManager
->>> from olpc.datastore.model import Property
->>> qm = DefaultQueryManager("/tmp/_test_index", fulltext_repo='/tmp/_test_fulltext')
->>> qm.prepare()
-True
-
-That will create the memory backed database which will be used in this
-documentation. The call to prepare() is normally invoked by the higher
-level datastore for you.
-
-Because this is a new database there should be nothing in it. We can
-verify this with the call to the find() method.
-
->>> qm.find()
-([], 0)
-
-
-The simplest way to add an entry to the datastore is by passing a
-dictionary of properties to the create method.
-
->>> a = qm.create(dict(title="New Content"))
-
-Find will now return this object.
-
->>> qm.find()
-([<Content id:...>], 1)
-
-We can examine the Properties of this object.
-
->>> a.properties
-[... <TextProperty title:'New Content'>, ...]
-
-This returned a list of all properties on the Content object in which
-case we can find the property by enumeration. The other option is
-using the get_properties call on Content
-
->>> a.get_properties(key='title')
-[<TextProperty title:'New Content'>]
-
-Using the query manager API we are able to update the
-properties. Using this form automatically synchronizes with the
-database and the property is immediately available. To demonstrate
-that this works lets attach another property.
->>> qm.update(a, dict(author='Benjamin'))
-
-A request for title still returns only the title property.
->>> a.get_properties(key='title')
-[<TextProperty title:'New Content'>]
-
-And a request for author works as expected.
->>> a.get_properties(key='author')
-[<Property author:'Benjamin'>]
-
->>> qm.update(a, dict(foo='bar'))
->>> set([p.key for p in a.properties]) == set(['title', 'mtime', 'ctime', 'language', 'mime_type', 'author', 'foo'])
-True
-
-We could have also passed an id for the content object rather than the
-object itself. A list of properties would have been acceptable in
-place of a dictionary. One thing that is shown here is that a number
-of default properties were added when the Content object was
-created. This is done by default and can be controlled by the
-include_defaults flag to the create() method.
-
-Some of the default Property objects have values which are not of the
-'string' type. 'ctime' and 'mtime' are examples of these
-
->>> a.get_property('ctime')
-datetime.datetime(...)
-
-We can see that ctime has been mapped to a standard Python
-datetime.datetime instance. olpc.datastore.model includes support for
-'number' and 'datetime' Property types by default. To add support for
-new property types see the oplc.datastore.model.Model.addPropertyType
-method.
-
-Here we want to show that certain types of Properties map to
-specialized implemenations automatically based on their type. 'ctime'
-is a DateTime Property and we can verify that it is returned properly
-from the mapping layer with the following.
->>> ctimeProp = a.get_properties(key='ctime')[0]
->>> ctimeProp.type == "date"
-True
->>> type(ctimeProp)
-<class 'olpc.datastore.model.DateProperty'>
-
-Special support is needed to make dates easily addressable within the
-datastore. The properties 'ctime', creation time, and 'mtime',
-modification time are supported. To query on these properties two
-methods are available.
-
->>> qm.find(ctime="2007-01-01 00:00")
-([], 0)
-
-Which matches nothing. And the other form is to pass a dict with
-'start' and 'end' range boundries.
->>> import datetime
->>> now = datetime.datetime.now().isoformat()
-
->>> qm.find(ctime=dict(end=now))
-([<Content id:...>], 1)
-
-
-Property keys are unique per Content item. This means that adding a
-Property with an existing key should update the value rather than add
-another Property with the same key.
-
->>> qm.update(a.id, [Property('another', 'property')])
->>> a.get_property('another')
-'property'
-
->>> l1 = len(a.properties)
->>> qm.update(a, dict(another="test"))
->>> a.get_property('another')
-'test'
->>> len(a.properties) == l1
-True
-
-Both forms of passing properties work.
-
->>> l1 = len(a.properties)
->>> qm.update(a, [Property('another', 'value')])
->>> a.get_property('another')
-'value'
->>> len(a.properties) == l1
-True
-
-
-We can also navigate from a Property object to the content to which it
-refers. This is available through the 'content' attrbiute of
-properties. Only properties bound to content and synchronized with the
-database have this property.
-
->>> p = a.get_properties(key='author')[0]
->>> p.content
-<Content id:...>
-
-Let's create additional content.
-
->>> b = qm.create(dict(title="My Picture", author="Sarah", mime_type="image/png"))
->>> c = qm.create(dict(title="My Song", author="Sarah", mime_type="audio/mp3"))
-
-At this point the find() method should be able to provide us with more
-interesting results. Note at this point that find() has multiple
-responsibilities. First it must search the Properties of objects,
-secondly it provides access to the full-text index associated with
-content in the datastore.
-
-For now we are only interested in the properties.
-
->>> qm.find(mime_type="image/png") == ([b], 1)
-True
-
->>> qm.find(author="Benjamin") == ([a],1)
-True
-
->>> qm.find(author="Sarah") == ([b, c],2)
-True
-
->>> qm.find(author="Sarah", mime_type="audio/mp3") == ([c], 1)
-True
-
-Passing the special value, 'content' to find will pass an expression
-to the full text index engine. The query manager maintains a full-text
-index in parallel to the normal metadata storeage. The full text index
-is updated on every create, update and delete call to the query
-manager provided the mime_type Property of the Content is one
-understood by the index.
-
->>> from StringIO import StringIO
->>> qm.update(a, {}, StringIO("this is my content, hear it roar"))
->>> qm.find(fulltext="roar") == ([a], 1)
-True
-
-Combining this with properties also works.
->>> qm.find(fulltext="roar", author="Benjamin") == ([a], 1)
-True
-
-And we can verify the negitive as well.
->>> qm.find(fulltext="roar", author="Sarah")
-([], 0)
-
-Calls to update() and create() both take an optional file argument
-which will update the fulltext indexed content with the new value of
-file.
-
->>> qm.update(a, filelike=StringIO("different text"))
-
-The new content will be found
->>> qm.find(fulltext="different", author="Benjamin") == ([a], 1)
-True
-
-And the old content is not.
->>> qm.find(fulltext="roar", author="Benjamin")
-([], 0)
-
-
-Passing a filename for file works as well. Files can be in a variety
-of binary formats include PDF.
->>> qm.update(a, filelike="test.doc")
->>> qm.find(fulltext="roar", author="Benjamin")
-([], 0)
->>> qm.find(fulltext="amazed", author="Benjamin") == ([a], 1)
-True
-
-We have converters for DOC, PDF and ODT by default
-
->>> qm.update(a, filelike="test.pdf")
->>> qm.find(fulltext="peek", author="Benjamin") == ([a], 1)
-True
-
-
->>> qm.update(a, filelike="test.odt")
->>> qm.find(fulltext="amazed", author="Benjamin") == ([a], 1)
-True
-
->>> qm.update(a, dict(title="titled indeed"), filelike="test.doc")
->>> qm.find(fulltext="amazed", author="Benjamin") == ([a], 1)
-True
-
-For the last example we can see that we also updated the title. Here
-we show that because title is a 'text' property, rather than a simple
-string its contents will be available to the text indexing engine as
-well.
-
-Searching for a direct match on the property works.
->>> qm.find(title="titled indeed") == ([a], 1)
-True
-
-Doing a search for text internal to the title doesn't however.
-
->>> qm.find(title="indeed") == ([a], 1)
-False
-
-Searching for it in the fulltext index does return a result.
->>> qm.find(fulltext="indeed") == ([a], 1)
-True
-
-Searching for only title in fulltext index does return a result as well.
->>> qm.find(fulltext="title:indeed") == ([a], 1)
-True
-
-
-Here we show off the get_uniquevaluesfor call examining all the values
-used in the 'author' field.
-
->>> assert set(qm.get_uniquevaluesfor('author')) == set(['Benjamin', 'Sarah'])
-
-Now that we can see a set of possible values it might be nice to
-select any content with properties from a known set. For example
-
->>> r, c = qm.find(author=['Benjamin', 'Sarah'])
->>> assert c == 3
-
-By putting the request value in a list we can ask that the value be
-'IN' this collection. All participating values are included in this
-way.
-
-
-Now for politeness we shut everything down
->>> qm.stop()
->>> import shutil, os
->>> shutil.rmtree('/tmp/_test_fulltext')
->>> os.unlink('/tmp/_test_index.db')
diff --git a/tests/runalltests.py b/tests/runalltests.py
index bbf0f97..02034b9 100644
--- a/tests/runalltests.py
+++ b/tests/runalltests.py
@@ -14,15 +14,14 @@ import unittest
import doctest
from pkg_resources import resource_filename
-from sqlalchemy import clear_mappers
doctests = [
- resource_filename(__name__, "query.txt"),
+ resource_filename(__name__, "xapianindex.txt"),
resource_filename(__name__, "milestone_1.txt"),
resource_filename(__name__, "sugar_demo_may17.txt"),
resource_filename(__name__, "milestone_2.txt"),
resource_filename(__name__, "mountpoints.txt"),
- resource_filename(__name__, "properties.txt")
+ resource_filename(__name__, "properties.txt"),
]
@@ -30,43 +29,30 @@ doctest_options = doctest.ELLIPSIS
doctest_options |= doctest.REPORT_ONLY_FIRST_FAILURE
-# IF YOU ARE NOT GETTING THE RESULTS YOU EXPECT WHILE TESTING
-# THIS IS THE LIKELY CAUSE
-# :: Use distutils to modify the pythonpath for inplace testing
-# using the build directory
-from distutils.util import get_platform
-plat_specifier = ".%s-%s" % (get_platform(), sys.version[0:3])
-build_platlib = os.path.join("build", 'lib' + plat_specifier)
-test_lib = os.path.join(os.path.abspath(".."), build_platlib)
-sys.path.insert(0, test_lib)
-# END PATH ADJUSTMENT CODE
-
-
-
-def tearDownDS(test):
- # reset the module global mappers used in SQLAlchemy between tests
- clear_mappers()
- # and remove the test repository used in some tests
- os.system('rm -rf /tmp/test_ds')
-
def test_suite():
+ global doctests
suite = unittest.TestSuite()
+ if len(sys.argv) > 1:
+ doctests = sys.argv[1:]
+
for dt in doctests:
suite.addTest(doctest.DocFileSuite(dt,
- optionflags=doctest_options, tearDown=tearDownDS))
+ optionflags=doctest_options))
- tests = os.listdir(os.curdir)
- tests = [n[:-3] for n in tests if n.startswith('test') and
- n.endswith('.py')]
+ if len(sys.argv) <= 1:
+ tests = os.listdir(os.curdir)
+ tests = [n[:-3] for n in tests if n.startswith('test') and
+ n.endswith('.py')]
- for test in tests:
- m = __import__(test)
- if hasattr(m, 'test_suite'):
- suite.addTest(m.test_suite())
+ for test in tests:
+ m = __import__(test)
+ if hasattr(m, 'test_suite'):
+ suite.addTest(m.test_suite())
return suite
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=1)
- runner.run(test_suite())
+ suite = test_suite()
+ runner.run(suite)
diff --git a/tests/sugar_demo_may17.txt b/tests/sugar_demo_may17.txt
index c899799..f242140 100644
--- a/tests/sugar_demo_may17.txt
+++ b/tests/sugar_demo_may17.txt
@@ -2,6 +2,7 @@ How Sugar will interact with the DS for the May 17th demo in Argentina:
>>> from olpc.datastore import DataStore
>>> from olpc.datastore import backingstore
+>>> from testutils import waitforindex
>>> ds = DataStore()
>>> ds.registerBackend(backingstore.FileBackingStore)
>>> assert ds.mount("/tmp/test_ds")
@@ -9,11 +10,14 @@ How Sugar will interact with the DS for the May 17th demo in Argentina:
Create an entry without data:
>>> uid = ds.create(dict(title="New entry"), '')
+>>> waitforindex(ds)
+
>>> ds.get_filename(uid)
''
Update an entry without data:
>>> ds.update(uid, dict(title="New entry still without content"), '')
+>>> waitforindex(ds)
>>> ds.get_filename(uid)
''
@@ -23,6 +27,7 @@ Add some data to the same entry:
>>> print >>fp, "some content"
>>> fp.close()
>>> ds.update(uid, dict(title="Same entry now with some content"), fp.name)
+>>> waitforindex(ds)
Retrieve that data:
>>> fn = ds.get_filename(uid)
@@ -36,6 +41,7 @@ Update again:
>>> print >>fp, "some other content"
>>> fp.close()
>>> ds.update(uid, dict(title="Same entry with some other content"), fp.name)
+>>> waitforindex(ds)
And retrieve again:
>>> fn = ds.get_filename(uid)
@@ -60,6 +66,7 @@ Set content as pdf:
>>> ds.update(uid, dict(title="Same entry with some content in pdf"), 'test.pdf')
>>> ds.update(uid, dict(title="Same entry with some content in doc"), 'test.doc')
>>> ds.update(uid, dict(title="Same entry with some content in odt"), 'test.odt')
+>>> waitforindex(ds)
>>> ds.stop()
>>> del ds
diff --git a/tests/test_backingstore.py b/tests/test_backingstore.py
index 28fdeba..8aab9e6 100644
--- a/tests/test_backingstore.py
+++ b/tests/test_backingstore.py
@@ -1,21 +1,21 @@
import unittest
-from StringIO import StringIO
+from testutils import tmpData, waitforindex
from olpc.datastore import backingstore
-from sqlalchemy import clear_mappers
import os
DEFAULT_STORE = '/tmp/_bs_test'
class Test(unittest.TestCase):
- def tearDown(self):
+ def setUp(self):
if os.path.exists(DEFAULT_STORE):
os.system("rm -rf %s" % DEFAULT_STORE)
- clear_mappers()
+ def tearDown(self):
+ if os.path.exists(DEFAULT_STORE):
+ os.system("rm -rf %s" % DEFAULT_STORE)
def test_fsstore(self):
- clear_mappers()
bs = backingstore.FileBackingStore(DEFAULT_STORE)
bs.initialize_and_load()
bs.create_descriptor()
@@ -28,20 +28,28 @@ class Test(unittest.TestCase):
d = """This is a test"""
d2 = "Different"
- c = bs.create(dict(title="A"), StringIO(d))
- obj = bs.get(c.id)
+ uid = bs.create(dict(title="A"), tmpData(d))
+
+ waitforindex(bs)
+
+ obj = bs.get(uid)
+
assert obj.get_property('title') == "A"
got = obj.file.read()
assert got == d
- bs.update(c.id, dict(title="B"), StringIO(d2))
- obj = bs.get(c.id)
+ bs.update(uid, dict(title="B"), tmpData(d2))
+
+ waitforindex(bs)
+
+ obj = bs.get(uid)
assert obj.get_property('title') == "B"
got = obj.file.read()
assert got == d2
- bs.delete(c.id)
- self.failUnlessRaises(KeyError, bs.get, c.id)
+ bs.delete(uid)
+ bs.complete_indexing()
+ self.failUnlessRaises(KeyError, bs.get, uid)
def test_suite():
suite = unittest.TestSuite()
diff --git a/tests/test_model.py b/tests/test_model.py
index 6e8c896..2ac2fb2 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1,35 +1,55 @@
import unittest
-from testutils import tmpData
+from testutils import tmpData, waitforindex
from olpc.datastore import DataStore
from olpc.datastore import model, backingstore
import datetime
import os
+
+DEFAULT_STORE = '/tmp/test_ds'
+
class Test(unittest.TestCase):
+ def setUp(self): os.system('rm -rf %s' % DEFAULT_STORE)
+ def tearDown(self): os.system('rm -rf %s' % DEFAULT_STORE)
+
def test_dateproperty(self):
n = datetime.datetime.now()
# we have to kill the microseconds as
# time.strptime which we must use in 2.4 doesn't parse it
n = n.replace(microsecond=0)
- p = model.DateProperty('ctime', n)
+ p = model.Property('ctime', n, 'date')
assert p.key == "ctime"
- assert p.value.isoformat() == n.isoformat()
+ # XXX: the 'date()' is a work around for a missing secore
+ # feature right now
+ assert p.value == n.date().isoformat()
+
def test_binaryproperty(self):
ds = DataStore()
ds.registerBackend(backingstore.FileBackingStore)
- ds.mount('/tmp/test_ds')
+
+ #add a custom field to the model
+ dm = model.defaultModel.copy().addField('thumbnail', 'binary')
+
+ ds.mount(DEFAULT_STORE, {'indexmanager.model' : dm})
+
+
data = open('test.jpg', 'r').read()
# binary data with \0's in it can cause dbus errors here
- uid = ds.create({'title' : "Document 1", 'thumbnail:binary' : data},
- tmpData("with image\0\0 prop"))
+ fn = tmpData("with image\0\0 prop")
+ # XXX: We should be able to remove:binary now
+ uid = ds.create({'title' : "Document 1", 'thumbnail:binary' : data}, fn)
+
+ waitforindex(ds)
+
c = ds.get(uid)
assert c.get_property('thumbnail') == data
+
ds.stop()
- os.system('rm -rf /tmp/test_ds')
+
def test_suite():
suite = unittest.TestSuite()
diff --git a/tests/test_xapianindex.py b/tests/test_xapianindex.py
new file mode 100644
index 0000000..db6afef
--- /dev/null
+++ b/tests/test_xapianindex.py
@@ -0,0 +1,90 @@
+from olpc.datastore.xapianindex import IndexManager
+import os
+from datetime import datetime
+
+import time
+import unittest
+import gnomevfs
+
+DEFAULT_STORE = '/tmp/_xi_test'
+
+
+def index_file(iconn, filepath):
+ """Index a file."""
+
+ mimetype = gnomevfs.get_mime_type(filepath)
+ main, subtype = mimetype.split('/',1)
+
+ stat = os.stat(filepath)
+ ctime = datetime.fromtimestamp(stat.st_ctime)
+ mtime = datetime.fromtimestamp(stat.st_mtime)
+
+ if main in ['image']: filepath = None
+ if subtype in ['x-trash', 'x-python-bytecode']: filepath = None
+
+
+
+ props = {'mimetype' : mimetype, 'mtime:date' : mtime,
+ 'ctime:date' : ctime,}
+
+ if filepath:
+ fn = os.path.split(filepath)[1]
+ props['filename'] = fn
+
+ iconn.index(props, filepath)
+
+ return 1
+
+def index_path(iconn, docpath):
+ """Index a path."""
+ count = 0
+ for dirpath, dirnames, filenames in os.walk(docpath):
+ for filename in filenames:
+ filepath = os.path.join(dirpath, filename)
+ index_file(iconn, filepath)
+ count += 1
+ return count
+
+class Test(unittest.TestCase):
+ def setUp(self):
+ if os.path.exists(DEFAULT_STORE):
+ os.system("rm -rf %s" % DEFAULT_STORE)
+
+ def tearDown(self):
+ if os.path.exists(DEFAULT_STORE):
+ os.system("rm -rf %s" % DEFAULT_STORE)
+
+ def test_index(self):
+ # import a bunch of documents into the store
+ im = IndexManager()
+ im.connect(DEFAULT_STORE)
+
+ # test basic index performance
+ start = time.time()
+ count = index_path(im, os.getcwd())
+ end = time.time()
+ delta = end - start
+
+ #print "%s in %s %s/sec" % (count, delta, count/delta)
+
+ # wait for indexing to finish
+ im.complete_indexing()
+
+ # test basic search performance
+ results = list(im.search('peek')[0])
+
+ # this indicates that we found text inside binary content that
+ # we expected
+ assert 'test.pdf' in set(r.get_property('filename') for r in results)
+
+ assert im.search('mimetype:application/pdf filename:test.pdf peek')[1] == 1
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTest(unittest.makeSuite(Test))
+ return suite
+
+if __name__ == "__main__":
+ unittest.main()
+
diff --git a/tests/testutils.py b/tests/testutils.py
index 243747a..a4efc0a 100644
--- a/tests/testutils.py
+++ b/tests/testutils.py
@@ -7,3 +7,8 @@ def tmpData(data):
os.write(fd, data)
os.close(fd)
return fn
+
+def waitforindex(obj):
+ # wait for any/all index managers associated with object to finish
+ # indexing so that tests can do there thing
+ obj.complete_indexing()
diff --git a/tests/xapianindex.txt b/tests/xapianindex.txt
new file mode 100644
index 0000000..354d7a8
--- /dev/null
+++ b/tests/xapianindex.txt
@@ -0,0 +1,73 @@
+The xapian index module can be used directly as follows
+
+First clean up any old test data.
+
+>>> index_home = "/tmp/xi"
+>>> import os, sys, time, logging
+>>> assert os.system('rm -rf %s' % index_home) == 0
+
+# >>> logging.basicConfig(level=logging.DEBUG,
+# ... format="%(asctime)-15s %(name)s %(levelname)s: %(message)s",
+# ... stream=sys.stderr)
+
+
+>>> from olpc.datastore.xapianindex import IndexManager
+>>> from olpc.datastore import model
+>>> im = IndexManager()
+>>> im.connect(index_home)
+
+
+Now add the file to the index.
+
+>>> props = dict(title="PDF Document",
+... mimetype="application/pdf")
+
+
+>>> uid = im.index(props, "test.pdf")
+
+Let the async indexer do its thing. We ask the indexer if it has work
+left, when it has none we expect our content to be indexed and searchable.
+
+>>> im.complete_indexing()
+
+
+Searching on an property of the content works.
+>>> def expect(r, count=None):
+... if count: assert r[1] == count
+... return list(r[0])
+>>> def expect_single(r):
+... assert r[1] == 1
+... return r[0].next()
+>>> def expect_none(r):
+... assert r[1] == 0
+... assert list(r[0]) == []
+
+
+>>> assert expect_single(im.search("PDF")).id == uid
+
+Searching into the binary content of the object works as well.
+>>> assert expect_single(im.search("peek")).id == uid
+
+Specifying a search that demands a document term be found only in the
+title works as well.
+
+>>> assert expect_single(im.search('title:PDF')).id == uid
+>>> expect_none(im.search('title:peek'))
+
+Searching for documents that are PDF works as expected here. Here we
+use the dictionary form of the query where each field name is given
+and creates a search.
+>>> assert expect_single(im.search(dict(mimetype='application/pdf'))).id == uid
+
+Punctuation is fine.
+
+>>> assert expect_single(im.search("Don't peek")).id == uid
+
+As well as quoted strings
+
+>>> assert expect_single(im.search(r'''"Don't peek"''')).id == uid
+
+Cleanly shut down.
+>>> im.stop()
+
+>>> assert os.system('rm -rf %s' % index_home) == 0