checkpoint new branch before the property type/xapian field merge

author: Benjamin Saller <bcsaller@objectrealms.net> 2007-07-12 21:14:06 (GMT)
committer: Benjamin Saller <bcsaller@objectrealms.net> 2007-07-12 21:14:06 (GMT)
commit: f577c2c142c7648a482e0eec7ecd736c1ca716d7 (patch)
tree: 259c5cf191116379e97d8aebc260f9664ad3a0e5 /tests
parent: d7092a126f230f22344b50d79b8bd362d659953b (diff)
3 files changed, 206 insertions, 0 deletions
diff --git a/tests/cleaner.py b/tests/cleaner.py
new file mode 100755
index 0000000..8cc795b
--- /dev/null
+++ b/tests/cleaner.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+import os
+import re
+from ore.main import Application
+
+filepattern = re.compile("(\w{8})\-(\w{4})\-(\w{4})\-(\w{4})\-(\w{12})")
+tmppattern = re.compile("tmp\S{6}")
+
+staticdirs = re.compile('test_ds|store\d')
+
+filepatterns = [filepattern, tmppattern]
+dirpatterns = [staticdirs]
+
+class Cleaner(Application):
+    def manage_options(self):
+        self.parser.add_option("--base", dest="base_dir",
+                               action="store", default='/tmp',
+                               help="""Where to clean (/tmp)""")
+
+    def main(self):
+        """clean up files left from testing in /tmp"""
+        # this is done using patterned names
+        for root, dirs, files in os.walk(self.options.base_dir):
+            for filename in files:
+                for pat in filepatterns:
+                    if pat.match(filename):
+                        fn = os.path.join(root, filename)
+                        os.remove(fn)
+                        break
+            for dirname in dirs:
+                for pat in dirpatterns:
+                    if pat.match(dirname):
+                        dn = os.path.join(root, dirname)
+                        os.system('rm -rf %s' % dn)
+                        
+if __name__ == "__main__":
+    Cleaner("cleaner")()
+
+
diff --git a/tests/test_xapianindex.py b/tests/test_xapianindex.py
new file mode 100644
index 0000000..cf39f01
--- /dev/null
+++ b/tests/test_xapianindex.py
@@ -0,0 +1,91 @@
+from testutils import waitforindex
+
+from olpc.datastore.xapianindex import IndexManager
+import os
+from datetime import datetime
+
+import time
+import unittest
+import gnomevfs
+
+DEFAULT_STORE = '/tmp/_xi_test'
+
+
+def index_file(iconn, filepath):
+    """Index a file."""
+
+    mimetype = gnomevfs.get_mime_type(filepath)
+    main, subtype = mimetype.split('/',1)
+
+    stat = os.stat(filepath)
+    ctime = datetime.fromtimestamp(stat.st_ctime)
+    mtime = datetime.fromtimestamp(stat.st_mtime)
+    
+    if main in ['image']: filepath = None
+    if subtype in ['x-trash', 'x-python-bytecode']: filepath = None
+
+
+
+    props = {'mimetype' : mimetype, 'mtime:date' : mtime,
+             'ctime:date' : ctime,}
+
+    if filepath:
+        fn = os.path.split(filepath)[1]
+        props['filename'] = fn 
+    
+    iconn.index(props, filepath)
+
+    return 1
+
+def index_path(iconn, docpath):
+    """Index a path."""
+    count = 0
+    for dirpath, dirnames, filenames in os.walk(docpath):
+        for filename in filenames:
+            filepath = os.path.join(dirpath, filename)
+            index_file(iconn, filepath)
+            count += 1
+    return count
+
+class Test(unittest.TestCase):
+    def setUp(self):
+        if os.path.exists(DEFAULT_STORE):
+            os.system("rm -rf %s" % DEFAULT_STORE)
+
+    def tearDown(self):
+        if os.path.exists(DEFAULT_STORE):
+            os.system("rm -rf %s" % DEFAULT_STORE)
+
+    def test_index(self):
+        # import a bunch of documents into the store
+        im = IndexManager()
+        im.connect(DEFAULT_STORE)
+
+        # test basic index performance
+        start = time.time()
+        count = index_path(im, os.getcwd())
+        end = time.time()
+        delta = end - start
+
+        #print "%s in %s %s/sec" % (count, delta, count/delta)
+
+        # wait for indexing to finish
+        waitforindex(im)
+
+        # test basic search performance
+        results = list(im.search('peek')[0])
+
+        # this indicates that we found text inside binary content that
+        # we expected 
+        assert 'test.pdf' in set(r.get_property('filename') for r in results)
+        
+        
+        
+def test_suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(Test))
+    return suite
+
+if __name__ == "__main__":
+    unittest.main()
+                    
diff --git a/tests/xapianindex.txt b/tests/xapianindex.txt
new file mode 100644
index 0000000..de495a6
--- /dev/null
+++ b/tests/xapianindex.txt
@@ -0,0 +1,76 @@
+The xapian index module can be used directly as follows
+
+First clean up any old test data.
+
+>>> index_home = "/tmp/xi"
+>>> import os, sys, time, logging
+>>> assert os.system('rm -rf %s' % index_home) == 0
+
+# >>> logging.basicConfig(level=logging.DEBUG,
+# ...                    format="%(asctime)-15s %(name)s %(levelname)s: %(message)s",
+# ...                    stream=sys.stderr)
+     
+
+>>> from olpc.datastore.xapianindex import IndexManager
+>>> from olpc.datastore import model
+>>> im = IndexManager()
+>>> im.connect(index_home)
+
+A small utility method for wrapping a normal dict into proper property
+objects.
+
+>>> def propsdict(**kwargs):
+...    d = {}
+...    for k,v in kwargs.iteritems(): 
+...        d[k] = model.Property(k, v)
+...    return d
+
+
+Now add the file to the index.
+
+>>> props = propsdict(title="PDF Document",
+...                   mimetype="application/pdf")
+
+
+>>> uid = im.index(props, "test.pdf")
+
+Let the async indexer do its thing. We ask the indexer if it has work
+left, when it has none we expect our content to be indexed and searchable.
+
+>>> while im.working: time.sleep(0.5)
+
+
+Searching on an property of the content works.
+>>> assert im.search("PDF")[0] == uid
+
+Searching into the binary content of the object works as well.
+>>> assert im.search("peek")[0] == uid
+
+Specifying a search that demands a document term be found only in the
+title works as well.
+
+>>> assert im.search('title:PDF')[0] == uid
+>>> im.search('title:peek')
+[]
+
+Searching for documents that are PDF works as expected here. Here we
+use the dictionary form of the query where each field name is given
+and creates a search.
+>>> assert im.search(dict(mimetype='application/pdf'))[0] == uid
+
+
+#Likewise excluding the match works as expected
+#>>> im.search('-title:PDF')
+#[]
+
+
+Punctuation is fine.
+
+>>> assert im.search("Don't peek")[0] == uid
+
+As well as quoted strings
+
+>>> assert im.search(r'''"Don't peek"''')[0] == uid
+
+Cleanly shut down.
+>>> im.stop()
author	Benjamin Saller <bcsaller@objectrealms.net>	2007-07-12 21:14:06 (GMT)
committer	Benjamin Saller <bcsaller@objectrealms.net>	2007-07-12 21:14:06 (GMT)
commit	f577c2c142c7648a482e0eec7ecd736c1ca716d7 (patch)
tree	259c5cf191116379e97d8aebc260f9664ad3a0e5 /tests
parent	d7092a126f230f22344b50d79b8bd362d659953b (diff)