diff options
Diffstat (limited to 'websdk/hatta/search.py')
-rw-r--r-- | websdk/hatta/search.py | 317 |
1 files changed, 317 insertions, 0 deletions
diff --git a/websdk/hatta/search.py b/websdk/hatta/search.py new file mode 100644 index 0000000..2d8ae69 --- /dev/null +++ b/websdk/hatta/search.py @@ -0,0 +1,317 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sqlite3 +import re +import os +import thread + +import error + + +class WikiSearch(object): + """ + Responsible for indexing words and links, for fast searching and + backlinks. Uses a cache directory to store the index files. + """ + + word_pattern = re.compile(ur"""\w[-~&\w]+\w""", re.UNICODE) + jword_pattern = re.compile( +ur"""[ヲ-゚]+|[ぁ-ん~ー]+|[ァ-ヶ~ー]+|[0-9A-Za-z]+|""" +ur"""[0-9A-Za-zΑ-Ωα-ωА-я]+|""" +ur"""[^- !"#$%&'()*+,./:;<=>?@\[\\\]^_`{|}""" +ur"""‾。「」、・ 、。,.・:;?!゛゜´`¨""" +ur"""^ ̄_/〜‖|…‥‘’“”""" +ur"""()〔〕[]{}〈〉《》「」『』【】+−±×÷""" +ur"""=≠<>≦≧∞∴♂♀°′″℃¥$¢£""" +ur"""%#&*@§☆★○●◎◇◆□■△▲▽▼※〒""" +ur"""→←↑↓〓∈∋⊆⊇⊂⊃∪∩∧∨¬⇒⇔∠∃∠⊥""" +ur"""⌒∂∇≡≒≪≫√∽∝∵∫∬ʼn♯♭♪†‡¶◾""" +ur"""─│┌┐┘└├┬┤┴┼""" +ur"""━┃┏┓┛┗┣┫┻╋""" +ur"""┠┯┨┷┿┝┰┥┸╂""" +ur"""ヲ-゚ぁ-ん~ーァ-ヶ""" +ur"""0-9A-Za-z0-9A-Za-zΑ-Ωα-ωА-я]+""", re.UNICODE) + + def __init__(self, cache_path, lang, storage): + self._con = {} + self.path = cache_path + self.storage = storage + self.lang = lang + if lang == "ja": + self.split_text = self.split_japanese_text + self.filename = os.path.join(cache_path, 'index.sqlite3') + if not os.path.isdir(self.path): + self.empty = True + os.makedirs(self.path) + elif not os.path.exists(self.filename): + self.empty = True + else: + self.empty = False + self.init_db(self.con) + + def init_db(self, con): + con.execute('CREATE TABLE IF NOT EXISTS titles ' + '(id INTEGER PRIMARY KEY, title VARCHAR);') + con.execute('CREATE TABLE IF NOT EXISTS words ' + '(word VARCHAR, page INTEGER, count INTEGER);') + con.execute('CREATE INDEX IF NOT EXISTS index1 ' + 'ON words (page);') + con.execute('CREATE INDEX IF NOT EXISTS index2 ' + 'ON words (word);') + con.execute('CREATE TABLE IF NOT EXISTS links ' + '(src INTEGER, target INTEGER, label VARCHAR, number INTEGER);') + con.commit() + + @property + def con(self): + """Keep one connection per thread.""" + + thread_id = thread.get_ident() + try: + return self._con[thread_id] + except KeyError: + connection = sqlite3.connect(self.filename) + self._con[thread_id] = connection + return connection + + def split_text(self, text): + """Splits text into words""" + + for match in self.word_pattern.finditer(text): + word = match.group(0) + yield word.lower() + + def split_japanese_text(self, text): + """Splits text into words, including rules for Japanese""" + + for match in self.word_pattern.finditer(text): + word = match.group(0) + got_japanese = False + for m in self.jword_pattern.finditer(word): + w = m.group(0) + got_japanese = True + yield w.lower() + if not got_japanese: + yield word.lower() + + def count_words(self, words): + count = {} + for word in words: + count[word] = count.get(word, 0) + 1 + return count + + def title_id(self, title, con): + c = con.execute('SELECT id FROM titles WHERE title=?;', (title,)) + idents = c.fetchone() + if idents is None: + con.execute('INSERT INTO titles (title) VALUES (?);', (title,)) + c = con.execute('SELECT LAST_INSERT_ROWID();') + idents = c.fetchone() + return idents[0] + + def update_words(self, title, text, cursor): + title_id = self.title_id(title, cursor) + cursor.execute('DELETE FROM words WHERE page=?;', (title_id,)) + if not text: + return + words = self.count_words(self.split_text(text)) + title_words = self.count_words(self.split_text(title)) + for word, count in title_words.iteritems(): + words[word] = words.get(word, 0) + count + for word, count in words.iteritems(): + cursor.execute('INSERT INTO words VALUES (?, ?, ?);', + (word, title_id, count)) + + def update_links(self, title, links_and_labels, cursor): + title_id = self.title_id(title, cursor) + cursor.execute('DELETE FROM links WHERE src=?;', (title_id,)) + for number, (link, label) in enumerate(links_and_labels): + cursor.execute('INSERT INTO links VALUES (?, ?, ?, ?);', + (title_id, link, label, number)) + + def orphaned_pages(self): + """Gives all pages with no links to them.""" + + con = self.con + try: + sql = ('SELECT title FROM titles ' + 'WHERE NOT EXISTS ' + '(SELECT * FROM links WHERE target=title) ' + 'ORDER BY title;') + for (title,) in con.execute(sql): + yield unicode(title) + finally: + con.commit() + + def wanted_pages(self): + """Gives all pages that are linked to, but don't exist, together with + the number of links.""" + + con = self.con + try: + sql = ('SELECT COUNT(*), target FROM links ' + 'WHERE NOT EXISTS ' + '(SELECT * FROM titles WHERE target=title) ' + 'GROUP BY target ORDER BY -COUNT(*);') + for (refs, db_title,) in con.execute(sql): + title = unicode(db_title) + yield refs, title + finally: + con.commit() + + def page_backlinks(self, title): + """Gives a list of pages linking to specified page.""" + + con = self.con # sqlite3.connect(self.filename) + try: + sql = ('SELECT DISTINCT(titles.title) ' + 'FROM links, titles ' + 'WHERE links.target=? AND titles.id=links.src ' + 'ORDER BY titles.title;') + for (backlink,) in con.execute(sql, (title,)): + yield unicode(backlink) + finally: + con.commit() + + def page_links(self, title): + """Gives a list of links on specified page.""" + + con = self.con # sqlite3.connect(self.filename) + try: + title_id = self.title_id(title, con) + sql = 'SELECT target FROM links WHERE src=? ORDER BY number;' + for (link,) in con.execute(sql, (title_id,)): + yield unicode(link) + finally: + con.commit() + + def page_links_and_labels(self, title): + con = self.con # sqlite3.connect(self.filename) + try: + title_id = self.title_id(title, con) + sql = ('SELECT target, label FROM links ' + 'WHERE src=? ORDER BY number;') + for link, label in con.execute(sql, (title_id,)): + yield unicode(link), unicode(label) + finally: + con.commit() + + def find(self, words): + """Iterator of all pages containing the words, and their scores.""" + + con = self.con + try: + ranks = [] + for word in words: + # Calculate popularity of each word. + sql = 'SELECT SUM(words.count) FROM words WHERE word LIKE ?;' + rank = con.execute(sql, ('%%%s%%' % word,)).fetchone()[0] + # If any rank is 0, there will be no results anyways + if not rank: + return + ranks.append((rank, word)) + ranks.sort() + # Start with the least popular word. Get all pages that contain it. + first_rank, first = ranks[0] + rest = ranks[1:] + sql = ('SELECT words.page, titles.title, SUM(words.count) ' + 'FROM words, titles ' + 'WHERE word LIKE ? AND titles.id=words.page ' + 'GROUP BY words.page;') + first_counts = con.execute(sql, ('%%%s%%' % first,)) + # Check for the rest of words + for title_id, title, first_count in first_counts: + # Score for the first word + score = float(first_count) / first_rank + for rank, word in rest: + sql = ('SELECT SUM(count) FROM words ' + 'WHERE page=? AND word LIKE ?;') + count = con.execute(sql, + (title_id, '%%%s%%' % word)).fetchone()[0] + if not count: + # If page misses any of the words, its score is 0 + score = 0 + break + score += float(count) / rank + if score > 0: + yield int(100 * score), unicode(title) + finally: + con.commit() + + def reindex_page(self, page, title, cursor, text=None): + """Updates the content of the database, needs locks around.""" + + if text is None: + get_text = getattr(page, 'plain_text', lambda: u'') + try: + text = get_text() + except error.NotFoundErr: + text = None + title_id = self.title_id(title, cursor) + if not list(self.page_backlinks(title)): + cursor.execute("DELETE FROM titles WHERE id=?;", + (title_id,)) + extract_links = getattr(page, 'extract_links', None) + if extract_links and text: + links = extract_links(text) + else: + links = [] + self.update_links(title, links, cursor=cursor) + self.update_words(title, text or u'', cursor=cursor) + + def update_page(self, page, title, data=None, text=None): + """Updates the index with new page content, for a single page.""" + + if text is None and data is not None: + text = unicode(data, self.storage.charset, 'replace') + cursor = self.con.cursor() + try: + self.set_last_revision(self.storage.repo_revision()) + self.reindex_page(page, title, cursor, text) + self.con.commit() + except: + self.con.rollback() + raise + + def reindex(self, wiki, pages): + """Updates specified pages in bulk.""" + + cursor = self.con.cursor() + try: + for title in pages: + page = wiki.get_page(None, title) + self.reindex_page(page, title, cursor) + self.con.commit() + self.empty = False + except: + self.con.rollback() + raise + + def set_last_revision(self, rev): + """Store the last indexed repository revision.""" + + # We use % here because the sqlite3's substitiution doesn't work + # We store revision 0 as 1, 1 as 2, etc. because 0 means "no revision" + self.con.execute('PRAGMA USER_VERSION=%d;' % (int(rev + 1),)) + + def get_last_revision(self): + """Retrieve the last indexed repository revision.""" + + con = self.con + c = con.execute('PRAGMA USER_VERSION;') + rev = c.fetchone()[0] + # -1 means "no revision", 1 means revision 0, 2 means revision 1, etc. + return rev - 1 + + def update(self, wiki): + """Reindex al pages that changed since last indexing.""" + + last_rev = self.get_last_revision() + if last_rev == -1: + changed = self.storage.all_pages() + else: + changed = self.storage.changed_since(last_rev) + self.reindex(wiki, changed) + rev = self.storage.repo_revision() + self.set_last_revision(rev) |