Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/websdk/hatta/search.py
diff options
context:
space:
mode:
Diffstat (limited to 'websdk/hatta/search.py')
-rw-r--r--websdk/hatta/search.py317
1 files changed, 317 insertions, 0 deletions
diff --git a/websdk/hatta/search.py b/websdk/hatta/search.py
new file mode 100644
index 0000000..2d8ae69
--- /dev/null
+++ b/websdk/hatta/search.py
@@ -0,0 +1,317 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import sqlite3
+import re
+import os
+import thread
+
+import error
+
+
+class WikiSearch(object):
+ """
+ Responsible for indexing words and links, for fast searching and
+ backlinks. Uses a cache directory to store the index files.
+ """
+
+ word_pattern = re.compile(ur"""\w[-~&\w]+\w""", re.UNICODE)
+ jword_pattern = re.compile(
+ur"""[ヲ-゚]+|[ぁ-ん~ー]+|[ァ-ヶ~ー]+|[0-9A-Za-z]+|"""
+ur"""[0-9A-Za-zΑ-Ωα-ωА-я]+|"""
+ur"""[^- !"#$%&'()*+,./:;<=>?@\[\\\]^_`{|}"""
+ur"""‾。「」、・ 、。,.・:;?!゛゜´`¨"""
+ur"""^ ̄_/〜‖|…‥‘’“”"""
+ur"""()〔〕[]{}〈〉《》「」『』【】+−±×÷"""
+ur"""=≠<>≦≧∞∴♂♀°′″℃¥$¢£"""
+ur"""%#&*@§☆★○●◎◇◆□■△▲▽▼※〒"""
+ur"""→←↑↓〓∈∋⊆⊇⊂⊃∪∩∧∨¬⇒⇔∠∃∠⊥"""
+ur"""⌒∂∇≡≒≪≫√∽∝∵∫∬ʼn♯♭♪†‡¶◾"""
+ur"""─│┌┐┘└├┬┤┴┼"""
+ur"""━┃┏┓┛┗┣┫┻╋"""
+ur"""┠┯┨┷┿┝┰┥┸╂"""
+ur"""ヲ-゚ぁ-ん~ーァ-ヶ"""
+ur"""0-9A-Za-z0-9A-Za-zΑ-Ωα-ωА-я]+""", re.UNICODE)
+
+ def __init__(self, cache_path, lang, storage):
+ self._con = {}
+ self.path = cache_path
+ self.storage = storage
+ self.lang = lang
+ if lang == "ja":
+ self.split_text = self.split_japanese_text
+ self.filename = os.path.join(cache_path, 'index.sqlite3')
+ if not os.path.isdir(self.path):
+ self.empty = True
+ os.makedirs(self.path)
+ elif not os.path.exists(self.filename):
+ self.empty = True
+ else:
+ self.empty = False
+ self.init_db(self.con)
+
+ def init_db(self, con):
+ con.execute('CREATE TABLE IF NOT EXISTS titles '
+ '(id INTEGER PRIMARY KEY, title VARCHAR);')
+ con.execute('CREATE TABLE IF NOT EXISTS words '
+ '(word VARCHAR, page INTEGER, count INTEGER);')
+ con.execute('CREATE INDEX IF NOT EXISTS index1 '
+ 'ON words (page);')
+ con.execute('CREATE INDEX IF NOT EXISTS index2 '
+ 'ON words (word);')
+ con.execute('CREATE TABLE IF NOT EXISTS links '
+ '(src INTEGER, target INTEGER, label VARCHAR, number INTEGER);')
+ con.commit()
+
+ @property
+ def con(self):
+ """Keep one connection per thread."""
+
+ thread_id = thread.get_ident()
+ try:
+ return self._con[thread_id]
+ except KeyError:
+ connection = sqlite3.connect(self.filename)
+ self._con[thread_id] = connection
+ return connection
+
+ def split_text(self, text):
+ """Splits text into words"""
+
+ for match in self.word_pattern.finditer(text):
+ word = match.group(0)
+ yield word.lower()
+
+ def split_japanese_text(self, text):
+ """Splits text into words, including rules for Japanese"""
+
+ for match in self.word_pattern.finditer(text):
+ word = match.group(0)
+ got_japanese = False
+ for m in self.jword_pattern.finditer(word):
+ w = m.group(0)
+ got_japanese = True
+ yield w.lower()
+ if not got_japanese:
+ yield word.lower()
+
+ def count_words(self, words):
+ count = {}
+ for word in words:
+ count[word] = count.get(word, 0) + 1
+ return count
+
+ def title_id(self, title, con):
+ c = con.execute('SELECT id FROM titles WHERE title=?;', (title,))
+ idents = c.fetchone()
+ if idents is None:
+ con.execute('INSERT INTO titles (title) VALUES (?);', (title,))
+ c = con.execute('SELECT LAST_INSERT_ROWID();')
+ idents = c.fetchone()
+ return idents[0]
+
+ def update_words(self, title, text, cursor):
+ title_id = self.title_id(title, cursor)
+ cursor.execute('DELETE FROM words WHERE page=?;', (title_id,))
+ if not text:
+ return
+ words = self.count_words(self.split_text(text))
+ title_words = self.count_words(self.split_text(title))
+ for word, count in title_words.iteritems():
+ words[word] = words.get(word, 0) + count
+ for word, count in words.iteritems():
+ cursor.execute('INSERT INTO words VALUES (?, ?, ?);',
+ (word, title_id, count))
+
+ def update_links(self, title, links_and_labels, cursor):
+ title_id = self.title_id(title, cursor)
+ cursor.execute('DELETE FROM links WHERE src=?;', (title_id,))
+ for number, (link, label) in enumerate(links_and_labels):
+ cursor.execute('INSERT INTO links VALUES (?, ?, ?, ?);',
+ (title_id, link, label, number))
+
+ def orphaned_pages(self):
+ """Gives all pages with no links to them."""
+
+ con = self.con
+ try:
+ sql = ('SELECT title FROM titles '
+ 'WHERE NOT EXISTS '
+ '(SELECT * FROM links WHERE target=title) '
+ 'ORDER BY title;')
+ for (title,) in con.execute(sql):
+ yield unicode(title)
+ finally:
+ con.commit()
+
+ def wanted_pages(self):
+ """Gives all pages that are linked to, but don't exist, together with
+ the number of links."""
+
+ con = self.con
+ try:
+ sql = ('SELECT COUNT(*), target FROM links '
+ 'WHERE NOT EXISTS '
+ '(SELECT * FROM titles WHERE target=title) '
+ 'GROUP BY target ORDER BY -COUNT(*);')
+ for (refs, db_title,) in con.execute(sql):
+ title = unicode(db_title)
+ yield refs, title
+ finally:
+ con.commit()
+
+ def page_backlinks(self, title):
+ """Gives a list of pages linking to specified page."""
+
+ con = self.con # sqlite3.connect(self.filename)
+ try:
+ sql = ('SELECT DISTINCT(titles.title) '
+ 'FROM links, titles '
+ 'WHERE links.target=? AND titles.id=links.src '
+ 'ORDER BY titles.title;')
+ for (backlink,) in con.execute(sql, (title,)):
+ yield unicode(backlink)
+ finally:
+ con.commit()
+
+ def page_links(self, title):
+ """Gives a list of links on specified page."""
+
+ con = self.con # sqlite3.connect(self.filename)
+ try:
+ title_id = self.title_id(title, con)
+ sql = 'SELECT target FROM links WHERE src=? ORDER BY number;'
+ for (link,) in con.execute(sql, (title_id,)):
+ yield unicode(link)
+ finally:
+ con.commit()
+
+ def page_links_and_labels(self, title):
+ con = self.con # sqlite3.connect(self.filename)
+ try:
+ title_id = self.title_id(title, con)
+ sql = ('SELECT target, label FROM links '
+ 'WHERE src=? ORDER BY number;')
+ for link, label in con.execute(sql, (title_id,)):
+ yield unicode(link), unicode(label)
+ finally:
+ con.commit()
+
+ def find(self, words):
+ """Iterator of all pages containing the words, and their scores."""
+
+ con = self.con
+ try:
+ ranks = []
+ for word in words:
+ # Calculate popularity of each word.
+ sql = 'SELECT SUM(words.count) FROM words WHERE word LIKE ?;'
+ rank = con.execute(sql, ('%%%s%%' % word,)).fetchone()[0]
+ # If any rank is 0, there will be no results anyways
+ if not rank:
+ return
+ ranks.append((rank, word))
+ ranks.sort()
+ # Start with the least popular word. Get all pages that contain it.
+ first_rank, first = ranks[0]
+ rest = ranks[1:]
+ sql = ('SELECT words.page, titles.title, SUM(words.count) '
+ 'FROM words, titles '
+ 'WHERE word LIKE ? AND titles.id=words.page '
+ 'GROUP BY words.page;')
+ first_counts = con.execute(sql, ('%%%s%%' % first,))
+ # Check for the rest of words
+ for title_id, title, first_count in first_counts:
+ # Score for the first word
+ score = float(first_count) / first_rank
+ for rank, word in rest:
+ sql = ('SELECT SUM(count) FROM words '
+ 'WHERE page=? AND word LIKE ?;')
+ count = con.execute(sql,
+ (title_id, '%%%s%%' % word)).fetchone()[0]
+ if not count:
+ # If page misses any of the words, its score is 0
+ score = 0
+ break
+ score += float(count) / rank
+ if score > 0:
+ yield int(100 * score), unicode(title)
+ finally:
+ con.commit()
+
+ def reindex_page(self, page, title, cursor, text=None):
+ """Updates the content of the database, needs locks around."""
+
+ if text is None:
+ get_text = getattr(page, 'plain_text', lambda: u'')
+ try:
+ text = get_text()
+ except error.NotFoundErr:
+ text = None
+ title_id = self.title_id(title, cursor)
+ if not list(self.page_backlinks(title)):
+ cursor.execute("DELETE FROM titles WHERE id=?;",
+ (title_id,))
+ extract_links = getattr(page, 'extract_links', None)
+ if extract_links and text:
+ links = extract_links(text)
+ else:
+ links = []
+ self.update_links(title, links, cursor=cursor)
+ self.update_words(title, text or u'', cursor=cursor)
+
+ def update_page(self, page, title, data=None, text=None):
+ """Updates the index with new page content, for a single page."""
+
+ if text is None and data is not None:
+ text = unicode(data, self.storage.charset, 'replace')
+ cursor = self.con.cursor()
+ try:
+ self.set_last_revision(self.storage.repo_revision())
+ self.reindex_page(page, title, cursor, text)
+ self.con.commit()
+ except:
+ self.con.rollback()
+ raise
+
+ def reindex(self, wiki, pages):
+ """Updates specified pages in bulk."""
+
+ cursor = self.con.cursor()
+ try:
+ for title in pages:
+ page = wiki.get_page(None, title)
+ self.reindex_page(page, title, cursor)
+ self.con.commit()
+ self.empty = False
+ except:
+ self.con.rollback()
+ raise
+
+ def set_last_revision(self, rev):
+ """Store the last indexed repository revision."""
+
+ # We use % here because the sqlite3's substitiution doesn't work
+ # We store revision 0 as 1, 1 as 2, etc. because 0 means "no revision"
+ self.con.execute('PRAGMA USER_VERSION=%d;' % (int(rev + 1),))
+
+ def get_last_revision(self):
+ """Retrieve the last indexed repository revision."""
+
+ con = self.con
+ c = con.execute('PRAGMA USER_VERSION;')
+ rev = c.fetchone()[0]
+ # -1 means "no revision", 1 means revision 0, 2 means revision 1, etc.
+ return rev - 1
+
+ def update(self, wiki):
+ """Reindex al pages that changed since last indexing."""
+
+ last_rev = self.get_last_revision()
+ if last_rev == -1:
+ changed = self.storage.all_pages()
+ else:
+ changed = self.storage.changed_since(last_rev)
+ self.reindex(wiki, changed)
+ rev = self.storage.repo_revision()
+ self.set_last_revision(rev)