websdk/hatta/search.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sqlite3
import re
import os
import thread

import error


class WikiSearch(object):
    """
    Responsible for indexing words and links, for fast searching and
    backlinks. Uses a cache directory to store the index files.
    """

    word_pattern = re.compile(ur"""\w[-~&\w]+\w""", re.UNICODE)
    jword_pattern = re.compile(
ur"""[ｦ-ﾟ]+|[ぁ-ん～ー]+|[ァ-ヶ～ー]+|[0-9A-Za-z]+|"""
ur"""[０-９Ａ-Ｚａ-ｚΑ-Ωα-ωА-я]+|"""
ur"""[^- !"#$%&'()*+,./:;<=>?@\[\\\]^_`{|}"""
ur"""‾｡｢｣､･　、。，．・：；？！゛゜´｀¨"""
ur"""＾￣＿／〜‖｜…‥‘’“”"""
ur"""（）〔〕［］｛｝〈〉《》「」『』【】＋−±×÷"""
ur"""＝≠＜＞≦≧∞∴♂♀°′″℃￥＄¢£"""
ur"""％＃＆＊＠§☆★○●◎◇◆□■△▲▽▼※〒"""
ur"""→←↑↓〓∈∋⊆⊇⊂⊃∪∩∧∨¬⇒⇔∠∃∠⊥"""
ur"""⌒∂∇≡≒≪≫√∽∝∵∫∬Å‰♯♭♪†‡¶◾"""
ur"""─│┌┐┘└├┬┤┴┼"""
ur"""━┃┏┓┛┗┣┫┻╋"""
ur"""┠┯┨┷┿┝┰┥┸╂"""
ur"""ｦ-ﾟぁ-ん～ーァ-ヶ"""
ur"""0-9A-Za-z０-９Ａ-Ｚａ-ｚΑ-Ωα-ωА-я]+""", re.UNICODE)

    def __init__(self, cache_path, lang, storage):
        self._con = {}
        self.path = cache_path
        self.storage = storage
        self.lang = lang
        if lang == "ja":
            self.split_text = self.split_japanese_text
        self.filename = os.path.join(cache_path, 'index.sqlite3')
        if not os.path.isdir(self.path):
            self.empty = True
            os.makedirs(self.path)
        elif not os.path.exists(self.filename):
            self.empty = True
        else:
            self.empty = False
        self.init_db(self.con)

    def init_db(self, con):
        con.execute('CREATE TABLE IF NOT EXISTS titles '
                '(id INTEGER PRIMARY KEY, title VARCHAR);')
        con.execute('CREATE TABLE IF NOT EXISTS words '
                '(word VARCHAR, page INTEGER, count INTEGER);')
        con.execute('CREATE INDEX IF NOT EXISTS index1 '
                         'ON words (page);')
        con.execute('CREATE INDEX IF NOT EXISTS index2 '
                         'ON words (word);')
        con.execute('CREATE TABLE IF NOT EXISTS links '
            '(src INTEGER, target INTEGER, label VARCHAR, number INTEGER);')
        con.commit()

    @property
    def con(self):
        """Keep one connection per thread."""

        thread_id = thread.get_ident()
        try:
            return self._con[thread_id]
        except KeyError:
            connection = sqlite3.connect(self.filename)
            self._con[thread_id] = connection
            return connection

    def split_text(self, text):
        """Splits text into words"""

        for match in self.word_pattern.finditer(text):
            word = match.group(0)
            yield word.lower()

    def split_japanese_text(self, text):
        """Splits text into words, including rules for Japanese"""

        for match in self.word_pattern.finditer(text):
            word = match.group(0)
            got_japanese = False
            for m in self.jword_pattern.finditer(word):
                w = m.group(0)
                got_japanese = True
                yield w.lower()
            if not got_japanese:
                yield word.lower()

    def count_words(self, words):
        count = {}
        for word in words:
            count[word] = count.get(word, 0) + 1
        return count

    def title_id(self, title, con):
        c = con.execute('SELECT id FROM titles WHERE title=?;', (title,))
        idents = c.fetchone()
        if idents is None:
            con.execute('INSERT INTO titles (title) VALUES (?);', (title,))
            c = con.execute('SELECT LAST_INSERT_ROWID();')
            idents = c.fetchone()
        return idents[0]

    def update_words(self, title, text, cursor):
        title_id = self.title_id(title, cursor)
        cursor.execute('DELETE FROM words WHERE page=?;', (title_id,))
        if not text:
            return
        words = self.count_words(self.split_text(text))
        title_words = self.count_words(self.split_text(title))
        for word, count in title_words.iteritems():
            words[word] = words.get(word, 0) + count
        for word, count in words.iteritems():
            cursor.execute('INSERT INTO words VALUES (?, ?, ?);',
                             (word, title_id, count))

    def update_links(self, title, links_and_labels, cursor):
        title_id = self.title_id(title, cursor)
        cursor.execute('DELETE FROM links WHERE src=?;', (title_id,))
        for number, (link, label) in enumerate(links_and_labels):
            cursor.execute('INSERT INTO links VALUES (?, ?, ?, ?);',
                             (title_id, link, label, number))

    def orphaned_pages(self):
        """Gives all pages with no links to them."""

        con = self.con
        try:
            sql = ('SELECT title FROM titles '
                   'WHERE NOT EXISTS '
                   '(SELECT * FROM links WHERE target=title) '
                   'ORDER BY title;')
            for (title,) in con.execute(sql):
                yield unicode(title)
        finally:
            con.commit()

    def wanted_pages(self):
        """Gives all pages that are linked to, but don't exist, together with
        the number of links."""

        con = self.con
        try:
            sql = ('SELECT COUNT(*), target FROM links '
                   'WHERE NOT EXISTS '
                   '(SELECT * FROM titles WHERE target=title) '
                   'GROUP BY target ORDER BY -COUNT(*);')
            for (refs, db_title,) in con.execute(sql):
                title = unicode(db_title)
                yield refs, title
        finally:
            con.commit()

    def page_backlinks(self, title):
        """Gives a list of pages linking to specified page."""

        con = self.con  # sqlite3.connect(self.filename)
        try:
            sql = ('SELECT DISTINCT(titles.title) '
                   'FROM links, titles '
                   'WHERE links.target=? AND titles.id=links.src '
                   'ORDER BY titles.title;')
            for (backlink,) in con.execute(sql, (title,)):
                yield unicode(backlink)
        finally:
            con.commit()

    def page_links(self, title):
        """Gives a list of links on specified page."""

        con = self.con  # sqlite3.connect(self.filename)
        try:
            title_id = self.title_id(title, con)
            sql = 'SELECT target FROM links WHERE src=? ORDER BY number;'
            for (link,) in con.execute(sql, (title_id,)):
                yield unicode(link)
        finally:
            con.commit()

    def page_links_and_labels(self, title):
        con = self.con  # sqlite3.connect(self.filename)
        try:
            title_id = self.title_id(title, con)
            sql = ('SELECT target, label FROM links '
                   'WHERE src=? ORDER BY number;')
            for link, label in con.execute(sql, (title_id,)):
                yield unicode(link), unicode(label)
        finally:
            con.commit()

    def find(self, words):
        """Iterator of all pages containing the words, and their scores."""

        con = self.con
        try:
            ranks = []
            for word in words:
                # Calculate popularity of each word.
                sql = 'SELECT SUM(words.count) FROM words WHERE word LIKE ?;'
                rank = con.execute(sql, ('%%%s%%' % word,)).fetchone()[0]
                # If any rank is 0, there will be no results anyways
                if not rank:
                    return
                ranks.append((rank, word))
            ranks.sort()
            # Start with the least popular word. Get all pages that contain it.
            first_rank, first = ranks[0]
            rest = ranks[1:]
            sql = ('SELECT words.page, titles.title, SUM(words.count) '
                   'FROM words, titles '
                   'WHERE word LIKE ? AND titles.id=words.page '
                   'GROUP BY words.page;')
            first_counts = con.execute(sql, ('%%%s%%' % first,))
            # Check for the rest of words
            for title_id, title, first_count in first_counts:
                # Score for the first word
                score = float(first_count) / first_rank
                for rank, word in rest:
                    sql = ('SELECT SUM(count) FROM words '
                           'WHERE page=? AND word LIKE ?;')
                    count = con.execute(sql,
                        (title_id, '%%%s%%' % word)).fetchone()[0]
                    if not count:
                        # If page misses any of the words, its score is 0
                        score = 0
                        break
                    score += float(count) / rank
                if score > 0:
                    yield int(100 * score), unicode(title)
        finally:
            con.commit()

    def reindex_page(self, page, title, cursor, text=None):
        """Updates the content of the database, needs locks around."""

        if text is None:
            get_text = getattr(page, 'plain_text', lambda: u'')
            try:
                text = get_text()
            except error.NotFoundErr:
                text = None
                title_id = self.title_id(title, cursor)
                if not list(self.page_backlinks(title)):
                    cursor.execute("DELETE FROM titles WHERE id=?;",
                                   (title_id,))
        extract_links = getattr(page, 'extract_links', None)
        if extract_links and text:
            links = extract_links(text)
        else:
            links = []
        self.update_links(title, links, cursor=cursor)
        self.update_words(title, text or u'', cursor=cursor)

    def update_page(self, page, title, data=None, text=None):
        """Updates the index with new page content, for a single page."""

        if text is None and data is not None:
            text = unicode(data, self.storage.charset, 'replace')
        cursor = self.con.cursor()
        try:
            self.set_last_revision(self.storage.repo_revision())
            self.reindex_page(page, title, cursor, text)
            self.con.commit()
        except:
            self.con.rollback()
            raise

    def reindex(self, wiki, pages):
        """Updates specified pages in bulk."""

        cursor = self.con.cursor()
        try:
            for title in pages:
                page = wiki.get_page(None, title)
                self.reindex_page(page, title, cursor)
            self.con.commit()
            self.empty = False
        except:
            self.con.rollback()
            raise

    def set_last_revision(self, rev):
        """Store the last indexed repository revision."""

        # We use % here because the sqlite3's substitiution doesn't work
        # We store revision 0 as 1, 1 as 2, etc. because 0 means "no revision"
        self.con.execute('PRAGMA USER_VERSION=%d;' % (int(rev + 1),))

    def get_last_revision(self):
        """Retrieve the last indexed repository revision."""

        con = self.con
        c = con.execute('PRAGMA USER_VERSION;')
        rev = c.fetchone()[0]
        # -1 means "no revision", 1 means revision 0, 2 means revision 1, etc.
        return rev - 1

    def update(self, wiki):
        """Reindex al pages that changed since last indexing."""

        last_rev = self.get_last_revision()
        if last_rev == -1:
            changed = self.storage.all_pages()
        else:
            changed = self.storage.changed_since(last_rev)
        self.reindex(wiki, changed)
        rev = self.storage.repo_revision()
        self.set_last_revision(rev)