Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/woip/rb
diff options
context:
space:
mode:
authorWade Brainerd <wadetb@gmail.com>2008-05-23 22:59:37 (GMT)
committer Wade Brainerd <wadetb@gmail.com>2008-05-23 22:59:37 (GMT)
commit9878512ab181ef56e82d91ed3e69ddbaa50520d0 (patch)
tree879e52bebdea44daa32afaaa8802c183fd9484ed /woip/rb
parentdd58bf72d6799438d8033cf7de6bc26a711734c3 (diff)
Reorganization step 2.
Diffstat (limited to 'woip/rb')
-rw-r--r--woip/rb/#bzipreader.rb#192
-rw-r--r--woip/rb/article.rb30
-rw-r--r--woip/rb/bzipreader.rb192
-rw-r--r--woip/rb/common.rb3
-rw-r--r--woip/rb/index.rb54
-rw-r--r--woip/rb/livesearch.rb106
-rw-r--r--woip/rb/server.rb284
-rw-r--r--woip/rb/titles.rb17
-rw-r--r--woip/rb/xapian-index.rb30
-rw-r--r--woip/rb/xmlprocess.rb52
10 files changed, 960 insertions, 0 deletions
diff --git a/woip/rb/#bzipreader.rb# b/woip/rb/#bzipreader.rb#
new file mode 100644
index 0000000..1ccacca
--- /dev/null
+++ b/woip/rb/#bzipreader.rb#
@@ -0,0 +1,192 @@
+# rm -r ~/.ruby_inline; ARCHFLAGS="-arch i386" ruby -r rubygems -r bzipread.rb \
+# -e 'BzipReader.new.readBlock("../ga.wp.txt.bz2", 0)'
+
+require 'rubygems'
+require 'tempfile'
+require 'inline'
+require File.join(File.dirname(__FILE__), 'common')
+
+BZ_MAX_BLOCK = 1024 * 900
+
+class BzipReader
+ attr :offset
+
+ def debug(str)
+ $stderr.puts str
+ end
+
+ inline(:C) do |builder|
+ builder.add_compile_flags "-I../c -I. -lbz2"
+ builder.add_compile_flags "../c/bzipreader.c"
+ builder.add_compile_flags "../c/safe.c"
+
+ builder.prefix '
+ #include "bzipreader.h"
+ uint64_t readOffset;
+ '
+
+ ['VALUE __decompressBlock(char *src, int srcLen) {
+ char dest[BZ_MAX_BLOCK];
+ uint32_t destLen = BZ_MAX_BLOCK;
+ int ret;
+
+ debug("ruby decompressing %d bytes", srcLen);
+ if((ret = decompressBlock(src, srcLen, dest, &destLen)) != BZ_OK)
+ fatal("couldn\'t decompress: bz error %d", ret);
+
+ return rb_str_new(dest, destLen);
+ }',
+
+ 'VALUE __readBlock(char *file) {
+ FILE *in;
+ uint64_t realOffset;
+ VALUE str;
+
+ if(strlen(file) == 0) {
+ in = xfopen("/dev/stdin", "rb");
+ realOffset = 0;
+ } else {
+ in = xfopen(file, "rb");
+ realOffset = readOffset;
+ }
+
+ BitBuffer *bb = bbOfSize(BZ_MAX_BLOCK);
+ readOffset = fixedOffset(readBlock(in, realOffset, bb));
+
+ xfclose(in);
+
+ str = rb_str_new(bb->buff, bb->pos);
+ bbClose(bb);
+ return str;
+ }',
+
+ 'void __setReadOffset(char *offset) {
+ readOffset = *((uint64_t *) offset);
+ }',
+
+ 'VALUE __getReadOffset() {
+ return rb_str_new((char *) &readOffset, sizeof(uint64_t));
+ }',
+
+ 'int __computeBoundaries(char *file) {
+ int size;
+ FILE *in = xfopen(file, "rb");
+ size = computeBoundaries(in);
+ xfclose(in);
+ return size;
+ }'].each {|c| builder.c c}
+ end
+
+ def uint64_to_char(num)
+ hi = num >> 32
+ lo = num & 0xffffffff
+ [lo, hi].pack('L2')
+ end
+
+ def char_to_uint64(char)
+ lo, hi = char.unpack('L2')
+ return lo + (hi << 32)
+ end
+
+ def initialize(file="")
+ if file.empty?
+ @useStdin = true
+ @buffered = ""
+ @eof = false
+ end
+
+ @file = file
+ @offset = 0
+ end
+
+ def getReadOffset
+ char_to_uint64(__getReadOffset)
+ end
+
+ def setReadOffset(num)
+ __setReadOffset(uint64_to_char(num))
+ end
+
+ def readNextBlock
+ # in bzipreader.c, readBlock reads more than it should -- to determine the end of a block,
+ # it reads the header for the subsequent block. If we're reading from a file, this is ok;
+ # we can seek back to where we want to be. But when reading from stdin, it's more awkward,
+ # and we have to buffer things
+ # TODO: use a named pipe
+ if @useStdin
+ if !@buffered or (@offset > 0 and @buffered.size < 40)
+ raise EOF
+ end
+
+ begin
+ @buffered += $stdin.read(BZ_MAX_BLOCK) if @buffered.size < BZ_MAX_BLOCK and !@eof
+ rescue TypeError
+ @eof = true
+ end
+
+ tempfile = Tempfile.new('bzipreader')
+ tempfile.write(@buffered)
+ tempfile.flush
+
+ setReadOffset(0)
+
+ block = __readBlock(tempfile.path)
+
+ offset = getReadOffset - 80
+ @offset += offset
+ bytes = (offset >> 3)
+ @buffered = @buffered[bytes..-1]
+
+ block
+ else
+ readBlock(@offset)
+ end
+ end
+
+ def readBlock(offset)
+ unless @useStdin
+ if File.size(@file) < (offset >> 3) + 80 + 40 # don't ask
+ raise EOF
+ end
+ end
+ setReadOffset(offset)
+ block = __readBlock(@file)
+ @offset = getReadOffset
+ block
+ end
+
+ def decompressBlock(str)
+ __decompressBlock(str, str.size)
+ end
+
+ def computeBoundaries
+ __computeBoundaries(@file)
+ end
+
+ def self.test(skip=0)
+ b = BzipReader.new('../ga.wp.txt.bz2')
+ skip.times do b.readNextBlock end
+ block = b.readNextBlock
+ block = b.decompressBlock(block)[0..100]
+ puts block
+ end
+
+ def self.stdin_test(skip = 0)
+ b = self.new
+ skip.times do b.readNextBlock end
+ blockNo = -1
+
+ while true
+ begin
+ offset = b.offset
+ block = b.readNextBlock
+ plaintext = b.decompressBlock(block)
+ $stderr.puts "#{blockNo += 1}\t#{offset}\t#{plaintext.gsub(/\n/, "\\n")[0..30]}"
+ $stdout.write plaintext
+ rescue EOF
+ $stderr.puts "EOF"
+ break
+ end
+ end
+ end
+end
diff --git a/woip/rb/article.rb b/woip/rb/article.rb
new file mode 100644
index 0000000..eb44340
--- /dev/null
+++ b/woip/rb/article.rb
@@ -0,0 +1,30 @@
+# these are actually real ASCII codes
+START_HEADING = 1.chr
+START_TEXT = 2.chr
+END_TEXT = 3.chr
+
+class Article
+ attr_accessor :title
+ attr_accessor :body
+
+ def body
+ @body ||= ''
+ end
+
+ def write(str)
+ str.puts START_HEADING
+ str.puts title
+ str.puts body.size
+ str.puts START_TEXT
+ str.puts body
+ str.puts END_TEXT
+ str.flush
+ end
+end
+
+def stdin_gets
+ raise EOF if $stdin.closed?
+ line = $stdin.gets
+ raise EOF unless line
+ line
+end
diff --git a/woip/rb/bzipreader.rb b/woip/rb/bzipreader.rb
new file mode 100644
index 0000000..1ccacca
--- /dev/null
+++ b/woip/rb/bzipreader.rb
@@ -0,0 +1,192 @@
+# rm -r ~/.ruby_inline; ARCHFLAGS="-arch i386" ruby -r rubygems -r bzipread.rb \
+# -e 'BzipReader.new.readBlock("../ga.wp.txt.bz2", 0)'
+
+require 'rubygems'
+require 'tempfile'
+require 'inline'
+require File.join(File.dirname(__FILE__), 'common')
+
+BZ_MAX_BLOCK = 1024 * 900
+
+class BzipReader
+ attr :offset
+
+ def debug(str)
+ $stderr.puts str
+ end
+
+ inline(:C) do |builder|
+ builder.add_compile_flags "-I../c -I. -lbz2"
+ builder.add_compile_flags "../c/bzipreader.c"
+ builder.add_compile_flags "../c/safe.c"
+
+ builder.prefix '
+ #include "bzipreader.h"
+ uint64_t readOffset;
+ '
+
+ ['VALUE __decompressBlock(char *src, int srcLen) {
+ char dest[BZ_MAX_BLOCK];
+ uint32_t destLen = BZ_MAX_BLOCK;
+ int ret;
+
+ debug("ruby decompressing %d bytes", srcLen);
+ if((ret = decompressBlock(src, srcLen, dest, &destLen)) != BZ_OK)
+ fatal("couldn\'t decompress: bz error %d", ret);
+
+ return rb_str_new(dest, destLen);
+ }',
+
+ 'VALUE __readBlock(char *file) {
+ FILE *in;
+ uint64_t realOffset;
+ VALUE str;
+
+ if(strlen(file) == 0) {
+ in = xfopen("/dev/stdin", "rb");
+ realOffset = 0;
+ } else {
+ in = xfopen(file, "rb");
+ realOffset = readOffset;
+ }
+
+ BitBuffer *bb = bbOfSize(BZ_MAX_BLOCK);
+ readOffset = fixedOffset(readBlock(in, realOffset, bb));
+
+ xfclose(in);
+
+ str = rb_str_new(bb->buff, bb->pos);
+ bbClose(bb);
+ return str;
+ }',
+
+ 'void __setReadOffset(char *offset) {
+ readOffset = *((uint64_t *) offset);
+ }',
+
+ 'VALUE __getReadOffset() {
+ return rb_str_new((char *) &readOffset, sizeof(uint64_t));
+ }',
+
+ 'int __computeBoundaries(char *file) {
+ int size;
+ FILE *in = xfopen(file, "rb");
+ size = computeBoundaries(in);
+ xfclose(in);
+ return size;
+ }'].each {|c| builder.c c}
+ end
+
+ def uint64_to_char(num)
+ hi = num >> 32
+ lo = num & 0xffffffff
+ [lo, hi].pack('L2')
+ end
+
+ def char_to_uint64(char)
+ lo, hi = char.unpack('L2')
+ return lo + (hi << 32)
+ end
+
+ def initialize(file="")
+ if file.empty?
+ @useStdin = true
+ @buffered = ""
+ @eof = false
+ end
+
+ @file = file
+ @offset = 0
+ end
+
+ def getReadOffset
+ char_to_uint64(__getReadOffset)
+ end
+
+ def setReadOffset(num)
+ __setReadOffset(uint64_to_char(num))
+ end
+
+ def readNextBlock
+ # in bzipreader.c, readBlock reads more than it should -- to determine the end of a block,
+ # it reads the header for the subsequent block. If we're reading from a file, this is ok;
+ # we can seek back to where we want to be. But when reading from stdin, it's more awkward,
+ # and we have to buffer things
+ # TODO: use a named pipe
+ if @useStdin
+ if !@buffered or (@offset > 0 and @buffered.size < 40)
+ raise EOF
+ end
+
+ begin
+ @buffered += $stdin.read(BZ_MAX_BLOCK) if @buffered.size < BZ_MAX_BLOCK and !@eof
+ rescue TypeError
+ @eof = true
+ end
+
+ tempfile = Tempfile.new('bzipreader')
+ tempfile.write(@buffered)
+ tempfile.flush
+
+ setReadOffset(0)
+
+ block = __readBlock(tempfile.path)
+
+ offset = getReadOffset - 80
+ @offset += offset
+ bytes = (offset >> 3)
+ @buffered = @buffered[bytes..-1]
+
+ block
+ else
+ readBlock(@offset)
+ end
+ end
+
+ def readBlock(offset)
+ unless @useStdin
+ if File.size(@file) < (offset >> 3) + 80 + 40 # don't ask
+ raise EOF
+ end
+ end
+ setReadOffset(offset)
+ block = __readBlock(@file)
+ @offset = getReadOffset
+ block
+ end
+
+ def decompressBlock(str)
+ __decompressBlock(str, str.size)
+ end
+
+ def computeBoundaries
+ __computeBoundaries(@file)
+ end
+
+ def self.test(skip=0)
+ b = BzipReader.new('../ga.wp.txt.bz2')
+ skip.times do b.readNextBlock end
+ block = b.readNextBlock
+ block = b.decompressBlock(block)[0..100]
+ puts block
+ end
+
+ def self.stdin_test(skip = 0)
+ b = self.new
+ skip.times do b.readNextBlock end
+ blockNo = -1
+
+ while true
+ begin
+ offset = b.offset
+ block = b.readNextBlock
+ plaintext = b.decompressBlock(block)
+ $stderr.puts "#{blockNo += 1}\t#{offset}\t#{plaintext.gsub(/\n/, "\\n")[0..30]}"
+ $stdout.write plaintext
+ rescue EOF
+ $stderr.puts "EOF"
+ break
+ end
+ end
+ end
+end
diff --git a/woip/rb/common.rb b/woip/rb/common.rb
new file mode 100644
index 0000000..1c7426e
--- /dev/null
+++ b/woip/rb/common.rb
@@ -0,0 +1,3 @@
+require 'rubygems'
+
+class EOF < RuntimeError; end \ No newline at end of file
diff --git a/woip/rb/index.rb b/woip/rb/index.rb
new file mode 100644
index 0000000..dc376f9
--- /dev/null
+++ b/woip/rb/index.rb
@@ -0,0 +1,54 @@
+require File.join(File.dirname(__FILE__), 'bzipreader')
+require File.join(File.dirname(__FILE__), 'article')
+
+$nblock = -1
+$startblock = 0
+
+def get_line
+ if $lines.empty?
+ txt = $reader.decompressBlock($reader.readNextBlock)
+ $nblock += 1
+
+ $stderr.puts "#{$nblock}\t#{$reader.offset}\t#{txt.gsub(/\n/, "\\n")[0..30]}\t#{txt.size}"
+
+ $lines = txt.split("\n").reverse
+ end
+
+ $lines.pop
+end
+
+def quote(str)
+ str.gsub(/'/, "''")
+end
+
+def process_titles
+ while true
+ line = get_line
+ block = $nblock
+ if line.chomp == START_HEADING
+ article = get_line.chomp
+ puts "#{article} #{block}" if $startblock <= $nblock
+ end
+ end
+rescue EOF
+ $stderr.puts "Done"
+end
+
+def main
+ if ARGV[1]
+ $startblock = ARGV[1].to_i
+ end
+
+ if ARGV.empty?
+ $reader = BzipReader.new
+ else
+ $reader = BzipReader.new(ARGV.first)
+ end
+
+ $lines = []
+ process_titles
+end
+
+if __FILE__ == $0
+ main
+end
diff --git a/woip/rb/livesearch.rb b/woip/rb/livesearch.rb
new file mode 100644
index 0000000..f57ae04
--- /dev/null
+++ b/woip/rb/livesearch.rb
@@ -0,0 +1,106 @@
+require 'rubygems'
+require 'curses'
+require 'inline'
+
+class TernarySearcher
+ RESULTS = 10
+
+ inline(:C) do |builder|
+ builder.include '"ternary.h"'
+ builder.add_compile_flags "-I../c -I. -DDEBUG"
+ builder.add_compile_flags "../c/searcher.c"
+
+ builder.prefix "
+ char resultbuf[#{RESULTS}][MAXLINE];
+ int haveresults;
+ "
+
+ builder.c "int handleResult(char *s) {
+ strncpy(resultbuf[haveresults++], s, MAXLINE);
+ if(haveresults == #{RESULTS}) return false;
+ else return true;
+ }"
+
+ builder.c 'void __init(char *indexFile) {
+ load_root(indexFile);
+ }'
+
+ builder.c 'void __prefixSearch(char *s, int n) {
+ haveresults = 0;
+ root_search(s, handleResult);
+ }'
+ end
+
+ def initialize(index)
+ __init(index)
+ end
+
+ def prefixSearch(str, n)
+ @results = []
+ __prefixSearch(str, n)
+ end
+end
+
+class Searcher
+ def initialize
+ @needle = ""
+ @xap = XapianSearcher.new(ARGV.first)
+ end
+
+ def refresh
+ Curses.clear
+ Curses.setpos(0, 0)
+ Curses.addstr(@needle)
+ draw_matches
+ Curses.refresh
+ end
+
+ def draw_matches
+ line = 1
+ @matches.each do |match|
+ match.draw(line += 1)
+ end
+ end
+
+ def search
+ @matches = @xap.matches(@needle).map do |match|
+ Match.new(match)
+ end
+ end
+
+ def run
+ Curses.init_screen
+ Curses.noecho
+ Curses.stdscr.keypad(true)
+
+ loop do
+ char = Curses.getch
+
+ if char == 127 # Backspace
+ @needle = @needle[0..-2] unless @needle.empty?
+ else
+ @needle += char.chr
+ end
+ search
+ refresh
+ end
+ ensure
+ Curses.close_screen
+ end
+end
+
+class Match
+ def initialize(str)
+ @string = str
+ end
+
+ def draw(line)
+ Curses.setpos(line, 0)
+ Curses.addstr(@string)
+ end
+end
+
+if $0 == __FILE__
+ puts "running..."
+ Searcher.new.run
+end \ No newline at end of file
diff --git a/woip/rb/server.rb b/woip/rb/server.rb
new file mode 100644
index 0000000..0b2ea29
--- /dev/null
+++ b/woip/rb/server.rb
@@ -0,0 +1,284 @@
+require 'rubygems'
+require 'mongrel'
+require 'inline'
+
+READER = "WP, the free offline Wikipedia reader thing"
+MAXRES = 40
+USAGE = %Q[
+<strong>Supported requests:</strong>
+<ul>
+<li><tt>/wiki/Foo</tt> - parsed Wikipedia article on Foo</li>
+<li><tt>/raw/Foo</tt> - raw text of article</li>
+<li><tt>/search?s=foo</tt> - article titles containing foo</li>
+</ul>
+]
+SEARCH_BOX=%Q{<div width="100" style="float: right;">
+ <form action="/search" method="get">
+ <input type="text" name="s" accesskey="s" /></form></div>}
+
+class String
+ def titleize
+ self[0..0].upcase + self[1..-1]
+ end
+end
+
+class Parser
+ # Parser.php is 5k lines, and has a billion dependencies, transclusion logic,
+ # magic-word support, langauge processing, templating code (a literally
+ # Turing-complete sublanguage), and stuff... so let's just use regexps
+
+ def parse_infobox(str)
+ str.gsub(/^\|([^=]+)[:space:]*=[:space:]*(.*)$/) { "<b>#{$1}</b> #{$2}<br>" }
+ end
+
+ def parse_template(str)
+ str.gsub(/\{\{(([^[:space:]]+) (\w+)$)?([^\}\{]+)\}\}/) {
+ if $2 == "Infobox"
+ %Q[ <div style="width: 200px; background-color: #eee; border: 1px solid #bbb; float: right">
+ <h2>#{$3}</h2>
+ #{parse_infobox($4)} </div> ]
+ else
+ ""
+ end
+ }
+ end
+
+ def parse(str)
+ # Lists; this barely works in even the simplest cases
+ str = str.gsub(/^\*(.+)$/) { "<span style=\"display: block\">* #{$1}</span>" }
+
+ # Bold
+ str = str.gsub(/'''(.+?)'''/) { "<b>#{$1}</b>" }
+
+ # Italic
+ str = str.gsub(/''(.+?)''/) { "<i>#{$1}</i>" }
+
+ # Headings
+ str = str.gsub(/(={2,})([^=]+)\1/) {"<h#{$1.size}>#{$2}</h#{$1.size}>"}
+
+ # Interwiki links
+ str = str.gsub(/\[\[([^\|\]]+)\|([^\]]+)\]\](\w*)/) { Article.link_to($1, $2 + $3) }
+ str = str.gsub(/\[\[(.*?)\]\](\w*)/) { Article.link_to($1, $1 + $2) }
+
+ # Strip refs
+ str = str.gsub(/<ref>.*?<\/ref>/, '')
+
+ # Templates, which may be nested, and which we need to parse from innermost first
+ while (new = parse_template(str)) != str
+ str = new
+ end
+
+ # External links
+ str = str.gsub(/\[([^\][:space:]]+) ([^\]]+)\]/) {"<a href=\"#{$1}\">#{$2}</a>"}
+ str = str.gsub(/\[([^\][:space:]]+)\]/) {"<a href=\"#{$1}\">#{$1}</a>"}
+ end
+end
+
+class Article
+ attr_accessor :text
+ attr_accessor :block
+ attr_accessor :title
+
+ def parsed_text
+ Parser.new.parse(self.text)
+ end
+
+ def as_html
+ %Q[<html><head><title>#{title} - #{READER}</title></head>
+ <body>
+ #{SEARCH_BOX}
+ <h1>#{title}</h1><h5>#{READER}</h5><p><small>#{text.size} bytes from block #{block}
+ (<a href="/raw/#{CGI::escape(title.titleize)}">raw</a>)</small></p>
+ #{parsed_text}
+ </body></html>]
+ end
+
+ def self.link_to(name, text)
+ "<a href=\"/wiki/#{CGI::escape(name.titleize)}\">#{text}</a>"
+ end
+end
+
+class SearchResult
+ attr_accessor :results
+ attr_accessor :needle
+
+ def as_html
+ %Q[<html><head><title>Search: #{needle} - #{READER}</title></head>
+ <body>
+ #{SEARCH_BOX}
+ <h1>Search: #{needle}</h1>
+ <ul>#{results.map{|r| "<li>" + Article.link_to(r, r) + "</li>"}.join}</ul>
+ <p><small>Searches return up to #{MAXRES} articles containing the search string anywhere in their title.
+ Results are case-insensitive. Exact matches appear first, followed by prefix matches, and, lastly, substring matches.
+ Press ^S to quickly jump to search.</small></p>
+ </body>
+ </html>]
+ end
+end
+
+class WPArticleReader
+ inline(:C) do |builder|
+ builder.add_compile_flags "-I../c -I. -lbz2 -DDEBUG"
+ builder.add_compile_flags "../c/bzipreader.c"
+ builder.add_compile_flags "../c/wp.c"
+ builder.add_compile_flags "../c/lsearcher.c"
+ builder.add_compile_flags "../c/safe.c"
+ builder.add_compile_flags "../c/blocks.c"
+
+ builder.prefix %Q$
+ #include "wp.h"
+ #define MAXRES #{MAXRES}
+ #define MAXSTR 1024
+
+ wp_dump d = {0};
+ wp_article a = {0};
+
+ char results[MAXRES][MAXSTR];
+ int nresults;
+
+ bool __handle_result(char *s) {
+ strncpy(results[nresults], s, MAXSTR);
+ results[nresults][MAXSTR - 1] = \'\\0\';
+ char *end = strrchr(results[nresults], \' \');
+
+ if(end) {
+ *(end - 1) = \'\\0\';
+ nresults++;
+ }
+
+ return nresults < MAXRES;
+ }
+ $
+
+ builder.c 'void __load_dump(char *dump, char *loc, char *ploc, char *blocks) {
+ load_dump(&d, dump, loc, ploc, blocks);
+ init_article(&a);
+ }'
+
+ builder.c 'char *__load_article(char *name) {
+ a.block = 0;
+ a.text[0] = \'\0\';
+ load_article(&d, name, &a);
+ return a.text;
+ }'
+
+ builder.c 'int __article_block() {
+ return a.block;
+ }'
+
+ builder.c 'int __article_size() {
+ return strlen(a.text);
+ }'
+
+
+ builder.c 'int __search(char *needle) {
+ nresults = 0;
+ search(&d.index, needle, __handle_result, NULL, true, true);
+ return nresults;
+ }'
+
+ builder.c 'char *__result(int n) {
+ return results[n];
+ }'
+ end
+
+ def initialize(opts)
+ @locatedb = opts[:locatedb]
+ @prefixdb = opts[:prefixdb]
+ @blockdb = opts[:blockdb]
+ @dump = opts[:dump]
+
+ __load_dump(@dump, @locatedb, @prefixdb, @blockdb)
+ end
+
+ def fetch(name)
+ text = __load_article(name)
+ a = Article.new
+ a.text = text
+ a.block = __article_block
+ a.title = name
+ a
+ end
+
+ def find(name)
+ n = __search(name)
+ r = SearchResult.new
+ r.needle = name
+ r.results = (0..n - 1).map {|n| __result(n)}
+ r
+ end
+end
+
+class WPHandler < Mongrel::HttpHandler
+ def initialize(base)
+ $stderr.puts "Using base #{base}"
+ @reader = WPArticleReader.new(:locatedb => "#{base}.locate.db",
+ :prefixdb => "#{base}.locate.prefixdb",
+ :blockdb => "#{base}.blocks.db",
+ :dump => "#{base}.processed")
+ @parser = Parser.new
+ end
+
+ def path(req)
+ CGI::unescape(req.params["REQUEST_URI"])
+ end
+
+ def notfound(resp, str)
+ respond(resp, true, 404) { "Couldn't find #{str}. <p>#{USAGE}</p>" }
+ end
+
+ def respond(resp, html=true, status=200)
+ resp.start(200) do |h, o|
+ h["Content-type"] = (html ? 'text/html' : 'text/plain') + '; charset=utf-8'
+ o.write yield
+ end
+ end
+
+ def process(req, resp)
+ if path(req) =~ /^\/(wiki|raw)\/(.+)$/
+ article = @reader.fetch($2)
+
+ if article.text.empty?
+ notfound(resp, $2)
+ else
+ if $1 == "wiki"
+ respond(resp) { article.as_html }
+ elsif $1 == "raw"
+ respond(resp, false) { article.text }
+ end
+ end
+ elsif path(req) =~ /^\/search\?s=(.+)$/
+ respond(resp) { @reader.find($1).as_html }
+ else
+ notfound(resp, path(req))
+ end
+ end
+end
+
+class WPServer
+ def self.start_on(port)
+ self.new(:port => port).run
+ end
+
+ def initialize(opts={})
+ @port = opts[:port] || 9000
+ @host = opts[:host] || '0.0.0.0'
+ end
+
+ def run
+ $stderr.puts "Binding to #{@host}:#{@port}"
+
+ conf = Mongrel::Configurator.new(:port => @port,
+ :host => @host) {
+ listener do
+ uri '/', :handler => WPHandler.new(ARGV.first), :in_front => true
+ end
+
+ trap("INT") { stop }
+
+ run
+ }
+
+ conf.join
+ end
+end
diff --git a/woip/rb/titles.rb b/woip/rb/titles.rb
new file mode 100644
index 0000000..ec03c22
--- /dev/null
+++ b/woip/rb/titles.rb
@@ -0,0 +1,17 @@
+require File.join(File.dirname(__FILE__), 'article')
+require File.join(File.dirname(__FILE__), 'common')
+
+def process_titles
+ while true
+ line = stdin_gets
+ if line.chomp == START_HEADING
+ puts stdin_gets.chomp
+ end
+ end
+rescue EOF
+ $stderr.puts "Done"
+end
+
+if __FILE__ == $0
+ process_titles
+end \ No newline at end of file
diff --git a/woip/rb/xapian-index.rb b/woip/rb/xapian-index.rb
new file mode 100644
index 0000000..2f06fa4
--- /dev/null
+++ b/woip/rb/xapian-index.rb
@@ -0,0 +1,30 @@
+require 'xapian'
+
+db = Xapian::WritableDatabase.new(ARGV[0], Xapian::DB_CREATE_OR_OPEN)
+stem = Xapian::TermGenerator.new()
+f = File.open(ARGV[1], 'r')
+processed = 0
+
+begin
+ while (line = f.readline)
+ begin
+ split = line.split("|")
+ next if split.first == ""
+
+ doc = Xapian::Document.new
+ doc.data = line
+ doc.add_posting(split.first.downcase, 1)
+ db.add_document(doc)
+ processed += 1
+
+ if processed % 100 == 0
+ $stderr.puts "#{processed}\t#{split.first}"
+ end
+ rescue
+ puts line
+ raise $!
+ end
+ end
+rescue EOFError
+ $stderr.puts "Done"
+end
diff --git a/woip/rb/xmlprocess.rb b/woip/rb/xmlprocess.rb
new file mode 100644
index 0000000..ebe06f9
--- /dev/null
+++ b/woip/rb/xmlprocess.rb
@@ -0,0 +1,52 @@
+require "rexml/document"
+require "rexml/streamlistener"
+require File.join(File.dirname(__FILE__), 'article')
+
+include REXML
+
+class ArticleListener
+ include StreamListener
+
+ def initialize
+ @processed = 0
+ @start = Time.now
+ end
+
+ def text(text)
+ @cur_text = text.gsub(/^\[\[[^\]\[]+?\:[^\]\[]+?\]\]$/, '').gsub(/\n+/, "\n")
+ end
+
+ def print_stats
+ rate = (((@processed.to_f / (Time.now - @start)) * 100).round) / 100.0
+ $stderr.puts "Processed: #{@processed}\tRate: #{rate}/sec"
+ end
+
+ def is_desirable(article)
+ not (article.title =~ /\:/ or article.title =~ /\//)
+ end
+
+ def tag_start(name, attrs)
+ if name == 'page'
+ @cur_article.write($stdout) if (@cur_article)
+ @cur_article = Article.new
+ @processed += 1
+ print_stats if (@processed % 100) == 0
+ end
+ end
+
+ def tag_end(name)
+ if name == 'title'
+ @cur_article.title = @cur_text
+ elsif name == 'text'
+ @cur_article.body = @cur_text
+ end
+ end
+end
+
+def process
+ Document.parse_stream($stdin, ArticleListener.new)
+end
+
+if __FILE__ == $0
+ process
+end