diff options
author | Wade Brainerd <wadetb@gmail.com> | 2008-05-23 22:59:37 (GMT) |
---|---|---|
committer | Wade Brainerd <wadetb@gmail.com> | 2008-05-23 22:59:37 (GMT) |
commit | 9878512ab181ef56e82d91ed3e69ddbaa50520d0 (patch) | |
tree | 879e52bebdea44daa32afaaa8802c183fd9484ed /woip/rb | |
parent | dd58bf72d6799438d8033cf7de6bc26a711734c3 (diff) |
Reorganization step 2.
Diffstat (limited to 'woip/rb')
-rw-r--r-- | woip/rb/#bzipreader.rb# | 192 | ||||
-rw-r--r-- | woip/rb/article.rb | 30 | ||||
-rw-r--r-- | woip/rb/bzipreader.rb | 192 | ||||
-rw-r--r-- | woip/rb/common.rb | 3 | ||||
-rw-r--r-- | woip/rb/index.rb | 54 | ||||
-rw-r--r-- | woip/rb/livesearch.rb | 106 | ||||
-rw-r--r-- | woip/rb/server.rb | 284 | ||||
-rw-r--r-- | woip/rb/titles.rb | 17 | ||||
-rw-r--r-- | woip/rb/xapian-index.rb | 30 | ||||
-rw-r--r-- | woip/rb/xmlprocess.rb | 52 |
10 files changed, 960 insertions, 0 deletions
diff --git a/woip/rb/#bzipreader.rb# b/woip/rb/#bzipreader.rb# new file mode 100644 index 0000000..1ccacca --- /dev/null +++ b/woip/rb/#bzipreader.rb# @@ -0,0 +1,192 @@ +# rm -r ~/.ruby_inline; ARCHFLAGS="-arch i386" ruby -r rubygems -r bzipread.rb \ +# -e 'BzipReader.new.readBlock("../ga.wp.txt.bz2", 0)' + +require 'rubygems' +require 'tempfile' +require 'inline' +require File.join(File.dirname(__FILE__), 'common') + +BZ_MAX_BLOCK = 1024 * 900 + +class BzipReader + attr :offset + + def debug(str) + $stderr.puts str + end + + inline(:C) do |builder| + builder.add_compile_flags "-I../c -I. -lbz2" + builder.add_compile_flags "../c/bzipreader.c" + builder.add_compile_flags "../c/safe.c" + + builder.prefix ' + #include "bzipreader.h" + uint64_t readOffset; + ' + + ['VALUE __decompressBlock(char *src, int srcLen) { + char dest[BZ_MAX_BLOCK]; + uint32_t destLen = BZ_MAX_BLOCK; + int ret; + + debug("ruby decompressing %d bytes", srcLen); + if((ret = decompressBlock(src, srcLen, dest, &destLen)) != BZ_OK) + fatal("couldn\'t decompress: bz error %d", ret); + + return rb_str_new(dest, destLen); + }', + + 'VALUE __readBlock(char *file) { + FILE *in; + uint64_t realOffset; + VALUE str; + + if(strlen(file) == 0) { + in = xfopen("/dev/stdin", "rb"); + realOffset = 0; + } else { + in = xfopen(file, "rb"); + realOffset = readOffset; + } + + BitBuffer *bb = bbOfSize(BZ_MAX_BLOCK); + readOffset = fixedOffset(readBlock(in, realOffset, bb)); + + xfclose(in); + + str = rb_str_new(bb->buff, bb->pos); + bbClose(bb); + return str; + }', + + 'void __setReadOffset(char *offset) { + readOffset = *((uint64_t *) offset); + }', + + 'VALUE __getReadOffset() { + return rb_str_new((char *) &readOffset, sizeof(uint64_t)); + }', + + 'int __computeBoundaries(char *file) { + int size; + FILE *in = xfopen(file, "rb"); + size = computeBoundaries(in); + xfclose(in); + return size; + }'].each {|c| builder.c c} + end + + def uint64_to_char(num) + hi = num >> 32 + lo = num & 0xffffffff + [lo, hi].pack('L2') + end + + def char_to_uint64(char) + lo, hi = char.unpack('L2') + return lo + (hi << 32) + end + + def initialize(file="") + if file.empty? + @useStdin = true + @buffered = "" + @eof = false + end + + @file = file + @offset = 0 + end + + def getReadOffset + char_to_uint64(__getReadOffset) + end + + def setReadOffset(num) + __setReadOffset(uint64_to_char(num)) + end + + def readNextBlock + # in bzipreader.c, readBlock reads more than it should -- to determine the end of a block, + # it reads the header for the subsequent block. If we're reading from a file, this is ok; + # we can seek back to where we want to be. But when reading from stdin, it's more awkward, + # and we have to buffer things + # TODO: use a named pipe + if @useStdin + if !@buffered or (@offset > 0 and @buffered.size < 40) + raise EOF + end + + begin + @buffered += $stdin.read(BZ_MAX_BLOCK) if @buffered.size < BZ_MAX_BLOCK and !@eof + rescue TypeError + @eof = true + end + + tempfile = Tempfile.new('bzipreader') + tempfile.write(@buffered) + tempfile.flush + + setReadOffset(0) + + block = __readBlock(tempfile.path) + + offset = getReadOffset - 80 + @offset += offset + bytes = (offset >> 3) + @buffered = @buffered[bytes..-1] + + block + else + readBlock(@offset) + end + end + + def readBlock(offset) + unless @useStdin + if File.size(@file) < (offset >> 3) + 80 + 40 # don't ask + raise EOF + end + end + setReadOffset(offset) + block = __readBlock(@file) + @offset = getReadOffset + block + end + + def decompressBlock(str) + __decompressBlock(str, str.size) + end + + def computeBoundaries + __computeBoundaries(@file) + end + + def self.test(skip=0) + b = BzipReader.new('../ga.wp.txt.bz2') + skip.times do b.readNextBlock end + block = b.readNextBlock + block = b.decompressBlock(block)[0..100] + puts block + end + + def self.stdin_test(skip = 0) + b = self.new + skip.times do b.readNextBlock end + blockNo = -1 + + while true + begin + offset = b.offset + block = b.readNextBlock + plaintext = b.decompressBlock(block) + $stderr.puts "#{blockNo += 1}\t#{offset}\t#{plaintext.gsub(/\n/, "\\n")[0..30]}" + $stdout.write plaintext + rescue EOF + $stderr.puts "EOF" + break + end + end + end +end diff --git a/woip/rb/article.rb b/woip/rb/article.rb new file mode 100644 index 0000000..eb44340 --- /dev/null +++ b/woip/rb/article.rb @@ -0,0 +1,30 @@ +# these are actually real ASCII codes +START_HEADING = 1.chr +START_TEXT = 2.chr +END_TEXT = 3.chr + +class Article + attr_accessor :title + attr_accessor :body + + def body + @body ||= '' + end + + def write(str) + str.puts START_HEADING + str.puts title + str.puts body.size + str.puts START_TEXT + str.puts body + str.puts END_TEXT + str.flush + end +end + +def stdin_gets + raise EOF if $stdin.closed? + line = $stdin.gets + raise EOF unless line + line +end diff --git a/woip/rb/bzipreader.rb b/woip/rb/bzipreader.rb new file mode 100644 index 0000000..1ccacca --- /dev/null +++ b/woip/rb/bzipreader.rb @@ -0,0 +1,192 @@ +# rm -r ~/.ruby_inline; ARCHFLAGS="-arch i386" ruby -r rubygems -r bzipread.rb \ +# -e 'BzipReader.new.readBlock("../ga.wp.txt.bz2", 0)' + +require 'rubygems' +require 'tempfile' +require 'inline' +require File.join(File.dirname(__FILE__), 'common') + +BZ_MAX_BLOCK = 1024 * 900 + +class BzipReader + attr :offset + + def debug(str) + $stderr.puts str + end + + inline(:C) do |builder| + builder.add_compile_flags "-I../c -I. -lbz2" + builder.add_compile_flags "../c/bzipreader.c" + builder.add_compile_flags "../c/safe.c" + + builder.prefix ' + #include "bzipreader.h" + uint64_t readOffset; + ' + + ['VALUE __decompressBlock(char *src, int srcLen) { + char dest[BZ_MAX_BLOCK]; + uint32_t destLen = BZ_MAX_BLOCK; + int ret; + + debug("ruby decompressing %d bytes", srcLen); + if((ret = decompressBlock(src, srcLen, dest, &destLen)) != BZ_OK) + fatal("couldn\'t decompress: bz error %d", ret); + + return rb_str_new(dest, destLen); + }', + + 'VALUE __readBlock(char *file) { + FILE *in; + uint64_t realOffset; + VALUE str; + + if(strlen(file) == 0) { + in = xfopen("/dev/stdin", "rb"); + realOffset = 0; + } else { + in = xfopen(file, "rb"); + realOffset = readOffset; + } + + BitBuffer *bb = bbOfSize(BZ_MAX_BLOCK); + readOffset = fixedOffset(readBlock(in, realOffset, bb)); + + xfclose(in); + + str = rb_str_new(bb->buff, bb->pos); + bbClose(bb); + return str; + }', + + 'void __setReadOffset(char *offset) { + readOffset = *((uint64_t *) offset); + }', + + 'VALUE __getReadOffset() { + return rb_str_new((char *) &readOffset, sizeof(uint64_t)); + }', + + 'int __computeBoundaries(char *file) { + int size; + FILE *in = xfopen(file, "rb"); + size = computeBoundaries(in); + xfclose(in); + return size; + }'].each {|c| builder.c c} + end + + def uint64_to_char(num) + hi = num >> 32 + lo = num & 0xffffffff + [lo, hi].pack('L2') + end + + def char_to_uint64(char) + lo, hi = char.unpack('L2') + return lo + (hi << 32) + end + + def initialize(file="") + if file.empty? + @useStdin = true + @buffered = "" + @eof = false + end + + @file = file + @offset = 0 + end + + def getReadOffset + char_to_uint64(__getReadOffset) + end + + def setReadOffset(num) + __setReadOffset(uint64_to_char(num)) + end + + def readNextBlock + # in bzipreader.c, readBlock reads more than it should -- to determine the end of a block, + # it reads the header for the subsequent block. If we're reading from a file, this is ok; + # we can seek back to where we want to be. But when reading from stdin, it's more awkward, + # and we have to buffer things + # TODO: use a named pipe + if @useStdin + if !@buffered or (@offset > 0 and @buffered.size < 40) + raise EOF + end + + begin + @buffered += $stdin.read(BZ_MAX_BLOCK) if @buffered.size < BZ_MAX_BLOCK and !@eof + rescue TypeError + @eof = true + end + + tempfile = Tempfile.new('bzipreader') + tempfile.write(@buffered) + tempfile.flush + + setReadOffset(0) + + block = __readBlock(tempfile.path) + + offset = getReadOffset - 80 + @offset += offset + bytes = (offset >> 3) + @buffered = @buffered[bytes..-1] + + block + else + readBlock(@offset) + end + end + + def readBlock(offset) + unless @useStdin + if File.size(@file) < (offset >> 3) + 80 + 40 # don't ask + raise EOF + end + end + setReadOffset(offset) + block = __readBlock(@file) + @offset = getReadOffset + block + end + + def decompressBlock(str) + __decompressBlock(str, str.size) + end + + def computeBoundaries + __computeBoundaries(@file) + end + + def self.test(skip=0) + b = BzipReader.new('../ga.wp.txt.bz2') + skip.times do b.readNextBlock end + block = b.readNextBlock + block = b.decompressBlock(block)[0..100] + puts block + end + + def self.stdin_test(skip = 0) + b = self.new + skip.times do b.readNextBlock end + blockNo = -1 + + while true + begin + offset = b.offset + block = b.readNextBlock + plaintext = b.decompressBlock(block) + $stderr.puts "#{blockNo += 1}\t#{offset}\t#{plaintext.gsub(/\n/, "\\n")[0..30]}" + $stdout.write plaintext + rescue EOF + $stderr.puts "EOF" + break + end + end + end +end diff --git a/woip/rb/common.rb b/woip/rb/common.rb new file mode 100644 index 0000000..1c7426e --- /dev/null +++ b/woip/rb/common.rb @@ -0,0 +1,3 @@ +require 'rubygems' + +class EOF < RuntimeError; end
\ No newline at end of file diff --git a/woip/rb/index.rb b/woip/rb/index.rb new file mode 100644 index 0000000..dc376f9 --- /dev/null +++ b/woip/rb/index.rb @@ -0,0 +1,54 @@ +require File.join(File.dirname(__FILE__), 'bzipreader') +require File.join(File.dirname(__FILE__), 'article') + +$nblock = -1 +$startblock = 0 + +def get_line + if $lines.empty? + txt = $reader.decompressBlock($reader.readNextBlock) + $nblock += 1 + + $stderr.puts "#{$nblock}\t#{$reader.offset}\t#{txt.gsub(/\n/, "\\n")[0..30]}\t#{txt.size}" + + $lines = txt.split("\n").reverse + end + + $lines.pop +end + +def quote(str) + str.gsub(/'/, "''") +end + +def process_titles + while true + line = get_line + block = $nblock + if line.chomp == START_HEADING + article = get_line.chomp + puts "#{article} #{block}" if $startblock <= $nblock + end + end +rescue EOF + $stderr.puts "Done" +end + +def main + if ARGV[1] + $startblock = ARGV[1].to_i + end + + if ARGV.empty? + $reader = BzipReader.new + else + $reader = BzipReader.new(ARGV.first) + end + + $lines = [] + process_titles +end + +if __FILE__ == $0 + main +end diff --git a/woip/rb/livesearch.rb b/woip/rb/livesearch.rb new file mode 100644 index 0000000..f57ae04 --- /dev/null +++ b/woip/rb/livesearch.rb @@ -0,0 +1,106 @@ +require 'rubygems' +require 'curses' +require 'inline' + +class TernarySearcher + RESULTS = 10 + + inline(:C) do |builder| + builder.include '"ternary.h"' + builder.add_compile_flags "-I../c -I. -DDEBUG" + builder.add_compile_flags "../c/searcher.c" + + builder.prefix " + char resultbuf[#{RESULTS}][MAXLINE]; + int haveresults; + " + + builder.c "int handleResult(char *s) { + strncpy(resultbuf[haveresults++], s, MAXLINE); + if(haveresults == #{RESULTS}) return false; + else return true; + }" + + builder.c 'void __init(char *indexFile) { + load_root(indexFile); + }' + + builder.c 'void __prefixSearch(char *s, int n) { + haveresults = 0; + root_search(s, handleResult); + }' + end + + def initialize(index) + __init(index) + end + + def prefixSearch(str, n) + @results = [] + __prefixSearch(str, n) + end +end + +class Searcher + def initialize + @needle = "" + @xap = XapianSearcher.new(ARGV.first) + end + + def refresh + Curses.clear + Curses.setpos(0, 0) + Curses.addstr(@needle) + draw_matches + Curses.refresh + end + + def draw_matches + line = 1 + @matches.each do |match| + match.draw(line += 1) + end + end + + def search + @matches = @xap.matches(@needle).map do |match| + Match.new(match) + end + end + + def run + Curses.init_screen + Curses.noecho + Curses.stdscr.keypad(true) + + loop do + char = Curses.getch + + if char == 127 # Backspace + @needle = @needle[0..-2] unless @needle.empty? + else + @needle += char.chr + end + search + refresh + end + ensure + Curses.close_screen + end +end + +class Match + def initialize(str) + @string = str + end + + def draw(line) + Curses.setpos(line, 0) + Curses.addstr(@string) + end +end + +if $0 == __FILE__ + puts "running..." + Searcher.new.run +end
\ No newline at end of file diff --git a/woip/rb/server.rb b/woip/rb/server.rb new file mode 100644 index 0000000..0b2ea29 --- /dev/null +++ b/woip/rb/server.rb @@ -0,0 +1,284 @@ +require 'rubygems' +require 'mongrel' +require 'inline' + +READER = "WP, the free offline Wikipedia reader thing" +MAXRES = 40 +USAGE = %Q[ +<strong>Supported requests:</strong> +<ul> +<li><tt>/wiki/Foo</tt> - parsed Wikipedia article on Foo</li> +<li><tt>/raw/Foo</tt> - raw text of article</li> +<li><tt>/search?s=foo</tt> - article titles containing foo</li> +</ul> +] +SEARCH_BOX=%Q{<div width="100" style="float: right;"> + <form action="/search" method="get"> + <input type="text" name="s" accesskey="s" /></form></div>} + +class String + def titleize + self[0..0].upcase + self[1..-1] + end +end + +class Parser + # Parser.php is 5k lines, and has a billion dependencies, transclusion logic, + # magic-word support, langauge processing, templating code (a literally + # Turing-complete sublanguage), and stuff... so let's just use regexps + + def parse_infobox(str) + str.gsub(/^\|([^=]+)[:space:]*=[:space:]*(.*)$/) { "<b>#{$1}</b> #{$2}<br>" } + end + + def parse_template(str) + str.gsub(/\{\{(([^[:space:]]+) (\w+)$)?([^\}\{]+)\}\}/) { + if $2 == "Infobox" + %Q[ <div style="width: 200px; background-color: #eee; border: 1px solid #bbb; float: right"> + <h2>#{$3}</h2> + #{parse_infobox($4)} </div> ] + else + "" + end + } + end + + def parse(str) + # Lists; this barely works in even the simplest cases + str = str.gsub(/^\*(.+)$/) { "<span style=\"display: block\">* #{$1}</span>" } + + # Bold + str = str.gsub(/'''(.+?)'''/) { "<b>#{$1}</b>" } + + # Italic + str = str.gsub(/''(.+?)''/) { "<i>#{$1}</i>" } + + # Headings + str = str.gsub(/(={2,})([^=]+)\1/) {"<h#{$1.size}>#{$2}</h#{$1.size}>"} + + # Interwiki links + str = str.gsub(/\[\[([^\|\]]+)\|([^\]]+)\]\](\w*)/) { Article.link_to($1, $2 + $3) } + str = str.gsub(/\[\[(.*?)\]\](\w*)/) { Article.link_to($1, $1 + $2) } + + # Strip refs + str = str.gsub(/<ref>.*?<\/ref>/, '') + + # Templates, which may be nested, and which we need to parse from innermost first + while (new = parse_template(str)) != str + str = new + end + + # External links + str = str.gsub(/\[([^\][:space:]]+) ([^\]]+)\]/) {"<a href=\"#{$1}\">#{$2}</a>"} + str = str.gsub(/\[([^\][:space:]]+)\]/) {"<a href=\"#{$1}\">#{$1}</a>"} + end +end + +class Article + attr_accessor :text + attr_accessor :block + attr_accessor :title + + def parsed_text + Parser.new.parse(self.text) + end + + def as_html + %Q[<html><head><title>#{title} - #{READER}</title></head> + <body> + #{SEARCH_BOX} + <h1>#{title}</h1><h5>#{READER}</h5><p><small>#{text.size} bytes from block #{block} + (<a href="/raw/#{CGI::escape(title.titleize)}">raw</a>)</small></p> + #{parsed_text} + </body></html>] + end + + def self.link_to(name, text) + "<a href=\"/wiki/#{CGI::escape(name.titleize)}\">#{text}</a>" + end +end + +class SearchResult + attr_accessor :results + attr_accessor :needle + + def as_html + %Q[<html><head><title>Search: #{needle} - #{READER}</title></head> + <body> + #{SEARCH_BOX} + <h1>Search: #{needle}</h1> + <ul>#{results.map{|r| "<li>" + Article.link_to(r, r) + "</li>"}.join}</ul> + <p><small>Searches return up to #{MAXRES} articles containing the search string anywhere in their title. + Results are case-insensitive. Exact matches appear first, followed by prefix matches, and, lastly, substring matches. + Press ^S to quickly jump to search.</small></p> + </body> + </html>] + end +end + +class WPArticleReader + inline(:C) do |builder| + builder.add_compile_flags "-I../c -I. -lbz2 -DDEBUG" + builder.add_compile_flags "../c/bzipreader.c" + builder.add_compile_flags "../c/wp.c" + builder.add_compile_flags "../c/lsearcher.c" + builder.add_compile_flags "../c/safe.c" + builder.add_compile_flags "../c/blocks.c" + + builder.prefix %Q$ + #include "wp.h" + #define MAXRES #{MAXRES} + #define MAXSTR 1024 + + wp_dump d = {0}; + wp_article a = {0}; + + char results[MAXRES][MAXSTR]; + int nresults; + + bool __handle_result(char *s) { + strncpy(results[nresults], s, MAXSTR); + results[nresults][MAXSTR - 1] = \'\\0\'; + char *end = strrchr(results[nresults], \' \'); + + if(end) { + *(end - 1) = \'\\0\'; + nresults++; + } + + return nresults < MAXRES; + } + $ + + builder.c 'void __load_dump(char *dump, char *loc, char *ploc, char *blocks) { + load_dump(&d, dump, loc, ploc, blocks); + init_article(&a); + }' + + builder.c 'char *__load_article(char *name) { + a.block = 0; + a.text[0] = \'\0\'; + load_article(&d, name, &a); + return a.text; + }' + + builder.c 'int __article_block() { + return a.block; + }' + + builder.c 'int __article_size() { + return strlen(a.text); + }' + + + builder.c 'int __search(char *needle) { + nresults = 0; + search(&d.index, needle, __handle_result, NULL, true, true); + return nresults; + }' + + builder.c 'char *__result(int n) { + return results[n]; + }' + end + + def initialize(opts) + @locatedb = opts[:locatedb] + @prefixdb = opts[:prefixdb] + @blockdb = opts[:blockdb] + @dump = opts[:dump] + + __load_dump(@dump, @locatedb, @prefixdb, @blockdb) + end + + def fetch(name) + text = __load_article(name) + a = Article.new + a.text = text + a.block = __article_block + a.title = name + a + end + + def find(name) + n = __search(name) + r = SearchResult.new + r.needle = name + r.results = (0..n - 1).map {|n| __result(n)} + r + end +end + +class WPHandler < Mongrel::HttpHandler + def initialize(base) + $stderr.puts "Using base #{base}" + @reader = WPArticleReader.new(:locatedb => "#{base}.locate.db", + :prefixdb => "#{base}.locate.prefixdb", + :blockdb => "#{base}.blocks.db", + :dump => "#{base}.processed") + @parser = Parser.new + end + + def path(req) + CGI::unescape(req.params["REQUEST_URI"]) + end + + def notfound(resp, str) + respond(resp, true, 404) { "Couldn't find #{str}. <p>#{USAGE}</p>" } + end + + def respond(resp, html=true, status=200) + resp.start(200) do |h, o| + h["Content-type"] = (html ? 'text/html' : 'text/plain') + '; charset=utf-8' + o.write yield + end + end + + def process(req, resp) + if path(req) =~ /^\/(wiki|raw)\/(.+)$/ + article = @reader.fetch($2) + + if article.text.empty? + notfound(resp, $2) + else + if $1 == "wiki" + respond(resp) { article.as_html } + elsif $1 == "raw" + respond(resp, false) { article.text } + end + end + elsif path(req) =~ /^\/search\?s=(.+)$/ + respond(resp) { @reader.find($1).as_html } + else + notfound(resp, path(req)) + end + end +end + +class WPServer + def self.start_on(port) + self.new(:port => port).run + end + + def initialize(opts={}) + @port = opts[:port] || 9000 + @host = opts[:host] || '0.0.0.0' + end + + def run + $stderr.puts "Binding to #{@host}:#{@port}" + + conf = Mongrel::Configurator.new(:port => @port, + :host => @host) { + listener do + uri '/', :handler => WPHandler.new(ARGV.first), :in_front => true + end + + trap("INT") { stop } + + run + } + + conf.join + end +end diff --git a/woip/rb/titles.rb b/woip/rb/titles.rb new file mode 100644 index 0000000..ec03c22 --- /dev/null +++ b/woip/rb/titles.rb @@ -0,0 +1,17 @@ +require File.join(File.dirname(__FILE__), 'article') +require File.join(File.dirname(__FILE__), 'common') + +def process_titles + while true + line = stdin_gets + if line.chomp == START_HEADING + puts stdin_gets.chomp + end + end +rescue EOF + $stderr.puts "Done" +end + +if __FILE__ == $0 + process_titles +end
\ No newline at end of file diff --git a/woip/rb/xapian-index.rb b/woip/rb/xapian-index.rb new file mode 100644 index 0000000..2f06fa4 --- /dev/null +++ b/woip/rb/xapian-index.rb @@ -0,0 +1,30 @@ +require 'xapian' + +db = Xapian::WritableDatabase.new(ARGV[0], Xapian::DB_CREATE_OR_OPEN) +stem = Xapian::TermGenerator.new() +f = File.open(ARGV[1], 'r') +processed = 0 + +begin + while (line = f.readline) + begin + split = line.split("|") + next if split.first == "" + + doc = Xapian::Document.new + doc.data = line + doc.add_posting(split.first.downcase, 1) + db.add_document(doc) + processed += 1 + + if processed % 100 == 0 + $stderr.puts "#{processed}\t#{split.first}" + end + rescue + puts line + raise $! + end + end +rescue EOFError + $stderr.puts "Done" +end diff --git a/woip/rb/xmlprocess.rb b/woip/rb/xmlprocess.rb new file mode 100644 index 0000000..ebe06f9 --- /dev/null +++ b/woip/rb/xmlprocess.rb @@ -0,0 +1,52 @@ +require "rexml/document" +require "rexml/streamlistener" +require File.join(File.dirname(__FILE__), 'article') + +include REXML + +class ArticleListener + include StreamListener + + def initialize + @processed = 0 + @start = Time.now + end + + def text(text) + @cur_text = text.gsub(/^\[\[[^\]\[]+?\:[^\]\[]+?\]\]$/, '').gsub(/\n+/, "\n") + end + + def print_stats + rate = (((@processed.to_f / (Time.now - @start)) * 100).round) / 100.0 + $stderr.puts "Processed: #{@processed}\tRate: #{rate}/sec" + end + + def is_desirable(article) + not (article.title =~ /\:/ or article.title =~ /\//) + end + + def tag_start(name, attrs) + if name == 'page' + @cur_article.write($stdout) if (@cur_article) + @cur_article = Article.new + @processed += 1 + print_stats if (@processed % 100) == 0 + end + end + + def tag_end(name) + if name == 'title' + @cur_article.title = @cur_text + elsif name == 'text' + @cur_article.body = @cur_text + end + end +end + +def process + Document.parse_stream($stdin, ArticleListener.new) +end + +if __FILE__ == $0 + process +end |