diff options
Diffstat (limited to 'woip/rb/xmlprocess.rb')
-rw-r--r-- | woip/rb/xmlprocess.rb | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/woip/rb/xmlprocess.rb b/woip/rb/xmlprocess.rb new file mode 100644 index 0000000..ebe06f9 --- /dev/null +++ b/woip/rb/xmlprocess.rb @@ -0,0 +1,52 @@ +require "rexml/document" +require "rexml/streamlistener" +require File.join(File.dirname(__FILE__), 'article') + +include REXML + +class ArticleListener + include StreamListener + + def initialize + @processed = 0 + @start = Time.now + end + + def text(text) + @cur_text = text.gsub(/^\[\[[^\]\[]+?\:[^\]\[]+?\]\]$/, '').gsub(/\n+/, "\n") + end + + def print_stats + rate = (((@processed.to_f / (Time.now - @start)) * 100).round) / 100.0 + $stderr.puts "Processed: #{@processed}\tRate: #{rate}/sec" + end + + def is_desirable(article) + not (article.title =~ /\:/ or article.title =~ /\//) + end + + def tag_start(name, attrs) + if name == 'page' + @cur_article.write($stdout) if (@cur_article) + @cur_article = Article.new + @processed += 1 + print_stats if (@processed % 100) == 0 + end + end + + def tag_end(name) + if name == 'title' + @cur_article.title = @cur_text + elsif name == 'text' + @cur_article.body = @cur_text + end + end +end + +def process + Document.parse_stream($stdin, ArticleListener.new) +end + +if __FILE__ == $0 + process +end |