Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/woip/rb/xmlprocess.rb
blob: 8638c2e3ca828ec5d9c9f8226138d671c97c2b0a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
require "rexml/document"
require "rexml/streamlistener"
require File.join(File.dirname(__FILE__), 'article')

include REXML

class ArticleListener
  include StreamListener
  
  def initialize
    @processed = 0
    @start = Time.now
  end
  
  def text(text)
    @cur_text = text.gsub(/\n+/, "\n")
  end
  
  def print_stats
    rate = (((@processed.to_f / (Time.now - @start)) * 100).round) / 100.0
    $stderr.puts "Processed: #{@processed}\tRate: #{rate}/sec"
  end
  
  def is_desirable(article)
    not (article.title =~ /\:/ or article.title =~ /\//)
  end
  
  def tag_start(name, attrs)
    if name == 'page'
      @cur_article.write($stdout) if (@cur_article)
      @cur_article = Article.new
      @processed += 1
      print_stats if (@processed % 100) == 0
    end
  end
  
  def tag_end(name)
    if name == 'title'
      @cur_article.title = @cur_text
    elsif name == 'text'
      @cur_article.body = @cur_text
    end
  end
end

def process
  Document.parse_stream($stdin, ArticleListener.new)
end
    
if __FILE__ == $0
  process
end