Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/woip/rb/xmlprocess.rb
diff options
context:
space:
mode:
Diffstat (limited to 'woip/rb/xmlprocess.rb')
-rw-r--r--woip/rb/xmlprocess.rb52
1 files changed, 52 insertions, 0 deletions
diff --git a/woip/rb/xmlprocess.rb b/woip/rb/xmlprocess.rb
new file mode 100644
index 0000000..ebe06f9
--- /dev/null
+++ b/woip/rb/xmlprocess.rb
@@ -0,0 +1,52 @@
+require "rexml/document"
+require "rexml/streamlistener"
+require File.join(File.dirname(__FILE__), 'article')
+
+include REXML
+
+class ArticleListener
+ include StreamListener
+
+ def initialize
+ @processed = 0
+ @start = Time.now
+ end
+
+ def text(text)
+ @cur_text = text.gsub(/^\[\[[^\]\[]+?\:[^\]\[]+?\]\]$/, '').gsub(/\n+/, "\n")
+ end
+
+ def print_stats
+ rate = (((@processed.to_f / (Time.now - @start)) * 100).round) / 100.0
+ $stderr.puts "Processed: #{@processed}\tRate: #{rate}/sec"
+ end
+
+ def is_desirable(article)
+ not (article.title =~ /\:/ or article.title =~ /\//)
+ end
+
+ def tag_start(name, attrs)
+ if name == 'page'
+ @cur_article.write($stdout) if (@cur_article)
+ @cur_article = Article.new
+ @processed += 1
+ print_stats if (@processed % 100) == 0
+ end
+ end
+
+ def tag_end(name)
+ if name == 'title'
+ @cur_article.title = @cur_text
+ elsif name == 'text'
+ @cur_article.body = @cur_text
+ end
+ end
+end
+
+def process
+ Document.parse_stream($stdin, ArticleListener.new)
+end
+
+if __FILE__ == $0
+ process
+end