diff options
Diffstat (limited to 'woip/rb/bzipreader.rb')
-rw-r--r-- | woip/rb/bzipreader.rb | 192 |
1 files changed, 192 insertions, 0 deletions
diff --git a/woip/rb/bzipreader.rb b/woip/rb/bzipreader.rb new file mode 100644 index 0000000..1ccacca --- /dev/null +++ b/woip/rb/bzipreader.rb @@ -0,0 +1,192 @@ +# rm -r ~/.ruby_inline; ARCHFLAGS="-arch i386" ruby -r rubygems -r bzipread.rb \ +# -e 'BzipReader.new.readBlock("../ga.wp.txt.bz2", 0)' + +require 'rubygems' +require 'tempfile' +require 'inline' +require File.join(File.dirname(__FILE__), 'common') + +BZ_MAX_BLOCK = 1024 * 900 + +class BzipReader + attr :offset + + def debug(str) + $stderr.puts str + end + + inline(:C) do |builder| + builder.add_compile_flags "-I../c -I. -lbz2" + builder.add_compile_flags "../c/bzipreader.c" + builder.add_compile_flags "../c/safe.c" + + builder.prefix ' + #include "bzipreader.h" + uint64_t readOffset; + ' + + ['VALUE __decompressBlock(char *src, int srcLen) { + char dest[BZ_MAX_BLOCK]; + uint32_t destLen = BZ_MAX_BLOCK; + int ret; + + debug("ruby decompressing %d bytes", srcLen); + if((ret = decompressBlock(src, srcLen, dest, &destLen)) != BZ_OK) + fatal("couldn\'t decompress: bz error %d", ret); + + return rb_str_new(dest, destLen); + }', + + 'VALUE __readBlock(char *file) { + FILE *in; + uint64_t realOffset; + VALUE str; + + if(strlen(file) == 0) { + in = xfopen("/dev/stdin", "rb"); + realOffset = 0; + } else { + in = xfopen(file, "rb"); + realOffset = readOffset; + } + + BitBuffer *bb = bbOfSize(BZ_MAX_BLOCK); + readOffset = fixedOffset(readBlock(in, realOffset, bb)); + + xfclose(in); + + str = rb_str_new(bb->buff, bb->pos); + bbClose(bb); + return str; + }', + + 'void __setReadOffset(char *offset) { + readOffset = *((uint64_t *) offset); + }', + + 'VALUE __getReadOffset() { + return rb_str_new((char *) &readOffset, sizeof(uint64_t)); + }', + + 'int __computeBoundaries(char *file) { + int size; + FILE *in = xfopen(file, "rb"); + size = computeBoundaries(in); + xfclose(in); + return size; + }'].each {|c| builder.c c} + end + + def uint64_to_char(num) + hi = num >> 32 + lo = num & 0xffffffff + [lo, hi].pack('L2') + end + + def char_to_uint64(char) + lo, hi = char.unpack('L2') + return lo + (hi << 32) + end + + def initialize(file="") + if file.empty? + @useStdin = true + @buffered = "" + @eof = false + end + + @file = file + @offset = 0 + end + + def getReadOffset + char_to_uint64(__getReadOffset) + end + + def setReadOffset(num) + __setReadOffset(uint64_to_char(num)) + end + + def readNextBlock + # in bzipreader.c, readBlock reads more than it should -- to determine the end of a block, + # it reads the header for the subsequent block. If we're reading from a file, this is ok; + # we can seek back to where we want to be. But when reading from stdin, it's more awkward, + # and we have to buffer things + # TODO: use a named pipe + if @useStdin + if !@buffered or (@offset > 0 and @buffered.size < 40) + raise EOF + end + + begin + @buffered += $stdin.read(BZ_MAX_BLOCK) if @buffered.size < BZ_MAX_BLOCK and !@eof + rescue TypeError + @eof = true + end + + tempfile = Tempfile.new('bzipreader') + tempfile.write(@buffered) + tempfile.flush + + setReadOffset(0) + + block = __readBlock(tempfile.path) + + offset = getReadOffset - 80 + @offset += offset + bytes = (offset >> 3) + @buffered = @buffered[bytes..-1] + + block + else + readBlock(@offset) + end + end + + def readBlock(offset) + unless @useStdin + if File.size(@file) < (offset >> 3) + 80 + 40 # don't ask + raise EOF + end + end + setReadOffset(offset) + block = __readBlock(@file) + @offset = getReadOffset + block + end + + def decompressBlock(str) + __decompressBlock(str, str.size) + end + + def computeBoundaries + __computeBoundaries(@file) + end + + def self.test(skip=0) + b = BzipReader.new('../ga.wp.txt.bz2') + skip.times do b.readNextBlock end + block = b.readNextBlock + block = b.decompressBlock(block)[0..100] + puts block + end + + def self.stdin_test(skip = 0) + b = self.new + skip.times do b.readNextBlock end + blockNo = -1 + + while true + begin + offset = b.offset + block = b.readNextBlock + plaintext = b.decompressBlock(block) + $stderr.puts "#{blockNo += 1}\t#{offset}\t#{plaintext.gsub(/\n/, "\\n")[0..30]}" + $stdout.write plaintext + rescue EOF + $stderr.puts "EOF" + break + end + end + end +end |