diff options
Diffstat (limited to 'ruby/lib/ftsearch')
-rw-r--r-- | ruby/lib/ftsearch/analysis/analyzer.rb | 16 | ||||
-rw-r--r-- | ruby/lib/ftsearch/analysis/simple_identifier_analyzer.rb | 23 | ||||
-rw-r--r-- | ruby/lib/ftsearch/analysis/whitespace_analyzer.rb | 22 | ||||
-rw-r--r-- | ruby/lib/ftsearch/document_map_reader.rb | 106 | ||||
-rw-r--r-- | ruby/lib/ftsearch/document_map_writer.rb | 46 | ||||
-rw-r--r-- | ruby/lib/ftsearch/field_infos.rb | 46 | ||||
-rw-r--r-- | ruby/lib/ftsearch/fragment_writer.rb | 114 | ||||
-rw-r--r-- | ruby/lib/ftsearch/fulltext_reader.rb | 52 | ||||
-rw-r--r-- | ruby/lib/ftsearch/fulltext_writer.rb | 75 | ||||
-rw-r--r-- | ruby/lib/ftsearch/suffix_array_reader.rb | 277 | ||||
-rw-r--r-- | ruby/lib/ftsearch/suffix_array_writer.rb | 99 | ||||
-rw-r--r-- | ruby/lib/ftsearch/util.rb | 21 |
12 files changed, 897 insertions, 0 deletions
diff --git a/ruby/lib/ftsearch/analysis/analyzer.rb b/ruby/lib/ftsearch/analysis/analyzer.rb new file mode 100644 index 0000000..0e08760 --- /dev/null +++ b/ruby/lib/ftsearch/analysis/analyzer.rb @@ -0,0 +1,16 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +module FTSearch +module Analysis + +class Analyzer + def find_suffixes(text) + ret = [] + append_suffixes(ret, text, 0) + ret + end +end + +end # Analysis +end # FTSearch diff --git a/ruby/lib/ftsearch/analysis/simple_identifier_analyzer.rb b/ruby/lib/ftsearch/analysis/simple_identifier_analyzer.rb new file mode 100644 index 0000000..b939861 --- /dev/null +++ b/ruby/lib/ftsearch/analysis/simple_identifier_analyzer.rb @@ -0,0 +1,23 @@ + +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +require 'strscan' +require 'ftsearch/analysis/analyzer' + +module FTSearch +module Analysis + +class SimpleIdentifierAnalyzer < Analyzer + def append_suffixes(array, text, offset) + sc = StringScanner.new(text) + sc.skip(/[^A-Za-z_]+/) + until sc.eos? + array << (sc.pos + offset) + break unless sc.skip(/[A-Za-z_][A-Za-z0-9_]*[^A-Za-z_]*/) + end + end +end + +end # Analyzer +end # FTSearch diff --git a/ruby/lib/ftsearch/analysis/whitespace_analyzer.rb b/ruby/lib/ftsearch/analysis/whitespace_analyzer.rb new file mode 100644 index 0000000..6cb5191 --- /dev/null +++ b/ruby/lib/ftsearch/analysis/whitespace_analyzer.rb @@ -0,0 +1,22 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +require 'strscan' +require 'ftsearch/analysis/analyzer' + +module FTSearch +module Analysis + class WhiteSpaceAnalyzer < Analyzer + def append_suffixes(array, text, offset) + sc = StringScanner.new(text) + sc.skip(/(\s|\n)*/) + until sc.eos? + array << (sc.pos + offset) + break unless sc.skip(/\S+\s*/) + end + + array + end + end +end # Analyzer +end # FTSearch diff --git a/ruby/lib/ftsearch/document_map_reader.rb b/ruby/lib/ftsearch/document_map_reader.rb new file mode 100644 index 0000000..5d842c0 --- /dev/null +++ b/ruby/lib/ftsearch/document_map_reader.rb @@ -0,0 +1,106 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +module FTSearch + +class DocumentMapReader + DEFAULT_OPTIONS = { + :path => nil, + :io => nil, + } + def initialize(options = {}) + options = DEFAULT_OPTIONS.merge(options) + unless options[:path] || options[:io] + raise ArgumentError, "Need either the path to the suffix array file or an IO." + end + if options[:path] + io = File.open(options[:path], "rb") + else + io = options[:io] + end + @uri_tbl, @field_arr = Marshal.load(io) + end + + def document_id(suffix_idx, offset) + idx = binary_search(@field_arr, offset, 0, @field_arr.size) + @field_arr[idx][1] + end + + def field_id(suffix_idx, offset) + idx = binary_search(@field_arr, offset, 0, @field_arr.size) + @field_arr[idx][2] + end + + def field_size(suffix_idx, offset) + idx = binary_search(@field_arr, offset, 0, @field_arr.size) + @field_arr[idx][3] + end + + def field_info(suffix_idx, offset) + idx = binary_search(@field_arr, offset, 0, @field_arr.size) + @field_arr[idx] + end + + def document_uri(suffix_idx, offset) + doc_id = document_id(suffix_idx, offset) + @uri_tbl[doc_id] + end + + def document_id_to_uri(doc_id) + @uri_tbl[doc_id] + end + + def offsets_to_field_infos(offsets) + offsets.map{|off| @field_arr[binary_search(@field_arr, off, 0, @field_arr.size)]} + end + + def rank_offsets(offsets, weights) + h = Hash.new{|h,k| h[k] = 0.0} + offsets_to_field_infos(offsets).each do |offset, doc_id, field_id, field_size| + h[doc_id] += weights[field_id] / field_size + end + sort_score_hash(h) + end + + def rank_offsets_probabilistic(offsets, weights, iterations) + h = Hash.new{|h,k| h[k] = 0.0} + infos = offsets_to_field_infos(offsets) + max = infos.size + while iterations > 0 + offset, doc_id, field_id, field_size = infos[rand(max)] + h[doc_id] += weights[field_id] / field_size + iterations -= 1 + end + sort_score_hash(h) + end + + def dump_data + [@uri_tbl, @field_arr] + end + + def num_documents + @uri_tbl.size + end + + private + def sort_score_hash(h) + h.sort_by{|_,score| -score} + end + + def binary_search(arr, offset, from, to) + middle = 0 + while to - from > 1 + middle = (from + to) / 2 + pivot, = arr[middle] + if offset < pivot + to = middle + else + from = middle + end + end + + from + end + +end +end # FTSearch diff --git a/ruby/lib/ftsearch/document_map_writer.rb b/ruby/lib/ftsearch/document_map_writer.rb new file mode 100644 index 0000000..7bf20bc --- /dev/null +++ b/ruby/lib/ftsearch/document_map_writer.rb @@ -0,0 +1,46 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +require 'ftsearch/util' + +module FTSearch +class DocumentMapWriter + include InMemoryWriter + + DEFAULT_OPTIONS = { + :path => "docmap-#{Process.pid}-#{rand(100000)}", + } + def initialize(options = {}) + options = DEFAULT_OPTIONS.merge(options) + @path = options[:path] + @field_arr = [] + @uri_tbl = [] + @data = [@uri_tbl, @field_arr] + initialize_in_memory_buffer + end + + def merge(doc_map_reader) + # TODO: general merge? + @uri_tbl, @field_arr = doc_map_reader.dump_data + @data = [@uri_tbl, @field_arr] + end + + def add_document(doc_id, uri) + @uri_tbl[doc_id] = uri + end + + def add_field(offset, doc_id, field_id, size) + @field_arr << [offset, doc_id, field_id, size] + end + + def finish! + if @path + File.open(@path, "wb") do |f| + Marshal.dump(@data, f) + end + else + Marshal.dump(@data, @memory_io) + end + end +end +end # FTSearch diff --git a/ruby/lib/ftsearch/field_infos.rb b/ruby/lib/ftsearch/field_infos.rb new file mode 100644 index 0000000..91749b2 --- /dev/null +++ b/ruby/lib/ftsearch/field_infos.rb @@ -0,0 +1,46 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +require 'ftsearch/analysis/whitespace_analyzer' + +module FTSearch +class FieldInfos + DEFAULT_OPTIONS = { + :default_analyzer => FTSearch::Analysis::WhiteSpaceAnalyzer.new, + :stored => true, + } + def initialize(options = {}) + options = DEFAULT_OPTIONS.merge(options) + @fields = {} + @default_options = options + end + + def add_field(options = {}) + options = @default_options.merge(options) + raise "Need a name" unless options[:name] + store_field_info(options) + end + + def [](field_name) + if field_info = @fields[field_name] + field_info + else + store_field_info(:name => field_name) + end + end + + private + def store_field_info(options) + options = @default_options.merge(options) + unless options[:analyzer] + if klass = options[:analyzer_class] + options[:analyzer] = klass.new + else + options[:analyzer] = @default_options[:default_analyzer] + end + end + @fields[options[:name]] = options + end +end +end # FTSearch + diff --git a/ruby/lib/ftsearch/fragment_writer.rb b/ruby/lib/ftsearch/fragment_writer.rb new file mode 100644 index 0000000..79b783f --- /dev/null +++ b/ruby/lib/ftsearch/fragment_writer.rb @@ -0,0 +1,114 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +require 'fileutils' +require 'ftsearch/suffix_array_writer' +require 'ftsearch/document_map_writer' +require 'ftsearch/fulltext_writer' +require 'ftsearch/field_infos' + +require 'ftsearch/fulltext_reader' +require 'ftsearch/document_map_reader' +require 'ftsearch/suffix_array_reader' + +module FTSearch + +class FragmentWriter + DEFAULT_OPTIONS = { + :path => "ftsearch-#{Process.pid}-#{rand(100000)}", + :default_analyzer_class => FTSearch::Analysis::WhiteSpaceAnalyzer, + :field_infos_class => FieldInfos, + :fulltext_writer_class => FulltextWriter, + :suffix_array_writer_class => SuffixArrayWriter, + :doc_map_writer_class => DocumentMapWriter, + :field_infos => nil, + :fulltext_writer => nil, + :suffix_array_writer => nil, + :doc_map_writer_nil => nil, + } + + attr_reader :fulltext_writer, :suffix_array_writer, :doc_map_writer + def initialize(options = {}) + options = DEFAULT_OPTIONS.merge(options) + create = lambda do |name, *args| + options[name] || options[(name.to_s + "_class").to_sym].new(*args) + end + build_path = lambda do |suffix| + if @path + File.join(@tmpdir, suffix) + else + nil + end + end + @path = options[:path] + if @path + @path = File.expand_path(@path) + @tmpdir = @path + "#{Process.pid}-#{rand(100000)}" + FileUtils.mkdir_p(@tmpdir) + end + + @fulltext_writer = create.call(:fulltext_writer, :path => build_path["fulltext"]) + @suffix_array_writer = create.call(:suffix_array_writer, :path => build_path["suffixes"]) + @doc_map_writer = create.call(:doc_map_writer, :path => build_path["docmap"]) + + default_analyzer = (klass = options[:default_analyzer_class]) ? klass.new : nil + @field_infos = create.call(:field_infos, :default_analyzer => default_analyzer) + @num_documents = 0 + @field_map = Hash.new{|h,k| h[k.to_sym] = h.size} + @field_map[:uri] # init + end + + def add_document(doc_hash) + uri = doc_hash[:uri] || @num_documents.to_s + @fulltext_writer.add_document(@num_documents, doc_hash.merge(:uri => uri), + @field_map, @field_infos, @suffix_array_writer, @doc_map_writer) + @num_documents += 1 + end + + def merge(fragment_directory) + raise "Cannot import old data unless the destination Fragment is empty." unless @num_documents == 0 + # TODO: use a FragmentReader to access old data + fulltext_reader = FulltextReader.new(:path => "#{fragment_directory}/fulltext") + suffix_array_reader = SuffixArrayReader.new(fulltext_reader, nil, + :path => "#{fragment_directory}/suffixes") + doc_map_reader = DocumentMapReader.new(:path => "#{fragment_directory}/docmap") + @fulltext_writer.merge(fulltext_reader) + @suffix_array_writer.merge(suffix_array_reader) + @doc_map_writer.merge(doc_map_reader) + #FIXME: .num_documents will be wrong if some URIs were repeated + @num_documents = doc_map_reader.num_documents + File.open(File.join(fragment_directory, "fieldmap"), "rb") do |f| + i = 0 + f.each_line{|l| @field_map[l.chomp.to_sym] = i; i+= 1} + end + end + + def fields + @field_map.sort_by{|field, fid| fid}.map{|field, fid| field} + end + + def documents + @num_documents + end + + def field_id(field) + @field_map.has_key?(field) && @field_map[field] + end + + def finish! + @fulltext_writer.finish! + fulltext = @fulltext_writer.data + @suffix_array_writer.finish!(fulltext) + @doc_map_writer.finish! + + if @path + File.open(File.join(@tmpdir, "fieldmap"), "wb") do |f| + @field_map.sort_by{|field_name, field_id| field_id}.each do |field_name, field_id| + f.puts field_name + end + File.rename(@tmpdir, @path) + end + end + end +end +end # FTSearch diff --git a/ruby/lib/ftsearch/fulltext_reader.rb b/ruby/lib/ftsearch/fulltext_reader.rb new file mode 100644 index 0000000..05da57b --- /dev/null +++ b/ruby/lib/ftsearch/fulltext_reader.rb @@ -0,0 +1,52 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +module FTSearch + +class FulltextReader + DEFAULT_OPTIONS = { + :path => nil, + :io => nil, + } + def initialize(options = {}) + options = DEFAULT_OPTIONS.merge(options) + unless options[:path] || options[:io] + raise ArgumentError, "Need either the path to the suffix array file or an IO." + end + init_internal_structures(options) + end + + def get_data(offset, size) + @io.pos = offset + @io.read(size) + end + + def dump_data(&block) + blocksize = 32768 + @io.pos = 0 + begin + size = @io.stat.size - 1 + rescue NoMethodError # try with StringIO's interface + size = @io.string.size - 1 + end + read = 0 + #buffer = " " * blocksize + while read < size + #@io.read([size - read, blocksize].min, buffer) + #yield buffer + data = @io.read([size - read, blocksize].min) + read += data.size + yield data + end + end + + private + def init_internal_structures(options) + if options[:path] + @io = File.open(options[:path], "rb") + else + @io = options[:io] + end + end +end +end # FTSearch diff --git a/ruby/lib/ftsearch/fulltext_writer.rb b/ruby/lib/ftsearch/fulltext_writer.rb new file mode 100644 index 0000000..ad92c0b --- /dev/null +++ b/ruby/lib/ftsearch/fulltext_writer.rb @@ -0,0 +1,75 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +require 'ftsearch/util' + +module FTSearch +class FulltextWriter + include InMemoryWriter + + DEFAULT_OPTIONS = { + :path => "fulltext-#{Process.pid}-#{rand(100000)}", + } + + attr_reader :path + + def initialize(options = {}) + options = DEFAULT_OPTIONS.merge(options) + @path = options[:path] + initialize_in_memory_buffer + if @path + @io = File.open(@path, "wb") + else + @io = @memory_io + end + end + + def merge(fulltext_reader) + fulltext_reader.dump_data do |data| + @io.write data + end + end + + def add_document(doc_id, doc_hash, field_mapping, field_infos, suffix_array_writer, doc_map_writer) + write_document_header(doc_id, doc_hash, field_mapping, field_infos) + doc_map_writer.add_document(doc_id, doc_hash[:uri]) + doc_hash.each_pair do |field_name, data| + if field_id = field_mapping[field_name] + field_info = field_infos[field_name] + if field_info[:stored] + suffix_offset, segment_offset = store_field(doc_id, field_name, field_id, data) + if analyzer = field_info[:analyzer] + suffix_array_writer.add_suffixes(analyzer, data, suffix_offset) + end + doc_map_writer.add_field(segment_offset, doc_id, field_id, data.size) + end + end + end + end + + def finish! + @io.write "\0" + @io.fsync + @io.close + end + + private + def write_document_header(doc_id, doc_hash, field_mapping, field_infos) + stored_fields = doc_hash.select do |field_name, data| + field_infos[field_name][:stored] + end + total_size = stored_fields.inject(0){|s,(_,data)| s + data.size} + stored_fields.size * 9 + # 9 = field ids plus field size plus trailing \0 + @io.write [total_size].pack("V") + end + + def store_field(doc_id, field_name, field_id, data) + @io.write [field_id, data.size].pack("V2") + offset = @io.pos + @io.write data + @io.write "\0" + + [offset, offset - 8] + end +end +end # FTSearch diff --git a/ruby/lib/ftsearch/suffix_array_reader.rb b/ruby/lib/ftsearch/suffix_array_reader.rb new file mode 100644 index 0000000..830bf61 --- /dev/null +++ b/ruby/lib/ftsearch/suffix_array_reader.rb @@ -0,0 +1,277 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +module FTSearch + +class SuffixArrayReader + class Hit < Struct.new(:term, :suffix_number, :offset, :fulltext_reader) + def context(size) + strip_markers(self.fulltext_reader.get_data(offset - size, 2 * size), size) + end + + def text(size) + strip_markers(self.fulltext_reader.get_data(offset, size), 0) + end + + private + def strip_markers(str, size) + first = (str.rindex("\0", -size) || -1) + 1 + last = str.index("\0", size) || str.size + str[first...last] + end + end + + class LazyHits < Struct.new(:term, :suffix_array_reader, :fulltext_reader, + :from_index, :to_index) + include Enumerable + def each + sa_reader = self.suffix_array_reader + ft_reader = self.fulltext_reader + term = self.term + self.from_index.upto(self.to_index - 1) do |idx| + yield Hit.new(term, idx, sa_reader.suffix_index_to_offset(idx), + ft_reader) + end + end + + def [](i) + i += to_index - from_index if i < 0 + sa_reader = self.suffix_array_reader + if (idx = from_index + i) < to_index && idx >= from_index + Hit.new(self.term, idx, sa_reader.suffix_index_to_offset(idx), + self.fulltext_reader) + else + nil + end + end + + def size + to_index - from_index + end + end + + DEFAULT_OPTIONS = { + :path => nil, + :io => nil, + } + def initialize(fulltext_reader, doc_map, options = {}) + options = DEFAULT_OPTIONS.merge(options) + @fulltext_reader = fulltext_reader + @doc_map = doc_map + unless options[:path] || options[:io] + raise ArgumentError, "Need either the path to the suffix array file or an IO." + end + init_internal_structures(options) + end + + def count_hits(term) + from = binary_search(term, 0, @suffixes.size) + offset = @suffixes[from] + if @fulltext_reader.get_data(offset, term.size) == term + to = binary_search_upper(term, 0, @suffixes.size) + to - from + else + 0 + end + end + + def find_all(term) + from = binary_search(term, 0, @suffixes.size) + offset = @suffixes[from] + if @fulltext_reader.get_data(offset, term.size) == term + to = binary_search_upper(term, 0, @suffixes.size) + LazyHits.new(term, self, @fulltext_reader, from, to) + else + LazyHits.new(term, self, @fulltext_reader, 0, 0) + end + end + + def find_first(term) + suffix_index = binary_search(term, 0, @suffixes.size) + offset = @suffixes[suffix_index] + if @fulltext_reader.get_data(offset, term.size) == term + Hit.new(term, suffix_index, offset, @fulltext_reader) + else + nil + end + end + + def find_next(hit) + end + + def suffix_index_to_offset(suffix_index) + @suffixes[suffix_index] + end + + def lazyhits_to_offsets(lazyhits) + from = lazyhits.from_index + to = lazyhits.to_index + @io.pos = @base + 4 * from + @io.read((to - from) * 4).unpack("V*") + end + + def dump_data + @io.pos = @base + while data = @io.read(32768) + yield data.unpack("V*") + end + end + + private + def init_internal_structures(options) + if options[:path] + @io = File.open(options[:path], "rb") + else + @io = options[:io] + end + @total_suffixes, @block_size, @inline_suffix_size = @io.read(12).unpack("VVV") + @inline_suffixes = [] + if @block_size != 0 + 0.step(@total_suffixes, @block_size){ @inline_suffixes << @io.read(@inline_suffix_size)} + end + + # skip padding + if (mod = @io.pos & 0xf) != 0 + @io.read(16 - mod) + end + + @base = @io.pos + #@suffixes = io.read.unpack("V*") + @suffixes = Object.new + nsuffixes = @total_suffixes + io = @io + base = @base + class << @suffixes; self end.module_eval do + define_method(:[]) do |i| + io.pos = base + i * 4 + io.read(4).unpack("V")[0] + end + define_method(:size){ nsuffixes } + end + end + + def binary_search(term, from, to) + from, to = binary_search_inline_suffixes(term, from, to) + + tsize = term.size + while from < to + middle = (from + to) / 2 + pivot = @fulltext_reader.get_data(@suffixes[middle], tsize) + if term <= pivot + to = middle + else + from = middle + 1 + end + end + + from + end + + def binary_search_upper(term, from, to) + from, to = binary_search_inline_suffixes_upper(term, from, to) + + tsize = term.size + + #puts "#{from} -- #{to}" + #from.upto(to+5) do |idx| + # puts "#{idx} #{@fulltext_reader.get_data(@suffixes[idx], tsize + 10).inspect}" + #end + while from < to + middle = (from + to) / 2 + pivot = @fulltext_reader.get_data(@suffixes[middle], tsize) + if term < pivot + to = middle + else + from = middle + 1 + end + end + + #puts "RET: #{from}" + from + end + + + def binary_search_inline_suffixes(term, from, to) + return [from, to] if @block_size == 0 + + tsize = term.size + while to - from > @block_size + middle = (from + to) / 2 + #puts "from: #{from} to #{to} middle: #{middle}" if $DEBUG + quotient, mod = middle.divmod(@block_size) + middle = middle - mod + pivot = @inline_suffixes[quotient] + #puts "NOW: #{middle} pivot: #{pivot.inspect}" if $DEBUG + if tsize <= @inline_suffix_size + if term <= pivot + to = middle + else + from = middle + 1 + end + elsif term[0, @inline_suffix_size] < pivot + to = middle + else + # FIXME: handle pivot[-1] = 255? + pivot = pivot.clone + #pivot[-1] += 1 + pivot.next! + #puts "TESTING AGAINST new pivot: #{pivot.inspect}" if $DEBUG + if term > pivot + from = middle + 1 + else # term[0, @inline_suffix_size] == pivot, disambiguate + pivot = @fulltext_reader.get_data(@suffixes[middle], term.size) + if term <= pivot + to = middle + else + from = middle + 1 + end + end + end + end + + [from, to] + end + + def binary_search_inline_suffixes_upper(term, from, to) + return [from, to] if @block_size == 0 + + tsize = term.size + while to - from > @block_size + middle = (from + to) / 2 + #puts "from: #{from} to #{to} middle: #{middle}" if $DEBUG + quotient, mod = middle.divmod(@block_size) + middle = middle - mod + pivot = @inline_suffixes[quotient] + #puts "NOW: #{middle} pivot: #{pivot.inspect}" if $DEBUG + if tsize <= @inline_suffix_size + if term < pivot[0, tsize] + to = middle + else + from = middle + 1 + end + elsif term[0, @inline_suffix_size] < pivot + to = middle + else + # FIXME: handle pivot[-1] = 255? + pivot = pivot.clone + #pivot[-1] += 1 + pivot.next! + #puts "TESTING AGAINST new pivot: #{pivot.inspect}" if $DEBUG + if term > pivot + from = middle + 1 + else # term[0, @inline_suffix_size] == pivot, disambiguate + pivot = @fulltext_reader.get_data(@suffixes[middle], term.size) + if term < pivot + to = middle + else + from = middle + 1 + end + end + end + end + + [from, to] + end +end + +end # FTSearch diff --git a/ruby/lib/ftsearch/suffix_array_writer.rb b/ruby/lib/ftsearch/suffix_array_writer.rb new file mode 100644 index 0000000..694dd3b --- /dev/null +++ b/ruby/lib/ftsearch/suffix_array_writer.rb @@ -0,0 +1,99 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +require 'enumerator' +require 'ftsearch/util' + +module FTSearch + +class SuffixArrayWriter + include InMemoryWriter + + DEFAULT_OPTIONS = { + :path => "suffixes-#{Process.pid}-#{rand(100000)}", + :block_size => 32, + :inline_suffix_size => 8, + :default_analyzer => nil, + } + def initialize(options = {}) + options = DEFAULT_OPTIONS.merge(options) + @path = options[:path] + @suffixes = [] + @block_size = options[:block_size] + @inline_suffix_size = options[:inline_suffix_size] + @finished = false + initialize_in_memory_buffer + end + + def merge(suffix_array_reader) + unless @suffixes.empty? + raise "Cannot merge if the destination SuffixArrayWriter isn't empty!" + end + suffix_array_reader.dump_data do |partial_sarray| + @suffixes.concat partial_sarray + end + end + + def add_suffixes(analyzer, data, offset) + #analyzer.new.find_suffixes(data).each{|x| @suffixes << offset + x} + analyzer.append_suffixes(@suffixes, data, offset) + end + + def finish!(fulltext) + return if @finished + if $DEBUG + puts "Suffixes: #{@suffixes.size}" + t0 = Time.new + end + sort!(fulltext) + if $DEBUG + puts "Sorted in #{Time.new - t0}" + end + if $DEBUG + puts "Dumping suffixes" + t0 = Time.new + end + dump_suffixes(fulltext) + if $DEBUG + puts "Dumped in #{Time.new - t0}" + end + @finished = true + end + + private + def dump_suffixes(fulltext) + io = @path ? File.open(@path, "wb") : @memory_io + io.write([@suffixes.size, @block_size || 0, @inline_suffix_size].pack("VVV")) + if @block_size + dump_inline_suffixes(io, fulltext) + end + add_padding(io) + dump_suffix_array(io) + ensure + io.close if @path + end + + def dump_inline_suffixes(io, fulltext) + 0.step(@suffixes.size, @block_size) do |suffix_idx| + io.write([fulltext[@suffixes[suffix_idx], @inline_suffix_size]].pack("a#{@inline_suffix_size}")) + end + end + + def dump_suffix_array(io) + @suffixes.each_slice(1024){|suffixes| io.write(suffixes.pack("V*")) } + end + + def add_padding(io) + # padding to 16-byte + if (mod = io.pos & 0xf) != 0 + io.write("\0" * (16 - mod)) + end + end + + def sort!(fulltext) + tsize = fulltext.size + @suffixes = @suffixes.sort_by{|offset| fulltext[offset, tsize - offset]} + end +end + +end # FTSearch diff --git a/ruby/lib/ftsearch/util.rb b/ruby/lib/ftsearch/util.rb new file mode 100644 index 0000000..2944a7b --- /dev/null +++ b/ruby/lib/ftsearch/util.rb @@ -0,0 +1,21 @@ +# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org> +# + +require 'stringio' +module FTSearch + +module InMemoryWriter + def initialize_in_memory_buffer + @memory_io = StringIO.new("") + end + + def data + if @path + File.open(@path, "rb"){|f| f.read} rescue nil + else + @memory_io.string.clone + end + end +end + +end # FTSearch |