Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/ruby/lib/ftsearch
diff options
context:
space:
mode:
Diffstat (limited to 'ruby/lib/ftsearch')
-rw-r--r--ruby/lib/ftsearch/analysis/analyzer.rb16
-rw-r--r--ruby/lib/ftsearch/analysis/simple_identifier_analyzer.rb23
-rw-r--r--ruby/lib/ftsearch/analysis/whitespace_analyzer.rb22
-rw-r--r--ruby/lib/ftsearch/document_map_reader.rb106
-rw-r--r--ruby/lib/ftsearch/document_map_writer.rb46
-rw-r--r--ruby/lib/ftsearch/field_infos.rb46
-rw-r--r--ruby/lib/ftsearch/fragment_writer.rb114
-rw-r--r--ruby/lib/ftsearch/fulltext_reader.rb52
-rw-r--r--ruby/lib/ftsearch/fulltext_writer.rb75
-rw-r--r--ruby/lib/ftsearch/suffix_array_reader.rb277
-rw-r--r--ruby/lib/ftsearch/suffix_array_writer.rb99
-rw-r--r--ruby/lib/ftsearch/util.rb21
12 files changed, 897 insertions, 0 deletions
diff --git a/ruby/lib/ftsearch/analysis/analyzer.rb b/ruby/lib/ftsearch/analysis/analyzer.rb
new file mode 100644
index 0000000..0e08760
--- /dev/null
+++ b/ruby/lib/ftsearch/analysis/analyzer.rb
@@ -0,0 +1,16 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+module FTSearch
+module Analysis
+
+class Analyzer
+ def find_suffixes(text)
+ ret = []
+ append_suffixes(ret, text, 0)
+ ret
+ end
+end
+
+end # Analysis
+end # FTSearch
diff --git a/ruby/lib/ftsearch/analysis/simple_identifier_analyzer.rb b/ruby/lib/ftsearch/analysis/simple_identifier_analyzer.rb
new file mode 100644
index 0000000..b939861
--- /dev/null
+++ b/ruby/lib/ftsearch/analysis/simple_identifier_analyzer.rb
@@ -0,0 +1,23 @@
+
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+require 'strscan'
+require 'ftsearch/analysis/analyzer'
+
+module FTSearch
+module Analysis
+
+class SimpleIdentifierAnalyzer < Analyzer
+ def append_suffixes(array, text, offset)
+ sc = StringScanner.new(text)
+ sc.skip(/[^A-Za-z_]+/)
+ until sc.eos?
+ array << (sc.pos + offset)
+ break unless sc.skip(/[A-Za-z_][A-Za-z0-9_]*[^A-Za-z_]*/)
+ end
+ end
+end
+
+end # Analyzer
+end # FTSearch
diff --git a/ruby/lib/ftsearch/analysis/whitespace_analyzer.rb b/ruby/lib/ftsearch/analysis/whitespace_analyzer.rb
new file mode 100644
index 0000000..6cb5191
--- /dev/null
+++ b/ruby/lib/ftsearch/analysis/whitespace_analyzer.rb
@@ -0,0 +1,22 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+require 'strscan'
+require 'ftsearch/analysis/analyzer'
+
+module FTSearch
+module Analysis
+ class WhiteSpaceAnalyzer < Analyzer
+ def append_suffixes(array, text, offset)
+ sc = StringScanner.new(text)
+ sc.skip(/(\s|\n)*/)
+ until sc.eos?
+ array << (sc.pos + offset)
+ break unless sc.skip(/\S+\s*/)
+ end
+
+ array
+ end
+ end
+end # Analyzer
+end # FTSearch
diff --git a/ruby/lib/ftsearch/document_map_reader.rb b/ruby/lib/ftsearch/document_map_reader.rb
new file mode 100644
index 0000000..5d842c0
--- /dev/null
+++ b/ruby/lib/ftsearch/document_map_reader.rb
@@ -0,0 +1,106 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+module FTSearch
+
+class DocumentMapReader
+ DEFAULT_OPTIONS = {
+ :path => nil,
+ :io => nil,
+ }
+ def initialize(options = {})
+ options = DEFAULT_OPTIONS.merge(options)
+ unless options[:path] || options[:io]
+ raise ArgumentError, "Need either the path to the suffix array file or an IO."
+ end
+ if options[:path]
+ io = File.open(options[:path], "rb")
+ else
+ io = options[:io]
+ end
+ @uri_tbl, @field_arr = Marshal.load(io)
+ end
+
+ def document_id(suffix_idx, offset)
+ idx = binary_search(@field_arr, offset, 0, @field_arr.size)
+ @field_arr[idx][1]
+ end
+
+ def field_id(suffix_idx, offset)
+ idx = binary_search(@field_arr, offset, 0, @field_arr.size)
+ @field_arr[idx][2]
+ end
+
+ def field_size(suffix_idx, offset)
+ idx = binary_search(@field_arr, offset, 0, @field_arr.size)
+ @field_arr[idx][3]
+ end
+
+ def field_info(suffix_idx, offset)
+ idx = binary_search(@field_arr, offset, 0, @field_arr.size)
+ @field_arr[idx]
+ end
+
+ def document_uri(suffix_idx, offset)
+ doc_id = document_id(suffix_idx, offset)
+ @uri_tbl[doc_id]
+ end
+
+ def document_id_to_uri(doc_id)
+ @uri_tbl[doc_id]
+ end
+
+ def offsets_to_field_infos(offsets)
+ offsets.map{|off| @field_arr[binary_search(@field_arr, off, 0, @field_arr.size)]}
+ end
+
+ def rank_offsets(offsets, weights)
+ h = Hash.new{|h,k| h[k] = 0.0}
+ offsets_to_field_infos(offsets).each do |offset, doc_id, field_id, field_size|
+ h[doc_id] += weights[field_id] / field_size
+ end
+ sort_score_hash(h)
+ end
+
+ def rank_offsets_probabilistic(offsets, weights, iterations)
+ h = Hash.new{|h,k| h[k] = 0.0}
+ infos = offsets_to_field_infos(offsets)
+ max = infos.size
+ while iterations > 0
+ offset, doc_id, field_id, field_size = infos[rand(max)]
+ h[doc_id] += weights[field_id] / field_size
+ iterations -= 1
+ end
+ sort_score_hash(h)
+ end
+
+ def dump_data
+ [@uri_tbl, @field_arr]
+ end
+
+ def num_documents
+ @uri_tbl.size
+ end
+
+ private
+ def sort_score_hash(h)
+ h.sort_by{|_,score| -score}
+ end
+
+ def binary_search(arr, offset, from, to)
+ middle = 0
+ while to - from > 1
+ middle = (from + to) / 2
+ pivot, = arr[middle]
+ if offset < pivot
+ to = middle
+ else
+ from = middle
+ end
+ end
+
+ from
+ end
+
+end
+end # FTSearch
diff --git a/ruby/lib/ftsearch/document_map_writer.rb b/ruby/lib/ftsearch/document_map_writer.rb
new file mode 100644
index 0000000..7bf20bc
--- /dev/null
+++ b/ruby/lib/ftsearch/document_map_writer.rb
@@ -0,0 +1,46 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+require 'ftsearch/util'
+
+module FTSearch
+class DocumentMapWriter
+ include InMemoryWriter
+
+ DEFAULT_OPTIONS = {
+ :path => "docmap-#{Process.pid}-#{rand(100000)}",
+ }
+ def initialize(options = {})
+ options = DEFAULT_OPTIONS.merge(options)
+ @path = options[:path]
+ @field_arr = []
+ @uri_tbl = []
+ @data = [@uri_tbl, @field_arr]
+ initialize_in_memory_buffer
+ end
+
+ def merge(doc_map_reader)
+ # TODO: general merge?
+ @uri_tbl, @field_arr = doc_map_reader.dump_data
+ @data = [@uri_tbl, @field_arr]
+ end
+
+ def add_document(doc_id, uri)
+ @uri_tbl[doc_id] = uri
+ end
+
+ def add_field(offset, doc_id, field_id, size)
+ @field_arr << [offset, doc_id, field_id, size]
+ end
+
+ def finish!
+ if @path
+ File.open(@path, "wb") do |f|
+ Marshal.dump(@data, f)
+ end
+ else
+ Marshal.dump(@data, @memory_io)
+ end
+ end
+end
+end # FTSearch
diff --git a/ruby/lib/ftsearch/field_infos.rb b/ruby/lib/ftsearch/field_infos.rb
new file mode 100644
index 0000000..91749b2
--- /dev/null
+++ b/ruby/lib/ftsearch/field_infos.rb
@@ -0,0 +1,46 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+require 'ftsearch/analysis/whitespace_analyzer'
+
+module FTSearch
+class FieldInfos
+ DEFAULT_OPTIONS = {
+ :default_analyzer => FTSearch::Analysis::WhiteSpaceAnalyzer.new,
+ :stored => true,
+ }
+ def initialize(options = {})
+ options = DEFAULT_OPTIONS.merge(options)
+ @fields = {}
+ @default_options = options
+ end
+
+ def add_field(options = {})
+ options = @default_options.merge(options)
+ raise "Need a name" unless options[:name]
+ store_field_info(options)
+ end
+
+ def [](field_name)
+ if field_info = @fields[field_name]
+ field_info
+ else
+ store_field_info(:name => field_name)
+ end
+ end
+
+ private
+ def store_field_info(options)
+ options = @default_options.merge(options)
+ unless options[:analyzer]
+ if klass = options[:analyzer_class]
+ options[:analyzer] = klass.new
+ else
+ options[:analyzer] = @default_options[:default_analyzer]
+ end
+ end
+ @fields[options[:name]] = options
+ end
+end
+end # FTSearch
+
diff --git a/ruby/lib/ftsearch/fragment_writer.rb b/ruby/lib/ftsearch/fragment_writer.rb
new file mode 100644
index 0000000..79b783f
--- /dev/null
+++ b/ruby/lib/ftsearch/fragment_writer.rb
@@ -0,0 +1,114 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+require 'fileutils'
+require 'ftsearch/suffix_array_writer'
+require 'ftsearch/document_map_writer'
+require 'ftsearch/fulltext_writer'
+require 'ftsearch/field_infos'
+
+require 'ftsearch/fulltext_reader'
+require 'ftsearch/document_map_reader'
+require 'ftsearch/suffix_array_reader'
+
+module FTSearch
+
+class FragmentWriter
+ DEFAULT_OPTIONS = {
+ :path => "ftsearch-#{Process.pid}-#{rand(100000)}",
+ :default_analyzer_class => FTSearch::Analysis::WhiteSpaceAnalyzer,
+ :field_infos_class => FieldInfos,
+ :fulltext_writer_class => FulltextWriter,
+ :suffix_array_writer_class => SuffixArrayWriter,
+ :doc_map_writer_class => DocumentMapWriter,
+ :field_infos => nil,
+ :fulltext_writer => nil,
+ :suffix_array_writer => nil,
+ :doc_map_writer_nil => nil,
+ }
+
+ attr_reader :fulltext_writer, :suffix_array_writer, :doc_map_writer
+ def initialize(options = {})
+ options = DEFAULT_OPTIONS.merge(options)
+ create = lambda do |name, *args|
+ options[name] || options[(name.to_s + "_class").to_sym].new(*args)
+ end
+ build_path = lambda do |suffix|
+ if @path
+ File.join(@tmpdir, suffix)
+ else
+ nil
+ end
+ end
+ @path = options[:path]
+ if @path
+ @path = File.expand_path(@path)
+ @tmpdir = @path + "#{Process.pid}-#{rand(100000)}"
+ FileUtils.mkdir_p(@tmpdir)
+ end
+
+ @fulltext_writer = create.call(:fulltext_writer, :path => build_path["fulltext"])
+ @suffix_array_writer = create.call(:suffix_array_writer, :path => build_path["suffixes"])
+ @doc_map_writer = create.call(:doc_map_writer, :path => build_path["docmap"])
+
+ default_analyzer = (klass = options[:default_analyzer_class]) ? klass.new : nil
+ @field_infos = create.call(:field_infos, :default_analyzer => default_analyzer)
+ @num_documents = 0
+ @field_map = Hash.new{|h,k| h[k.to_sym] = h.size}
+ @field_map[:uri] # init
+ end
+
+ def add_document(doc_hash)
+ uri = doc_hash[:uri] || @num_documents.to_s
+ @fulltext_writer.add_document(@num_documents, doc_hash.merge(:uri => uri),
+ @field_map, @field_infos, @suffix_array_writer, @doc_map_writer)
+ @num_documents += 1
+ end
+
+ def merge(fragment_directory)
+ raise "Cannot import old data unless the destination Fragment is empty." unless @num_documents == 0
+ # TODO: use a FragmentReader to access old data
+ fulltext_reader = FulltextReader.new(:path => "#{fragment_directory}/fulltext")
+ suffix_array_reader = SuffixArrayReader.new(fulltext_reader, nil,
+ :path => "#{fragment_directory}/suffixes")
+ doc_map_reader = DocumentMapReader.new(:path => "#{fragment_directory}/docmap")
+ @fulltext_writer.merge(fulltext_reader)
+ @suffix_array_writer.merge(suffix_array_reader)
+ @doc_map_writer.merge(doc_map_reader)
+ #FIXME: .num_documents will be wrong if some URIs were repeated
+ @num_documents = doc_map_reader.num_documents
+ File.open(File.join(fragment_directory, "fieldmap"), "rb") do |f|
+ i = 0
+ f.each_line{|l| @field_map[l.chomp.to_sym] = i; i+= 1}
+ end
+ end
+
+ def fields
+ @field_map.sort_by{|field, fid| fid}.map{|field, fid| field}
+ end
+
+ def documents
+ @num_documents
+ end
+
+ def field_id(field)
+ @field_map.has_key?(field) && @field_map[field]
+ end
+
+ def finish!
+ @fulltext_writer.finish!
+ fulltext = @fulltext_writer.data
+ @suffix_array_writer.finish!(fulltext)
+ @doc_map_writer.finish!
+
+ if @path
+ File.open(File.join(@tmpdir, "fieldmap"), "wb") do |f|
+ @field_map.sort_by{|field_name, field_id| field_id}.each do |field_name, field_id|
+ f.puts field_name
+ end
+ File.rename(@tmpdir, @path)
+ end
+ end
+ end
+end
+end # FTSearch
diff --git a/ruby/lib/ftsearch/fulltext_reader.rb b/ruby/lib/ftsearch/fulltext_reader.rb
new file mode 100644
index 0000000..05da57b
--- /dev/null
+++ b/ruby/lib/ftsearch/fulltext_reader.rb
@@ -0,0 +1,52 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+module FTSearch
+
+class FulltextReader
+ DEFAULT_OPTIONS = {
+ :path => nil,
+ :io => nil,
+ }
+ def initialize(options = {})
+ options = DEFAULT_OPTIONS.merge(options)
+ unless options[:path] || options[:io]
+ raise ArgumentError, "Need either the path to the suffix array file or an IO."
+ end
+ init_internal_structures(options)
+ end
+
+ def get_data(offset, size)
+ @io.pos = offset
+ @io.read(size)
+ end
+
+ def dump_data(&block)
+ blocksize = 32768
+ @io.pos = 0
+ begin
+ size = @io.stat.size - 1
+ rescue NoMethodError # try with StringIO's interface
+ size = @io.string.size - 1
+ end
+ read = 0
+ #buffer = " " * blocksize
+ while read < size
+ #@io.read([size - read, blocksize].min, buffer)
+ #yield buffer
+ data = @io.read([size - read, blocksize].min)
+ read += data.size
+ yield data
+ end
+ end
+
+ private
+ def init_internal_structures(options)
+ if options[:path]
+ @io = File.open(options[:path], "rb")
+ else
+ @io = options[:io]
+ end
+ end
+end
+end # FTSearch
diff --git a/ruby/lib/ftsearch/fulltext_writer.rb b/ruby/lib/ftsearch/fulltext_writer.rb
new file mode 100644
index 0000000..ad92c0b
--- /dev/null
+++ b/ruby/lib/ftsearch/fulltext_writer.rb
@@ -0,0 +1,75 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+require 'ftsearch/util'
+
+module FTSearch
+class FulltextWriter
+ include InMemoryWriter
+
+ DEFAULT_OPTIONS = {
+ :path => "fulltext-#{Process.pid}-#{rand(100000)}",
+ }
+
+ attr_reader :path
+
+ def initialize(options = {})
+ options = DEFAULT_OPTIONS.merge(options)
+ @path = options[:path]
+ initialize_in_memory_buffer
+ if @path
+ @io = File.open(@path, "wb")
+ else
+ @io = @memory_io
+ end
+ end
+
+ def merge(fulltext_reader)
+ fulltext_reader.dump_data do |data|
+ @io.write data
+ end
+ end
+
+ def add_document(doc_id, doc_hash, field_mapping, field_infos, suffix_array_writer, doc_map_writer)
+ write_document_header(doc_id, doc_hash, field_mapping, field_infos)
+ doc_map_writer.add_document(doc_id, doc_hash[:uri])
+ doc_hash.each_pair do |field_name, data|
+ if field_id = field_mapping[field_name]
+ field_info = field_infos[field_name]
+ if field_info[:stored]
+ suffix_offset, segment_offset = store_field(doc_id, field_name, field_id, data)
+ if analyzer = field_info[:analyzer]
+ suffix_array_writer.add_suffixes(analyzer, data, suffix_offset)
+ end
+ doc_map_writer.add_field(segment_offset, doc_id, field_id, data.size)
+ end
+ end
+ end
+ end
+
+ def finish!
+ @io.write "\0"
+ @io.fsync
+ @io.close
+ end
+
+ private
+ def write_document_header(doc_id, doc_hash, field_mapping, field_infos)
+ stored_fields = doc_hash.select do |field_name, data|
+ field_infos[field_name][:stored]
+ end
+ total_size = stored_fields.inject(0){|s,(_,data)| s + data.size} + stored_fields.size * 9
+ # 9 = field ids plus field size plus trailing \0
+ @io.write [total_size].pack("V")
+ end
+
+ def store_field(doc_id, field_name, field_id, data)
+ @io.write [field_id, data.size].pack("V2")
+ offset = @io.pos
+ @io.write data
+ @io.write "\0"
+
+ [offset, offset - 8]
+ end
+end
+end # FTSearch
diff --git a/ruby/lib/ftsearch/suffix_array_reader.rb b/ruby/lib/ftsearch/suffix_array_reader.rb
new file mode 100644
index 0000000..830bf61
--- /dev/null
+++ b/ruby/lib/ftsearch/suffix_array_reader.rb
@@ -0,0 +1,277 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+module FTSearch
+
+class SuffixArrayReader
+ class Hit < Struct.new(:term, :suffix_number, :offset, :fulltext_reader)
+ def context(size)
+ strip_markers(self.fulltext_reader.get_data(offset - size, 2 * size), size)
+ end
+
+ def text(size)
+ strip_markers(self.fulltext_reader.get_data(offset, size), 0)
+ end
+
+ private
+ def strip_markers(str, size)
+ first = (str.rindex("\0", -size) || -1) + 1
+ last = str.index("\0", size) || str.size
+ str[first...last]
+ end
+ end
+
+ class LazyHits < Struct.new(:term, :suffix_array_reader, :fulltext_reader,
+ :from_index, :to_index)
+ include Enumerable
+ def each
+ sa_reader = self.suffix_array_reader
+ ft_reader = self.fulltext_reader
+ term = self.term
+ self.from_index.upto(self.to_index - 1) do |idx|
+ yield Hit.new(term, idx, sa_reader.suffix_index_to_offset(idx),
+ ft_reader)
+ end
+ end
+
+ def [](i)
+ i += to_index - from_index if i < 0
+ sa_reader = self.suffix_array_reader
+ if (idx = from_index + i) < to_index && idx >= from_index
+ Hit.new(self.term, idx, sa_reader.suffix_index_to_offset(idx),
+ self.fulltext_reader)
+ else
+ nil
+ end
+ end
+
+ def size
+ to_index - from_index
+ end
+ end
+
+ DEFAULT_OPTIONS = {
+ :path => nil,
+ :io => nil,
+ }
+ def initialize(fulltext_reader, doc_map, options = {})
+ options = DEFAULT_OPTIONS.merge(options)
+ @fulltext_reader = fulltext_reader
+ @doc_map = doc_map
+ unless options[:path] || options[:io]
+ raise ArgumentError, "Need either the path to the suffix array file or an IO."
+ end
+ init_internal_structures(options)
+ end
+
+ def count_hits(term)
+ from = binary_search(term, 0, @suffixes.size)
+ offset = @suffixes[from]
+ if @fulltext_reader.get_data(offset, term.size) == term
+ to = binary_search_upper(term, 0, @suffixes.size)
+ to - from
+ else
+ 0
+ end
+ end
+
+ def find_all(term)
+ from = binary_search(term, 0, @suffixes.size)
+ offset = @suffixes[from]
+ if @fulltext_reader.get_data(offset, term.size) == term
+ to = binary_search_upper(term, 0, @suffixes.size)
+ LazyHits.new(term, self, @fulltext_reader, from, to)
+ else
+ LazyHits.new(term, self, @fulltext_reader, 0, 0)
+ end
+ end
+
+ def find_first(term)
+ suffix_index = binary_search(term, 0, @suffixes.size)
+ offset = @suffixes[suffix_index]
+ if @fulltext_reader.get_data(offset, term.size) == term
+ Hit.new(term, suffix_index, offset, @fulltext_reader)
+ else
+ nil
+ end
+ end
+
+ def find_next(hit)
+ end
+
+ def suffix_index_to_offset(suffix_index)
+ @suffixes[suffix_index]
+ end
+
+ def lazyhits_to_offsets(lazyhits)
+ from = lazyhits.from_index
+ to = lazyhits.to_index
+ @io.pos = @base + 4 * from
+ @io.read((to - from) * 4).unpack("V*")
+ end
+
+ def dump_data
+ @io.pos = @base
+ while data = @io.read(32768)
+ yield data.unpack("V*")
+ end
+ end
+
+ private
+ def init_internal_structures(options)
+ if options[:path]
+ @io = File.open(options[:path], "rb")
+ else
+ @io = options[:io]
+ end
+ @total_suffixes, @block_size, @inline_suffix_size = @io.read(12).unpack("VVV")
+ @inline_suffixes = []
+ if @block_size != 0
+ 0.step(@total_suffixes, @block_size){ @inline_suffixes << @io.read(@inline_suffix_size)}
+ end
+
+ # skip padding
+ if (mod = @io.pos & 0xf) != 0
+ @io.read(16 - mod)
+ end
+
+ @base = @io.pos
+ #@suffixes = io.read.unpack("V*")
+ @suffixes = Object.new
+ nsuffixes = @total_suffixes
+ io = @io
+ base = @base
+ class << @suffixes; self end.module_eval do
+ define_method(:[]) do |i|
+ io.pos = base + i * 4
+ io.read(4).unpack("V")[0]
+ end
+ define_method(:size){ nsuffixes }
+ end
+ end
+
+ def binary_search(term, from, to)
+ from, to = binary_search_inline_suffixes(term, from, to)
+
+ tsize = term.size
+ while from < to
+ middle = (from + to) / 2
+ pivot = @fulltext_reader.get_data(@suffixes[middle], tsize)
+ if term <= pivot
+ to = middle
+ else
+ from = middle + 1
+ end
+ end
+
+ from
+ end
+
+ def binary_search_upper(term, from, to)
+ from, to = binary_search_inline_suffixes_upper(term, from, to)
+
+ tsize = term.size
+
+ #puts "#{from} -- #{to}"
+ #from.upto(to+5) do |idx|
+ # puts "#{idx} #{@fulltext_reader.get_data(@suffixes[idx], tsize + 10).inspect}"
+ #end
+ while from < to
+ middle = (from + to) / 2
+ pivot = @fulltext_reader.get_data(@suffixes[middle], tsize)
+ if term < pivot
+ to = middle
+ else
+ from = middle + 1
+ end
+ end
+
+ #puts "RET: #{from}"
+ from
+ end
+
+
+ def binary_search_inline_suffixes(term, from, to)
+ return [from, to] if @block_size == 0
+
+ tsize = term.size
+ while to - from > @block_size
+ middle = (from + to) / 2
+ #puts "from: #{from} to #{to} middle: #{middle}" if $DEBUG
+ quotient, mod = middle.divmod(@block_size)
+ middle = middle - mod
+ pivot = @inline_suffixes[quotient]
+ #puts "NOW: #{middle} pivot: #{pivot.inspect}" if $DEBUG
+ if tsize <= @inline_suffix_size
+ if term <= pivot
+ to = middle
+ else
+ from = middle + 1
+ end
+ elsif term[0, @inline_suffix_size] < pivot
+ to = middle
+ else
+ # FIXME: handle pivot[-1] = 255?
+ pivot = pivot.clone
+ #pivot[-1] += 1
+ pivot.next!
+ #puts "TESTING AGAINST new pivot: #{pivot.inspect}" if $DEBUG
+ if term > pivot
+ from = middle + 1
+ else # term[0, @inline_suffix_size] == pivot, disambiguate
+ pivot = @fulltext_reader.get_data(@suffixes[middle], term.size)
+ if term <= pivot
+ to = middle
+ else
+ from = middle + 1
+ end
+ end
+ end
+ end
+
+ [from, to]
+ end
+
+ def binary_search_inline_suffixes_upper(term, from, to)
+ return [from, to] if @block_size == 0
+
+ tsize = term.size
+ while to - from > @block_size
+ middle = (from + to) / 2
+ #puts "from: #{from} to #{to} middle: #{middle}" if $DEBUG
+ quotient, mod = middle.divmod(@block_size)
+ middle = middle - mod
+ pivot = @inline_suffixes[quotient]
+ #puts "NOW: #{middle} pivot: #{pivot.inspect}" if $DEBUG
+ if tsize <= @inline_suffix_size
+ if term < pivot[0, tsize]
+ to = middle
+ else
+ from = middle + 1
+ end
+ elsif term[0, @inline_suffix_size] < pivot
+ to = middle
+ else
+ # FIXME: handle pivot[-1] = 255?
+ pivot = pivot.clone
+ #pivot[-1] += 1
+ pivot.next!
+ #puts "TESTING AGAINST new pivot: #{pivot.inspect}" if $DEBUG
+ if term > pivot
+ from = middle + 1
+ else # term[0, @inline_suffix_size] == pivot, disambiguate
+ pivot = @fulltext_reader.get_data(@suffixes[middle], term.size)
+ if term < pivot
+ to = middle
+ else
+ from = middle + 1
+ end
+ end
+ end
+ end
+
+ [from, to]
+ end
+end
+
+end # FTSearch
diff --git a/ruby/lib/ftsearch/suffix_array_writer.rb b/ruby/lib/ftsearch/suffix_array_writer.rb
new file mode 100644
index 0000000..694dd3b
--- /dev/null
+++ b/ruby/lib/ftsearch/suffix_array_writer.rb
@@ -0,0 +1,99 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+require 'enumerator'
+require 'ftsearch/util'
+
+module FTSearch
+
+class SuffixArrayWriter
+ include InMemoryWriter
+
+ DEFAULT_OPTIONS = {
+ :path => "suffixes-#{Process.pid}-#{rand(100000)}",
+ :block_size => 32,
+ :inline_suffix_size => 8,
+ :default_analyzer => nil,
+ }
+ def initialize(options = {})
+ options = DEFAULT_OPTIONS.merge(options)
+ @path = options[:path]
+ @suffixes = []
+ @block_size = options[:block_size]
+ @inline_suffix_size = options[:inline_suffix_size]
+ @finished = false
+ initialize_in_memory_buffer
+ end
+
+ def merge(suffix_array_reader)
+ unless @suffixes.empty?
+ raise "Cannot merge if the destination SuffixArrayWriter isn't empty!"
+ end
+ suffix_array_reader.dump_data do |partial_sarray|
+ @suffixes.concat partial_sarray
+ end
+ end
+
+ def add_suffixes(analyzer, data, offset)
+ #analyzer.new.find_suffixes(data).each{|x| @suffixes << offset + x}
+ analyzer.append_suffixes(@suffixes, data, offset)
+ end
+
+ def finish!(fulltext)
+ return if @finished
+ if $DEBUG
+ puts "Suffixes: #{@suffixes.size}"
+ t0 = Time.new
+ end
+ sort!(fulltext)
+ if $DEBUG
+ puts "Sorted in #{Time.new - t0}"
+ end
+ if $DEBUG
+ puts "Dumping suffixes"
+ t0 = Time.new
+ end
+ dump_suffixes(fulltext)
+ if $DEBUG
+ puts "Dumped in #{Time.new - t0}"
+ end
+ @finished = true
+ end
+
+ private
+ def dump_suffixes(fulltext)
+ io = @path ? File.open(@path, "wb") : @memory_io
+ io.write([@suffixes.size, @block_size || 0, @inline_suffix_size].pack("VVV"))
+ if @block_size
+ dump_inline_suffixes(io, fulltext)
+ end
+ add_padding(io)
+ dump_suffix_array(io)
+ ensure
+ io.close if @path
+ end
+
+ def dump_inline_suffixes(io, fulltext)
+ 0.step(@suffixes.size, @block_size) do |suffix_idx|
+ io.write([fulltext[@suffixes[suffix_idx], @inline_suffix_size]].pack("a#{@inline_suffix_size}"))
+ end
+ end
+
+ def dump_suffix_array(io)
+ @suffixes.each_slice(1024){|suffixes| io.write(suffixes.pack("V*")) }
+ end
+
+ def add_padding(io)
+ # padding to 16-byte
+ if (mod = io.pos & 0xf) != 0
+ io.write("\0" * (16 - mod))
+ end
+ end
+
+ def sort!(fulltext)
+ tsize = fulltext.size
+ @suffixes = @suffixes.sort_by{|offset| fulltext[offset, tsize - offset]}
+ end
+end
+
+end # FTSearch
diff --git a/ruby/lib/ftsearch/util.rb b/ruby/lib/ftsearch/util.rb
new file mode 100644
index 0000000..2944a7b
--- /dev/null
+++ b/ruby/lib/ftsearch/util.rb
@@ -0,0 +1,21 @@
+# Copyright (C) 2006 Mauricio Fernandez <mfp@acm.org>
+#
+
+require 'stringio'
+module FTSearch
+
+module InMemoryWriter
+ def initialize_in_memory_buffer
+ @memory_io = StringIO.new("")
+ end
+
+ def data
+ if @path
+ File.open(@path, "rb"){|f| f.read} rescue nil
+ else
+ @memory_io.string.clone
+ end
+ end
+end
+
+end # FTSearch