class PMS::Index

Constants

DEFAULT_LSI
TOKEN_RE

Attributes

entries[R]
index[R]
input[R]

Public Class Methods

new(input, options = {}) click to toggle source
# File lib/pms/index.rb, line 39
def initialize(input, options = {})
  @input = input.respond_to?(:each) ? input : input.is_a?(String) ?
    input.each_line : raise(ArgumentError, 'input must implement #each')

  build_index(options)
end

Public Instance Methods

[](doc_num)
Alias for: doc
doc(doc_num) click to toggle source
# File lib/pms/index.rb, line 74
def doc(doc_num)
  documents([doc_num]).first
end
Also aliased as: []
doc_nums(token) click to toggle source
# File lib/pms/index.rb, line 61
def doc_nums(token)
  doc_nums_with_positions(token).keys
end
Also aliased as: results
doc_nums_with_positions(token) click to toggle source
# File lib/pms/index.rb, line 46
def doc_nums_with_positions(token)
  case token
    when String
      index[mangle_token(token)]
    when Regexp
      index.each_with_object({}) { |(key, value), hash|
        hash.update(value) { |_, old, new| old | new } if key =~ token
      }
    else
      raise TypeError, "String or Regexp expected, got #{token.class}"
  end.each_value(&:compact!)
end
Also aliased as: results_with_positions
documents(doc_nums = default = true) click to toggle source
# File lib/pms/index.rb, line 67
def documents(doc_nums = default = true)
  @documents ||= get_documents
  default ? @documents : @documents.values_at(*doc_nums)
end
Also aliased as: matches
matches(doc_nums = default = true)
Alias for: documents
results(token)
Alias for: doc_nums
results_with_positions(token)

Private Instance Methods

build_index(options) click to toggle source
# File lib/pms/index.rb, line 82
def build_index(options)
  if lsi = options[:lsi]
    require 'lsi4r'

    lsi = DEFAULT_LSI if lsi == true
    map = Hash.new { |h, k| h[k] = [] }
  end

  @documents, @entries, doc_num = nil, [], -1
  index = Hash.new { |h, k| h[k] = Hash.new { |i, j| i[j] = [] } }

  input.each { |doc|
    @entries << doc_num += 1
    pos = -1

    each_token(doc) { |token|
      index[term = mangle_token(token)][doc_num] << pos += 1
      map[doc_num] << term if map
    }
  }

  Lsi4R.each_norm(map, min: lsi, new: true) { |d, k, _|
    index[mangle_token(k)][d.key] << nil
  } if lsi

  @index = index
end
each_token(doc, &block) click to toggle source
# File lib/pms/index.rb, line 118
def each_token(doc, &block)
  doc.scan(TOKEN_RE, &block)
end
get_documents() click to toggle source
# File lib/pms/index.rb, line 110
def get_documents
  input.rewind if input.respond_to?(:rewind)

  docs = []
  input.each { |doc| docs << doc }
  docs
end
mangle_token(token) click to toggle source
# File lib/pms/index.rb, line 122
def mangle_token(token)
  Unicode.downcase(token)
end