class CMess::GuessEncoding::Automatic

Tries to detect the encoding of a given input by applying several heuristics to determine the most likely candidate. If no heuristic catches on, resorts to Encoding::UNKNOWN.

If a BOM is found, it may determine the encoding directly.

For supported encodings see EncodingGuessers and BOMGuessers.

Constants

CHARS_TO_TEST

Certain (non-ASCII) chars to test for in TEST_ENCODINGS.

GUESS_METHOD_RE

Pattern for method names in EncodingGuessers and BOMGuessers.

TEST_CHARS

Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST.

TEST_ENCODINGS

Single-byte encodings to test statistically by TEST_CHARS.

TEST_THRESHOLD_APPROX

Relative count of TEST_CHARS must exceed this threshold to yield an approximate match.

TEST_THRESHOLD_DIRECT

Relative count of TEST_CHARS must exceed this threshold to yield a direct match.

Attributes

bom_guessers[R]
encoding_guessers[R]
supported_boms[R]
supported_encodings[R]
byte_count[R]
byte_total[R]
chunk_size[R]
first_byte[R]
input[R]

Public Class Methods

new(input, chunk_size = nil) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 146
def initialize(input, chunk_size = nil)
  @input = case input
    when IO     then input
    when String then StringIO.new(input)
    else raise ArgumentError,
      "don't know how to handle input of type #{input.class}"
  end

  @chunk_size = chunk_size
end

Public Instance Methods

bom() click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 173
def bom
  @bom ||= check_bom
end
guess(input, chunk_size = nil, ignore_bom = false) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 114
def guess(input, chunk_size = nil, ignore_bom = false)
  new(input, chunk_size).guess(ignore_bom)
end

Private Instance Methods

bom_encoding(encoding, &block) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 133
def bom_encoding(encoding, &block)
  unless supported_bom?(encoding)
    supported_boms << encoding
    bom_guessers   << lambda { |*| encoding if instance_eval(&block) }
  end
end
byte_count_sum(bytes) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 234
def byte_count_sum(bytes)
  Array(bytes).inject(0) { |sum, n| sum + byte_count[n] }
end
check_bom() click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 183
def check_bom
  return if eof?

  # prevent "Illegal seek" error inside a pipe
  begin
    input.pos
  rescue Errno::ESPIPE
    return
  end

  bom_guessers.each { |block|
    if encoding = instance_eval(&block) and supported_encoding?(encoding)
      return encoding
    else
      input.rewind
    end
  }

  nil
end
encoding(*encodings, &block) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 120
def encoding(*encodings, &block)
  encodings.flatten.each { |encoding|
    unless supported_encoding?(encoding)
      supported_encodings << encoding
      encoding_guessers   << block
    end
  }
end
eof?() click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 179
def eof?
  input.eof?
end
next_byte() click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 204
def next_byte
  input.read(1).unpack('C').first
end
next_one_of?(*bytes) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 212
def next_one_of?(*bytes)
  bytes.include?(next_byte)
end
read(chunk_size = chunk_size()) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 216
def read(chunk_size = chunk_size())
  @byte_count ||= Hash.new(0)
  @byte_total ||= 0

  return if eof?

  bytes_before = @byte_total

  input.read(chunk_size).each_byte { |byte|
    @byte_count[byte] += 1
    @byte_total       += 1

    @first_byte ||= byte
  }

  @byte_total > bytes_before
end
relative_byte_count(count) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 238
def relative_byte_count(count)
  count.to_f / byte_total
end
starts_with?(*bytes) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 208
def starts_with?(*bytes)
  bytes.all? { |byte| next_byte == byte }
end
supported_bom?(encoding) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 140
def supported_bom?(encoding)
  supported_boms.include?(encoding)
end
supported_encoding?(encoding) click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 129
def supported_encoding?(encoding)
  supported_encodings.include?(encoding)
end