class CMess::GuessEncoding::Automatic
Tries to detect the encoding of a given input by applying several heuristics to determine the most likely candidate. If no heuristic catches on, resorts to Encoding::UNKNOWN.
If a BOM is found, it may determine the encoding directly.
For supported encodings see EncodingGuessers and BOMGuessers.
Constants
- CHARS_TO_TEST
Certain (non-ASCII) chars to test for in TEST_ENCODINGS.
- GUESS_METHOD_RE
Pattern for method names in EncodingGuessers and BOMGuessers.
- TEST_CHARS
Map TEST_ENCODINGS to respectively encoded CHARS_TO_TEST.
- TEST_ENCODINGS
Single-byte encodings to test statistically by TEST_CHARS.
- TEST_THRESHOLD_APPROX
Relative count of TEST_CHARS must exceed this threshold to yield an approximate match.
- TEST_THRESHOLD_DIRECT
Relative count of TEST_CHARS must exceed this threshold to yield a direct match.
Attributes
bom_guessers[R]
encoding_guessers[R]
supported_boms[R]
supported_encodings[R]
byte_count[R]
byte_total[R]
chunk_size[R]
first_byte[R]
input[R]
Public Class Methods
new(input, chunk_size = nil)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 146 def initialize(input, chunk_size = nil) @input = case input when IO then input when String then StringIO.new(input) else raise ArgumentError, "don't know how to handle input of type #{input.class}" end @chunk_size = chunk_size end
Public Instance Methods
bom()
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 173 def bom @bom ||= check_bom end
guess(input, chunk_size = nil, ignore_bom = false)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 114 def guess(input, chunk_size = nil, ignore_bom = false) new(input, chunk_size).guess(ignore_bom) end
Private Instance Methods
bom_encoding(encoding, &block)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 133 def bom_encoding(encoding, &block) unless supported_bom?(encoding) supported_boms << encoding bom_guessers << lambda { |*| encoding if instance_eval(&block) } end end
byte_count_sum(bytes)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 234 def byte_count_sum(bytes) Array(bytes).inject(0) { |sum, n| sum + byte_count[n] } end
check_bom()
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 183 def check_bom return if eof? # prevent "Illegal seek" error inside a pipe begin input.pos rescue Errno::ESPIPE return end bom_guessers.each { |block| if encoding = instance_eval(&block) and supported_encoding?(encoding) return encoding else input.rewind end } nil end
encoding(*encodings, &block)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 120 def encoding(*encodings, &block) encodings.flatten.each { |encoding| unless supported_encoding?(encoding) supported_encodings << encoding encoding_guessers << block end } end
eof?()
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 179 def eof? input.eof? end
next_byte()
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 204 def next_byte input.read(1).unpack('C').first end
next_one_of?(*bytes)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 212 def next_one_of?(*bytes) bytes.include?(next_byte) end
read(chunk_size = chunk_size())
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 216 def read(chunk_size = chunk_size()) @byte_count ||= Hash.new(0) @byte_total ||= 0 return if eof? bytes_before = @byte_total input.read(chunk_size).each_byte { |byte| @byte_count[byte] += 1 @byte_total += 1 @first_byte ||= byte } @byte_total > bytes_before end
relative_byte_count(count)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 238 def relative_byte_count(count) count.to_f / byte_total end
starts_with?(*bytes)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 208 def starts_with?(*bytes) bytes.all? { |byte| next_byte == byte } end
supported_bom?(encoding)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 140 def supported_bom?(encoding) supported_boms.include?(encoding) end
supported_encoding?(encoding)
click to toggle source
# File lib/cmess/guess_encoding/automatic.rb, line 129 def supported_encoding?(encoding) supported_encodings.include?(encoding) end