module CMess::Cinderella

Find (and possibly repair) doubly encoded characters. Here's how it's done:

Treats characters encoded in target encoding as if they were encoded in source encoding, converts them to target encoding and “grep”s for lines containing those doubly encoded characters; if asked to repair doubly encoded characters, substitutes them with their original character.

Constants

DEFAULT_CSETS_DIR
VERSION

Public Instance Methods

pick(options) click to toggle source
# File lib/cmess/cinderella.rb, line 48
def pick(options)
  input, pot, crop, source, target, chars = CMess.ensure_options!(options,
    :input, :pot, :crop, :source_encoding, :target_encoding, :chars
  )

  encoded = {}
  chars.each { |char| encoded[encode(char, source, target)] = char }

  regexp = Regexp.union(*encoded.keys)

  input.each { |line|
    out = line =~ regexp ? crop : pot or next

    line.gsub!(regexp, encoded) if repair
    out.puts(line)
  }
end

Private Instance Methods

encode(string, source, target) click to toggle source
# File lib/cmess/cinderella.rb, line 68
def encode(string, source, target)
  string.encode(target, source)
rescue Encoding::UndefinedConversionError
end