Use String#scrub when available to tidy bytes
This commit is contained in:
parent
8003c541f7
commit
ab195841dd
@ -212,37 +212,43 @@ def compose(codepoints)
|
|||||||
codepoints
|
codepoints
|
||||||
end
|
end
|
||||||
|
|
||||||
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
|
# Ruby >= 2.1 has String#scrub, which is faster than the workaround used for < 2.1.
|
||||||
# resulting in a valid UTF-8 string.
|
if RUBY_VERSION >= '2.1'
|
||||||
#
|
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
|
||||||
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
# resulting in a valid UTF-8 string.
|
||||||
# encoding is entirely CP1252 or ISO-8859-1.
|
#
|
||||||
def tidy_bytes(string, force = false)
|
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
||||||
return string if string.empty?
|
# encoding is entirely CP1252 or ISO-8859-1.
|
||||||
|
def tidy_bytes(string, force = false)
|
||||||
if force
|
return string if string.empty?
|
||||||
return string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
|
return recode_windows1252_chars(string) if force
|
||||||
|
string.scrub { |bad| recode_windows1252_chars(bad) }
|
||||||
end
|
end
|
||||||
|
else
|
||||||
|
def tidy_bytes(string, force = false)
|
||||||
|
return string if string.empty?
|
||||||
|
return recode_windows1252_chars(string) if force
|
||||||
|
|
||||||
# We can't transcode to the same format, so we choose a nearly-identical encoding.
|
# We can't transcode to the same format, so we choose a nearly-identical encoding.
|
||||||
# We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
|
# We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
|
||||||
# CP1252 when we get errors. The final string will be 'converted' back to UTF-8
|
# CP1252 when we get errors. The final string will be 'converted' back to UTF-8
|
||||||
# before returning.
|
# before returning.
|
||||||
reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC)
|
reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_8_MAC)
|
||||||
|
|
||||||
source = string.dup
|
source = string.dup
|
||||||
out = ''.force_encoding(Encoding::UTF_8_MAC)
|
out = ''.force_encoding(Encoding::UTF_8_MAC)
|
||||||
|
|
||||||
loop do
|
loop do
|
||||||
reader.primitive_convert(source, out)
|
reader.primitive_convert(source, out)
|
||||||
_, _, _, error_bytes, _ = reader.primitive_errinfo
|
_, _, _, error_bytes, _ = reader.primitive_errinfo
|
||||||
break if error_bytes.nil?
|
break if error_bytes.nil?
|
||||||
out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace)
|
out << error_bytes.encode(Encoding::UTF_8_MAC, Encoding::Windows_1252, invalid: :replace, undef: :replace)
|
||||||
|
end
|
||||||
|
|
||||||
|
reader.finish
|
||||||
|
|
||||||
|
out.encode!(Encoding::UTF_8)
|
||||||
end
|
end
|
||||||
|
|
||||||
reader.finish
|
|
||||||
|
|
||||||
out.encode!(Encoding::UTF_8)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Returns the KC normalization of the string by default. NFKC is
|
# Returns the KC normalization of the string by default. NFKC is
|
||||||
@ -371,14 +377,8 @@ def apply_mapping(string, mapping) #:nodoc:
|
|||||||
end.pack('U*')
|
end.pack('U*')
|
||||||
end
|
end
|
||||||
|
|
||||||
def tidy_byte(byte)
|
def recode_windows1252_chars(string)
|
||||||
if byte < 160
|
string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
|
||||||
[database.cp1252[byte] || byte].pack("U").unpack("C*")
|
|
||||||
elsif byte < 192
|
|
||||||
[194, byte]
|
|
||||||
else
|
|
||||||
[195, byte - 64]
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def database
|
def database
|
||||||
|
Loading…
Reference in New Issue
Block a user