Use String#scrub when available to tidy bytes

2013-12-26 12:36:52 -03:00 · 2013-12-26 12:36:52 -03:00 · ab195841dd
commit ab195841dd
parent 8003c541f7
1 changed files with 34 additions and 34 deletions
--- a/activesupport/lib/active_support/multibyte/unicode.rb
+++ b/activesupport/lib/active_support/multibyte/unicode.rb
@ -212,6 +212,8 @@ def compose(codepoints)
        codepoints
      end

+      # Ruby >= 2.1 has String#scrub, which is faster than the workaround used for < 2.1.
+      if RUBY_VERSION >= '2.1'
        # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
        # resulting in a valid UTF-8 string.
        #
@ -219,10 +221,13 @@ def compose(codepoints)
        # encoding is entirely CP1252 or ISO-8859-1.
        def tidy_bytes(string, force = false)
          return string if string.empty?
-
-        if force
-          return string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
+          return recode_windows1252_chars(string) if force
+          string.scrub { |bad| recode_windows1252_chars(bad) }
        end
+      else
+        def tidy_bytes(string, force = false)
+          return string if string.empty?
+          return recode_windows1252_chars(string) if force

          # We can't transcode to the same format, so we choose a nearly-identical encoding.
          # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
@ -244,6 +249,7 @@ def tidy_bytes(string, force = false)

          out.encode!(Encoding::UTF_8)
        end
+      end

      # Returns the KC normalization of the string by default. NFKC is
      # considered the best normalization form for passing strings to databases
@ -371,14 +377,8 @@ def apply_mapping(string, mapping) #:nodoc:
        end.pack('U*')
      end

-      def tidy_byte(byte)
-        if byte < 160
-          [database.cp1252[byte] || byte].pack("U").unpack("C*")
-        elsif byte < 192
-          [194, byte]
-        else
-          [195, byte - 64]
-        end
+      def recode_windows1252_chars(string)
+        string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
      end

      def database