Fix URL linkifier grabbing full-width spaces and quotations (#9997)

author Eugen Rochko <eugen@zeonfederated.com>

Sat, 9 Feb 2019 19:13:11 +0000 (20:13 +0100)

committer GitHub <noreply@github.com>

Sat, 9 Feb 2019 19:13:11 +0000 (20:13 +0100)
author Eugen Rochko <eugen@zeonfederated.com>
Sat, 9 Feb 2019 19:13:11 +0000 (20:13 +0100)
committer GitHub <noreply@github.com>
Sat, 9 Feb 2019 19:13:11 +0000 (20:13 +0100)
diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb

index 6603b8df178c9138b0ec0e6c8f6d83e67fa8171a..0653214f53d693e7e3ad7e9adf2c6ff9c1f68823 100644 (file)
--- a/app/lib/formatter.rb
+++ b/app/lib/formatter.rb
@@ -199,12 +199,22 @@ class Formatter
      result.flatten.join
    end
  
+  UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/
+
    def utf8_friendly_extractor(text, options = {})
      old_to_new_index = [0]
  
      escaped = text.chars.map do |c|
-      output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
+      output = begin
+        if c.ord.to_s(16).length > 2 && UNICODE_ESCAPE_BLACKLIST_RE.match(c).nil?
+          CGI.escape(c)
+        else
+          c
+        end
+      end
+
        old_to_new_index << old_to_new_index.last + output.length
+
        output
      end.join
  
diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb

index 0e8f5bfeb47353d1c151e8917f03650e000067b6..0ddbbee9828bfdcfefb1e28964d27df3ab28bef9 100644 (file)
--- a/config/initializers/twitter_regex.rb
+++ b/config/initializers/twitter_regex.rb
@@ -1,7 +1,7 @@
  module Twitter
    class Regex
-    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
-    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
+    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
+    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
      REGEXEN[:valid_url_balanced_parens] = /
        \(
          (?:
diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb

index 8fb6695a9bf12c738847adf224ad18e5bba6dfb2..96d2fc7e06c247dd15d6643960bc2bfb015605f2 100644 (file)
--- a/spec/lib/formatter_spec.rb
+++ b/spec/lib/formatter_spec.rb
@@ -115,6 +115,22 @@ RSpec.describe Formatter do
        end
      end
  
+    context 'given a URL in quotation marks' do
+      let(:text) { '"https://example.com/"' }
+
+      it 'does not match the quotation marks' do
+        is_expected.to include 'href="https://example.com/"'
+      end
+    end
+
+    context 'given a URL in angle brackets' do
+      let(:text) { '<https://example.com/>' }
+
+      it 'does not match the angle brackets' do
+        is_expected.to include 'href="https://example.com/"'
+      end
+    end
+
      context 'given a URL with Japanese path string' do
        let(:text) { 'https://ja.wikipedia.org/wiki/日本' }
  
@@ -131,6 +147,22 @@ RSpec.describe Formatter do
        end
      end
  
+    context 'given a URL with a full-width space' do
+      let(:text) { 'https://example.com/　abc123' }
+
+      it 'does not match the full-width space' do
+        is_expected.to include 'href="https://example.com/"'
+      end
+    end
+
+    context 'given a URL in Japanese quotation marks' do
+      let(:text) { '「[https://example.org/」' }
+
+      it 'does not match the quotation marks' do
+        is_expected.to include 'href="https://example.org/"'
+      end
+    end
+
      context 'given a URL with Simplified Chinese path string' do
        let(:text) { 'https://baike.baidu.com/item/中华人民共和国' }
  
@@ -150,7 +182,11 @@ RSpec.describe Formatter do
      context 'given a URL containing unsafe code (XSS attack, visible part)' do
        let(:text) { %q{http://example.com/b<del>b</del>} }
  
-      it 'escapes the HTML in the URL' do
+      it 'does not include the HTML in the URL' do
+        is_expected.to include '"http://example.com/b"'
+      end
+
+      it 'escapes the HTML' do
          is_expected.to include '&lt;del&gt;b&lt;/del&gt;'
        end
      end
@@ -158,7 +194,11 @@ RSpec.describe Formatter do
      context 'given a URL containing unsafe code (XSS attack, invisible part)' do
        let(:text) { %q{http://example.com/blahblahblahblah/a<script>alert("Hello")</script>} }
  
-      it 'escapes the HTML in the URL' do
+      it 'does not include the HTML in the URL' do
+        is_expected.to include '"http://example.com/blahblahblahblah/a"'
+      end
+
+      it 'escapes the HTML' do
          is_expected.to include '&lt;script&gt;alert(&quot;Hello&quot;)&lt;/script&gt;'
        end
      end
author	Eugen Rochko <eugen@zeonfederated.com>
	Sat, 9 Feb 2019 19:13:11 +0000 (20:13 +0100)
committer	GitHub <noreply@github.com>
	Sat, 9 Feb 2019 19:13:11 +0000 (20:13 +0100)
app/lib/formatter.rb		patch \| blob \| history
config/initializers/twitter_regex.rb		patch \| blob \| history
spec/lib/formatter_spec.rb		patch \| blob \| history