Allow most kinds of characters in URL query (fixes #8408) (#8447)

author Jakub Mendyk <jakubmendyk.szkola@gmail.com>

Sat, 2 Feb 2019 18:01:18 +0000 (19:01 +0100)

committer Eugen Rochko <eugen@zeonfederated.com>

Sat, 2 Feb 2019 18:01:18 +0000 (19:01 +0100)
author Jakub Mendyk <jakubmendyk.szkola@gmail.com>
Sat, 2 Feb 2019 18:01:18 +0000 (19:01 +0100)
committer Eugen Rochko <eugen@zeonfederated.com>
Sat, 2 Feb 2019 18:01:18 +0000 (19:01 +0100)
diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb

index 05fd9eeb1261abeeb9bcbb1dbc046c5a385f9cfd..2e358716968a4ba43e691988389442fd7417194c 100644 (file)
--- a/app/lib/formatter.rb
+++ b/app/lib/formatter.rb
@@ -99,7 +99,7 @@ class Formatter
    end
  
    def encode_and_link_urls(html, accounts = nil, options = {})
-    entities = Extractor.extract_entities_with_indices(html, extract_url_without_protocol: false)
+    entities = utf8_friendly_extractor(html, extract_url_without_protocol: false)
  
      if accounts.is_a?(Hash)
        options  = accounts
@@ -199,6 +199,43 @@ class Formatter
      result.flatten.join
    end
  
+  def utf8_friendly_extractor(text, options = {})
+    old_to_new_index = [0]
+
+    escaped = text.chars.map do |c|
+      output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
+      old_to_new_index << old_to_new_index.last + output.length
+      output
+    end.join
+
+    # Note: I couldn't obtain list_slug with @user/list-name format
+    # for mention so this requires additional check
+    special = Extractor.extract_entities_with_indices(escaped, options).map do |extract|
+      # exactly one of :url, :hashtag, :screen_name, :cashtag keys is present
+      key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first
+
+      new_indices = [
+        old_to_new_index.find_index(extract[:indices].first),
+        old_to_new_index.find_index(extract[:indices].last),
+      ]
+
+      has_prefix_char = [:hashtag, :screen_name, :cashtag].include?(key)
+      value_indices = [
+        new_indices.first + (has_prefix_char ? 1 : 0), # account for #, @ or $
+        new_indices.last - 1,
+      ]
+
+      next extract.merge(
+        :indices => new_indices,
+        key => text[value_indices.first..value_indices.last]
+      )
+    end
+
+    standard = Extractor.extract_entities_with_indices(text, options)
+
+    Extractor.remove_overlapping_entities(special + standard)
+  end
+
    def link_to_url(entity, options = {})
      url        = Addressable::URI.parse(entity[:url])
      html_attrs = { target: '_blank', rel: 'nofollow noopener' }
diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb

index 0c1efe7c3cc763799e0e5c77ab1ef820f7d2d3b1..9872d375679dc0759974e6dd3d8fe2f3cf5290da 100644 (file)
--- a/spec/lib/formatter_spec.rb
+++ b/spec/lib/formatter_spec.rb
@@ -74,10 +74,36 @@ RSpec.describe Formatter do
      end
  
      context 'given a URL with a query string' do
-      let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
+      context 'with escaped unicode character' do
+        let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
  
-      it 'matches the full URL' do
-        is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;q=autolink"'
+        it 'matches the full URL' do
+          is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;q=autolink"'
+        end
+      end
+
+      context 'with unicode character' do
+        let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' }
+
+        it 'matches the full URL' do
+          is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&amp;q=autolink"'
+        end
+      end
+
+      context 'with unicode character at the end' do
+        let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' }
+
+        it 'matches the full URL' do
+          is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"'
+        end
+      end
+
+      context 'with escaped and not escaped unicode characters' do
+        let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' }
+
+        it 'preserves escaped unicode characters' do
+          is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;utf81=✓&amp;q=autolink"'
+        end
        end
      end
author	Jakub Mendyk <jakubmendyk.szkola@gmail.com>
	Sat, 2 Feb 2019 18:01:18 +0000 (19:01 +0100)
committer	Eugen Rochko <eugen@zeonfederated.com>
	Sat, 2 Feb 2019 18:01:18 +0000 (19:01 +0100)
app/lib/formatter.rb		patch \| blob \| history
spec/lib/formatter_spec.rb		patch \| blob \| history