end
def encode_and_link_urls(html, accounts = nil, options = {})
- entities = Extractor.extract_entities_with_indices(html, extract_url_without_protocol: false)
+ entities = utf8_friendly_extractor(html, extract_url_without_protocol: false)
if accounts.is_a?(Hash)
options = accounts
result.flatten.join
end
+ def utf8_friendly_extractor(text, options = {})
+ old_to_new_index = [0]
+
+ escaped = text.chars.map do |c|
+ output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
+ old_to_new_index << old_to_new_index.last + output.length
+ output
+ end.join
+
+ # Note: I couldn't obtain list_slug with @user/list-name format
+ # for mention so this requires additional check
+ special = Extractor.extract_entities_with_indices(escaped, options).map do |extract|
+ # exactly one of :url, :hashtag, :screen_name, :cashtag keys is present
+ key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first
+
+ new_indices = [
+ old_to_new_index.find_index(extract[:indices].first),
+ old_to_new_index.find_index(extract[:indices].last),
+ ]
+
+ has_prefix_char = [:hashtag, :screen_name, :cashtag].include?(key)
+ value_indices = [
+ new_indices.first + (has_prefix_char ? 1 : 0), # account for #, @ or $
+ new_indices.last - 1,
+ ]
+
+ next extract.merge(
+ :indices => new_indices,
+ key => text[value_indices.first..value_indices.last]
+ )
+ end
+
+ standard = Extractor.extract_entities_with_indices(text, options)
+
+ Extractor.remove_overlapping_entities(special + standard)
+ end
+
def link_to_url(entity, options = {})
url = Addressable::URI.parse(entity[:url])
html_attrs = { target: '_blank', rel: 'nofollow noopener' }
end
context 'given a URL with a query string' do
- let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
+ context 'with escaped unicode character' do
+ let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
- it 'matches the full URL' do
- is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink"'
+ it 'matches the full URL' do
+ is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink"'
+ end
+ end
+
+ context 'with unicode character' do
+ let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' }
+
+ it 'matches the full URL' do
+ is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&q=autolink"'
+ end
+ end
+
+ context 'with unicode character at the end' do
+ let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' }
+
+ it 'matches the full URL' do
+ is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"'
+ end
+ end
+
+ context 'with escaped and not escaped unicode characters' do
+ let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' }
+
+ it 'preserves escaped unicode characters' do
+ is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink"'
+ end
end
end