end
standard = Extractor.extract_entities_with_indices(text, options)
- xmpp = Extractor.extract_xmpp_uris_with_indices(text, options)
+ extra = Extractor.extract_extra_uris_with_indices(text, options)
- Extractor.remove_overlapping_entities(special + standard + xmpp)
+ Extractor.remove_overlapping_entities(special + standard + extra)
end
+ def html_friendly_extractor(html, options = {})
+ gaps = []
+ total_offset = 0
+
+ escaped = html.gsub(/<[^>]*>|&#[0-9]+;/) do |match|
+ total_offset += match.length - 1
+ end_offset = Regexp.last_match.end(0)
+ gaps << [end_offset - total_offset, total_offset]
+ "\u200b"
+ end
+
+ entities = Extractor.extract_hashtags_with_indices(escaped, :check_url_overlap => false) +
+ Extractor.extract_mentions_or_lists_with_indices(escaped)
+ Extractor.remove_overlapping_entities(entities).map do |extract|
+ pos = extract[:indices].first
+ offset_idx = gaps.rindex { |gap| gap.first <= pos }
+ offset = offset_idx.nil? ? 0 : gaps[offset_idx].last
+ next extract.merge(
+ :indices => [extract[:indices].first + offset, extract[:indices].last + offset]
+ )
+ end
+ end
+
def link_to_url(entity, options = {})
url = Addressable::URI.parse(entity[:url])
html_attrs = { target: '_blank', rel: 'nofollow noopener noreferrer' }