]> cat aescling's git repositories - mastodon.git/commitdiff
Add more accurate account search (#11537)
authorEugen Rochko <eugen@zeonfederated.com>
Thu, 15 Aug 2019 23:24:03 +0000 (01:24 +0200)
committerGitHub <noreply@github.com>
Thu, 15 Aug 2019 23:24:03 +0000 (01:24 +0200)
* Add more accurate account search

When ElasticSearch is available, a more accurate search is implemented:

- Using edge n-gram index for acct and display name
- Using asciifolding and cjk width normalization on display names
- Using Gaussian decay on account activity for additional scoring (recency)
- Using followers/friends ratio for additional scoring (spamminess)
- Using followers number for additional scoring (size)

The exact match precedence only takes effect when the input conforms
to the username format and the username part of it is complete, i.e.
when the user started typing the domain part.

* Support single-letter usernames

* Fix tests

* Fix not picking up account updates

* Add weights and normalization for scores, skip zero terms queries

* Use local counts for accounts index, adjust search parameters

* Fix mistakes

* Using updated_at of accounts is inadequate for remote accounts

app/chewy/accounts_index.rb [new file with mode: 0644]
app/models/account.rb
app/models/account_stat.rb
app/services/account_search_service.rb
spec/services/account_search_service_spec.rb

diff --git a/app/chewy/accounts_index.rb b/app/chewy/accounts_index.rb
new file mode 100644 (file)
index 0000000..e11b800
--- /dev/null
@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+
+class AccountsIndex < Chewy::Index
+  settings index: { refresh_interval: '5m' }, analysis: {
+    analyzer: {
+      content: {
+        tokenizer: 'whitespace',
+        filter: %w(lowercase asciifolding cjk_width),
+      },
+
+      edge_ngram: {
+        tokenizer: 'edge_ngram',
+        filter: %w(lowercase asciifolding cjk_width),
+      },
+    },
+
+    tokenizer: {
+      edge_ngram: {
+        type: 'edge_ngram',
+        min_gram: 1,
+        max_gram: 15,
+      },
+    },
+  }
+
+  define_type ::Account.searchable.includes(:account_stat), delete_if: ->(account) { account.destroyed? || !account.searchable? } do
+    root date_detection: false do
+      field :id, type: 'long'
+      field :display_name, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
+      field :acct, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content', value: ->(account) { [account.username, account.domain].compact.join('@') }
+      field :following_count, type: 'long', value: ->(account) { account.active_relationships.count }
+      field :followers_count, type: 'long', value: ->(account) { account.passive_relationships.count }
+      field :last_status_at, type: 'date', value: ->(account) { account.last_status_at || account.created_at }
+    end
+  end
+end
index 60c06aaf0b41171ff56280c64fe07aa803dbe0a4..392cc625fcc3a8037ddeaef8e2a0817614ce96a4 100644 (file)
@@ -127,6 +127,8 @@ class Account < ApplicationRecord
 
   delegate :chosen_languages, to: :user, prefix: false, allow_nil: true
 
+  update_index('accounts#account', :self) if Chewy.enabled?
+
   def local?
     domain.nil?
   end
@@ -169,6 +171,10 @@ class Account < ApplicationRecord
     subscription_expires_at.present?
   end
 
+  def searchable?
+    !(suspended? || moved?)
+  end
+
   def possibly_stale?
     last_webfingered_at.nil? || last_webfingered_at <= 1.day.ago
   end
index 9813aa84ff105420b0629e5a9f501ba78296ff30..6d1097cec68967d7e81b47275c07e78bc5ee043d 100644 (file)
@@ -16,6 +16,8 @@
 class AccountStat < ApplicationRecord
   belongs_to :account, inverse_of: :account_stat
 
+  update_index('accounts#account', :account) if Chewy.enabled?
+
   def increment_count!(key)
     update(attributes_for_increment(key))
   end
index e1874d0450f7ed35677bee87e9ba82405d99018c..2d602a31deab8b622c520d5169257ad4fb95dc60 100644 (file)
@@ -4,105 +4,134 @@ class AccountSearchService < BaseService
   attr_reader :query, :limit, :offset, :options, :account
 
   def call(query, account = nil, options = {})
-    @query   = query.strip
-    @limit   = options[:limit].to_i
-    @offset  = options[:offset].to_i
-    @options = options
-    @account = account
+    @acct_hint = query.start_with?('@')
+    @query     = query.strip.gsub(/\A@/, '')
+    @limit     = options[:limit].to_i
+    @offset    = options[:offset].to_i
+    @options   = options
+    @account   = account
 
-    search_service_results
+    search_service_results.compact.uniq
   end
 
   private
 
   def search_service_results
-    return [] if query_blank_or_hashtag? || limit < 1
+    return [] if query.blank? || limit < 1
 
-    if resolving_non_matching_remote_account?
-      [ResolveAccountService.new.call("#{query_username}@#{query_domain}")].compact
-    else
-      search_results_and_exact_match.compact.uniq
-    end
+    [exact_match] + search_results
   end
 
-  def resolving_non_matching_remote_account?
-    offset.zero? && options[:resolve] && !exact_match? && !domain_is_local?
-  end
+  def exact_match
+    return unless offset.zero? && username_complete?
 
-  def search_results_and_exact_match
-    return search_results.to_a unless offset.zero?
+    return @exact_match if defined?(@exact_match)
 
-    results = [exact_match]
+    @exact_match = begin
+      if options[:resolve]
+        ResolveAccountService.new.call(query)
+      elsif domain_is_local?
+        Account.find_local(query_username)
+      else
+        Account.find_remote(query_username, query_domain)
+      end
+    end
+  end
 
-    return results if exact_match? && limit == 1
+  def search_results
+    return [] if limit_for_non_exact_results.zero?
 
-    results + search_results.to_a
+    @search_results ||= begin
+      if Chewy.enabled?
+        from_elasticsearch
+      else
+        from_database
+      end
+    end
   end
 
-  def query_blank_or_hashtag?
-    query.blank? || query.start_with?('#')
+  def from_database
+    if account
+      advanced_search_results
+    else
+      simple_search_results
+    end
   end
 
-  def split_query_string
-    @split_query_string ||= query.gsub(/\A@/, '').split('@')
+  def advanced_search_results
+    Account.advanced_search_for(terms_for_query, account, limit_for_non_exact_results, options[:following], offset)
   end
 
-  def query_username
-    @query_username ||= split_query_string.first || ''
+  def simple_search_results
+    Account.search_for(terms_for_query, limit_for_non_exact_results, offset)
   end
 
-  def query_domain
-    @query_domain ||= query_without_split? ? nil : split_query_string.last
-  end
+  def from_elasticsearch
+    must_clauses   = [{ multi_match: { query: terms_for_query, fields: likely_acct? ? %w(acct) : %w(acct^2 display_name), type: 'best_fields' } }]
+    should_clauses = []
 
-  def query_without_split?
-    split_query_string.size == 1
-  end
+    if account
+      return [] if options[:following] && following_ids.empty?
 
-  def domain_is_local?
-    @domain_is_local ||= TagManager.instance.local_domain?(query_domain)
-  end
+      if options[:following]
+        must_clauses << { terms: { id: following_ids } }
+      elsif following_ids.any?
+        should_clauses << { terms: { id: following_ids, boost: 100 } }
+      end
+    end
 
-  def search_from
-    options[:following] && account ? account.following : Account
-  end
+    query     = { bool: { must: must_clauses, should: should_clauses } }
+    functions = [reputation_score_function, followers_score_function, time_distance_function]
 
-  def exact_match?
-    exact_match.present?
-  end
+    records = AccountsIndex.query(function_score: { query: query, functions: functions, boost_mode: 'multiply', score_mode: 'avg' })
+                           .limit(limit_for_non_exact_results)
+                           .offset(offset)
+                           .objects
+                           .compact
 
-  def exact_match
-    return @exact_match if defined?(@exact_match)
+    ActiveRecord::Associations::Preloader.new.preload(records, :account_stat)
 
-    @exact_match = begin
-      if domain_is_local?
-        search_from.without_suspended.find_local(query_username)
-      else
-        search_from.without_suspended.find_remote(query_username, query_domain)
-      end
-    end
+    records
   end
 
-  def search_results
-    @search_results ||= begin
-      if account
-        advanced_search_results
-      else
-        simple_search_results
-      end
-    end
+  def reputation_score_function
+    {
+      script_score: {
+        script: {
+          source: "(doc['followers_count'].value + 0.0) / (doc['followers_count'].value + doc['following_count'].value + 1)",
+        },
+      },
+    }
   end
 
-  def advanced_search_results
-    Account.advanced_search_for(terms_for_query, account, limit_for_non_exact_results, options[:following], offset)
+  def followers_score_function
+    {
+      field_value_factor: {
+        field: 'followers_count',
+        modifier: 'log2p',
+        missing: 1,
+      },
+    }
   end
 
-  def simple_search_results
-    Account.search_for(terms_for_query, limit_for_non_exact_results, offset)
+  def time_distance_function
+    {
+      gauss: {
+        last_status_at: {
+          scale: '30d',
+          offset: '30d',
+          decay: 0.3,
+        },
+      },
+    }
+  end
+
+  def following_ids
+    @following_ids ||= account.active_relationships.pluck(:target_account_id)
   end
 
   def limit_for_non_exact_results
-    if offset.zero? && exact_match?
+    if exact_match?
       limit - 1
     else
       limit
@@ -113,7 +142,39 @@ class AccountSearchService < BaseService
     if domain_is_local?
       query_username
     else
-      "#{query_username} #{query_domain}"
+      query
     end
   end
+
+  def split_query_string
+    @split_query_string ||= query.split('@')
+  end
+
+  def query_username
+    @query_username ||= split_query_string.first || ''
+  end
+
+  def query_domain
+    @query_domain ||= query_without_split? ? nil : split_query_string.last
+  end
+
+  def query_without_split?
+    split_query_string.size == 1
+  end
+
+  def domain_is_local?
+    @domain_is_local ||= TagManager.instance.local_domain?(query_domain)
+  end
+
+  def exact_match?
+    exact_match.present?
+  end
+
+  def username_complete?
+    query.include?('@') && "@#{query}" =~ Account::MENTION_RE
+  end
+
+  def likely_acct?
+    @acct_hint || username_complete?
+  end
 end
index 7b071b378e04b06fe6894ad4090205982f85e7bf..5b7182586c5f2f4406eb180ffa0eadfb35f29814 100644 (file)
 require 'rails_helper'
 
 describe AccountSearchService, type: :service do
-  describe '.call' do
-    describe 'with a query to ignore' do
+  describe '#call' do
+    context 'with a query to ignore' do
       it 'returns empty array for missing query' do
         results = subject.call('', nil, limit: 10)
 
         expect(results).to eq []
       end
-      it 'returns empty array for hashtag query' do
-        results = subject.call('#tag', nil, limit: 10)
 
-        expect(results).to eq []
-      end
       it 'returns empty array for limit zero' do
         Fabricate(:account, username: 'match')
+
         results = subject.call('match', nil, limit: 0)
 
         expect(results).to eq []
       end
     end
 
-    describe 'searching for a simple term that is not an exact match' do
+    context 'searching for a simple term that is not an exact match' do
       it 'does not return a nil entry in the array for the exact match' do
-        match = Fabricate(:account, username: 'matchingusername')
-
+        account = Fabricate(:account, username: 'matchingusername')
         results = subject.call('match', nil, limit: 5)
-        expect(results).to eq [match]
-      end
-    end
 
-    describe 'searching local and remote users' do
-      describe "when only '@'" do
-        before do
-          allow(Account).to receive(:find_local)
-          allow(Account).to receive(:search_for)
-          subject.call('@', nil, limit: 10)
-        end
-
-        it 'uses find_local with empty query to look for local accounts' do
-          expect(Account).to have_received(:find_local).with('')
-        end
-      end
-
-      describe 'when no domain' do
-        before do
-          allow(Account).to receive(:find_local)
-          allow(Account).to receive(:search_for)
-          subject.call('one', nil, limit: 10)
-        end
-
-        it 'uses find_local to look for local accounts' do
-          expect(Account).to have_received(:find_local).with('one')
-        end
-
-        it 'uses search_for to find matches' do
-          expect(Account).to have_received(:search_for).with('one', 10, 0)
-        end
-      end
-
-      describe 'when there is a domain' do
-        before do
-          allow(Account).to receive(:find_remote)
-        end
-
-        it 'uses find_remote to look for remote accounts' do
-          subject.call('two@example.com', nil, limit: 10)
-          expect(Account).to have_received(:find_remote).with('two', 'example.com')
-        end
-
-        describe 'and there is no account provided' do
-          it 'uses search_for to find matches' do
-            allow(Account).to receive(:search_for)
-            subject.call('two@example.com', nil, limit: 10, resolve: false)
-
-            expect(Account).to have_received(:search_for).with('two example.com', 10, 0)
-          end
-        end
-
-        describe 'and there is an account provided' do
-          it 'uses advanced_search_for to find matches' do
-            account = Fabricate(:account)
-            allow(Account).to receive(:advanced_search_for)
-            subject.call('two@example.com', account, limit: 10, resolve: false)
-
-            expect(Account).to have_received(:advanced_search_for).with('two example.com', account, 10, nil, 0)
-          end
-        end
+        expect(results).to eq [account]
       end
     end
 
-    describe 'with an exact match' do
-      it 'returns exact match first, and does not return duplicates' do
-        partial = Fabricate(:account, username: 'exactness')
-        exact = Fabricate(:account, username: 'exact')
-
-        results = subject.call('exact', nil, limit: 10)
-        expect(results.size).to eq 2
-        expect(results).to eq [exact, partial]
-      end
-    end
-
-    describe 'when there is a local domain' do
+    context 'when there is a local domain' do
       around do |example|
         before = Rails.configuration.x.local_domain
+
         example.run
+
         Rails.configuration.x.local_domain = before
       end
 
       it 'returns exact match first' do
         remote     = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e')
         remote_too = Fabricate(:account, username: 'b', domain: 'remote', display_name: 'e')
-        exact = Fabricate(:account, username: 'e')
+        exact      = Fabricate(:account, username: 'e')
+
         Rails.configuration.x.local_domain = 'example.com'
 
         results = subject.call('e@example.com', nil, limit: 2)
+
         expect(results.size).to eq 2
         expect(results).to eq([exact, remote]).or eq([exact, remote_too])
       end
     end
 
-    describe 'when there is a domain but no exact match' do
+    context 'when there is a domain but no exact match' do
       it 'follows the remote account when resolve is true' do
         service = double(call: nil)
         allow(ResolveAccountService).to receive(:new).and_return(service)
@@ -138,23 +68,21 @@ describe AccountSearchService, type: :service do
       end
     end
 
-    describe 'should not include suspended accounts' do
-      it 'returns the fuzzy match first, and does not return suspended exacts' do
-        partial = Fabricate(:account, username: 'exactness')
-        exact = Fabricate(:account, username: 'exact', suspended: true)
+    it 'returns the fuzzy match first, and does not return suspended exacts' do
+      partial = Fabricate(:account, username: 'exactness')
+      exact   = Fabricate(:account, username: 'exact', suspended: true)
+      results = subject.call('exact', nil, limit: 10)
 
-        results = subject.call('exact', nil, limit: 10)
-        expect(results.size).to eq 1
-        expect(results).to eq [partial]
-      end
+      expect(results.size).to eq 1
+      expect(results).to eq [partial]
+    end
 
-      it "does not return suspended remote accounts" do
-        remote = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e', suspended: true)
+    it "does not return suspended remote accounts" do
+      remote  = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e', suspended: true)
+      results = subject.call('a@example.com', nil, limit: 2)
 
-        results = subject.call('a@example.com', nil, limit: 2)
-        expect(results.size).to eq 0
-        expect(results).to eq []
-      end
+      expect(results.size).to eq 0
+      expect(results).to eq []
     end
   end
 end