include Redisable
include ActionView::Helpers::TextHelper
+ # Threshold over which two Nilsimsa values are considered
+ # to refer to the same text
NILSIMSA_COMPARE_THRESHOLD = 95
- NILSIMSA_MIN_SIZE = 10
- EXPIRE_SET_AFTER = 1.week.seconds
+
+ # Nilsimsa doesn't work well on small inputs, so below
+ # this size, we check only for exact matches with MD5
+ NILSIMSA_MIN_SIZE = 10
+
+ # How long to keep the trail of digests between updates,
+ # there is no reason to store it forever
+ EXPIRE_SET_AFTER = 1.week.seconds
+
+ # How many digests to keep in an account's trail. If it's
+ # too small, spam could rotate around different message templates
+ MAX_TRAIL_SIZE = 10
+
+ # How many detected duplicates to allow through before
+ # considering the message as spam
+ THRESHOLD = 5
def initialize(status)
@account = status.account
if insufficient_data?
false
elsif nilsimsa?
- any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
+ digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
else
- any_other_digest?('md5') { |_, other_digest| other_digest == digest }
+ digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
end
end
# get the correct status ID back, we have to save it in the string value
redis.zadd(redis_key, @status.id, digest_with_algorithm)
- redis.zremrangebyrank(redis_key, '0', '-10')
+ redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
redis.expire(redis_key, EXPIRE_SET_AFTER)
end
end
end
+ class << self
+ def perform(status)
+ spam_check = new(status)
+
+ return if spam_check.skip?
+
+ if spam_check.spam?
+ spam_check.flag!
+ else
+ spam_check.remember!
+ end
+ end
+ end
+
private
def disabled?
redis.zrange(redis_key, 0, -1)
end
- def any_other_digest?(filter_algorithm)
- other_digests.any? do |record|
+ def digests_over_threshold?(filter_algorithm)
+ other_digests.select do |record|
algorithm, other_digest, status_id = record.split(':')
next unless algorithm == filter_algorithm
yield algorithm, other_digest, status_id
- end
+ end.size >= THRESHOLD
end
def matching_status_ids
end
it 'returns true for duplicate statuses to the same recipient' do
- status1 = status_with_html('@alice Hello')
- described_class.new(status1).remember!
+ described_class::THRESHOLD.times do
+ status1 = status_with_html('@alice Hello')
+ described_class.new(status1).remember!
+ end
+
status2 = status_with_html('@alice Hello')
expect(described_class.new(status2).spam?).to be true
end
it 'returns true for duplicate statuses to different recipients' do
- status1 = status_with_html('@alice Hello')
- described_class.new(status1).remember!
+ described_class::THRESHOLD.times do
+ status1 = status_with_html('@alice Hello')
+ described_class.new(status1).remember!
+ end
+
status2 = status_with_html('@bob Hello')
expect(described_class.new(status2).spam?).to be true
end
it 'returns true for nearly identical statuses with random numbers' do
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
- status1 = status_with_html('@alice ' + source_text + ' 1234')
- described_class.new(status1).remember!
+
+ described_class::THRESHOLD.times do
+ status1 = status_with_html('@alice ' + source_text + ' 1234')
+ described_class.new(status1).remember!
+ end
+
status2 = status_with_html('@bob ' + source_text + ' 9568')
expect(described_class.new(status2).spam?).to be true
end
let(:redis_key) { spam_check.send(:redis_key) }
it 'remembers' do
- expect do
- spam_check.remember!
- end.to change { Redis.current.exists(redis_key) }.from(false).to(true)
+ expect(Redis.current.exists(redis_key)).to be true
+ spam_check.remember!
+ expect(Redis.current.exists(redis_key)).to be true
end
end
end
it 'resets' do
- expect do
- spam_check.reset!
- end.to change { Redis.current.exists(redis_key) }.from(true).to(false)
+ expect(Redis.current.exists(redis_key)).to be true
+ spam_check.reset!
+ expect(Redis.current.exists(redis_key)).to be false
end
end