Skip to content
Snippets Groups Projects
Unverified Commit 4f6af879 authored by Eugen Rochko's avatar Eugen Rochko Committed by GitHub
Browse files

Change spam check to apply to local accounts and add a threshold (#11806)

Instead of detecting spam on first duplicate message, add a
threshold of 5 such messages to reduce false positives
parent 57770698
No related branches found
No related tags found
No related merge requests found
......@@ -408,15 +408,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
end
def check_for_spam
spam_check = SpamCheck.new(@status)
return if spam_check.skip?
if spam_check.spam?
spam_check.flag!
else
spam_check.remember!
end
SpamCheck.perform(@status)
end
def forward_for_reply
......
......@@ -4,9 +4,25 @@ class SpamCheck
include Redisable
include ActionView::Helpers::TextHelper
# Threshold over which two Nilsimsa values are considered
# to refer to the same text
NILSIMSA_COMPARE_THRESHOLD = 95
NILSIMSA_MIN_SIZE = 10
EXPIRE_SET_AFTER = 1.week.seconds
# Nilsimsa doesn't work well on small inputs, so below
# this size, we check only for exact matches with MD5
NILSIMSA_MIN_SIZE = 10
# How long to keep the trail of digests between updates,
# there is no reason to store it forever
EXPIRE_SET_AFTER = 1.week.seconds
# How many digests to keep in an account's trail. If it's
# too small, spam could rotate around different message templates
MAX_TRAIL_SIZE = 10
# How many detected duplicates to allow through before
# considering the message as spam
THRESHOLD = 5
def initialize(status)
@account = status.account
......@@ -21,9 +37,9 @@ class SpamCheck
if insufficient_data?
false
elsif nilsimsa?
any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
else
any_other_digest?('md5') { |_, other_digest| other_digest == digest }
digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
end
end
......@@ -38,7 +54,7 @@ class SpamCheck
# get the correct status ID back, we have to save it in the string value
redis.zadd(redis_key, @status.id, digest_with_algorithm)
redis.zremrangebyrank(redis_key, '0', '-10')
redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
redis.expire(redis_key, EXPIRE_SET_AFTER)
end
......@@ -78,6 +94,20 @@ class SpamCheck
end
end
class << self
def perform(status)
spam_check = new(status)
return if spam_check.skip?
if spam_check.spam?
spam_check.flag!
else
spam_check.remember!
end
end
end
private
def disabled?
......@@ -149,14 +179,14 @@ class SpamCheck
redis.zrange(redis_key, 0, -1)
end
def any_other_digest?(filter_algorithm)
other_digests.any? do |record|
def digests_over_threshold?(filter_algorithm)
other_digests.select do |record|
algorithm, other_digest, status_id = record.split(':')
next unless algorithm == filter_algorithm
yield algorithm, other_digest, status_id
end
end.size >= THRESHOLD
end
def matching_status_ids
......
......@@ -33,6 +33,7 @@ class ProcessMentionsService < BaseService
end
status.save!
check_for_spam(status)
mentions.each { |mention| create_notification(mention) }
end
......@@ -61,4 +62,8 @@ class ProcessMentionsService < BaseService
def resolve_account_service
ResolveAccountService.new
end
def check_for_spam(status)
SpamCheck.perform(status)
end
end
......@@ -86,23 +86,33 @@ RSpec.describe SpamCheck do
end
it 'returns true for duplicate statuses to the same recipient' do
status1 = status_with_html('@alice Hello')
described_class.new(status1).remember!
described_class::THRESHOLD.times do
status1 = status_with_html('@alice Hello')
described_class.new(status1).remember!
end
status2 = status_with_html('@alice Hello')
expect(described_class.new(status2).spam?).to be true
end
it 'returns true for duplicate statuses to different recipients' do
status1 = status_with_html('@alice Hello')
described_class.new(status1).remember!
described_class::THRESHOLD.times do
status1 = status_with_html('@alice Hello')
described_class.new(status1).remember!
end
status2 = status_with_html('@bob Hello')
expect(described_class.new(status2).spam?).to be true
end
it 'returns true for nearly identical statuses with random numbers' do
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
status1 = status_with_html('@alice ' + source_text + ' 1234')
described_class.new(status1).remember!
described_class::THRESHOLD.times do
status1 = status_with_html('@alice ' + source_text + ' 1234')
described_class.new(status1).remember!
end
status2 = status_with_html('@bob ' + source_text + ' 9568')
expect(described_class.new(status2).spam?).to be true
end
......@@ -140,9 +150,9 @@ RSpec.describe SpamCheck do
let(:redis_key) { spam_check.send(:redis_key) }
it 'remembers' do
expect do
spam_check.remember!
end.to change { Redis.current.exists(redis_key) }.from(false).to(true)
expect(Redis.current.exists(redis_key)).to be true
spam_check.remember!
expect(Redis.current.exists(redis_key)).to be true
end
end
......@@ -156,9 +166,9 @@ RSpec.describe SpamCheck do
end
it 'resets' do
expect do
spam_check.reset!
end.to change { Redis.current.exists(redis_key) }.from(true).to(false)
expect(Redis.current.exists(redis_key)).to be true
spam_check.reset!
expect(Redis.current.exists(redis_key)).to be false
end
end
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment