Skip to content

Commit

Permalink
Merge branch 'hotfix/23.16.6' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
mfraezz committed Dec 18, 2023
2 parents 561d81e + af35c28 commit 3ee8410
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 20 deletions.
21 changes: 15 additions & 6 deletions osf/external/spam/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,20 @@ def _check_resource_for_domains(guid, content):
resource = guid.referent
spammy_domains = []
referrer_content_type = ContentType.objects.get_for_model(resource)
for domain in _extract_domains(content):
notable_domain, _ = NotableDomain.objects.get_or_create(domain=domain)
for domain, note in _extract_domains(content):
notable_domain, _ = NotableDomain.objects.get_or_create(
domain=domain,
defaults={'note': note}
)
if notable_domain.note == NotableDomain.Note.EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT:
spammy_domains.append(notable_domain.domain)
DomainReference.objects.get_or_create(
domain=notable_domain,
referrer_object_id=resource.id,
referrer_content_type=referrer_content_type,
defaults={'is_triaged': notable_domain.note != NotableDomain.Note.UNKNOWN}
defaults={
'is_triaged': notable_domain.note not in (NotableDomain.Note.UNKNOWN, NotableDomain.Note.UNVERIFIED)
}
)
if spammy_domains:
resource.confirm_spam(save=True, domains=list(spammy_domains))
Expand All @@ -72,8 +77,11 @@ def check_resource_for_domains_async(guid, content):


def _extract_domains(content):
from osf.models import NotableDomain

extracted_domains = set()
for match in DOMAIN_REGEX.finditer(content):
note = NotableDomain.Note.UNKNOWN
domain = match.group('domain')
if not domain or domain in extracted_domains:
continue
Expand All @@ -85,10 +93,11 @@ def _extract_domains(content):

try:
response = requests.head(constructed_url, timeout=settings.DOMAIN_EXTRACTION_TIMEOUT)
except (requests.exceptions.ConnectionError, requests.exceptions.InvalidURL):
except requests.exceptions.InvalidURL:
# Likely false-positive from a filename.ext
continue
except requests.exceptions.RequestException:
pass
note = NotableDomain.Note.UNVERIFIED
else:
# Store the redirect location (to help catch link shorteners)
if response.status_code in REDIRECT_CODES and 'location' in response.headers:
Expand All @@ -99,7 +108,7 @@ def _extract_domains(content):
# Avoid returning a duplicate domain discovered via redirect
if domain not in extracted_domains:
extracted_domains.add(domain)
yield domain
yield domain, note


@run_postcommit(once_per_request=False, celery=True)
Expand Down
19 changes: 19 additions & 0 deletions osf/migrations/0017_alter_notabledomain_note.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 3.2.17 on 2023-12-12 19:02

from django.db import migrations, models
import osf.models.notable_domain


class Migration(migrations.Migration):

dependencies = [
('osf', '0016_auto_20230828_1810'),
]

operations = [
migrations.AlterField(
model_name='notabledomain',
name='note',
field=models.IntegerField(choices=[(0, 'EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT'), (1, 'ASSUME_HAM_UNTIL_REPORTED'), (2, 'UNKNOWN'), (3, 'IGNORED'), (4, 'UNVERIFIED')], default=osf.models.notable_domain.NotableDomain.Note['UNKNOWN']),
),
]
1 change: 1 addition & 0 deletions osf/models/notable_domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class Note(IntEnum):
ASSUME_HAM_UNTIL_REPORTED = 1
UNKNOWN = 2
IGNORED = 3
UNVERIFIED = 4 # Timedout couldn't determine

@classmethod
def choices(cls):
Expand Down
31 changes: 17 additions & 14 deletions osf_tests/test_notable_domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,31 +30,34 @@ def test_extract_domains__optional_components(self, protocol_component, www_comp
sample_text = f'This is a link: {test_url}'
with mock.patch.object(spam_tasks.requests, 'head'):
domains = list(spam_tasks._extract_domains(sample_text))
assert domains == ['osf.io']
assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__url_in_quotes(self):
sample_text = '"osf.io"'
with mock.patch.object(spam_tasks.requests, 'head'):
domains = list(spam_tasks._extract_domains(sample_text))
assert domains == ['osf.io']
assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__url_in_parens(self):
sample_text = '(osf.io)'
with mock.patch.object(spam_tasks.requests, 'head'):
domains = list(spam_tasks._extract_domains(sample_text))
assert domains == ['osf.io']
assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__captures_domain_with_multiple_subdomains(self):
sample_text = 'This is a link: https://api.test.osf.io'
with mock.patch.object(spam_tasks.requests, 'head'):
domains = list(spam_tasks._extract_domains(sample_text))
assert domains == ['api.test.osf.io']
assert domains == [('api.test.osf.io', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__captures_multiple_domains(self):
sample_text = 'This is a domain: http://osf.io. This is another domain: www.cos.io'
with mock.patch.object(spam_tasks.requests, 'head'):
domains = set(spam_tasks._extract_domains(sample_text))
assert domains == {'osf.io', 'cos.io'}
assert domains == {
('osf.io', NotableDomain.Note.UNKNOWN),
('cos.io', NotableDomain.Note.UNKNOWN),
}

def test_extract_domains__no_domains(self):
sample_text = 'http://fakeout!'
Expand All @@ -63,19 +66,19 @@ def test_extract_domains__no_domains(self):
assert not domains
mock_head.assert_not_called()

def test_extract_domains__ignored_if_does_not_resolve(self):
def test_extract_domains__unverfied_if_does_not_resolve(self):
sample_text = 'This.will.not.connect'
with mock.patch.object(spam_tasks.requests, 'head') as mock_head:
mock_head.side_effect = spam_tasks.requests.exceptions.ConnectionError
domains = set(spam_tasks._extract_domains(sample_text))
assert not domains
assert domains == {('This.will.not.connect', NotableDomain.Note.UNVERIFIED)}

def test_actract_domains__returned_on_error(self):
sample_text = 'This.will.timeout'
with mock.patch.object(spam_tasks.requests, 'head') as mock_head:
mock_head.side_effect = spam_tasks.requests.exceptions.Timeout
domains = set(spam_tasks._extract_domains(sample_text))
assert domains == {sample_text}
assert domains == {(sample_text, NotableDomain.Note.UNVERIFIED)}

@pytest.mark.parametrize('status_code', [301, 302, 303, 307, 308])
def test_extract_domains__follows_redirect(self, status_code):
Expand All @@ -85,7 +88,7 @@ def test_extract_domains__follows_redirect(self, status_code):
sample_text = 'redirect.me'
with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response):
domains = list(spam_tasks._extract_domains(sample_text))
assert domains == ['redirected.com']
assert domains == [('redirected.com', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__redirect_code_no_location(self):
mock_response = SimpleNamespace()
Expand All @@ -94,7 +97,7 @@ def test_extract_domains__redirect_code_no_location(self):
sample_text = 'redirect.me'
with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response):
domains = list(spam_tasks._extract_domains(sample_text))
assert domains == ['redirect.me']
assert domains == [('redirect.me', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__redirect_code_bad_location(self):
mock_response = SimpleNamespace()
Expand All @@ -103,7 +106,7 @@ def test_extract_domains__redirect_code_bad_location(self):
sample_text = 'redirect.me'
with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response):
domains = list(spam_tasks._extract_domains(sample_text))
assert domains == ['redirect.me']
assert domains == [('redirect.me', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__redirect_with_full_url_no_protocol(self):
mock_response = SimpleNamespace()
Expand All @@ -114,7 +117,7 @@ def test_extract_domains__redirect_with_full_url_no_protocol(self):
with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response) as mock_object:
domains = list(spam_tasks._extract_domains(sample_text))
mock_object.assert_called_once_with(f'https://{target_url}', timeout=60)
assert domains == ['osf.io']
assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__redirect_with_full_url_and_protocol(self):
mock_response = SimpleNamespace()
Expand All @@ -125,13 +128,13 @@ def test_extract_domains__redirect_with_full_url_and_protocol(self):
with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response) as mock_object:
domains = list(spam_tasks._extract_domains(sample_text))
mock_object.assert_called_once_with(target_url, timeout=60)
assert domains == ['osf.io']
assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__deduplicates(self):
sample_text = 'osf.io osf.io osf.io and, oh, yeah, osf.io'
with mock.patch.object(spam_tasks.requests, 'head'):
domains = list(spam_tasks._extract_domains(sample_text))
assert domains == ['osf.io']
assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)]

def test_extract_domains__ignores_floats(self):
sample_text = 'this is a number 3.1415 not a domain'
Expand Down

0 comments on commit 3ee8410

Please sign in to comment.