From af35c28a493c29161424fc4f9c7df9d662c0e3c1 Mon Sep 17 00:00:00 2001 From: John Tordoff <> Date: Fri, 15 Dec 2023 09:59:09 -0500 Subject: [PATCH] Add UNVERIFIED Domain classification Squashed commit of the following: commit 5615a3adf65b941f3929f4f8201a0eb0d1fe1c88 Author: John Tordoff <> Date: Tue Dec 12 16:42:54 2023 -0500 update is_triaged behavior commit a9a49f281c993fef425fa0db24ec2c33924e34c8 Author: John Tordoff <> Date: Tue Dec 12 15:53:49 2023 -0500 change test case to account for new exception handing for domain sniffer commit 096e1ab68b5e7d5c2615bdd0f064bcddaad85d16 Author: John Tordoff <> Date: Tue Dec 12 14:03:25 2023 -0500 redo exception handling and add migration file commit 89b37f32ce7c22076e0ef7fa7c2feceb81e24b86 Author: John Tordoff <> Date: Mon Dec 11 12:13:31 2023 -0500 make timeouts classify notable domains as unverified --- osf/external/spam/tasks.py | 21 +++++++++---- .../0017_alter_notabledomain_note.py | 19 ++++++++++++ osf/models/notable_domain.py | 1 + osf_tests/test_notable_domains.py | 31 ++++++++++--------- 4 files changed, 52 insertions(+), 20 deletions(-) create mode 100644 osf/migrations/0017_alter_notabledomain_note.py diff --git a/osf/external/spam/tasks.py b/osf/external/spam/tasks.py index cc3f9e16a16..fabb7dfb935 100644 --- a/osf/external/spam/tasks.py +++ b/osf/external/spam/tasks.py @@ -46,15 +46,20 @@ def _check_resource_for_domains(guid, content): resource = guid.referent spammy_domains = [] referrer_content_type = ContentType.objects.get_for_model(resource) - for domain in _extract_domains(content): - notable_domain, _ = NotableDomain.objects.get_or_create(domain=domain) + for domain, note in _extract_domains(content): + notable_domain, _ = NotableDomain.objects.get_or_create( + domain=domain, + defaults={'note': note} + ) if notable_domain.note == NotableDomain.Note.EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT: spammy_domains.append(notable_domain.domain) DomainReference.objects.get_or_create( domain=notable_domain, referrer_object_id=resource.id, referrer_content_type=referrer_content_type, - defaults={'is_triaged': notable_domain.note != NotableDomain.Note.UNKNOWN} + defaults={ + 'is_triaged': notable_domain.note not in (NotableDomain.Note.UNKNOWN, NotableDomain.Note.UNVERIFIED) + } ) if spammy_domains: resource.confirm_spam(save=True, domains=list(spammy_domains)) @@ -72,8 +77,11 @@ def check_resource_for_domains_async(guid, content): def _extract_domains(content): + from osf.models import NotableDomain + extracted_domains = set() for match in DOMAIN_REGEX.finditer(content): + note = NotableDomain.Note.UNKNOWN domain = match.group('domain') if not domain or domain in extracted_domains: continue @@ -85,10 +93,11 @@ def _extract_domains(content): try: response = requests.head(constructed_url, timeout=settings.DOMAIN_EXTRACTION_TIMEOUT) - except (requests.exceptions.ConnectionError, requests.exceptions.InvalidURL): + except requests.exceptions.InvalidURL: + # Likely false-positive from a filename.ext continue except requests.exceptions.RequestException: - pass + note = NotableDomain.Note.UNVERIFIED else: # Store the redirect location (to help catch link shorteners) if response.status_code in REDIRECT_CODES and 'location' in response.headers: @@ -99,7 +108,7 @@ def _extract_domains(content): # Avoid returning a duplicate domain discovered via redirect if domain not in extracted_domains: extracted_domains.add(domain) - yield domain + yield domain, note @run_postcommit(once_per_request=False, celery=True) diff --git a/osf/migrations/0017_alter_notabledomain_note.py b/osf/migrations/0017_alter_notabledomain_note.py new file mode 100644 index 00000000000..056568cffbe --- /dev/null +++ b/osf/migrations/0017_alter_notabledomain_note.py @@ -0,0 +1,19 @@ +# Generated by Django 3.2.17 on 2023-12-12 19:02 + +from django.db import migrations, models +import osf.models.notable_domain + + +class Migration(migrations.Migration): + + dependencies = [ + ('osf', '0016_auto_20230828_1810'), + ] + + operations = [ + migrations.AlterField( + model_name='notabledomain', + name='note', + field=models.IntegerField(choices=[(0, 'EXCLUDE_FROM_ACCOUNT_CREATION_AND_CONTENT'), (1, 'ASSUME_HAM_UNTIL_REPORTED'), (2, 'UNKNOWN'), (3, 'IGNORED'), (4, 'UNVERIFIED')], default=osf.models.notable_domain.NotableDomain.Note['UNKNOWN']), + ), + ] diff --git a/osf/models/notable_domain.py b/osf/models/notable_domain.py index 5b960718ed9..03ebcfd6e40 100644 --- a/osf/models/notable_domain.py +++ b/osf/models/notable_domain.py @@ -14,6 +14,7 @@ class Note(IntEnum): ASSUME_HAM_UNTIL_REPORTED = 1 UNKNOWN = 2 IGNORED = 3 + UNVERIFIED = 4 # Timedout couldn't determine @classmethod def choices(cls): diff --git a/osf_tests/test_notable_domains.py b/osf_tests/test_notable_domains.py index 4c9e39908dd..78edd11e967 100644 --- a/osf_tests/test_notable_domains.py +++ b/osf_tests/test_notable_domains.py @@ -30,31 +30,34 @@ def test_extract_domains__optional_components(self, protocol_component, www_comp sample_text = f'This is a link: {test_url}' with mock.patch.object(spam_tasks.requests, 'head'): domains = list(spam_tasks._extract_domains(sample_text)) - assert domains == ['osf.io'] + assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)] def test_extract_domains__url_in_quotes(self): sample_text = '"osf.io"' with mock.patch.object(spam_tasks.requests, 'head'): domains = list(spam_tasks._extract_domains(sample_text)) - assert domains == ['osf.io'] + assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)] def test_extract_domains__url_in_parens(self): sample_text = '(osf.io)' with mock.patch.object(spam_tasks.requests, 'head'): domains = list(spam_tasks._extract_domains(sample_text)) - assert domains == ['osf.io'] + assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)] def test_extract_domains__captures_domain_with_multiple_subdomains(self): sample_text = 'This is a link: https://api.test.osf.io' with mock.patch.object(spam_tasks.requests, 'head'): domains = list(spam_tasks._extract_domains(sample_text)) - assert domains == ['api.test.osf.io'] + assert domains == [('api.test.osf.io', NotableDomain.Note.UNKNOWN)] def test_extract_domains__captures_multiple_domains(self): sample_text = 'This is a domain: http://osf.io. This is another domain: www.cos.io' with mock.patch.object(spam_tasks.requests, 'head'): domains = set(spam_tasks._extract_domains(sample_text)) - assert domains == {'osf.io', 'cos.io'} + assert domains == { + ('osf.io', NotableDomain.Note.UNKNOWN), + ('cos.io', NotableDomain.Note.UNKNOWN), + } def test_extract_domains__no_domains(self): sample_text = 'http://fakeout!' @@ -63,19 +66,19 @@ def test_extract_domains__no_domains(self): assert not domains mock_head.assert_not_called() - def test_extract_domains__ignored_if_does_not_resolve(self): + def test_extract_domains__unverfied_if_does_not_resolve(self): sample_text = 'This.will.not.connect' with mock.patch.object(spam_tasks.requests, 'head') as mock_head: mock_head.side_effect = spam_tasks.requests.exceptions.ConnectionError domains = set(spam_tasks._extract_domains(sample_text)) - assert not domains + assert domains == {('This.will.not.connect', NotableDomain.Note.UNVERIFIED)} def test_actract_domains__returned_on_error(self): sample_text = 'This.will.timeout' with mock.patch.object(spam_tasks.requests, 'head') as mock_head: mock_head.side_effect = spam_tasks.requests.exceptions.Timeout domains = set(spam_tasks._extract_domains(sample_text)) - assert domains == {sample_text} + assert domains == {(sample_text, NotableDomain.Note.UNVERIFIED)} @pytest.mark.parametrize('status_code', [301, 302, 303, 307, 308]) def test_extract_domains__follows_redirect(self, status_code): @@ -85,7 +88,7 @@ def test_extract_domains__follows_redirect(self, status_code): sample_text = 'redirect.me' with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response): domains = list(spam_tasks._extract_domains(sample_text)) - assert domains == ['redirected.com'] + assert domains == [('redirected.com', NotableDomain.Note.UNKNOWN)] def test_extract_domains__redirect_code_no_location(self): mock_response = SimpleNamespace() @@ -94,7 +97,7 @@ def test_extract_domains__redirect_code_no_location(self): sample_text = 'redirect.me' with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response): domains = list(spam_tasks._extract_domains(sample_text)) - assert domains == ['redirect.me'] + assert domains == [('redirect.me', NotableDomain.Note.UNKNOWN)] def test_extract_domains__redirect_code_bad_location(self): mock_response = SimpleNamespace() @@ -103,7 +106,7 @@ def test_extract_domains__redirect_code_bad_location(self): sample_text = 'redirect.me' with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response): domains = list(spam_tasks._extract_domains(sample_text)) - assert domains == ['redirect.me'] + assert domains == [('redirect.me', NotableDomain.Note.UNKNOWN)] def test_extract_domains__redirect_with_full_url_no_protocol(self): mock_response = SimpleNamespace() @@ -114,7 +117,7 @@ def test_extract_domains__redirect_with_full_url_no_protocol(self): with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response) as mock_object: domains = list(spam_tasks._extract_domains(sample_text)) mock_object.assert_called_once_with(f'https://{target_url}', timeout=60) - assert domains == ['osf.io'] + assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)] def test_extract_domains__redirect_with_full_url_and_protocol(self): mock_response = SimpleNamespace() @@ -125,13 +128,13 @@ def test_extract_domains__redirect_with_full_url_and_protocol(self): with mock.patch.object(spam_tasks.requests, 'head', return_value=mock_response) as mock_object: domains = list(spam_tasks._extract_domains(sample_text)) mock_object.assert_called_once_with(target_url, timeout=60) - assert domains == ['osf.io'] + assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)] def test_extract_domains__deduplicates(self): sample_text = 'osf.io osf.io osf.io and, oh, yeah, osf.io' with mock.patch.object(spam_tasks.requests, 'head'): domains = list(spam_tasks._extract_domains(sample_text)) - assert domains == ['osf.io'] + assert domains == [('osf.io', NotableDomain.Note.UNKNOWN)] def test_extract_domains__ignores_floats(self): sample_text = 'this is a number 3.1415 not a domain'