Skip to content

Commit

Permalink
Merge pull request InternetHealthReport#123 from m-appel/dns-dependency
Browse files Browse the repository at this point in the history
Add OpenINTEL DNS dependency crawler
  • Loading branch information
romain-fontugne authored Feb 7, 2024
2 parents 838c8be + c672131 commit c9ce0ed
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 24 deletions.
2 changes: 2 additions & 0 deletions config.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@
"iyp.crawlers.alice_lg.linx",
"iyp.crawlers.alice_lg.megaport",
"iyp.crawlers.alice_lg.netnod",
"iyp.crawlers.openintel.dns_dependency_nl",
"iyp.crawlers.openintel.dns_dependency_jp",
"iyp.crawlers.cloudflare.dns_top_locations",
"iyp.crawlers.cloudflare.dns_top_ases"
],
Expand Down
145 changes: 121 additions & 24 deletions iyp/crawlers/openintel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Simple Python script to fetch domain name to IP address mappings from OpenINTEL data
# Based on code from Mattijs Jonker <[email protected]>
# OpenIntelCrawler is based on code from Mattijs Jonker <[email protected]>

import argparse
import datetime
import json
import logging
import os
import tempfile
from datetime import datetime, timedelta, timezone
from ipaddress import IPv6Address

import arrow
Expand All @@ -15,15 +15,11 @@
import pandas as pd
import requests

from iyp import BaseCrawler
from iyp import BaseCrawler, RequestStatusError

TMP_DIR = './tmp'
os.makedirs(TMP_DIR, exist_ok=True)

URL = 'https://data.openintel.nl/data/'
ORG = 'OpenINTEL'
NAME = 'openintel.*'

# credentials
OPENINTEL_ACCESS_KEY = ''
OPENINTEL_SECRET_KEY = ''
Expand All @@ -36,7 +32,7 @@

def valid_date(s):
try:
return datetime.datetime.strptime(s, '%Y-%m-%d')
return datetime.strptime(s, '%Y-%m-%d')
except ValueError:
msg = 'not a valid ISO 8601 date: {0!r}'.format(s)
raise argparse.ArgumentTypeError(msg)
Expand Down Expand Up @@ -78,22 +74,24 @@ def get_parquet(self):

# check on the website if yesterday's data is available
yesterday = arrow.utcnow().shift(days=-1)
url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
try:
req = requests.head(url)

attempt = 3
while req.status_code != 200 and attempt > 0:
print(req.status_code)
attempt -= 1
yesterday = yesterday.shift(days=-1)
url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
req = requests.head(url)

except requests.exceptions.ConnectionError:
logging.warning("Cannot reach OpenINTEL website, try yesterday's data")
yesterday = arrow.utcnow().shift(days=-1)
url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
# FIXME Check at the proper place. Remove flake8 exception afterwards.
# flake8: noqa
# url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
# try:
# req = requests.head(url)

# attempt = 3
# while req.status_code != 200 and attempt > 0:
# print(req.status_code)
# attempt -= 1
# yesterday = yesterday.shift(days=-1)
# url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)
# req = requests.head(url)

# except requests.exceptions.ConnectionError:
# logging.warning("Cannot reach OpenINTEL website, try yesterday's data")
# yesterday = arrow.utcnow().shift(days=-1)
# url = URL.format(year=yesterday.year, month=yesterday.month, day=yesterday.day)

logging.warning(f'Fetching data for {yesterday}')

Expand Down Expand Up @@ -245,3 +243,102 @@ def run(self):
self.iyp.batch_add_links('RESOLVES_TO', res_links)
self.iyp.batch_add_links('MANAGED_BY', mng_links)
self.iyp.batch_add_links('PART_OF', partof_links)


class DnsDependencyCrawler(BaseCrawler):

def __init__(self, organization, url, name):
super().__init__(organization, url, name)

def run(self):
# Extract current date for partitioning
logging.info('Probing available data')
max_lookback_in_weeks = 1
for lookback in range(0, max_lookback_in_weeks + 1):
current_date = datetime.now(tz=timezone.utc) - timedelta(weeks=lookback)
year = current_date.strftime('%Y')
week = current_date.strftime('%U')
base_url = f'{self.reference["reference_url"]}/year={year}/week={week}'
probe_url = f'{base_url}/domain_nodes.json.gz'
if requests.head(probe_url).ok:
logging.info(f'Using year={year}/week={week} ({current_date.strftime("%Y-%m-%d")})')
break
else:
logging.error('Failed to find data within the specified lookback interval.')
raise RequestStatusError('Failed to find data within the specified lookback interval.')

logging.info('Reading domain names')
domains = pd.read_json(f'{base_url}/domain_nodes.json.gz', lines=True)
logging.info('Reading host names')
hosts = pd.read_json(f'{base_url}/host_nodes.json.gz', lines=True)
logging.info('Reading IPs')
ips = pd.read_json(f'{base_url}/ip_nodes.json.gz', lines=True)
logging.info('Reading connections')
connections = pd.read_json(f'{base_url}/connections.json.gz', lines=True)

unique_domain_names = set(domains['name'])
unique_host_names = set(hosts['name'])
unique_ips = set(ips['address'])
logging.info(f'Pushing/getting {len(unique_domain_names)} DomainName {len(unique_host_names)} HostName '
f'{len(unique_ips)} IP nodes...')
domains_id = self.iyp.batch_get_nodes_by_single_prop('DomainName', 'name', unique_domain_names)
hosts_id = self.iyp.batch_get_nodes_by_single_prop('HostName', 'name', unique_host_names)
ips_id = self.iyp.batch_get_nodes_by_single_prop('IP', 'ip', unique_ips)

links_parent = list()
links_part_of = list()
links_alias_of = list()
links_managed_by = list()
links_resolves_to = list()

logging.info('Computing relationships...')
start_ts = datetime.now().timestamp()
for index, connection in connections.iterrows():
if connection['relation_name'] == 'PARENT':
links_parent.append({
'src_id': domains_id[connection['from_nodeKey']],
'dst_id': domains_id[connection['to_nodeKey']],
'props': [self.reference, connection['properties']],
})
elif connection['relation_name'] == 'MANAGED_BY':
links_managed_by.append({
'src_id': domains_id[connection['from_nodeKey']],
'dst_id': hosts_id[connection['to_nodeKey']],
'props': [self.reference, connection['properties']],
})
elif connection['relation_name'] == 'PART_OF':
links_part_of.append({
'src_id': hosts_id[connection['from_nodeKey']],
'dst_id': domains_id[connection['to_nodeKey']],
'props': [self.reference, connection['properties']],
})
elif connection['relation_name'] == 'ALIAS_OF':
links_alias_of.append({
'src_id': hosts_id[connection['from_nodeKey']],
'dst_id': hosts_id[connection['to_nodeKey']],
'props': [self.reference, connection['properties']],
})
elif connection['relation_name'] == 'RESOLVES_TO':
links_resolves_to.append({
'src_id': hosts_id[connection['from_nodeKey']],
'dst_id': ips_id[connection['to_nodeKey']],
'props': [self.reference, connection['properties']],
})
else:
logging.error(f'Unknown relationship type: {connection["relation_name"]}')
stop_ts = datetime.now().timestamp()
logging.info(f'{stop_ts - start_ts:.2f}s elapsed')

# Push all links to IYP
logging.info(f'Pushing {len(links_parent)} PARENT {len(links_part_of)} PART_OF {len(links_alias_of)} ALIAS_OF '
f'{len(links_managed_by)} MANAGED_BY {len(links_resolves_to)} RESOLVES_TO relationships...')
self.iyp.batch_add_links('PARENT', links_parent)
self.iyp.batch_add_links('PART_OF', links_part_of)
self.iyp.batch_add_links('ALIAS_OF', links_alias_of)
self.iyp.batch_add_links('MANAGED_BY', links_managed_by)
self.iyp.batch_add_links('RESOLVES_TO', links_resolves_to)

# Push the Authoritative NS Label
ns_id = [link['dst_id'] for link in links_managed_by]
logging.info(f'Adding AuthoritativeNameServer label to {len(ns_id)} nodes')
self.iyp.batch_add_node_label(ns_id, 'AuthoritativeNameServer')
45 changes: 45 additions & 0 deletions iyp/crawlers/openintel/dns_dependency_jp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import argparse
import logging
import os
import sys

from iyp.crawlers.openintel import DnsDependencyCrawler

URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/JP'
ORG = 'OpenINTEL'
NAME = 'openintel.dns_dependency_jp'


class Crawler(DnsDependencyCrawler):
def __init__(self, organization, url, name):
super().__init__(organization, url, name)


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('--unit-test', action='store_true')
args = parser.parse_args()

scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3]
FORMAT = '%(asctime)s %(levelname)s %(message)s'
logging.basicConfig(
format=FORMAT,
filename='log/' + scriptname + '.log',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S'
)

logging.info(f'Started: {sys.argv}')

crawler = Crawler(ORG, URL, NAME)
if args.unit_test:
crawler.unit_test(logging)
else:
crawler.run()
crawler.close()
logging.info(f'Finished: {sys.argv}')


if __name__ == '__main__':
main()
sys.exit(0)
45 changes: 45 additions & 0 deletions iyp/crawlers/openintel/dns_dependency_nl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import argparse
import logging
import os
import sys

from iyp.crawlers.openintel import DnsDependencyCrawler

URL = 'https://storage.dacs.utwente.nl/sommeser-dnsdep/NL'
ORG = 'OpenINTEL'
NAME = 'openintel.dns_dependency_nl'


class Crawler(DnsDependencyCrawler):
def __init__(self, organization, url, name):
super().__init__(organization, url, name)


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('--unit-test', action='store_true')
args = parser.parse_args()

scriptname = os.path.basename(sys.argv[0]).replace('/', '_')[0:-3]
FORMAT = '%(asctime)s %(levelname)s %(message)s'
logging.basicConfig(
format=FORMAT,
filename='log/' + scriptname + '.log',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S'
)

logging.info(f'Started: {sys.argv}')

crawler = Crawler(ORG, URL, NAME)
if args.unit_test:
crawler.unit_test(logging)
else:
crawler.run()
crawler.close()
logging.info(f'Finished: {sys.argv}')


if __name__ == '__main__':
main()
sys.exit(0)

0 comments on commit c9ce0ed

Please sign in to comment.