diff --git a/iyp/crawlers/virginiatech/rovista.py b/iyp/crawlers/virginiatech/rovista.py index a11b4a9..b72e734 100644 --- a/iyp/crawlers/virginiatech/rovista.py +++ b/iyp/crawlers/virginiatech/rovista.py @@ -2,6 +2,7 @@ import logging import os import sys +from datetime import datetime, timezone import requests @@ -13,10 +14,21 @@ class Crawler(BaseCrawler): + def __init__(self, organization, url, name): + super().__init__(organization, url, name) + self.reference['reference_url_info'] = 'https://rovista.netsecurelab.org/' + + def __set_modification_time(self, entry): + try: + date_str = entry['lastUpdatedDate'] + date = datetime.strptime(date_str, '%Y-%m-%d').replace(tzinfo=timezone.utc) + self.reference['reference_time_modification'] = date + except (KeyError, ValueError) as e: + logging.warning(f'Failed to set modification time: {e}') def run(self): """Get RoVista data from their API.""" - batch_size = 1000 # Adjust batch size as needed + batch_size = 1000 offset = 0 entries = [] asns = set() @@ -25,26 +37,29 @@ def run(self): # Make a request with the current offset response = requests.get(URL, params={'offset': offset, 'count': batch_size}) if response.status_code != 200: - raise RequestStatusError('Error while fetching RoVista data') + raise RequestStatusError(f'Error while fetching RoVista data: {response.status_code}') data = response.json().get('data', []) for entry in data: + if not self.reference['reference_time_modification']: + self.__set_modification_time(entry) asns.add(entry['asn']) if entry['ratio'] > 0.5: - entries.append({'asn': entry['asn'], 'ratio': entry['ratio'], 'label': 'Validating RPKI ROV'}) + entries.append({'asn': entry['asn'], 'ratio': entry['ratio']}) else: - entries.append({'asn': entry['asn'], 'ratio': entry['ratio'], 'label': 'Not Validating RPKI ROV'}) + entries.append({'asn': entry['asn'], 'ratio': entry['ratio']}) # Move to the next page offset += 1 # Break the loop if there's no more data if len(data) < batch_size: break + logging.info('Pushing nodes to neo4j...') # get ASNs and prefixes IDs self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns) - tag_id_not_valid = self.iyp.get_node('Tag', {'label': 'Not Validating RPKI ROV'}, create=True) - tag_id_valid = self.iyp.get_node('Tag', {'label': 'Validating RPKI ROV'}, create=True) + tag_id_not_valid = self.iyp.get_node('Tag', {'label': 'Not Validating RPKI ROV'}) + tag_id_valid = self.iyp.get_node('Tag', {'label': 'Validating RPKI ROV'}) # Compute links links = [] for entry in entries: