Add info URL and modification timestamp for Virginia Tech

MAVRICK-1 · Feb 16, 2024 · 9d79c11 · 9d79c11
1 parent 51f411e
commit 9d79c11
Showing 1 changed file with 21 additions and 6 deletions.
diff --git a/iyp/crawlers/virginiatech/rovista.py b/iyp/crawlers/virginiatech/rovista.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import sys
+from datetime import datetime, timezone
 
 import requests
 
@@ -13,10 +14,21 @@
 
 
 class Crawler(BaseCrawler):
+    def __init__(self, organization, url, name):
+        super().__init__(organization, url, name)
+        self.reference['reference_url_info'] = 'https://rovista.netsecurelab.org/'
+
+    def __set_modification_time(self, entry):
+        try:
+            date_str = entry['lastUpdatedDate']
+            date = datetime.strptime(date_str, '%Y-%m-%d').replace(tzinfo=timezone.utc)
+            self.reference['reference_time_modification'] = date
+        except (KeyError, ValueError) as e:
+            logging.warning(f'Failed to set modification time: {e}')
 
     def run(self):
         """Get RoVista data from their API."""
-        batch_size = 1000  # Adjust batch size as needed
+        batch_size = 1000
         offset = 0
         entries = []
         asns = set()
@@ -25,26 +37,29 @@ def run(self):
             # Make a request with the current offset
             response = requests.get(URL, params={'offset': offset, 'count': batch_size})
             if response.status_code != 200:
-                raise RequestStatusError('Error while fetching RoVista data')
+                raise RequestStatusError(f'Error while fetching RoVista data: {response.status_code}')
 
             data = response.json().get('data', [])
             for entry in data:
+                if not self.reference['reference_time_modification']:
+                    self.__set_modification_time(entry)
                 asns.add(entry['asn'])
                 if entry['ratio'] > 0.5:
-                    entries.append({'asn': entry['asn'], 'ratio': entry['ratio'], 'label': 'Validating RPKI ROV'})
+                    entries.append({'asn': entry['asn'], 'ratio': entry['ratio']})
                 else:
-                    entries.append({'asn': entry['asn'], 'ratio': entry['ratio'], 'label': 'Not Validating RPKI ROV'})
+                    entries.append({'asn': entry['asn'], 'ratio': entry['ratio']})
 
             # Move to the next page
             offset += 1
             # Break the loop if there's no more data
             if len(data) < batch_size:
                 break
+
         logging.info('Pushing nodes to neo4j...')
         # get ASNs and prefixes IDs
         self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
-        tag_id_not_valid = self.iyp.get_node('Tag', {'label': 'Not Validating RPKI ROV'}, create=True)
-        tag_id_valid = self.iyp.get_node('Tag', {'label': 'Validating RPKI ROV'}, create=True)
+        tag_id_not_valid = self.iyp.get_node('Tag', {'label': 'Not Validating RPKI ROV'})
+        tag_id_valid = self.iyp.get_node('Tag', {'label': 'Validating RPKI ROV'})
         # Compute links
         links = []
         for entry in entries: