Skip to content

Commit

Permalink
Add info URL and modification timestamp for Tranco
Browse files Browse the repository at this point in the history
  • Loading branch information
m-appel committed Feb 16, 2024
1 parent b5b3bf4 commit 51f411e
Showing 1 changed file with 18 additions and 1 deletion.
19 changes: 18 additions & 1 deletion iyp/crawlers/tranco/top1M.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

import requests

from iyp import BaseCrawler, RequestStatusError
from iyp import (BaseCrawler, RequestStatusError,
set_modification_time_from_last_modified_header)

# URL to Tranco top 1M
URL = 'https://tranco-list.eu/top-1m.csv.zip'
Expand All @@ -16,6 +17,19 @@


class Crawler(BaseCrawler):
def __init__(self, organization, url, name):
super().__init__(organization, url, name)
self.reference['reference_url_info'] = 'https://tranco-list.eu/methodology'

def __set_data_url(self):
"""Set the data URL using the permanent ID of the current list, which stays
valid once the permalink is updated."""
try:
res = requests.get('https://tranco-list.eu/top-1m-id')
res.raise_for_status()
self.reference['reference_url_data'] = f'https://tranco-list.eu/download_daily/{res.text}'
except requests.HTTPError as e:
logging.warning(f'Failed to update data URL: {e}')

def run(self):
"""Fetch Tranco top 1M and push to IYP."""
Expand All @@ -27,6 +41,9 @@ def run(self):
if req.status_code != 200:
raise RequestStatusError('Error while fetching Tranco csv file')

set_modification_time_from_last_modified_header(self.reference, req)
self.__set_data_url()

links = []
domains = set()
# open zip file and read top list
Expand Down

0 comments on commit 51f411e

Please sign in to comment.