Skip to content

Commit

Permalink
Factor out the address resolving and location functions
Browse files Browse the repository at this point in the history
Also, follow the [line of sight](https://medium.com/@matryer/line-of-sight-in-code-186dd7cdea88)
code style.

Signed-off-by: Aurélien Bompard <[email protected]>
  • Loading branch information
abompard committed Aug 19, 2024
1 parent 35523b9 commit f68c665
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 108 deletions.
54 changes: 5 additions & 49 deletions mirrormanager2/crawler/continents.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import collections
import csv
import functools
import logging
import os
import socket
from functools import cache
from importlib import resources
from urllib.parse import urlparse

import geoip2

import mirrormanager2.lib
from mirrormanager2.lib import geo, get_country_continent_redirect

from .constants import CONTINENTS

Expand Down Expand Up @@ -50,7 +48,7 @@ def get_country_continents(session):
with country_continent_csv.open("r") as infile:
reader = csv.reader(infile)
new_country_continents = {rows[0]: rows[1] for rows in reader}
for c in mirrormanager2.lib.get_country_continent_redirect(session):
for c in get_country_continent_redirect(session):
new_country_continents[c.country] = c.continent
return new_country_continents

Expand Down Expand Up @@ -81,57 +79,15 @@ def check_continent(config, options, session, categoryUrl):
hostname = hostname.split(":")[0]

try:
addrinfo = socket.getaddrinfo(hostname, None)
except socket.gaierror as e:
addresses = geo.get_host_addresses(hostname)
except geo.HostUnreachable as e:
# Name resolution failed. This means
# that the base URL is broken.
raise BrokenBaseUrl() from e

# Extract the IPv4 and IPv6 address from the tuples returned by getaddrinfo.
addresses = set()
for family, _socktype, _proto, _canonname, sockaddr in addrinfo:
# The GeoIP2 databases contain only information for IPv4 and IPv6
# addresses. Therefore, other, unusual address families are ignored.
if family == socket.AF_INET:
address, port = sockaddr
addresses.add(address)
elif family == socket.AF_INET6:
address, port, flowinfo, scope_id = sockaddr
addresses.add(address)
# Retrieve the ISO 3166-1 code for each address.
countries = []
for address in addresses:
try:
country = gi.country(address)
except geoip2.errors.AddressNotFoundError:
# If no country object is found for an IPv4 or IPv6 address,
# the address is ignored.
pass
else:
iso_code = country.country.iso_code
# If the ISO 3166-1 code is not available, the country cannot be
# matched to continent. Therefore, the country object is ignored.
if iso_code is not None:
countries.append(iso_code)
# The GeoIP2 databases are not perfect and fully accurate. Therefore,
# multiple countries might be returned for hosts with multiple addresses. It
# seems best to use the most frequently occuring country if a host has
# multiple addresses.
country_counter = collections.Counter(countries)
if country_counter:
# most_common(1) returns a list with one element that is tuple that
# consists of the item and its count.
country = country_counter.most_common(1)[0][0]
else:
# For hosts with no country in the GeoIP database
# the default is 'US' as that is where most of
# Fedora infrastructure systems are running
country = "US"
country = geo.get_country(addresses, geoip_db=gi)
if country in config["EMBARGOED_COUNTRIES"]:
raise EmbargoedCountry(country)
if country_continents[country] in continents:
return
# And another return value. '8' is used for mirrors on
# the wrong continent. The crawl should not be listed in
# the database at all.
raise WrongContinent
94 changes: 94 additions & 0 deletions mirrormanager2/lib/geo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import collections
import socket

import geoip2


class HostUnreachable(Exception):
pass


def get_host_addresses(hostname):
"""Get the IP addresses for a hostname"""
try:
addrinfo = socket.getaddrinfo(hostname, None)
except socket.gaierror as e:
raise HostUnreachable(hostname) from e

# Extract the IPv4 and IPv6 address from the tuples returned by getaddrinfo.
addresses = set()
for family, _socktype, _proto, _canonname, sockaddr in addrinfo:
# The GeoIP2 databases contain only information for IPv4 and IPv6
# addresses. Therefore, other, unusual address families are ignored.
if family not in (socket.AF_INET, socket.AF_INET6):
continue
addresses.add(sockaddr[0])
return addresses


def get_country(addresses, geoip_db):
"""Retrieve the ISO 3166-1 code for each address."""
countries = []
for address in addresses:
try:
country = geoip_db.country(address)
except geoip2.errors.AddressNotFoundError:
# If no country object is found for an IPv4 or IPv6 address,
# the address is ignored.
continue
iso_code = country.country.iso_code
if iso_code is None:
# If the ISO 3166-1 code is not available, the country cannot be
# matched to continent. Therefore, the country object is ignored.
continue
countries.append(iso_code)
# The GeoIP2 databases are not perfect and fully accurate. Therefore,
# multiple countries might be returned for hosts with multiple addresses. It
# seems best to use the most frequently occuring country if a host has
# multiple addresses.
country_counter = collections.Counter(countries)
if country_counter:
# most_common(1) returns a list with one element that is tuple that
# consists of the item and its count.
country = country_counter.most_common(1)[0][0]
else:
# For hosts with no country in the GeoIP database
# the default is 'US' as that is where most of
# Fedora infrastructure systems are running
country = "US"
return country


def get_cities(addresses, geoip_db):
"""Retrieve the city object for each address."""
cities = []
for address in addresses:
try:
city = geoip_db.city(address)
except geoip2.errors.AddressNotFoundError:
# If no city object was found for an IPv4 or IPv6
# address, the address is ignored.
continue
# It seems that an empty city record is returned when no
# city was found. If no city has been found for an IPv4
# or IPv6 address, the address is ignored.
if city.city.name is None:
continue
cities.append(city)
# If no city objects were found, the location of a host cannot
# be determined.
if not cities:
return []
city_names = [city.city.name for city in cities]
# Only the GeoIP2 Enterprise database has a confidence score for
# each city record. Therefore, it seems best to use the most
# frequently occuring city if a host has multiple addresses.
city_name_counter = collections.Counter(city_names)
# most_common(1) returns a list with one element that is tuple
# that consists of the item and its count.
most_common_city_name = city_name_counter.most_common(1)[0][0]
# Find a city object for the most common city name. Any city
# object should equivalent for a given city name.
for city in cities:
if most_common_city_name == city.city.name:
return city
72 changes: 13 additions & 59 deletions mirrormanager2/utility/generate_worldmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@
# while the rest of MirrorManager is licensed MIT/X11


import collections
import os
import socket
from urllib.parse import urlsplit

import click
import geoip2.database

import mirrormanager2.lib
from mirrormanager2.lib import geo, get_host_category_url, read_config
from mirrormanager2.lib.database import get_db_manager

from .common import config_option
Expand All @@ -25,81 +23,37 @@
@config_option
@click.option("--verbose", is_flag=True, default=False, help="show more details")
def main(config, verbose):
config = mirrormanager2.lib.read_config(config)
config = read_config(config)
gi = geoip2.database.Reader(os.path.join(config["GEOIP_BASE"], "GeoLite2-City.mmdb"))
db_manager = get_db_manager(config)
with db_manager.Session() as session:
embargoed_countries = set(x.upper() for x in config["EMBARGOED_COUNTRIES"])
tracking = set()
for hcurl in mirrormanager2.lib.get_host_category_url(session):
for hcurl in get_host_category_url(session):
host = hcurl.host_category.host
if host.private or host.site.private:
continue
hostname = urlsplit(hcurl.url)[1]
if host.id in tracking:
continue
gir = None
try:
addrinfo = socket.getaddrinfo(hostname, None)
# Extract the IPv4 and IPv6 address from the tuples returned by
# getaddrinfo.
addresses = set()
for family, _socktype, _proto, _canonname, sockaddr in addrinfo:
# The GeoIP2 databases contain only information for IPv4 and
# IPv6 addresses. Therefore, other, unusual address families
# are ignored.
if family == socket.AF_INET:
address, port = sockaddr
addresses.add(address)
elif family == socket.AF_INET6:
address, port, flowinfo, scope_id = sockaddr
addresses.add(address)
# Retrieve the city object for each address.
cities = []
for address in addresses:
try:
city = gi.city(address)
except geoip2.errors.AddressNotFoundError:
# If no city object was found for an IPv4 or IPv6
# address, the address is ignored.
pass
else:
# It seems that an empty city record is returned when no
# city was found. If no city has been found for an IPv4
# or IPv6 address, the address is ignored.
if city.city.name is not None:
cities.append(city)
# If no city objects were found, the location of a host cannot
# be determined.
if not cities:
continue
city_names = (city.city.name for city in cities)
# Only the GeoIP2 Enterprise database has a confidence score for
# each city record. Therefore, it seems best to use the most
# frequently occuring city if a host has multiple addresses.
city_name_counter = collections.Counter(city_names)
# most_common(1) returns a list with one element that is tuple
# that consists of the item and its count.
most_common_city_name = city_name_counter.most_common(1)[0][0]
# Find a city object for the most common city name. Any city
# object should equivalent for a given city name.
for city in cities:
if most_common_city_name == city.city.name:
gir = city
break
except Exception:
addresses = geo.get_host_addresses(hostname)
except geo.HostUnreachable:
click.echo(f"Unreachable host: {hostname}. Skipping.", err=True)
continue
if gir is None:

city = geo.get_city(addresses, geoip_db=gi)
if city is None:
continue
if gir.country.iso_code in embargoed_countries:
if city.country.iso_code in embargoed_countries:
click.echo(
f"WARNING: host {host.id} ({hostname}) seems to be from an embargoed "
f"country: {gir.country.iso_code}",
f"country: {city.country.iso_code}",
err=True,
)
continue
host.latitude = gir.location.latitude
host.longitude = gir.location.longitude
host.latitude = city.location.latitude
host.longitude = city.location.longitude
tracking.add(host.id)
if verbose:
click.echo(f"{host.name} ({host.id}): {host.latitude} {host.longitude}")
Expand Down

0 comments on commit f68c665

Please sign in to comment.