From 275ea025ff225903a210a57fd7edc57be700b8d1 Mon Sep 17 00:00:00 2001 From: andrewgryan Date: Fri, 3 Jul 2020 15:35:30 +0100 Subject: [PATCH 1/3] add an extra table to record OSError events --- forest/db/health.py | 56 +++++++++++++++++++++++++++++++++ forest/drivers/unified_model.py | 12 +++---- test/test_db_health.py | 25 +++++++++++++++ 3 files changed, 87 insertions(+), 6 deletions(-) create mode 100644 forest/db/health.py create mode 100644 test/test_db_health.py diff --git a/forest/db/health.py b/forest/db/health.py new file mode 100644 index 000000000..61ccd86e7 --- /dev/null +++ b/forest/db/health.py @@ -0,0 +1,56 @@ +""" +S3 object health status +""" +import sqlite3 + + +class HealthDB: + """Maintain meta-data related to S3 objects""" + def __init__(self, connection): + self.connection = connection + self.cursor = self.connection.cursor() + self.cursor.execute(""" + CREATE TABLE + IF NOT EXISTS health ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + errno INTEGER, + strerror TEXT, + time TEXT, + UNIQUE(name)) + """) + + @classmethod + def connect(cls, path_or_memory): + """Connect to sqlite3 database""" + return cls(sqlite3.connect(path_or_memory)) + + def checked_files(self, pattern): + """Files that are in the database""" + return sorted(set(self.files(pattern)) | + set(self.error_files(pattern))) + + def files(self, pattern): + query = "SELECT name FROM file WHERE name GLOB :pattern;" + params = {"pattern": pattern} + return [path for path, in self.cursor.execute(query, params)] + + def error_files(self, pattern): + query = "SELECT name FROM health WHERE name GLOB :pattern;" + params = {"pattern": pattern} + return [path for path, in self.cursor.execute(query, params)] + + def insert_error(self, path, error, check_time): + """Insert OSError into table""" + query = """ + INSERT OR IGNORE + INTO health (name, errno, strerror, time) + VALUES (:path, :errno, :strerror, :time); + """ + params = { + "path": path, + "errno": error.errno, + "strerror": error.strerror, + "time": check_time.isoformat() + } + self.cursor.execute(query, params) diff --git a/forest/drivers/unified_model.py b/forest/drivers/unified_model.py index eae293e1c..5f211d444 100644 --- a/forest/drivers/unified_model.py +++ b/forest/drivers/unified_model.py @@ -9,6 +9,7 @@ import netCDF4 import sqlite3 import forest.db +import forest.db.health import forest.util import forest.map_view from forest import ( @@ -45,12 +46,9 @@ def __call__(self): # Find names in database connection = sqlite3.connect(self.database_path) - cursor = connection.cursor() - query = "SELECT name FROM file WHERE name GLOB :pattern;" - sql_names = [] - for row in cursor.execute(query, {"pattern": self.pattern}): - path, = row - sql_names.append(os.path.basename(path)) + health_db = forest.db.health.HealthDB(connection) + sql_names = [os.path.basename(path) + for path in health_db.checked_files(self.pattern)] connection.close() # Find extra files @@ -61,12 +59,14 @@ def __call__(self): if len(extra_paths) > 0: print("connecting to: {}".format(self.database_path)) with forest.db.Database.connect(self.database_path) as database: + health_db = forest.db.health.HealthDB(database.connection) for path in extra_paths: print("inserting: '{}'".format(path)) try: database.insert_netcdf(path) except OSError as e: # S3 Glacier objects inaccessible via goofys + health_db.insert_error(path, e, dt.datetime.now()) print(e) print(f"skip file: {path}") continue diff --git a/test/test_db_health.py b/test/test_db_health.py new file mode 100644 index 000000000..e4ebaba63 --- /dev/null +++ b/test/test_db_health.py @@ -0,0 +1,25 @@ +import sqlite3 +import datetime as dt +import forest.db +import forest.db.health + + +def test_db_health_check(): + """Database tables to monitor S3 object availability""" + database = forest.db.Database.connect(":memory:") + database.insert_file_name("file.nc") + pattern = "*.nc" + health_db = forest.db.health.HealthDB(database.connection) + assert health_db.checked_files(pattern) == ["file.nc"] + + +def test_db_health_check_mark_oserror(): + """Database tables to monitor S3 object availability""" + database = forest.db.Database.connect(":memory:") + database.insert_file_name("file-0.nc") + health_db = forest.db.health.HealthDB(database.connection) + health_db.insert_error("file-1.nc", + OSError("Error message"), + dt.datetime(2020, 1, 1)) + pattern = "*.nc" + assert health_db.checked_files(pattern) == ["file-0.nc", "file-1.nc"] From b6ba6ee60f04a272836e6c93f2403cbf04579041 Mon Sep 17 00:00:00 2001 From: andrewgryan Date: Fri, 3 Jul 2020 15:39:09 +0100 Subject: [PATCH 2/3] better docstring --- forest/db/health.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/forest/db/health.py b/forest/db/health.py index 61ccd86e7..288b2bf9e 100644 --- a/forest/db/health.py +++ b/forest/db/health.py @@ -26,7 +26,10 @@ def connect(cls, path_or_memory): return cls(sqlite3.connect(path_or_memory)) def checked_files(self, pattern): - """Files that are in the database""" + """Files that are in the database + + :returns files: either successfully processed or marked as OSError + """ return sorted(set(self.files(pattern)) | set(self.error_files(pattern))) From ecd5adff5aba9091697d661a47565d04e65359ef Mon Sep 17 00:00:00 2001 From: andrewgryan Date: Fri, 3 Jul 2020 15:44:38 +0100 Subject: [PATCH 3/3] bump version to v0.20.7 --- forest/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/forest/__init__.py b/forest/__init__.py index f03cf9717..e0a665a52 100644 --- a/forest/__init__.py +++ b/forest/__init__.py @@ -28,7 +28,7 @@ .. automodule:: forest.services """ -__version__ = '0.20.6' +__version__ = '0.20.7' from .config import * from . import (