From 593e129d4af37b75f4199d4de148fdccc4fa4a6d Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 8 Nov 2023 12:24:07 -0500 Subject: [PATCH 01/22] Add latest Python to the test matrix to detrmine GH Actions support Closes #814 --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f4786005..ce5f7bb0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,10 +15,10 @@ jobs: - macos-latest # - windows-latest python: - - "3.8" - "3.9" - "3.10" - "3.11" + - "3.12" ipfs: - "0.22" - "0.23" From 12e7dda6bc82652d065c73c8bcf367e88766f626 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 8 Nov 2023 12:31:44 -0500 Subject: [PATCH 02/22] Add explicit installation of setuptools while testing for #814 --- test-requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test-requirements.txt b/test-requirements.txt index 4da99efc..d0f1b941 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -2,3 +2,4 @@ flake8>=3.7.9 pytest>=5.3.5 pytest-cov pytest-flake8 +setuptools From 76ef7bd8d179c29ac1f251ef4f89fc0e87c84839 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 8 Nov 2023 12:40:32 -0500 Subject: [PATCH 03/22] Add six to the test requirements for Py 3.12 GH Action for #814 --- test-requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test-requirements.txt b/test-requirements.txt index d0f1b941..369c6128 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -3,3 +3,4 @@ pytest>=5.3.5 pytest-cov pytest-flake8 setuptools +six From c01462a3650b2523f9d742f99d6dfccd6b021c06 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 8 Nov 2023 13:55:50 -0500 Subject: [PATCH 04/22] Rm six from requirements for #814 --- test-requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/test-requirements.txt b/test-requirements.txt index 369c6128..d0f1b941 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -3,4 +3,3 @@ pytest>=5.3.5 pytest-cov pytest-flake8 setuptools -six From 8bb2931eb1575dabb38a2e79519efce547b3f120 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 8 Nov 2023 15:16:48 -0500 Subject: [PATCH 05/22] Use packaging module to compare versions for #814 --- ipwb/util.py | 2 +- requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ipwb/util.py b/ipwb/util.py index 02e9cea0..1ebf6d96 100644 --- a/ipwb/util.py +++ b/ipwb/util.py @@ -24,7 +24,7 @@ from ipfshttpclient.exceptions import ConnectionError, AddressError from multiaddr.exceptions import StringParseError -from pkg_resources import parse_version +from packaging.version import parse as parse_version from .exceptions import IPFSDaemonNotAvailable diff --git a/requirements.txt b/requirements.txt index ae88d1df..e2cad882 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ requests>=2.19.1 beautifulsoup4>=4.6.3 surt>=0.3.0 multiaddr >= 0.0.9 +packaging==23.0 From e88627aa2e49decf6e20f424b6e1550a48092c69 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 24 Apr 2024 14:21:09 -0400 Subject: [PATCH 06/22] Rm usage of deprecated pkg_resources for path resolution Re:#814 --- ipwb/replay.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ipwb/replay.py b/ipwb/replay.py index 8608d861..6d83b0b2 100755 --- a/ipwb/replay.py +++ b/ipwb/replay.py @@ -11,6 +11,7 @@ import sys import os +import importlib.resources import ipfshttpclient as ipfsapi import json import subprocess @@ -1018,8 +1019,10 @@ def get_index_file_full_path(cdxj_file_path=INDEX_FILE): if os.path.isfile(cdxj_file_path): return cdxj_file_path - index_file_name = pkg_resources.resource_filename( - __name__, index_file_path) + # index_file_name = pkg_resources.resource_filename( + # __name__, index_file_path) + index_file_name = importlib.resources.files( + __name__).joinpath(index_file_path) return index_file_name From cf5187b842da427761b4a8bc71bab678f4913b24 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 24 Apr 2024 14:31:00 -0400 Subject: [PATCH 07/22] Update mock assertion method for Py 3.12 for #814 --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index 573af3ee..d56affee 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - assert mock_logger.called_once_with(('boo', )) + assert mock_logger.assert_called_once_with(('boo', )) From 7cf144b9549580acd91327e8fe4b6d36d1cd6535 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 24 Apr 2024 15:05:48 -0400 Subject: [PATCH 08/22] Tweak test params for #814 --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index d56affee..70b97796 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - assert mock_logger.assert_called_once_with(('boo', )) + assert mock_logger.assert_called_once_with('boo') From c166c77f7359fb3bd65f491c7216016087c95fb7 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 24 Apr 2024 15:23:01 -0400 Subject: [PATCH 09/22] Rv test param for #814 --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index 70b97796..444ea39c 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - assert mock_logger.assert_called_once_with('boo') + assert mock_logger.called_once_with('boo') From a20def210a18ac901945b852a56cd684ed2f318b Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 14:11:20 -0400 Subject: [PATCH 10/22] Fix clarity in README --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 14361d9f..a3edf11a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![pypi](https://img.shields.io/pypi/v/ipwb.svg)](https://pypi.org/project/ipwb) [![codecov](https://codecov.io/gh/oduwsdl/ipwb/branch/master/graph/badge.svg)](https://codecov.io/gh/oduwsdl/ipwb) -InterPlanetary Wayback (ipwb) facilitates permanence and collaboration in web archives by disseminating the contents of [WARC](http://www.iso.org/iso/catalogue_detail.htm?csnumber=44717) files into the IPFS network. [IPFS](https://ipfs.io/) is a peer-to-peer content-addressable file system that inherently allows deduplication and facilitates opt-in replication. ipwb splits the header and payload of WARC response records before disseminating into IPFS to leverage the deduplication, builds a [CDXJ index](https://github.com/oduwsdl/ORS/wiki/CDXJ) with references to the IPFS hashes returned, and combines the header and payload from IPFS at the time of replay. +InterPlanetary Wayback (ipwb) facilitates permanence and collaboration in web archives by disseminating the contents of [WARC](http://www.iso.org/iso/catalogue_detail.htm?csnumber=44717) files into the IPFS network. [IPFS](https://ipfs.io/) is a peer-to-peer content-addressable file system that inherently allows deduplication and facilitates opt-in replication. ipwb splits the header and payload of WARC response records before disseminating into IPFS to leverage the deduplication, builds a [CDXJ index](https://github.com/oduwsdl/ORS/wiki/CDXJ) with references to the IPFS hashes that are returned, and combines the header and payload from IPFS at the time of replay. InterPlanetary Wayback primarily consists of two scripts: @@ -90,7 +90,7 @@ $ ipwb replay QmYwAPJzv5CZsnANOTaREALhashYgPpHdWEz79ojWnPbdG Once started, the replay system's web interface can be accessed through a web browser, e.g., by default. -To run it under a domain name other than `localhost`, the easiest approach is to use a reverse proxy that supports HTTPS. The replay system utilizes [Service Worker](https://developer.mozilla.org/en-US/docs/Web/API/Service_Worker_API) for URL rerouting/rewriting to prevent [live leakage (zombies)](http://ws-dl.blogspot.com/2012/10/2012-10-10-zombies-in-archives.html). However, for security reason many web browsers have mandated HTTPS for the Service Worker API with only exception if the domain is `localhost`. [Caddy Server](https://caddyserver.com/) and [Traefik](https://traefik.io/) can be used as a reverse-proxy server and are very easy to setup. They come with built-in HTTPS support and manage (install and update) TLS certificates transparently and automatically from [Let's Encrypt](https://letsencrypt.org/). However, any web server proxy that has HTTPS support on the front-end will work. To make ipwb replay aware of the proxy, use `--proxy` or `-P` flag to supply the proxy URL. This way the replay will yield the supplied proxy URL as a prefix when generating various fully qualified domain name (FQDN) URIs or absolute URIs (for example, those in the TimeMap or Link header) instead of the default `http://localhost:2016`. This can be necessary when the service is running in a private network or a container and only exposed via a reverse-proxy. Suppose a reverse-proxy server is running and ready to forward all traffic on the `https://ipwb.example.com` to the ipwb replay server then the replay can be started as following: +To run it under a domain name other than `localhost`, the easiest approach is to use a reverse proxy that supports HTTPS. The replay system utilizes [Service Worker](https://developer.mozilla.org/en-US/docs/Web/API/Service_Worker_API) for URL rerouting/rewriting to prevent [live leakage (zombies)](http://ws-dl.blogspot.com/2012/10/2012-10-10-zombies-in-archives.html). However, for security reason many web browsers have mandated HTTPS for the Service Worker API with only exception if the domain is `localhost`. [Caddy Server](https://caddyserver.com/) and [Traefik](https://traefik.io/) can be used as a reverse-proxy server and are very easy to setup. They come with built-in HTTPS support and manage (install and update) TLS certificates transparently and automatically from [Let's Encrypt](https://letsencrypt.org/). However, any web server proxy that has HTTPS support on the front-end will work. To make ipwb replay aware of the proxy, use `--proxy` or `-P` flag to supply the proxy URL. This way the replay will yield the supplied proxy URL as a prefix when generating various fully qualified domain name (FQDN) URIs or absolute URIs (for example, those in the TimeMap or Link header) instead of the default `http://localhost:2016`. This can be necessary when the service is running in a private network or a container, and only exposed via a reverse-proxy. Suppose a reverse-proxy server is running and ready to forward all traffic on the `https://ipwb.example.com` to the ipwb replay server then the replay can be started as following: ``` $ ipwb replay --proxy=https://ipwb.example.com @@ -121,7 +121,7 @@ To build an image from the source, run the following command from the directory $ docker image build -t oduwsdl/ipwb . ``` -By default, the image building process also performs tests, so it might take a while to build the image. It ensures that an image will not be created with failing tests. However, it is possible to skip tests by supplying a build-arg `--build-arg SKIPTEST=true` as illustrated below: +By default, the image building process also performs tests, so it might take a while to build the image. It ensures that an image will not be created with failing tests. However, it is possible to skip tests by supplying a build-arg `--build-arg SKIPTEST=true` as shown below: ``` $ docker image build --build-arg SKIPTEST=true -t oduwsdl/ipwb . @@ -201,7 +201,7 @@ This repo contains the code for integrating [WARC](http://www.iso.org/iso/catalo ### Citing Project -We have numerous publications related to this project, but the most significant and primary one was published in TPDL 2016. ([Read the PDF](https://matkelly.com/papers/2016_tpdl_ipwb.pdf)) +There are numerous publications related to this project, but the most significant and primary one was published in TPDL 2016. ([Read the PDF](https://matkelly.com/papers/2016_tpdl_ipwb.pdf)) > Mat Kelly, Sawood Alam, Michael L. Nelson, and Michele C. Weigle. __InterPlanetary Wayback: Peer-To-Peer Permanence of Web Archives__. In _Proceedings of the 20th International Conference on Theory and Practice of Digital Libraries_, pages 411–416, Hamburg, Germany, June 2016. From d11339c100d9087329af133ebe44d0a2adc6c902 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 14:19:34 -0400 Subject: [PATCH 11/22] Add space to invoke GH testing --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0118ab5b..36b6566b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,7 @@ jobs: os: - ubuntu-latest - macos-latest - # - windows-latest + # # - windows-latest python: - "3.9" - "3.10" From 17ed3f8b519ffd0c938d701121ddf90f2f1113a4 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 14:20:41 -0400 Subject: [PATCH 12/22] Rm space --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 36b6566b..0118ab5b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,7 @@ jobs: os: - ubuntu-latest - macos-latest - # # - windows-latest + # - windows-latest python: - "3.9" - "3.10" From d0462e9776b586dced753ab5a30b388d5e13df5e Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 14:49:58 -0400 Subject: [PATCH 13/22] Update mock assertion --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index 444ea39c..70b97796 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - assert mock_logger.called_once_with('boo') + assert mock_logger.assert_called_once_with('boo') From 45bc18af3d99c1925a9809663432893c2cc63d1e Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 15:08:38 -0400 Subject: [PATCH 14/22] Update parameters for mock logged assertion --- ipwb/error_handler.py | 6 +++--- tests/test_error_handler.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ipwb/error_handler.py b/ipwb/error_handler.py index a00e0138..06242fbd 100644 --- a/ipwb/error_handler.py +++ b/ipwb/error_handler.py @@ -7,7 +7,7 @@ def exception_logger(catch=True, exception_class=Exception): """ - Decorator which catches exceptions in the function and logs them. + Decorator that catches exceptions in the function and logs them. Usage: @@ -17,11 +17,11 @@ def decorated_function(foo, bar): do_something ``` - `exception_logger()` will catch any exception which happens in + `exception_logger()` will catch any exception that happens in `decorated_function()` while it is being executed, and log an error using Python built in `logging` library. - Unless `catch` argument is `False` - in which case the exception will be + Unless `catch` argument is `False` - in which case, the exception will be reraised. """ def decorator(f: Callable): diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index 70b97796..0544ac16 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - assert mock_logger.assert_called_once_with('boo') + assert mock_logger.assert_called_once_with('*','boo') From ccdb7455331614ab5d1a4a6361fd8d806ee15013 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 15:20:28 -0400 Subject: [PATCH 15/22] Rm extra param for assertion --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index 0544ac16..70b97796 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - assert mock_logger.assert_called_once_with('*','boo') + assert mock_logger.assert_called_once_with('boo') From 85ae7570e034f89f401eee3fa2db5ff6b3f3dc75 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 15:31:36 -0400 Subject: [PATCH 16/22] Rm package_resources from replay due to lack of Py 3.12 support --- ipwb/replay.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ipwb/replay.py b/ipwb/replay.py index 6d83b0b2..cedbaaeb 100755 --- a/ipwb/replay.py +++ b/ipwb/replay.py @@ -15,7 +15,6 @@ import ipfshttpclient as ipfsapi import json import subprocess -import pkg_resources import surt import re import traceback @@ -1019,8 +1018,6 @@ def get_index_file_full_path(cdxj_file_path=INDEX_FILE): if os.path.isfile(cdxj_file_path): return cdxj_file_path - # index_file_name = pkg_resources.resource_filename( - # __name__, index_file_path) index_file_name = importlib.resources.files( __name__).joinpath(index_file_path) return index_file_name From b6a288ec27f39050026244f94a59e84090a7faf9 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 15:41:01 -0400 Subject: [PATCH 17/22] Replace placeholder string in assertion --- tests/test_error_handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index 70b97796..0707377b 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -1,5 +1,5 @@ import pytest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, patch, ANY from ipwb.error_handler import exception_logger @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - assert mock_logger.assert_called_once_with('boo') + assert mock_logger.assert_called_once_with(ANY) From 48c1ba1fe6f43b0bd2c665135bd00db7fdf4fc07 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 16:04:34 -0400 Subject: [PATCH 18/22] Not once, with --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index 0707377b..1285e994 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - assert mock_logger.assert_called_once_with(ANY) + assert mock_logger.assert_called_with(ANY) From d9044135ce38ec6da25d59ac1f92bf8adc52d8cb Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 16:10:46 -0400 Subject: [PATCH 19/22] Rm assert --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index 1285e994..c628d20f 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - assert mock_logger.assert_called_with(ANY) + mock_logger.assert_called_with(ANY) From 68fabe7e0f96b6e27ee50f3573408c7a54323e43 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 16:11:17 -0400 Subject: [PATCH 20/22] Restore boo --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index c628d20f..9a8773b2 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - mock_logger.assert_called_with(ANY) + mock_logger.assert_called_with('boo') From b8273d6d4eca328c99ca5a27b23855574e06d3cb Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 16:15:32 -0400 Subject: [PATCH 21/22] Restore called once assertion --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index 9a8773b2..c2ff7ba4 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - mock_logger.assert_called_with('boo') + mock_logger.assert_called_once('boo') From 33abe92f0801ba31beb8a5b0ff1e999b3922ae53 Mon Sep 17 00:00:00 2001 From: Mat Kelly Date: Wed, 16 Oct 2024 16:29:23 -0400 Subject: [PATCH 22/22] Need with 'with' variant of the assert method if passing the param, for #814 --- tests/test_error_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_error_handler.py b/tests/test_error_handler.py index c2ff7ba4..74609011 100644 --- a/tests/test_error_handler.py +++ b/tests/test_error_handler.py @@ -24,4 +24,4 @@ def test_catch(): with patch('ipwb.error_handler.logger.critical', mock_logger): caught_error('boo') - mock_logger.assert_called_once('boo') + mock_logger.assert_called_once_with('boo')