diff --git a/src/ralph/backends/data/async_es.py b/src/ralph/backends/data/async_es.py index 6ff2442fa..7339c5716 100644 --- a/src/ralph/backends/data/async_es.py +++ b/src/ralph/backends/data/async_es.py @@ -114,6 +114,7 @@ async def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[AsyncIterator[bytes], AsyncIterator[dict]]: """Read documents matching the query in the target index and yield them. @@ -130,6 +131,10 @@ async def read( # noqa: PLR0913 ignore_errors (bool): If `True`, errors during the encoding operation will be ignored and logged. If `False` (default), a `BackendException` will be raised if an error occurs. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yield: @@ -140,7 +145,7 @@ async def read( # noqa: PLR0913 BackendException: If a failure occurs during Elasticsearch connection. """ statements = super().read( - query, target, chunk_size, raw_output, ignore_errors, max_statements + query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements ) async for statement in statements: yield statement diff --git a/src/ralph/backends/data/async_mongo.py b/src/ralph/backends/data/async_mongo.py index a67fa4be7..c743b4dd8 100644 --- a/src/ralph/backends/data/async_mongo.py +++ b/src/ralph/backends/data/async_mongo.py @@ -122,6 +122,7 @@ async def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[AsyncIterator[bytes], AsyncIterator[dict]]: """Read documents matching the `query` from `target` collection and yield them. @@ -134,6 +135,10 @@ async def read( # noqa: PLR0913 If `chunk_size` is `None` it defaults to `READ_CHUNK_SIZE`. raw_output (bool): Whether to yield dictionaries or bytes. ignore_errors (bool): Whether to ignore errors when reading documents. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yield: @@ -145,7 +150,7 @@ async def read( # noqa: PLR0913 BackendParameterException: If a failure occurs with MongoDB collection. """ statements = super().read( - query, target, chunk_size, raw_output, ignore_errors, max_statements + query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements ) async for statement in statements: yield statement diff --git a/src/ralph/backends/data/base.py b/src/ralph/backends/data/base.py index 936b3a4df..a4dbd57e4 100644 --- a/src/ralph/backends/data/base.py +++ b/src/ralph/backends/data/base.py @@ -280,6 +280,7 @@ def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[Iterator[bytes], Iterator[dict]]: """Read records matching the `query` in the `target` container and yield them. @@ -299,6 +300,10 @@ def read( # noqa: PLR0913 ignore_errors (bool): If `True`, errors during the read operation are be ignored and logged. If `False` (default), a `BackendException` is raised if an error occurs. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yield: @@ -310,6 +315,19 @@ def read( # noqa: PLR0913 `ignore_errors` is set to `False`. BackendParameterException: If a backend argument value is not valid. """ + if greedy: + yield from list( + self.read( + query, + target, + chunk_size, + raw_output, + ignore_errors, + False, + max_statements, + ) + ) + return chunk_size = chunk_size if chunk_size else self.settings.READ_CHUNK_SIZE query = validate_backend_query(query, self.query_class, self.logger) reader = self._read_bytes if raw_output else self._read_dicts @@ -318,10 +336,14 @@ def read( # noqa: PLR0913 yield from statements return + if not max_statements: + return + + max_statements -= 1 for i, statement in enumerate(statements): + yield statement if i >= max_statements: return - yield statement @abstractmethod def _read_bytes( @@ -515,6 +537,7 @@ async def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[AsyncIterator[bytes], AsyncIterator[dict]]: """Read records matching the `query` in the `target` container and yield them. @@ -534,6 +557,10 @@ async def read( # noqa: PLR0913 ignore_errors (bool): If `True`, errors during the read operation are be ignored and logged. If `False` (default), a `BackendException` is raised if an error occurs. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yield: @@ -545,6 +572,25 @@ async def read( # noqa: PLR0913 `ignore_errors` is set to `False`. BackendParameterException: If a backend argument value is not valid. """ + if greedy: + greedy_statements = [ + statement + async for statement in self.read( + query, + target, + chunk_size, + raw_output, + ignore_errors, + False, + max_statements, + ) + ] + + for greedy_statement in greedy_statements: + yield greedy_statement + + return + chunk_size = chunk_size if chunk_size else self.settings.READ_CHUNK_SIZE query = validate_backend_query(query, self.query_class, self.logger) reader = self._read_bytes if raw_output else self._read_dicts @@ -553,12 +599,16 @@ async def read( # noqa: PLR0913 async for statement in statements: yield statement return + + if not max_statements: + return + i = 0 async for statement in statements: - if i >= max_statements: - return yield statement i += 1 + if i >= max_statements: + return @abstractmethod async def _read_bytes( diff --git a/src/ralph/backends/data/clickhouse.py b/src/ralph/backends/data/clickhouse.py index c012aaaaa..8dbcdb031 100755 --- a/src/ralph/backends/data/clickhouse.py +++ b/src/ralph/backends/data/clickhouse.py @@ -211,6 +211,7 @@ def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[Iterator[bytes], Iterator[dict]]: """Read documents matching the query in the target table and yield them. @@ -225,6 +226,10 @@ def read( # noqa: PLR0913 ignore_errors (bool): If `True`, errors during the encoding operation will be ignored and logged. If `False` (default), a `BackendException` will be raised if an error occurs. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yield: @@ -235,7 +240,7 @@ def read( # noqa: PLR0913 BackendException: If a failure occurs during ClickHouse connection. """ yield from super().read( - query, target, chunk_size, raw_output, ignore_errors, max_statements + query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements ) def _read_bytes( diff --git a/src/ralph/backends/data/es.py b/src/ralph/backends/data/es.py index 5093167a6..3d73f67c8 100644 --- a/src/ralph/backends/data/es.py +++ b/src/ralph/backends/data/es.py @@ -199,6 +199,7 @@ def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[Iterator[bytes], Iterator[dict]]: """Read documents matching the query in the target index and yield them. @@ -215,6 +216,10 @@ def read( # noqa: PLR0913 ignore_errors (bool): If `True`, errors during the encoding operation will be ignored and logged. If `False` (default), a `BackendException` will be raised if an error occurs. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yield: @@ -225,7 +230,7 @@ def read( # noqa: PLR0913 BackendException: If a failure occurs during Elasticsearch connection. """ yield from super().read( - query, target, chunk_size, raw_output, ignore_errors, max_statements + query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements ) def _read_bytes( diff --git a/src/ralph/backends/data/fs.py b/src/ralph/backends/data/fs.py index bc5d37e6c..b14e1e30f 100644 --- a/src/ralph/backends/data/fs.py +++ b/src/ralph/backends/data/fs.py @@ -152,6 +152,7 @@ def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[Iterator[bytes], Iterator[dict]]: """Read files matching the query in the target folder and yield them. @@ -169,6 +170,10 @@ def read( # noqa: PLR0913 ignore_errors (bool): If `True`, errors during the read operation will be ignored and logged. If `False` (default), a `BackendException` will be raised if an error occurs. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yields: @@ -180,7 +185,7 @@ def read( # noqa: PLR0913 `ignore_errors` is set to `False`. """ yield from super().read( - query, target, chunk_size, raw_output, ignore_errors, max_statements + query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements ) def _read_bytes( diff --git a/src/ralph/backends/data/ldp.py b/src/ralph/backends/data/ldp.py index 7d845e98b..e4ba400fe 100644 --- a/src/ralph/backends/data/ldp.py +++ b/src/ralph/backends/data/ldp.py @@ -156,6 +156,7 @@ def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = True, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[Iterator[bytes], Iterator[dict]]: """Read an archive matching the query in the target stream_id and yield it. @@ -168,6 +169,10 @@ def read( # noqa: PLR0913 If `chunk_size` is `None` it defaults to `READ_CHUNK_SIZE`. raw_output (bool): Should always be set to `True`. ignore_errors (bool): Ignored. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yields: @@ -179,7 +184,7 @@ def read( # noqa: PLR0913 or if the `raw_output` argument is set to `False`. """ yield from super().read( - query, target, chunk_size, raw_output, ignore_errors, max_statements + query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements ) def _read_dicts( diff --git a/src/ralph/backends/data/mongo.py b/src/ralph/backends/data/mongo.py index aa7959c9d..c6d01073b 100644 --- a/src/ralph/backends/data/mongo.py +++ b/src/ralph/backends/data/mongo.py @@ -178,6 +178,7 @@ def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[Iterator[bytes], Iterator[dict]]: """Read documents matching the `query` from `target` collection and yield them. @@ -190,6 +191,10 @@ def read( # noqa: PLR0913 If `chunk_size` is `None` it defaults to `READ_CHUNK_SIZE`. raw_output (bool): Whether to yield dictionaries or bytes. ignore_errors (bool): Whether to ignore errors when reading documents. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yields: @@ -201,7 +206,7 @@ def read( # noqa: PLR0913 BackendParameterException: If the `target` is not a valid collection name. """ yield from super().read( - query, target, chunk_size, raw_output, ignore_errors, max_statements + query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements ) def _read_bytes( diff --git a/src/ralph/backends/data/s3.py b/src/ralph/backends/data/s3.py index 5137082ae..7af379632 100644 --- a/src/ralph/backends/data/s3.py +++ b/src/ralph/backends/data/s3.py @@ -169,6 +169,7 @@ def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[Iterator[bytes], Iterator[dict]]: """Read an object matching the `query` in the `target` bucket and yield it. @@ -183,6 +184,10 @@ def read( # noqa: PLR0913 ignore_errors (bool): If `True`, errors during the read operation will be ignored and logged. If `False` (default), a `BackendException` will be raised if an error occurs. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yields: @@ -196,7 +201,7 @@ def read( # noqa: PLR0913 `ignore_errors` is set to `False`. """ yield from super().read( - query, target, chunk_size, raw_output, ignore_errors, max_statements + query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements ) def _read_bytes( diff --git a/src/ralph/backends/data/swift.py b/src/ralph/backends/data/swift.py index 5835d72b9..bda561573 100644 --- a/src/ralph/backends/data/swift.py +++ b/src/ralph/backends/data/swift.py @@ -179,6 +179,7 @@ def read( # noqa: PLR0913 chunk_size: Optional[int] = None, raw_output: bool = False, ignore_errors: bool = False, + greedy: bool = False, max_statements: Optional[int] = None, ) -> Union[Iterator[bytes], Iterator[dict]]: """Read objects matching the `query` in the `target` container and yield them. @@ -198,6 +199,10 @@ def read( # noqa: PLR0913 ignore_errors (bool): If `True`, errors during the read operation are be ignored and logged. If `False` (default), a `BackendException` is raised if an error occurs. + greedy: If set to `True`, the client will fetch all available records + before they are yielded by the generator. Caution: + this might potentially lead to large amounts of API calls and to the + memory filling up. max_statements (int): The maximum number of statements to yield. Yields: @@ -210,7 +215,7 @@ def read( # noqa: PLR0913 BackendParameterException: If a backend argument value is not valid. """ yield from super().read( - query, target, chunk_size, raw_output, ignore_errors, max_statements + query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements ) def _read_bytes( diff --git a/tests/backends/data/test_async_es.py b/tests/backends/data/test_async_es.py index 5ae687c13..f5b8b8354 100644 --- a/tests/backends/data/test_async_es.py +++ b/tests/backends/data/test_async_es.py @@ -382,13 +382,16 @@ async def mock_async_es_search(**kwargs): @pytest.mark.anyio -async def test_backends_data_async_es_read_with_raw_ouput(es, async_es_backend): +@pytest.mark.parametrize("greedy", [False, True]) +async def test_backends_data_async_es_read_with_raw_ouput(greedy, es, async_es_backend): """Test the `AsyncESDataBackend.read` method with `raw_output` set to `True`.""" backend = async_es_backend() documents = [{"id": idx, "timestamp": now()} for idx in range(10)] assert await backend.write(documents) == 10 - hits = [statement async for statement in backend.read(raw_output=True)] + hits = [ + statement async for statement in backend.read(raw_output=True, greedy=greedy) + ] for i, hit in enumerate(hits): assert isinstance(hit, bytes) assert json.loads(hit).get("_source") == documents[i] @@ -397,13 +400,16 @@ async def test_backends_data_async_es_read_with_raw_ouput(es, async_es_backend): @pytest.mark.anyio -async def test_backends_data_async_es_read_without_raw_ouput(es, async_es_backend): +@pytest.mark.parametrize("greedy", [False, True]) +async def test_backends_data_async_es_read_without_raw_ouput( + greedy, es, async_es_backend +): """Test the `AsyncESDataBackend.read` method with `raw_output` set to `False`.""" backend = async_es_backend() documents = [{"id": idx, "timestamp": now()} for idx in range(10)] assert await backend.write(documents) == 10 - hits = [statement async for statement in backend.read()] + hits = [statement async for statement in backend.read(greedy=greedy)] for i, hit in enumerate(hits): assert isinstance(hit, dict) assert hit.get("_source") == documents[i] diff --git a/tests/backends/data/test_async_mongo.py b/tests/backends/data/test_async_mongo.py index 003fe6ec8..7ec8ef5d0 100644 --- a/tests/backends/data/test_async_mongo.py +++ b/tests/backends/data/test_async_mongo.py @@ -314,7 +314,9 @@ async def test_backends_data_async_mongo_list_with_history( @pytest.mark.anyio +@pytest.mark.parametrize("greedy", [False, True]) async def test_backends_data_async_mongo_read_with_raw_output( + greedy, mongo, async_mongo_backend, ): @@ -334,7 +336,9 @@ async def test_backends_data_async_mongo_read_with_raw_output( await backend.collection.insert_many(documents) await backend.database.foobar.insert_many(documents[:2]) - result = [statement async for statement in backend.read(raw_output=True)] + result = [ + statement async for statement in backend.read(raw_output=True, greedy=greedy) + ] assert result == expected result = [ statement async for statement in backend.read(raw_output=True, target="foobar") @@ -351,8 +355,9 @@ async def test_backends_data_async_mongo_read_with_raw_output( @pytest.mark.anyio +@pytest.mark.parametrize("greedy", [False, True]) async def test_backends_data_async_mongo_read_without_raw_output( - mongo, async_mongo_backend + greedy, mongo, async_mongo_backend ): """Test the `AsyncMongoDataBackend.read` method with `raw_output` set to `False`. @@ -372,7 +377,7 @@ async def test_backends_data_async_mongo_read_without_raw_output( await backend.collection.insert_many(documents) await backend.database.foobar.insert_many(documents[:2]) - assert [statement async for statement in backend.read()] == expected + assert [statement async for statement in backend.read(greedy=greedy)] == expected assert [statement async for statement in backend.read(target="foobar")] == expected[ :2 ] diff --git a/tests/backends/data/test_base.py b/tests/backends/data/test_base.py index 4530519cb..2fab66faf 100644 --- a/tests/backends/data/test_base.py +++ b/tests/backends/data/test_base.py @@ -157,6 +157,56 @@ def close(self): assert not list(backend.read(max_statements=0, raw_output=True)) +def test_backends_data_base_read_with_greedy(): + """Test the `BaseDataBackend.read` method with `greedy` argument.""" + + def get_mock_read_dicts(data): + """Return a `_read_dict` mock method yielding data.""" + + def mock_read_dicts(self, *args): + """Mock the `BaseDataBackend._read_dicts` method yielding data.""" + for item in data: + yield item + + return mock_read_dicts + + class MockBaseDataBackend(BaseDataBackend[BaseDataBackendSettings, BaseQuery]): + """A class mocking the base database class.""" + + def _read_dicts(self, *args): + pass + + def _read_bytes(self, *args): + pass + + def status(self): + pass + + def close(self): + pass + + backend = MockBaseDataBackend() + # Given `greedy` set to `False`, the `read` method should consume data on demand. + data = ({"foo": "bar"} for _ in range(4)) + backend._read_dicts = get_mock_read_dicts(data) + assert next(backend.read()) == {"foo": "bar"} + assert len(list(data)) == 3 # one item was requested - one item was consumed. + + # Given `greedy` set to `True`, the `read` method should consume all data. + data = ({"foo": "bar"} for _ in range(4)) + backend._read_dicts = get_mock_read_dicts(data) + assert next(backend.read(greedy=True)) == {"foo": "bar"} + assert not list(data) # all items are consumed. + + # Given `greedy` set to `True` and a `max_statements` limit, the `read` method + # should consume all data until reaching `max_statements`. + data = ({"foo": "bar"} for _ in range(4)) + backend._read_dicts = get_mock_read_dicts(data) + statements = backend.read(greedy=True, max_statements=3) + assert next(statements) == {"foo": "bar"} + assert list(data) == [{"foo": "bar"}] # 3 items are consumed. + + @pytest.mark.anyio async def test_backends_data_base_async_read_with_max_statements(): """Test the async `BaseDataBackend.read` method with `max_statements` argument.""" diff --git a/tests/backends/data/test_clickhouse.py b/tests/backends/data/test_clickhouse.py index 6227f8af2..e52e8a680 100644 --- a/tests/backends/data/test_clickhouse.py +++ b/tests/backends/data/test_clickhouse.py @@ -110,7 +110,10 @@ def mock_query(*_, **__): backend.close() -def test_backends_data_clickhouse_read_with_raw_output(clickhouse, clickhouse_backend): +@pytest.mark.parametrize("greedy", [False, True]) +def test_backends_data_clickhouse_read_with_raw_output( + greedy, clickhouse, clickhouse_backend +): """Test the `ClickHouseDataBackend.read` method.""" # Create records @@ -127,19 +130,19 @@ def test_backends_data_clickhouse_read_with_raw_output(clickhouse, clickhouse_ba backend = clickhouse_backend() backend.write(statements) - results = list(backend.read()) + results = list(backend.read(greedy=greedy)) assert len(results) == 3 assert results[0]["event"] == statements[0] assert results[1]["event"] == statements[1] assert results[2]["event"] == statements[2] - results = list(backend.read(chunk_size=10)) + results = list(backend.read(chunk_size=10, greedy=greedy)) assert len(results) == 3 assert results[0]["event"] == statements[0] assert results[1]["event"] == statements[1] assert results[2]["event"] == statements[2] - results = list(backend.read(raw_output=True)) + results = list(backend.read(raw_output=True, greedy=greedy)) assert len(results) == 3 assert isinstance(results[0], bytes) assert json.loads(results[0])["event"] == statements[0] diff --git a/tests/backends/data/test_es.py b/tests/backends/data/test_es.py index 84f235114..b199f54a9 100644 --- a/tests/backends/data/test_es.py +++ b/tests/backends/data/test_es.py @@ -338,13 +338,14 @@ def test_backends_data_es_read_with_ignore_errors(es, es_backend, monkeypatch, c backend.close() -def test_backends_data_es_read_with_raw_ouput(es, es_backend): +@pytest.mark.parametrize("greedy", [False, True]) +def test_backends_data_es_read_with_raw_ouput(greedy, es, es_backend): """Test the `ESDataBackend.read` method with `raw_output` set to `True`.""" backend = es_backend() documents = [{"id": idx, "timestamp": now()} for idx in range(10)] assert backend.write(documents) == 10 - hits = list(backend.read(raw_output=True)) + hits = list(backend.read(raw_output=True, greedy=greedy)) for i, hit in enumerate(hits): assert isinstance(hit, bytes) assert json.loads(hit).get("_source") == documents[i] @@ -352,13 +353,14 @@ def test_backends_data_es_read_with_raw_ouput(es, es_backend): backend.close() -def test_backends_data_es_read_without_raw_ouput(es, es_backend): +@pytest.mark.parametrize("greedy", [False, True]) +def test_backends_data_es_read_without_raw_ouput(greedy, es, es_backend): """Test the `ESDataBackend.read` method with `raw_output` set to `False`.""" backend = es_backend() documents = [{"id": idx, "timestamp": now()} for idx in range(10)] assert backend.write(documents) == 10 - hits = backend.read() + hits = backend.read(greedy=greedy) for i, hit in enumerate(hits): assert isinstance(hit, dict) assert hit.get("_source") == documents[i] diff --git a/tests/backends/data/test_fs.py b/tests/backends/data/test_fs.py index f6cf53c29..159a90e89 100644 --- a/tests/backends/data/test_fs.py +++ b/tests/backends/data/test_fs.py @@ -357,7 +357,8 @@ def test_backends_data_fs_list_with_history_and_details(fs_backend, fs): assert sorted(result, key=itemgetter("path")) == expected -def test_backends_data_fs_read_with_raw_ouput(fs_backend, fs, monkeypatch): +@pytest.mark.parametrize("greedy", [False, True]) +def test_backends_data_fs_read_with_raw_ouput(greedy, fs_backend, fs, monkeypatch): """Test the `FSDataBackend.read` method with `raw_output` set to `True`.""" # Create files in absolute path directory. @@ -377,7 +378,7 @@ def test_backends_data_fs_read_with_raw_ouput(fs_backend, fs, monkeypatch): # Given no `target`, the `read` method should read all files in the default # directory and yield bytes. - result = backend.read(raw_output=True) + result = backend.read(raw_output=True, greedy=greedy) assert isinstance(result, Iterable) assert list(result) == [b"baz"] @@ -396,7 +397,7 @@ def test_backends_data_fs_read_with_raw_ouput(fs_backend, fs, monkeypatch): # Given an absolute `target` path, the `read` method should read all files in the # target directory and yield bytes. - result = backend.read(raw_output=True, target=absolute_path) + result = backend.read(raw_output=True, target=absolute_path, greedy=greedy) assert isinstance(result, Iterable) assert list(result) == [b"foo", b"bar"] @@ -423,7 +424,7 @@ def test_backends_data_fs_read_with_raw_ouput(fs_backend, fs, monkeypatch): # Given a relative `target` path, the `read` method should read all files in the # target directory relative to the default directory and yield bytes. - result = backend.read(raw_output=True, target="./bar") + result = backend.read(raw_output=True, target="./bar", greedy=greedy) assert isinstance(result, Iterable) assert list(result) == [b"qux"] @@ -443,7 +444,9 @@ def test_backends_data_fs_read_with_raw_ouput(fs_backend, fs, monkeypatch): # Given a `chunk_size` and an absolute `target` path, # the `read` method should write the output bytes in chunks of the specified # `chunk_size`. - result = backend.read(raw_output=True, target=absolute_path, chunk_size=2) + result = backend.read( + raw_output=True, target=absolute_path, chunk_size=2, greedy=greedy + ) assert isinstance(result, Iterable) assert list(result) == [b"fo", b"o", b"ba", b"r"] @@ -469,7 +472,8 @@ def test_backends_data_fs_read_with_raw_ouput(fs_backend, fs, monkeypatch): ] -def test_backends_data_fs_read_without_raw_output(fs_backend, fs, monkeypatch): +@pytest.mark.parametrize("greedy", [False, True]) +def test_backends_data_fs_read_without_raw_output(greedy, fs_backend, fs, monkeypatch): """Test the `FSDataBackend.read` method with `raw_output` set to `False`.""" # File contents. @@ -494,7 +498,7 @@ def test_backends_data_fs_read_without_raw_output(fs_backend, fs, monkeypatch): # Given no `target`, the `read` method should read all files in the default # directory and yield dictionaries. - result = backend.read(raw_output=False) + result = backend.read(raw_output=False, greedy=greedy) assert isinstance(result, Iterable) assert list(result) == [valid_dictionary, valid_dictionary] @@ -513,7 +517,7 @@ def test_backends_data_fs_read_without_raw_output(fs_backend, fs, monkeypatch): # Given an absolute `target` path, the `read` method should read all files in the # target directory and yield dictionaries. - result = backend.read(raw_output=False, target=absolute_path) + result = backend.read(raw_output=False, target=absolute_path, greedy=greedy) assert isinstance(result, Iterable) assert list(result) == [valid_dictionary] @@ -532,7 +536,7 @@ def test_backends_data_fs_read_without_raw_output(fs_backend, fs, monkeypatch): # Given a relative `target` path, the `read` method should read all files in the # target directory relative to the default directory and yield dictionaries. - result = backend.read(raw_output=False, target="bar") + result = backend.read(raw_output=False, target="bar", greedy=greedy) assert isinstance(result, Iterable) assert list(result) == [valid_dictionary, valid_dictionary, valid_dictionary] diff --git a/tests/backends/data/test_ldp.py b/tests/backends/data/test_ldp.py index 8418c57db..d2191fa1e 100644 --- a/tests/backends/data/test_ldp.py +++ b/tests/backends/data/test_ldp.py @@ -564,7 +564,8 @@ def mock_requests_get(url, stream=True, timeout=None): next(backend.read(query="foo")) -def test_backends_data_ldp_read_with_query(ldp_backend, monkeypatch, fs): +@pytest.mark.parametrize("greedy", [False, True]) +def test_backends_data_ldp_read_with_query(greedy, ldp_backend, monkeypatch, fs): """Test the `LDPDataBackend.read` method, given a query argument.""" # Create fake archive to stream. @@ -614,7 +615,9 @@ def mock_ovh_get(url): with requests_mock.Mocker() as request_mocker: request_mocker.get("http://example.com", content=archive) - result = b"".join(backend.read(query="5d5c4c93-04a4-42c5-9860-f51fa4044aa1")) + result = b"".join( + backend.read(query="5d5c4c93-04a4-42c5-9860-f51fa4044aa1", greedy=greedy) + ) assert os.path.exists(settings.HISTORY_FILE) assert backend.history == [ diff --git a/tests/backends/data/test_mongo.py b/tests/backends/data/test_mongo.py index b601d88a5..f859c8058 100644 --- a/tests/backends/data/test_mongo.py +++ b/tests/backends/data/test_mongo.py @@ -250,7 +250,8 @@ def test_backends_data_mongo_list_with_history(mongo_backend, caplog): backend.close() -def test_backends_data_mongo_read_with_raw_output(mongo, mongo_backend): +@pytest.mark.parametrize("greedy", [False, True]) +def test_backends_data_mongo_read_with_raw_output(greedy, mongo, mongo_backend): """Test the `MongoDataBackend.read` method with `raw_output` set to `True`.""" backend = mongo_backend() @@ -266,14 +267,15 @@ def test_backends_data_mongo_read_with_raw_output(mongo, mongo_backend): ] backend.collection.insert_many(documents) backend.database.foobar.insert_many(documents[:2]) - assert list(backend.read(raw_output=True)) == expected + assert list(backend.read(raw_output=True, greedy=greedy)) == expected assert list(backend.read(raw_output=True, target="foobar")) == expected[:2] assert list(backend.read(raw_output=True, chunk_size=2)) == expected assert list(backend.read(raw_output=True, chunk_size=1000)) == expected backend.close() -def test_backends_data_mongo_read_without_raw_output(mongo, mongo_backend): +@pytest.mark.parametrize("greedy", [False, True]) +def test_backends_data_mongo_read_without_raw_output(greedy, mongo, mongo_backend): """Test the `MongoDataBackend.read` method with `raw_output` set to `False`.""" backend = mongo_backend() @@ -289,7 +291,7 @@ def test_backends_data_mongo_read_without_raw_output(mongo, mongo_backend): ] backend.collection.insert_many(documents) backend.database.foobar.insert_many(documents[:2]) - assert list(backend.read()) == expected + assert list(backend.read(greedy=greedy)) == expected assert list(backend.read(target="foobar")) == expected[:2] assert list(backend.read(chunk_size=2)) == expected assert list(backend.read(chunk_size=1000)) == expected diff --git a/tests/backends/data/test_s3.py b/tests/backends/data/test_s3.py index 0ba014907..7458a6bf7 100644 --- a/tests/backends/data/test_s3.py +++ b/tests/backends/data/test_s3.py @@ -226,7 +226,9 @@ def test_backends_data_s3_list_with_failed_connection_should_log_the_error( @mock_s3 +@pytest.mark.parametrize("greedy", [False, True]) def test_backends_data_s3_read_with_valid_name_should_write_to_history( + greedy, s3_backend, monkeypatch, ): @@ -267,6 +269,7 @@ def test_backends_data_s3_read_with_valid_name_should_write_to_history( target=bucket_name, chunk_size=1000, raw_output=True, + greedy=greedy, ) ) @@ -278,12 +281,7 @@ def test_backends_data_s3_read_with_valid_name_should_write_to_history( "timestamp": freezed_now, } in backend.history - list( - backend.read( - query="2022-09-30.gz", - raw_output=False, - ) - ) + list(backend.read(query="2022-09-30.gz", raw_output=False, greedy=greedy)) assert { "backend": "s3", diff --git a/tests/backends/data/test_swift.py b/tests/backends/data/test_swift.py index 1f07d1724..ad637fde5 100644 --- a/tests/backends/data/test_swift.py +++ b/tests/backends/data/test_swift.py @@ -278,8 +278,9 @@ def mock_get_container(*args, **kwargs): backend.close() +@pytest.mark.parametrize("greedy", [False, True]) def test_backends_data_swift_read_with_raw_output( - swift_backend, monkeypatch, fs, settings_fs + greedy, swift_backend, monkeypatch, fs, settings_fs ): """Test the `SwiftDataBackend.read` method with `raw_output` set to `True`.""" @@ -300,7 +301,7 @@ def mock_get_object(*args, **kwargs): fs.create_file(settings.HISTORY_FILE, contents=json.dumps([])) # The `read` method should read the object and yield bytes. - result = backend.read(raw_output=True, query="2020-04-29.gz") + result = backend.read(raw_output=True, query="2020-04-29.gz", greedy=greedy) assert isinstance(result, Iterable) assert list(result) == [content] @@ -339,8 +340,9 @@ def mock_get_object(*args, **kwargs): backend.close() +@pytest.mark.parametrize("greedy", [False, True]) def test_backends_data_swift_read_without_raw_output( - swift_backend, monkeypatch, fs, settings_fs + greedy, swift_backend, monkeypatch, fs, settings_fs ): """Test the `SwiftDataBackend.read` method with `raw_output` set to `False`.""" @@ -362,7 +364,7 @@ def mock_get_object(*args, **kwargs): fs.create_file(settings.HISTORY_FILE, contents=json.dumps([])) # The `read` method should read the object and yield bytes. - result = backend.read(raw_output=False, query="2020-04-29.gz") + result = backend.read(raw_output=False, query="2020-04-29.gz", greedy=greedy) assert isinstance(result, Iterable) assert list(result) == [content_dict]