✨(backends) add greedy option for data backends

We try to align the data backend interface with the http backends. The greedy option allows the caller to greedily read all records before they are yielded by the generator.
openfun · Nov 14, 2023 · a80bb8d · a80bb8d
1 parent 27ef4ff
commit a80bb8d
Show file tree

Hide file tree

Showing 20 changed files with 222 additions and 52 deletions.
diff --git a/src/ralph/backends/data/async_es.py b/src/ralph/backends/data/async_es.py
@@ -114,6 +114,7 @@ async def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[AsyncIterator[bytes], AsyncIterator[dict]]:
         """Read documents matching the query in the target index and yield them.
@@ -130,6 +131,10 @@ async def read(  # noqa: PLR0913
             ignore_errors (bool): If `True`, errors during the encoding operation
                 will be ignored and logged. If `False` (default), a `BackendException`
                 will be raised if an error occurs.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yield:
@@ -140,7 +145,7 @@ async def read(  # noqa: PLR0913
             BackendException: If a failure occurs during Elasticsearch connection.
         """
         statements = super().read(
-            query, target, chunk_size, raw_output, ignore_errors, max_statements
+            query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements
         )
         async for statement in statements:
             yield statement

diff --git a/src/ralph/backends/data/async_mongo.py b/src/ralph/backends/data/async_mongo.py
@@ -122,6 +122,7 @@ async def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[AsyncIterator[bytes], AsyncIterator[dict]]:
         """Read documents matching the `query` from `target` collection and yield them.
@@ -134,6 +135,10 @@ async def read(  # noqa: PLR0913
                 If `chunk_size` is `None` it defaults to `READ_CHUNK_SIZE`.
             raw_output (bool): Whether to yield dictionaries or bytes.
             ignore_errors (bool): Whether to ignore errors when reading documents.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yield:
@@ -145,7 +150,7 @@ async def read(  # noqa: PLR0913
             BackendParameterException: If a failure occurs with MongoDB collection.
         """
         statements = super().read(
-            query, target, chunk_size, raw_output, ignore_errors, max_statements
+            query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements
         )
         async for statement in statements:
             yield statement

diff --git a/src/ralph/backends/data/base.py b/src/ralph/backends/data/base.py
@@ -280,6 +280,7 @@ def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[Iterator[bytes], Iterator[dict]]:
         """Read records matching the `query` in the `target` container and yield them.
@@ -299,6 +300,10 @@ def read(  # noqa: PLR0913
             ignore_errors (bool): If `True`, errors during the read operation
                 are be ignored and logged. If `False` (default), a `BackendException`
                 is raised if an error occurs.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yield:
@@ -310,6 +315,19 @@ def read(  # noqa: PLR0913
                 `ignore_errors` is set to `False`.
             BackendParameterException: If a backend argument value is not valid.
         """
+        if greedy:
+            yield from list(
+                self.read(
+                    query,
+                    target,
+                    chunk_size,
+                    raw_output,
+                    ignore_errors,
+                    False,
+                    max_statements,
+                )
+            )
+            return
         chunk_size = chunk_size if chunk_size else self.settings.READ_CHUNK_SIZE
         query = validate_backend_query(query, self.query_class, self.logger)
         reader = self._read_bytes if raw_output else self._read_dicts
@@ -318,10 +336,14 @@ def read(  # noqa: PLR0913
             yield from statements
             return
 
+        if not max_statements:
+            return
+
+        max_statements -= 1
         for i, statement in enumerate(statements):
+            yield statement
             if i >= max_statements:
                 return
-            yield statement
 
     @abstractmethod
     def _read_bytes(
@@ -515,6 +537,7 @@ async def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[AsyncIterator[bytes], AsyncIterator[dict]]:
         """Read records matching the `query` in the `target` container and yield them.
@@ -534,6 +557,10 @@ async def read(  # noqa: PLR0913
             ignore_errors (bool): If `True`, errors during the read operation
                 are be ignored and logged. If `False` (default), a `BackendException`
                 is raised if an error occurs.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yield:
@@ -545,6 +572,25 @@ async def read(  # noqa: PLR0913
                 `ignore_errors` is set to `False`.
             BackendParameterException: If a backend argument value is not valid.
         """
+        if greedy:
+            greedy_statements = [
+                statement
+                async for statement in self.read(
+                    query,
+                    target,
+                    chunk_size,
+                    raw_output,
+                    ignore_errors,
+                    False,
+                    max_statements,
+                )
+            ]
+
+            for greedy_statement in greedy_statements:
+                yield greedy_statement
+
+            return
+
         chunk_size = chunk_size if chunk_size else self.settings.READ_CHUNK_SIZE
         query = validate_backend_query(query, self.query_class, self.logger)
         reader = self._read_bytes if raw_output else self._read_dicts
@@ -553,12 +599,16 @@ async def read(  # noqa: PLR0913
             async for statement in statements:
                 yield statement
             return
+
+        if not max_statements:
+            return
+
         i = 0
         async for statement in statements:
-            if i >= max_statements:
-                return
             yield statement
             i += 1
+            if i >= max_statements:
+                return
 
     @abstractmethod
     async def _read_bytes(

diff --git a/src/ralph/backends/data/clickhouse.py b/src/ralph/backends/data/clickhouse.py
@@ -211,6 +211,7 @@ def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[Iterator[bytes], Iterator[dict]]:
         """Read documents matching the query in the target table and yield them.
@@ -225,6 +226,10 @@ def read(  # noqa: PLR0913
             ignore_errors (bool): If `True`, errors during the encoding operation
                 will be ignored and logged. If `False` (default), a `BackendException`
                 will be raised if an error occurs.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yield:
@@ -235,7 +240,7 @@ def read(  # noqa: PLR0913
             BackendException: If a failure occurs during ClickHouse connection.
         """
         yield from super().read(
-            query, target, chunk_size, raw_output, ignore_errors, max_statements
+            query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements
         )
 
     def _read_bytes(

diff --git a/src/ralph/backends/data/es.py b/src/ralph/backends/data/es.py
@@ -199,6 +199,7 @@ def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[Iterator[bytes], Iterator[dict]]:
         """Read documents matching the query in the target index and yield them.
@@ -215,6 +216,10 @@ def read(  # noqa: PLR0913
             ignore_errors (bool): If `True`, errors during the encoding operation
                 will be ignored and logged. If `False` (default), a `BackendException`
                 will be raised if an error occurs.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yield:
@@ -225,7 +230,7 @@ def read(  # noqa: PLR0913
             BackendException: If a failure occurs during Elasticsearch connection.
         """
         yield from super().read(
-            query, target, chunk_size, raw_output, ignore_errors, max_statements
+            query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements
         )
 
     def _read_bytes(

diff --git a/src/ralph/backends/data/fs.py b/src/ralph/backends/data/fs.py
@@ -152,6 +152,7 @@ def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[Iterator[bytes], Iterator[dict]]:
         """Read files matching the query in the target folder and yield them.
@@ -169,6 +170,10 @@ def read(  # noqa: PLR0913
             ignore_errors (bool): If `True`, errors during the read operation
                 will be ignored and logged. If `False` (default), a `BackendException`
                 will be raised if an error occurs.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yields:
@@ -180,7 +185,7 @@ def read(  # noqa: PLR0913
                 `ignore_errors` is set to `False`.
         """
         yield from super().read(
-            query, target, chunk_size, raw_output, ignore_errors, max_statements
+            query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements
         )
 
     def _read_bytes(

diff --git a/src/ralph/backends/data/ldp.py b/src/ralph/backends/data/ldp.py
@@ -156,6 +156,7 @@ def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = True,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[Iterator[bytes], Iterator[dict]]:
         """Read an archive matching the query in the target stream_id and yield it.
@@ -168,6 +169,10 @@ def read(  # noqa: PLR0913
                 If `chunk_size` is `None` it defaults to `READ_CHUNK_SIZE`.
             raw_output (bool): Should always be set to `True`.
             ignore_errors (bool): Ignored.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yields:
@@ -179,7 +184,7 @@ def read(  # noqa: PLR0913
                 or if the `raw_output` argument is set to `False`.
         """
         yield from super().read(
-            query, target, chunk_size, raw_output, ignore_errors, max_statements
+            query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements
         )
 
     def _read_dicts(

diff --git a/src/ralph/backends/data/mongo.py b/src/ralph/backends/data/mongo.py
@@ -178,6 +178,7 @@ def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[Iterator[bytes], Iterator[dict]]:
         """Read documents matching the `query` from `target` collection and yield them.
@@ -190,6 +191,10 @@ def read(  # noqa: PLR0913
                 If `chunk_size` is `None` it defaults to `READ_CHUNK_SIZE`.
             raw_output (bool): Whether to yield dictionaries or bytes.
             ignore_errors (bool): Whether to ignore errors when reading documents.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yields:
@@ -201,7 +206,7 @@ def read(  # noqa: PLR0913
             BackendParameterException: If the `target` is not a valid collection name.
         """
         yield from super().read(
-            query, target, chunk_size, raw_output, ignore_errors, max_statements
+            query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements
         )
 
     def _read_bytes(

diff --git a/src/ralph/backends/data/s3.py b/src/ralph/backends/data/s3.py
@@ -169,6 +169,7 @@ def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[Iterator[bytes], Iterator[dict]]:
         """Read an object matching the `query` in the `target` bucket and yield it.
@@ -183,6 +184,10 @@ def read(  # noqa: PLR0913
             ignore_errors (bool): If `True`, errors during the read operation
                 will be ignored and logged. If `False` (default), a `BackendException`
                 will be raised if an error occurs.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yields:
@@ -196,7 +201,7 @@ def read(  # noqa: PLR0913
                 `ignore_errors` is set to `False`.
         """
         yield from super().read(
-            query, target, chunk_size, raw_output, ignore_errors, max_statements
+            query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements
         )
 
     def _read_bytes(

diff --git a/src/ralph/backends/data/swift.py b/src/ralph/backends/data/swift.py
@@ -179,6 +179,7 @@ def read(  # noqa: PLR0913
         chunk_size: Optional[int] = None,
         raw_output: bool = False,
         ignore_errors: bool = False,
+        greedy: bool = False,
         max_statements: Optional[int] = None,
     ) -> Union[Iterator[bytes], Iterator[dict]]:
         """Read objects matching the `query` in the `target` container and yield them.
@@ -198,6 +199,10 @@ def read(  # noqa: PLR0913
             ignore_errors (bool): If `True`, errors during the read operation
                 are be ignored and logged. If `False` (default), a `BackendException`
                 is raised if an error occurs.
+            greedy: If set to `True`, the client will fetch all available records
+                before they are yielded by the generator. Caution:
+                this might potentially lead to large amounts of API calls and to the
+                memory filling up.
             max_statements (int): The maximum number of statements to yield.
 
         Yields:
@@ -210,7 +215,7 @@ def read(  # noqa: PLR0913
             BackendParameterException: If a backend argument value is not valid.
         """
         yield from super().read(
-            query, target, chunk_size, raw_output, ignore_errors, max_statements
+            query, target, chunk_size, raw_output, ignore_errors, greedy, max_statements
         )
 
     def _read_bytes(