diff --git a/threads.ipynb b/threads.ipynb
index 47c723b..247fe9c 100644
--- a/threads.ipynb
+++ b/threads.ipynb
@@ -15,7 +15,7 @@
" - [Parallelism and concurrency in Python](#Parallelism-and-concurrency-in-Python)\n",
" - [References](#References)\n",
" - [Introduction](#Introduction)\n",
- " - [Parallelism Vs. concurrency](#Parallelism-Vs.-concurrency)\n",
+ " - [Parallelism vs. concurrency](#Parallelism-vs.-concurrency)\n",
" - [Parallelism](#Parallelism)\n",
" - [Concurrency](#Concurrency)\n",
" - [Quiz: parallel or not](#Quiz:-parallel-or-not)\n",
@@ -28,7 +28,7 @@
" - [Threads, GIL and the illusion of concurrency](#Threads,-GIL-and-the-illusion-of-concurrency)\n",
" - [Threads vs processes](#Threads-vs-processes)\n",
" - [When to use threads](#When-to-use-threads)\n",
- " - [Asynchronous programming and couroutines: cooperative multitasking](#Asynchronous-programming-and-couroutines:-cooperative-multitasking)\n",
+ " - [Asynchronous programming and coroutines: cooperative multitasking](#Asynchronous-programming-and-coroutines:-cooperative-multitasking)\n",
" - [Exercises](#Exercises)\n",
" - [Exercise 1: Counting words in a file🌶️🌶️](#Exercise-1:-Counting-words-in-a-file🌶️🌶️)\n",
" - [Exercise 2: Find super secret server key🌶️🌶️🌶️](#Exercise-2:-Find-super-secret-server-key🌶️🌶️🌶️)"
@@ -958,15 +958,6 @@
"## Exercises"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%reload_ext tutorial.tests.testsuite"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -976,7 +967,7 @@
"### Exercise 1: Counting words in a file🌶️🌶️\n",
"\n",
"Write a **parallel** function `letter_statistics` that returns the statistics of letter counts in the large file `input_file`.\n",
- "This means that the function should return a **sorted** `Dict[str, int]` containing the counts for each letter in sorted order.\n",
+ "This means that the function should return a `dict[str, int]` containing the counts for each letter in sorted order.\n",
"\n",
"
\n",
"
Hints
\n",
@@ -988,13 +979,19 @@
" To facilitate your work, we pass the size of the file (in number of characters) using the
size
argument.\n",
" \n",
"
\n",
+ " The input n_processes
determines how many processes your solution should use.\n",
+ " \n",
+ "
\n",
" Using seek
you can specify a line offset from the start of the file. Using read(size)
you can read size
characters only. \n",
" \n",
"
\n",
" Write your function in the cell below inside of the solution_exercise1
function. The function receives a Path
object input_file
as an input and should return a single dict[str, int]
dictionary.\n",
" \n",
"
\n",
- " Consider using the collections.Counter
class to count the number of letters in a string.\n",
+ " Consider using the collections.Counter
class to count the number of letters in a string.\n",
+ " \n",
+ "
\n",
+ " In case the test fails with a BrokenProcessPool
error, consider moving the definition of your solution in a separate file, importing it in the notebook and calling it from solution_exercise1
. \n",
" \n",
" \n",
"
\n"
@@ -1008,7 +1005,7 @@
},
"outputs": [],
"source": [
- "%reload_ext tutorial.tests.testsuite"
+ "%reload_ext tutorial.tests.testsuite\n"
]
},
{
@@ -1019,15 +1016,13 @@
},
"outputs": [],
"source": [
- "%%ipytest\n",
- "from pathlib import Path\n",
- "from collections import Counter\n",
- "from concurrent.futures import ProcessPoolExecutor\n",
- "from multiprocess import Process\n",
+ "%%ipytest \n",
+ "import pathlib\n",
"\n",
- "def solution_exercise1(input_file: Path, size: int) -> dict[str, int]:\n",
- " \"\"\"Write your solution here\"\"\"\n",
- " return {\"a\": 1}"
+ "def solution_exercise1(input_file: pathlib.Path, size: int, n_processes: int) -> dict[str, int]:\n",
+ " \"\"\"Write your solution here\"\"\"\n",
+ " return dict()\n",
+ "\n"
]
},
{
@@ -1115,7 +1110,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.13"
+ "version": "3.11.1"
}
},
"nbformat": 4,
diff --git a/tutorial/tests/test_threads.py b/tutorial/tests/test_threads.py
index 6898d55..a9f9c9b 100644
--- a/tutorial/tests/test_threads.py
+++ b/tutorial/tests/test_threads.py
@@ -5,9 +5,8 @@
import string
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
-from typing import Awaitable, Callable, Dict
+from typing import Awaitable, Callable
-import multiprocess
import pytest
@@ -56,67 +55,79 @@ def inner_file(size: int = 1000):
return inner_file
-def read_segment(file: pathlib.Path, start: int, end: int) -> str:
- with open(file) as f:
- f.seek(start)
- return f.read(end - start)
+def reference_exercise1(
+ input_file: pathlib.Path, size: int, n_processes: int
+) -> dict[str, int]:
+ def read_segment(file: pathlib.Path, start: int, end: int) -> str:
+ with open(file) as f:
+ f.seek(start)
+ return f.read(end - start)
+ def segment_stat(segment: str) -> dict[str, int]:
+ return Counter(segment.strip())
-def segment_stat(segment: str) -> Dict[str, int]:
- return Counter(segment.strip())
+ def count_words(
+ file: pathlib.Path, size: int, n_processes: int, segment_index: int
+ ) -> dict[str, int]:
+ segment_size = size // n_processes
+ remainder = size % n_processes
+ start = segment_index * segment_size + min(segment_index, remainder)
+ end = start + segment_size + (1 if segment_index < remainder else 0)
+ return segment_stat(read_segment(file, start, end))
-
-def count_words(
- file: pathlib.Path, size: int, n_processes: int, index: int
-) -> Dict[str, int]:
- segment_size = size // n_processes
- start = index * segment_size
- end = start + segment_size
- return segment_stat(read_segment(file, start, end))
-
-
-def reference_exercise1(input_path: pathlib.Path, size: int) -> Dict[str, int]:
- workers = multiprocess.cpu_count()
- with ProcessPoolExecutor(workers) as executor:
+ with ProcessPoolExecutor(n_processes) as executor:
result = executor.map(
- functools.partial(count_words, input_path, size, workers), range(workers)
+ functools.partial(count_words, input_file, size, n_processes),
+ range(n_processes),
)
return dict(functools.reduce(lambda x, y: x + y, result, Counter()))
-@pytest.mark.parametrize("size", [1000, 10000, 100000])
+random_file_sizes = [53, 123, 517, 1000, 10000]
+
+
+@pytest.mark.parametrize(
+ "size, n_processes", [(s, w) for s in random_file_sizes for w in [2, 4, 5, 7]]
+)
def test_exercise1_total_counts(
function_to_test: Callable,
make_random_file: Callable[[None], pathlib.Path],
size: int,
+ n_processes: int,
):
rf = make_random_file(size)
- reference_res = reference_exercise1(rf, size)
- total_letters = sum(reference_res.values())
- user_res = function_to_test(rf, size)
+ user_res = function_to_test(rf, size, n_processes)
total_letters_user = sum(user_res.values())
- assert total_letters == total_letters_user
+ assert total_letters_user == size
-@pytest.mark.parametrize("size", [1000, 10000, 100000])
+@pytest.mark.parametrize(
+ "size, workers", [(s, w) for s in random_file_sizes for w in [2, 4, 5, 7]]
+)
def test_exercise1_counts(
function_to_test: Callable,
make_random_file: Callable[[None], pathlib.Path],
size: int,
+ workers: int,
):
rf = make_random_file(size)
- reference_res = reference_exercise1(rf, size)
- user_res = function_to_test(rf, size)
- assert user_res == reference_res
+ # We read the file and use a counter as a trick. It is not parallel but we are
+ # sure it is correct
+ with open(rf) as f:
+ file_content = f.read()
+ # reference_res = count_words_parallel(rf, size, workers)
+ user_res = function_to_test(rf, size, workers)
+ assert user_res == Counter(file_content)
-# #TODO: find a way to test that the user is using multiprocessing (directly or indirectly)
+# TODO: find a way to test that the user is using multiprocessing (directly or indirectly)
# def test_exercise1_processes(function_to_test: Callable, make_random_file: Callable[[None], pathlib.Path], monkeypatch: pytest.MonkeyPatch):
-# with patch.object(multiprocessing.Process, "start") as process_mock:
-# size = 1000
-# rf = make_random_file(size)
-# user_res = function_to_test(rf, size)
-# assert process_mock.mock_calls or
+# n_process_mock = MagicMock()
+# n_process_mock.return_value = 2
+# size = 1000
+# rf = make_random_file(size)
+# user_res = function_to_test(rf, size, n_process_mock)
+# assert n_process_mock.called
def find_word(letters: list[str], separator: str) -> bool: