diff --git a/threads.ipynb b/threads.ipynb index 47c723b..247fe9c 100644 --- a/threads.ipynb +++ b/threads.ipynb @@ -15,7 +15,7 @@ " - [Parallelism and concurrency in Python](#Parallelism-and-concurrency-in-Python)\n", " - [References](#References)\n", " - [Introduction](#Introduction)\n", - " - [Parallelism Vs. concurrency](#Parallelism-Vs.-concurrency)\n", + " - [Parallelism vs. concurrency](#Parallelism-vs.-concurrency)\n", " - [Parallelism](#Parallelism)\n", " - [Concurrency](#Concurrency)\n", " - [Quiz: parallel or not](#Quiz:-parallel-or-not)\n", @@ -28,7 +28,7 @@ " - [Threads, GIL and the illusion of concurrency](#Threads,-GIL-and-the-illusion-of-concurrency)\n", " - [Threads vs processes](#Threads-vs-processes)\n", " - [When to use threads](#When-to-use-threads)\n", - " - [Asynchronous programming and couroutines: cooperative multitasking](#Asynchronous-programming-and-couroutines:-cooperative-multitasking)\n", + " - [Asynchronous programming and coroutines: cooperative multitasking](#Asynchronous-programming-and-coroutines:-cooperative-multitasking)\n", " - [Exercises](#Exercises)\n", " - [Exercise 1: Counting words in a file🌶️🌶️](#Exercise-1:-Counting-words-in-a-file🌶️🌶️)\n", " - [Exercise 2: Find super secret server key🌶️🌶️🌶️](#Exercise-2:-Find-super-secret-server-key🌶️🌶️🌶️)" @@ -958,15 +958,6 @@ "## Exercises" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext tutorial.tests.testsuite" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -976,7 +967,7 @@ "### Exercise 1: Counting words in a file🌶️🌶️\n", "\n", "Write a **parallel** function `letter_statistics` that returns the statistics of letter counts in the large file `input_file`.\n", - "This means that the function should return a **sorted** `Dict[str, int]` containing the counts for each letter in sorted order.\n", + "This means that the function should return a `dict[str, int]` containing the counts for each letter in sorted order.\n", "\n", "
\n", "

Hints

\n", @@ -988,13 +979,19 @@ " To facilitate your work, we pass the size of the file (in number of characters) using the size argument.\n", " \n", "
  • \n", + " The input n_processes determines how many processes your solution should use.\n", + "
  • \n", + "
  • \n", " Using seek you can specify a line offset from the start of the file. Using read(size) you can read size characters only. \n", "
  • \n", "
  • \n", " Write your function in the cell below inside of the solution_exercise1 function. The function receives a Path object input_file as an input and should return a single dict[str, int] dictionary.\n", "
  • \n", "
  • \n", - " Consider using the collections.Counter class to count the number of letters in a string.\n", + " Consider using the collections.Counter class to count the number of letters in a string.\n", + "
  • \n", + "
  • \n", + " In case the test fails with a BrokenProcessPool error, consider moving the definition of your solution in a separate file, importing it in the notebook and calling it from solution_exercise1. \n", "
  • \n", " \n", "
    \n" @@ -1008,7 +1005,7 @@ }, "outputs": [], "source": [ - "%reload_ext tutorial.tests.testsuite" + "%reload_ext tutorial.tests.testsuite\n" ] }, { @@ -1019,15 +1016,13 @@ }, "outputs": [], "source": [ - "%%ipytest\n", - "from pathlib import Path\n", - "from collections import Counter\n", - "from concurrent.futures import ProcessPoolExecutor\n", - "from multiprocess import Process\n", + "%%ipytest \n", + "import pathlib\n", "\n", - "def solution_exercise1(input_file: Path, size: int) -> dict[str, int]:\n", - " \"\"\"Write your solution here\"\"\"\n", - " return {\"a\": 1}" + "def solution_exercise1(input_file: pathlib.Path, size: int, n_processes: int) -> dict[str, int]:\n", + " \"\"\"Write your solution here\"\"\"\n", + " return dict()\n", + "\n" ] }, { @@ -1115,7 +1110,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.1" } }, "nbformat": 4, diff --git a/tutorial/tests/test_threads.py b/tutorial/tests/test_threads.py index 6898d55..a9f9c9b 100644 --- a/tutorial/tests/test_threads.py +++ b/tutorial/tests/test_threads.py @@ -5,9 +5,8 @@ import string from collections import Counter from concurrent.futures import ProcessPoolExecutor -from typing import Awaitable, Callable, Dict +from typing import Awaitable, Callable -import multiprocess import pytest @@ -56,67 +55,79 @@ def inner_file(size: int = 1000): return inner_file -def read_segment(file: pathlib.Path, start: int, end: int) -> str: - with open(file) as f: - f.seek(start) - return f.read(end - start) +def reference_exercise1( + input_file: pathlib.Path, size: int, n_processes: int +) -> dict[str, int]: + def read_segment(file: pathlib.Path, start: int, end: int) -> str: + with open(file) as f: + f.seek(start) + return f.read(end - start) + def segment_stat(segment: str) -> dict[str, int]: + return Counter(segment.strip()) -def segment_stat(segment: str) -> Dict[str, int]: - return Counter(segment.strip()) + def count_words( + file: pathlib.Path, size: int, n_processes: int, segment_index: int + ) -> dict[str, int]: + segment_size = size // n_processes + remainder = size % n_processes + start = segment_index * segment_size + min(segment_index, remainder) + end = start + segment_size + (1 if segment_index < remainder else 0) + return segment_stat(read_segment(file, start, end)) - -def count_words( - file: pathlib.Path, size: int, n_processes: int, index: int -) -> Dict[str, int]: - segment_size = size // n_processes - start = index * segment_size - end = start + segment_size - return segment_stat(read_segment(file, start, end)) - - -def reference_exercise1(input_path: pathlib.Path, size: int) -> Dict[str, int]: - workers = multiprocess.cpu_count() - with ProcessPoolExecutor(workers) as executor: + with ProcessPoolExecutor(n_processes) as executor: result = executor.map( - functools.partial(count_words, input_path, size, workers), range(workers) + functools.partial(count_words, input_file, size, n_processes), + range(n_processes), ) return dict(functools.reduce(lambda x, y: x + y, result, Counter())) -@pytest.mark.parametrize("size", [1000, 10000, 100000]) +random_file_sizes = [53, 123, 517, 1000, 10000] + + +@pytest.mark.parametrize( + "size, n_processes", [(s, w) for s in random_file_sizes for w in [2, 4, 5, 7]] +) def test_exercise1_total_counts( function_to_test: Callable, make_random_file: Callable[[None], pathlib.Path], size: int, + n_processes: int, ): rf = make_random_file(size) - reference_res = reference_exercise1(rf, size) - total_letters = sum(reference_res.values()) - user_res = function_to_test(rf, size) + user_res = function_to_test(rf, size, n_processes) total_letters_user = sum(user_res.values()) - assert total_letters == total_letters_user + assert total_letters_user == size -@pytest.mark.parametrize("size", [1000, 10000, 100000]) +@pytest.mark.parametrize( + "size, workers", [(s, w) for s in random_file_sizes for w in [2, 4, 5, 7]] +) def test_exercise1_counts( function_to_test: Callable, make_random_file: Callable[[None], pathlib.Path], size: int, + workers: int, ): rf = make_random_file(size) - reference_res = reference_exercise1(rf, size) - user_res = function_to_test(rf, size) - assert user_res == reference_res + # We read the file and use a counter as a trick. It is not parallel but we are + # sure it is correct + with open(rf) as f: + file_content = f.read() + # reference_res = count_words_parallel(rf, size, workers) + user_res = function_to_test(rf, size, workers) + assert user_res == Counter(file_content) -# #TODO: find a way to test that the user is using multiprocessing (directly or indirectly) +# TODO: find a way to test that the user is using multiprocessing (directly or indirectly) # def test_exercise1_processes(function_to_test: Callable, make_random_file: Callable[[None], pathlib.Path], monkeypatch: pytest.MonkeyPatch): -# with patch.object(multiprocessing.Process, "start") as process_mock: -# size = 1000 -# rf = make_random_file(size) -# user_res = function_to_test(rf, size) -# assert process_mock.mock_calls or +# n_process_mock = MagicMock() +# n_process_mock.return_value = 2 +# size = 1000 +# rf = make_random_file(size) +# user_res = function_to_test(rf, size, n_process_mock) +# assert n_process_mock.called def find_word(letters: list[str], separator: str) -> bool: