docs: move code samples to separate files (#457)

### Description - Move code samples to separate files (to be able to refactor, lint, type-checked, ..., them) - Till now this was done only for the guides section, in this PR I extended it to all doc files. ### Issues - N/A ### Testing - Doc was rendered locally, working. ### Checklist - [x] CI passed
apify · Aug 23, 2024 · 1c1ec64 · 1c1ec64
1 parent a7d3d72
commit 1c1ec64
Show file tree

Hide file tree

Showing 78 changed files with 1,440 additions and 1,356 deletions.
diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 .PHONY: clean install-dev build publish-to-pypi lint type-check unit-tests unit-tests-cov integration-tests check-code format check-version-conflict check-changelog-entry check-code build-api-reference run-doc
 
-DIRS_WITH_CODE = src tests scripts
+DIRS_WITH_CODE = src tests scripts docs
 
 # This is default for local testing, but GitHub workflows override it to a higher value in CI
 INTEGRATION_TESTS_CONCURRENCY = 1

diff --git a/docs/examples/add-data-to-dataset.mdx b/docs/examples/add-data-to-dataset.mdx
diff --git a/docs/examples/add_data_to_dataset.mdx b/docs/examples/add_data_to_dataset.mdx
@@ -0,0 +1,40 @@
+---
+id: add-data-to-dataset
+title: Add data to dataset
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+import CodeBlock from '@theme/CodeBlock';
+
+import BeautifulSoupExample from '!!raw-loader!./code/add_data_to_dataset_bs.py';
+import PlaywrightExample from '!!raw-loader!./code/add_data_to_dataset_pw.py';
+import DatasetExample from '!!raw-loader!./code/add_data_to_dataset_dataset.py';
+
+This example demonstrates how to store extracted data into datasets using the <ApiLink to="class/PushDataFunction#open">`context.push_data`</ApiLink> helper function. If the specified dataset does not already exist, it will be created automatically. Additionally, you can save data to custom datasets by providing `dataset_id` or `dataset_name` parameters to the <ApiLink to="class/PushDataFunction#open">`push_data`</ApiLink> function.
+
+<Tabs groupId="main">
+    <TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
+        <CodeBlock className="language-python">
+            {BeautifulSoupExample}
+        </CodeBlock>
+    </TabItem>
+    <TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
+        <CodeBlock className="language-python">
+            {PlaywrightExample}
+        </CodeBlock>
+    </TabItem>
+</Tabs>
+
+Each item in the dataset will be stored in its own file within the following directory:
+
+```text
+{PROJECT_FOLDER}/storage/datasets/default/
+```
+
+For more control, you can also open a dataset manually using the asynchronous constructor <ApiLink to="class/Dataset#open">`Dataset.open`</ApiLink>
+
+<CodeBlock className="language-python">
+    {DatasetExample}
+</CodeBlock>
diff --git a/docs/examples/beautifulsoup_crawler.mdx b/docs/examples/beautifulsoup_crawler.mdx
@@ -0,0 +1,15 @@
+---
+id: beautifulsoup-crawler
+title: BeautifulSoup crawler
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import BeautifulSoupExample from '!!raw-loader!./code/beautifulsoup_crawler.py';
+
+This example demonstrates how to use <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code.
+
+<CodeBlock className="language-python">
+    {BeautifulSoupExample}
+</CodeBlock>
diff --git a/docs/examples/capture_screenshot_using_playwright.mdx b/docs/examples/capture_screenshot_using_playwright.mdx
@@ -0,0 +1,19 @@
+---
+id: capture-screenshots-using-playwright
+title: Capture screenshots using Playwright
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import CaptureScreenshotExample from '!!raw-loader!./code/capture_screenshot_using_playwright.py';
+
+This example demonstrates how to capture screenshots of web pages using <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and store them in the key-value store.
+
+The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method.
+
+The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page.
+
+<CodeBlock className="language-python">
+    {CaptureScreenshotExample}
+</CodeBlock>
diff --git a/docs/examples/code/add_data_to_dataset_bs.py b/docs/examples/code/add_data_to_dataset_bs.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+async def main() -> None:
+    crawler = BeautifulSoupCrawler()
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Extract data from the page.
+        data = {
+            'url': context.request.url,
+            'title': context.soup.title.string if context.soup.title else None,
+            'html': str(context.soup)[:1000],
+        }
+
+        # Push the extracted data to the default dataset.
+        await context.push_data(data)
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(
+        [
+            'https://crawlee.dev',
+            'https://apify.com',
+            'https://example.com',
+        ]
+    )
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/code/add_data_to_dataset_dataset.py b/docs/examples/code/add_data_to_dataset_dataset.py
@@ -0,0 +1,9 @@
+from crawlee.storages import Dataset
+
+
+async def main() -> None:
+    # Open dataset manually using asynchronous constructor open().
+    dataset = await Dataset.open()
+
+    # Interact with dataset directly.
+    await dataset.push_data({'key': 'value'})
diff --git a/docs/examples/code/add_data_to_dataset_pw.py b/docs/examples/code/add_data_to_dataset_pw.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+
+
+async def main() -> None:
+    crawler = PlaywrightCrawler()
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Extract data from the page.
+        data = {
+            'url': context.request.url,
+            'title': await context.page.title(),
+            'html': str(await context.page.content())[:1000],
+        }
+
+        # Push the extracted data to the default dataset.
+        await context.push_data(data)
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(
+        [
+            'https://crawlee.dev',
+            'https://apify.com',
+            'https://example.com',
+        ]
+    )
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/beautifulsoup-crawler.mdx → docs/examples/code/beautifulsoup_crawler.py b/docs/examples/beautifulsoup-crawler.mdx → docs/examples/code/beautifulsoup_crawler.py
@@ -1,13 +1,3 @@
----
-id: beautifulsoup-crawler
-title: BeautifulSoup crawler
----
-
-import ApiLink from '@site/src/components/ApiLink';
-
-This example demonstrates how to use <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink> to crawl a list of URLs, load each URL using a plain HTTP request, parse the HTML using the [BeautifulSoup](https://pypi.org/project/beautifulsoup4/) library and extract some data from it - the page title and all `<h1>`, `<h2>` and `<h3>` tags. This setup is perfect for scraping specific elements from web pages. Thanks to the well-known BeautifulSoup, you can easily navigate the HTML structure and retrieve the data you need with minimal code.
-
-```python
 import asyncio
 from datetime import timedelta
 
@@ -52,6 +42,6 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     # Run the crawler with the initial list of URLs.
     await crawler.run(['https://crawlee.dev'])
 
+
 if __name__ == '__main__':
     asyncio.run(main())
-```
diff --git a/...s/capture-screenshot-using-playwright.mdx → ...de/capture_screenshot_using_playwright.py b/...s/capture-screenshot-using-playwright.mdx → ...de/capture_screenshot_using_playwright.py
@@ -1,17 +1,3 @@
----
-id: capture-screenshots-using-playwright
-title: Capture screenshots using Playwright
----
-
-import ApiLink from '@site/src/components/ApiLink';
-
-This example demonstrates how to capture screenshots of web pages using <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> and store them in the key-value store.
-
-The <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is configured to automate the browsing and interaction with web pages. It uses headless Chromium as the browser type to perform these tasks. Each web page specified in the initial list of URLs is visited sequentially, and a screenshot of the page is captured using Playwright's `page.screenshot()` method.
-
-The captured screenshots are stored in the key-value store, which is suitable for managing and storing files in various formats. In this case, screenshots are stored as PNG images with a unique key generated from the URL of the page.
-
-```python
 import asyncio
 
 from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
@@ -59,4 +45,3 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
 
 if __name__ == '__main__':
     asyncio.run(main())
-```
diff --git a/docs/examples/code/crawl_all_links_on_website_bs.py b/docs/examples/code/crawl_all_links_on_website_bs.py
@@ -0,0 +1,25 @@
+import asyncio
+
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+async def main() -> None:
+    crawler = BeautifulSoupCrawler(
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Enqueue all links found on the page.
+        await context.enqueue_links()
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/code/crawl_all_links_on_website_pw.py b/docs/examples/code/crawl_all_links_on_website_pw.py
@@ -0,0 +1,25 @@
+import asyncio
+
+from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+
+
+async def main() -> None:
+    crawler = PlaywrightCrawler(
+        # Limit the crawl to max requests. Remove or increase it for crawling all links.
+        max_requests_per_crawl=10,
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+        # Enqueue all links found on the page.
+        await context.enqueue_links()
+
+    # Run the crawler with the initial list of requests.
+    await crawler.run(['https://crawlee.dev'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())