From 66f525309becc995658bb57cff94a4d56ca9ab44 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 11 Sep 2024 12:30:03 +0200 Subject: [PATCH] docs: Deployment guide (#482) Added a doc page about running crawlers on Apify using the SDK. I tested it, fixes will be required before publishing this. --- docs/deployment/apify_platform.mdx | 252 ++++++++++++++++++ .../code/apify_platform_get_public_url.py | 8 + docs/deployment/code/apify_platform_main.py | 32 +++ docs/introduction/09_running_in_cloud.mdx | 107 +++++++- docs/introduction/code/09_apify_sdk.py | 25 ++ docs/introduction/code/__init__.py | 0 docs/introduction/code/routes.py | 4 + pyproject.toml | 5 +- templates/beautifulsoup/.dockerignore | 155 +++++++++++ templates/beautifulsoup/.gitignore | 155 +++++++++++ templates/beautifulsoup/Dockerfile | 38 +++ templates/playwright/.dockerignore | 155 +++++++++++ templates/playwright/.gitignore | 155 +++++++++++ templates/playwright/Dockerfile | 40 +++ website/sidebars.js | 37 ++- 15 files changed, 1143 insertions(+), 25 deletions(-) create mode 100644 docs/deployment/apify_platform.mdx create mode 100644 docs/deployment/code/apify_platform_get_public_url.py create mode 100644 docs/deployment/code/apify_platform_main.py create mode 100644 docs/introduction/code/09_apify_sdk.py create mode 100644 docs/introduction/code/__init__.py create mode 100644 docs/introduction/code/routes.py create mode 100644 templates/beautifulsoup/.dockerignore create mode 100644 templates/beautifulsoup/.gitignore create mode 100644 templates/beautifulsoup/Dockerfile create mode 100644 templates/playwright/.dockerignore create mode 100644 templates/playwright/.gitignore create mode 100644 templates/playwright/Dockerfile diff --git a/docs/deployment/apify_platform.mdx b/docs/deployment/apify_platform.mdx new file mode 100644 index 000000000..ad76df79d --- /dev/null +++ b/docs/deployment/apify_platform.mdx @@ -0,0 +1,252 @@ +--- +id: apify-platform +title: Apify platform +description: Apify platform - large-scale and high-performance web scraping +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; + +import MainSource from '!!raw-loader!./code/apify_platform_main.py'; +import GetPublicUrlSource from '!!raw-loader!./code/apify_platform_get_public_url.py'; + +Apify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), convenient request and result storages, [proxies](../guides/proxy-management), scheduling, webhooks and [more](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) or an [API](https://docs.apify.com/api). + +While we think that the Apify platform is super cool, and it's definitely worth signing up for a [free account](https://console.apify.com/sign-up), **Crawlee is and will always be open source**, runnable locally or on any cloud infrastructure. + +:::note + +We do not test Crawlee in other cloud environments such as Lambda or on specific architectures such as Raspberry PI. We strive to make it work, but there are no guarantees. + +::: + +## Logging into Apify platform from Crawlee + +To access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that either by utilizing [Apify CLI](https://github.com/apify/apify-cli) or with environment variables. + +Once you provide credentials to your Apify CLI installation, you will be able to use all the Apify platform features, such as calling Actors, saving to cloud storages, using Apify proxies, setting up webhooks and so on. + +### Log in with CLI + +Apify CLI allows you to log in to your Apify account on your computer. If you then run your crawler using the CLI, your credentials will automatically be added. + +```bash +npm install -g apify-cli +apify login -t YOUR_API_TOKEN +``` + +### Log in with environment variables + +Alternatively, you can always provide credentials to your Actor by setting the [`APIFY_TOKEN`](#apify_token) environment variable to your API token. + +> There's also the [`APIFY_PROXY_PASSWORD`](#apify_proxy_password) +> environment variable. Actor automatically infers that from your token, but it can be useful +> when you need to access proxies from a different account than your token represents. + +### Log in with Configuration + +Another option is to use the [`Configuration`](https://docs.apify.com/sdk/python/reference/class/Configuration) instance and set your api token there. + +```python +from apify import Actor + +sdk = Actor(Configuration(token='your_apify_token')); +``` + +## What is an Actor + +When you deploy your script to the Apify platform, it becomes an [Actor](https://apify.com/actors). An Actor is a serverless microservice that accepts an input and produces an output. It can run for a few seconds, hours or even infinitely. An Actor can perform anything from a simple action such as filling out a web form or sending an email, to complex operations such as crawling an entire website and removing duplicates from a large dataset. + +Actors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. But don't worry, if you share your Actor in the store and somebody uses it, it runs under their account, not yours. + +**Related links** + +- [Store of existing Actors](https://apify.com/store) +- [Documentation](https://docs.apify.com/actors) +- [View Actors in Apify Console](https://console.apify.com/actors) +- [API reference](https://apify.com/docs/api/v2#/reference/actors) + +## Running an Actor locally + +First let's create a boilerplate of the new Actor. You could use Apify CLI and just run: + +```bash +apify create my-hello-world +``` + +The CLI will prompt you to select a project boilerplate template - let's pick "Crawlee + BeautifulSoup". The tool will create a directory called `my-hello-world` with Python project files. You can run the Actor as follows: + +```bash +cd my-hello-world +apify run +``` + +## Running Crawlee code as an Actor + +For running Crawlee code as an Actor on [Apify platform](https://apify.com/actors) you need to wrap the body of the main function of your crawler with `async with Actor`. + +:::info NOTE +Adding `async with Actor` is the only important thing needed to run it on Apify platform as an Actor. It is needed to initialize your Actor (e.g. to set the correct storage implementation) and to correctly handle exitting the process. +::: + +Let's look at the `BeautifulSoupCrawler` example from the [Quick Start](../quick-start) guide: + + + {MainSource} + + +Note that you could also run your Actor (that is using Crawlee) locally with Apify CLI. You could start it via the following command in your project folder: + +```bash +apify run +``` + +## Deploying an Actor to Apify platform + +Now (assuming you are already logged in to your Apify account) you can easily deploy your code to the Apify platform by running: + +```bash +apify push +``` + +Your script will be uploaded to and built on the Apify platform so that it can be run there. For more information, view the +[Apify Actor](https://docs.apify.com/cli) documentation. + +## Usage on Apify platform + +You can also develop your Actor in an online code editor directly on the platform (you'll need an Apify Account). Let's go to the [Actors](https://console.apify.com/actors) page in the app, click *Create new* and then go to the *Source* tab and start writing the code or paste one of the examples from the [Examples](../examples) section. + +## Storages + +There are several things worth mentioning here. + +### Helper functions for default Key-Value Store and Dataset + +To simplify access to the _default_ storages, instead of using the helper functions of respective storage classes, you could use: +- [`Actor.set_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#set_value), [`Actor.get_value()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_value), [`Actor.get_input()`](https://docs.apify.com/sdk/python/reference/class/Actor#get_input) for [`Key-Value Store`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) +- [`Actor.push_data()`](https://docs.apify.com/sdk/python/reference/class/Actor#push_data) for [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset) + +### Using platform storage in a local Actor + +When you plan to use the platform storage while developing and running your Actor locally, you should use [`Actor.open_key_value_store()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_key_value_store), [`Actor.open_dataset()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_dataset) and [`Actor.open_request_queue()`](https://docs.apify.com/sdk/python/reference/class/Actor#open_request_queue) to open the respective storage. + +Using each of these methods allows to pass the `force_cloud` keyword argument. If set to `True`, cloud storage will be used instead of the folder on the local disk. + +:::note +If you don't plan to force usage of the platform storages when running the Actor locally, there is no need to use the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) class for it. The Crawlee variants `KeyValueStore.open()`, `Dataset.open()` and `RequestQueue.open()` will work the same. +::: + +{/* +### Getting public url of an item in the platform storage + +If you need to share a link to some file stored in a [Key-Value](https://docs.apify.com/sdk/python/reference/class/KeyValueStore) Store on Apify Platform, you can use [`get_public_url()`](https://docs.apify.com/sdk/python/reference/class/KeyValueStore#get_public_url) method. It accepts only one parameter: `key` - the key of the item you want to share. + + + {GetPublicUrlSource} + + +*/} + +### Exporting dataset data + +When the `Dataset` is stored on the [Apify platform](https://apify.com/actors), you can export its data to the following formats: HTML, JSON, CSV, Excel, XML and RSS. The datasets are displayed on the Actor run details page and in the [Storage](https://console.apify.com/storage) section in the Apify Console. The actual data is exported using the [Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This way you can easily share the crawling results. + +**Related links** + +- [Apify platform storage documentation](https://docs.apify.com/storage) +- [View storage in Apify Console](https://console.apify.com/storage) +- [Key-value stores API reference](https://apify.com/docs/api/v2#/reference/key-value-stores) +- [Datasets API reference](https://docs.apify.com/api/v2#/reference/datasets) +- [Request queues API reference](https://docs.apify.com/api/v2#/reference/request-queues) + +## Environment variables + +The following describes select environment variables set by the Apify platform. For a complete list, see the [Environment variables](https://docs.apify.com/platform/actors/development/programming-interface/environment-variables) section in the Apify platform documentation. + +:::note + +It's important to notice that `CRAWLEE_` environment variables don't need to be replaced with equivalent `APIFY_` ones. Likewise, Crawlee understands `APIFY_` environment variables. + +::: + +### `APIFY_TOKEN` + +The API token for your Apify account. It is used to access the Apify API, e.g. to access cloud storage +or to run an Actor on the Apify platform. You can find your API token on the +[Account Settings / Integrations](https://console.apify.com/account?tab=integrations) page. + +### Combinations of `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR` + +By combining the env vars in various ways, you can greatly influence the Actor's behavior. + +| Env Vars | API | Storages | +| --------------------------------------- | --- | ---------------- | +| none OR `CRAWLEE_STORAGE_DIR` | no | local | +| `APIFY_TOKEN` | yes | Apify platform | +| `APIFY_TOKEN` AND `CRAWLEE_STORAGE_DIR` | yes | local + platform | + +When using both `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`, you can use all the Apify platform +features and your data will be stored locally by default. If you want to access platform storages, +you can use the `force_cloud=true` option in their respective functions. + +### `APIFY_PROXY_PASSWORD` + +Optional password to [Apify Proxy](https://docs.apify.com/proxy) for IP address rotation. +Assuming Apify Account was already created, you can find the password on the [Proxy page](https://console.apify.com/proxy) +in the Apify Console. The password is automatically inferred using the `APIFY_TOKEN` env var, +so in most cases, you don't need to touch it. You should use it when, for some reason, +you need access to Apify Proxy, but not access to Apify API, or when you need access to +proxy from a different account than your token represents. + +## Proxy management + +In addition to your own proxy servers and proxy servers acquired from +third-party providers used together with Crawlee, you can also rely on [Apify Proxy](https://apify.com/proxy) +for your scraping needs. + +### Apify Proxy + +If you are already subscribed to Apify Proxy, you can start using them immediately in only a few lines of code (for local usage you first should be [logged in](#logging-into-apify-platform-from-crawlee) to your Apify account. + +```python +from apify import Actor + +proxy_configuration = await Actor.create_proxy_configuration() +proxy_url = await proxy_configuration.new_url(); +``` + +Note that unlike using your own proxies in Crawlee, you shouldn't use the constructor to create `ProxyConfiguration` instances. For using the Apify Proxy you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function instead. + +### Apify Proxy Configuration + +With Apify Proxy, you can select specific proxy groups to use, or countries to connect from. +This allows you to get better proxy performance after some initial research. + +```python +import { Actor } from 'apify'; + +proxy_configuration = await Actor.create_proxy_configuration( + groups=['RESIDENTIAL'], + country_code='US', +); +proxy_url = await proxy_configuration.new_url(); +``` + +Now your crawlers will use only Residential proxies from the US. Note that you must first get access +to a proxy group before you are able to use it. You can check proxy groups available to you +in the [proxy dashboard](https://console.apify.com/proxy). + +### Apify Proxy vs. Own proxies + +The [`ProxyConfiguration`](https://docs.apify.com/sdk/python/reference/class/ProxyConfiguration) class covers both Apify Proxy and custom proxy URLs so that you can easily switch between proxy providers. However, some features of the class are available only to Apify Proxy users, mainly because Apify Proxy is what one would call a super-proxy. It's not a single proxy server, but an API endpoint that allows connection through millions of different IP addresses. So the class essentially has two modes: Apify Proxy or Own (third party) proxy. + +The difference is easy to remember. +- If you're using your own proxies - you should create a `ProxyConfiguration` instance directly. +- If you are planning to use Apify Proxy - you should create an instance using the [`Actor.create_proxy_configuration()`](https://docs.apify.com/sdk/python/reference/class/Actor#create_proxy_configuration) function. The `new_url_function` parameter enables the use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy. + +**Related links** + +- [Apify Proxy docs](https://docs.apify.com/proxy) diff --git a/docs/deployment/code/apify_platform_get_public_url.py b/docs/deployment/code/apify_platform_get_public_url.py new file mode 100644 index 000000000..8a4b79f9b --- /dev/null +++ b/docs/deployment/code/apify_platform_get_public_url.py @@ -0,0 +1,8 @@ +from apify import Actor + + +async def main() -> None: + store = await Actor.open_key_value_store() + await store.set_value('your-file', {'foo': 'bar'}) + # url = store.get_public_url('your-file') # noqa: ERA001 + # https://api.apify.com/v2/key-value-stores//records/your-file diff --git a/docs/deployment/code/apify_platform_main.py b/docs/deployment/code/apify_platform_main.py new file mode 100644 index 000000000..911bee8c3 --- /dev/null +++ b/docs/deployment/code/apify_platform_main.py @@ -0,0 +1,32 @@ +import asyncio + +from apify import Actor + +from crawlee import Glob +from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext + + +async def main() -> None: + async with Actor: + crawler = BeautifulSoupCrawler() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + url = context.request.url + + # Extract HTML title of the page. + title_element = context.soup.find('title') + title = title_element.text if title_element else '' + context.log.info(f'Title of {url}: {title}') + + # Add URLs that match the provided pattern. + await context.enqueue_links(include=[Glob('https://www.iana.org/*')]) + + # Save extracted data to dataset. + await context.push_data({'url': url, 'title': title}) + + # Enqueue the initial request and run the crawler + await crawler.run(['https://www.iana.org/']) + + +asyncio.run(main()) diff --git a/docs/introduction/09_running_in_cloud.mdx b/docs/introduction/09_running_in_cloud.mdx index e4a35593a..75996058e 100644 --- a/docs/introduction/09_running_in_cloud.mdx +++ b/docs/introduction/09_running_in_cloud.mdx @@ -1,9 +1,106 @@ --- -id: running-in-cloud -title: Running in cloud +id: deployment +title: Running your crawler in the Cloud +sidebar_label: Running in the Cloud +description: Deploying Crawlee-python projects to the Apify Platform --- -{/* TODO: +import CodeBlock from '@theme/CodeBlock'; +import MainExample from '!!raw-loader!./code/09_apify_sdk.py'; -write this page once sdk v2 is ready -*/} +## Apify Platform + +Crawlee is developed by [**Apify**](https://apify.com), the web scraping and automation platform. You could say it is the **home of Crawlee projects**. In this section you'll see how to deploy the crawler there with just a few simple steps. You can deploy a **Crawlee** project wherever you want, but using the [**Apify Platform**](https://console.apify.com) will give you the best experience. + +{/*In case you want to deploy your Crawlee project to other platforms, check out the [**Deployment**](../deployment) section.*/} + +With a few simple steps, you can convert your Crawlee project into a so-called **Actor**. Actors are serverless micro-apps that are easy to develop, run, share, and integrate. The infra, proxies, and storages are ready to go. [Learn more about Actors](https://apify.com/actors). + +{/*:::info Choosing between Crawlee CLI and Apify CLI for project setup + +We started this guide by using the Crawlee CLI to bootstrap the project - it offers the basic Crawlee templates, including a ready-made `Dockerfile`. If you know you will be deploying your project to the Apify Platform, you might want to start with the Apify CLI instead. It also offers several project templates, and those are all set up to be used on the Apify Platform right ahead. + +:::*/} + +## Dependencies + +The first step will be installing two new dependencies: + +- Apify SDK, a toolkit for working with the Apify Platform. This will allow us to wire the storages (e.g. [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue) and [`Dataset`](https://docs.apify.com/sdk/python/reference/class/Dataset)) to the Apify cloud products. This will be a dependency of our project. + + ```bash + poetry add apify + ``` + +- Alternatively, if you don't use `poetry` to manage your project, you may just install the SDK with `pip`: + + ```bash + pip install apify + ``` + + +- Apify CLI, a command-line tool that will help us with authentication and deployment. This will be a globally installed tool, you will install it only once and use it in all your Crawlee/Apify projects. + + ```bash + npm install -g apify-cli + ``` + +## Logging in to the Apify Platform + +The next step will be [creating your Apify account](https://console.apify.com/sign-up). Don't worry, we have a **free tier**, so you can try things out before you buy in! Once you have that, it's time to log in with the just-installed [Apify CLI](https://docs.apify.com/cli/). You will need your personal access token, which you can find at https://console.apify.com/account#/integrations. + +```bash +apify login +``` + +## Adjusting the code + +Now that you have your account set up, you will need to adjust the code a tiny bit. We will use the [Apify SDK](https://docs.apify.com/sdk/python/), which will help us to wire the Crawlee storages (like the [`RequestQueue`](https://docs.apify.com/sdk/python/reference/class/RequestQueue)) to their Apify Platform counterparts - otherwise Crawlee would keep things only in memory. + +Open your `src/main.py` file, and wrap everyting in your `main` function with the [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager. Your code should look like this: + + + {MainExample} + + +The context manager will configure Crawlee to use the Apify API instead of its default memory storage interface. It also sets up few other things, like listening to the platform events via websockets. After the body is finished, it handles graceful shutdown. + +:::info Understanding `async with Actor` behavior with environment variables + +The [`Actor`](https://docs.apify.com/sdk/python/reference/class/Actor) context manager works conditionally based on the environment variables, namely based on the `APIFY_IS_AT_HOME` env var, which is set to `true` on the Apify Platform. This means that your project will remain working the same locally, but will use the Apify API when deployed to the Apify Platform. + +::: + +## Initializing the project + +You will also need to initialize the project for Apify, to do that, use the Apify CLI again: + +```bash +apify init +``` + +This will create a folder called `.actor`, and an `actor.json` file inside it - this file contains the configuration relevant to the Apify Platform, namely the Actor name, version, build tag, and few other things. Check out the [relevant documentation](https://docs.apify.com/platform/actors/development/actor-definition/actor-json) to see all the different things you can set there up. + +## Ship it! + +And that's all, your project is now ready to be published on the Apify Platform. You can use the Apify CLI once more to do that: + +```bash +apify push +``` + +This command will create an archive from your project, upload it to the Apify Platform and initiate a Docker build. Once finished, you will get a link to your new Actor on the platform. + +## Learning more about web scraping + +:::tip Explore Apify Academy Resources + +If you want to learn more about web scraping and browser automation, check out the [Apify Academy](https://developers.apify.com/academy). It's full of courses and tutorials on the topic. From beginner to advanced. And the best thing: **It's free and open source** ❤️ + +{/*If you want to do one more project, checkout our tutorial on building a [HackerNews scraper using Crawlee](https://blog.apify.com/crawlee-web-scraping-tutorial/).*/} + +::: + +## Thank you! 🎉 + +That's it! Thanks for reading the whole introduction and if there's anything wrong, please 🙏 let us know on [GitHub](https://github.com/apify/crawlee-python) or in our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! 👋 diff --git a/docs/introduction/code/09_apify_sdk.py b/docs/introduction/code/09_apify_sdk.py new file mode 100644 index 000000000..266af3989 --- /dev/null +++ b/docs/introduction/code/09_apify_sdk.py @@ -0,0 +1,25 @@ +import asyncio + +# highlight-next-line +from apify import Actor + +from crawlee.playwright_crawler import PlaywrightCrawler + +from .routes import router + + +async def main() -> None: + # highlight-next-line + async with Actor: + crawler = PlaywrightCrawler( + # Let's limit our crawls to make our tests shorter and safer. + max_requests_per_crawl=50, + # Provide our router instance to the crawler. + request_handler=router, + ) + + await crawler.run(['https://warehouse-theme-metal.myshopify.com/collections']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/introduction/code/__init__.py b/docs/introduction/code/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/docs/introduction/code/routes.py b/docs/introduction/code/routes.py new file mode 100644 index 000000000..cec3d3547 --- /dev/null +++ b/docs/introduction/code/routes.py @@ -0,0 +1,4 @@ +from crawlee.playwright_crawler import PlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[PlaywrightCrawlingContext]() diff --git a/pyproject.toml b/pyproject.toml index 8453383c7..05e83dfdf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ keywords = [ # https://github.com/apify/apify-sdk-python/pull/154. [tool.poetry.dependencies] python = "^3.9" +apify = { version = ">=2.0.0", optional = true } beautifulsoup4 = { version = ">=4.12.0", optional = true } colorama = ">=0.4.0" cookiecutter = ">=2.6.0" @@ -91,7 +92,8 @@ types-psutil = "~5.9.5.20240205" types-python-dateutil = "~2.9.0.20240316" [tool.poetry.extras] -all = ["beautifulsoup4", "lxml", "html5lib", "curl-cffi", "playwright"] +all = ["apify", "beautifulsoup4", "lxml", "html5lib", "curl-cffi", "playwright"] +apify = ["apify"] beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"] curl-impersonate = ["curl-cffi"] playwright = ["playwright"] @@ -164,6 +166,7 @@ indent-style = "space" "D", # Everything from the pydocstyle "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py "F841", # Local variable {variable} is assigned to but never used + "N999", # Invalid module name ] [tool.ruff.lint.flake8-quotes] diff --git a/templates/beautifulsoup/.dockerignore b/templates/beautifulsoup/.dockerignore new file mode 100644 index 000000000..6eb49d35e --- /dev/null +++ b/templates/beautifulsoup/.dockerignore @@ -0,0 +1,155 @@ +.git +.mise.toml +.nvim.lua +storage + +# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ diff --git a/templates/beautifulsoup/.gitignore b/templates/beautifulsoup/.gitignore new file mode 100644 index 000000000..6eb49d35e --- /dev/null +++ b/templates/beautifulsoup/.gitignore @@ -0,0 +1,155 @@ +.git +.mise.toml +.nvim.lua +storage + +# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ diff --git a/templates/beautifulsoup/Dockerfile b/templates/beautifulsoup/Dockerfile new file mode 100644 index 000000000..7443c1bdd --- /dev/null +++ b/templates/beautifulsoup/Dockerfile @@ -0,0 +1,38 @@ +# First, specify the base Docker image. +# You can see the Docker images from Apify at https://hub.docker.com/r/apify/. +# You can also use any other image from Docker Hub. +FROM apify/actor-python:3.12 + +RUN apt install -yq git && rm -rf /var/lib/apt/lists/* + +RUN pip install -U pip setuptools \ + && pip install poetry \ + && poetry self add poetry-plugin-export + +# Second, copy just poetry.lock and pyproject.toml into the Actor image, +# since those should be the only files that affects the dependency install in the next step, +# in order to speed up the build +COPY pyproject.toml ./ +COPY poetry.lock ./ + +# Install the dependencies +RUN echo "Python version:" \ + && python --version \ + && echo "Installing dependencies:" \ + # Export packages from poetry.lock + && poetry export -f requirements.txt --without-hashes | \ + # Install everything using pip (ignore dependency checks - the lockfile is correct, period) + pip install -r /dev/stdin --no-dependencies \ + && echo "All installed Python packages:" \ + && pip freeze + +# Next, copy the remaining files and directories with the source code. +# Since we do this after installing the dependencies, quick build will be really fast +# for most source file changes. +COPY . ./ + +# Use compileall to ensure the runnability of the Actor Python code. +RUN python -m compileall -q . + +# Specify how to launch the source code of your Actor. +CMD ["python", "-m", "{{ cookiecutter.project_name }}"] diff --git a/templates/playwright/.dockerignore b/templates/playwright/.dockerignore new file mode 100644 index 000000000..6eb49d35e --- /dev/null +++ b/templates/playwright/.dockerignore @@ -0,0 +1,155 @@ +.git +.mise.toml +.nvim.lua +storage + +# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ diff --git a/templates/playwright/.gitignore b/templates/playwright/.gitignore new file mode 100644 index 000000000..6eb49d35e --- /dev/null +++ b/templates/playwright/.gitignore @@ -0,0 +1,155 @@ +.git +.mise.toml +.nvim.lua +storage + +# The rest is copied from https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ diff --git a/templates/playwright/Dockerfile b/templates/playwright/Dockerfile new file mode 100644 index 000000000..3c8eb1272 --- /dev/null +++ b/templates/playwright/Dockerfile @@ -0,0 +1,40 @@ +# First, specify the base Docker image. +# You can see the Docker images from Apify at https://hub.docker.com/r/apify/. +# You can also use any other image from Docker Hub. +FROM apify/actor-python-playwright:3.12 + +RUN apt install -yq git && rm -rf /var/lib/apt/lists/* + +RUN pip install -U pip setuptools \ + && pip install poetry \ + && poetry self add poetry-plugin-export + +# Second, copy just poetry.lock and pyproject.toml into the Actor image, +# since those should be the only files that affects the dependency install in the next step, +# in order to speed up the build +COPY pyproject.toml ./ +COPY poetry.lock ./ + +# Install the dependencies +RUN echo "Python version:" \ + && python --version \ + && echo "Installing dependencies:" \ + # Export packages from poetry.lock + && poetry export -f requirements.txt --without-hashes | \ + # Replace playwright version so that it matches whatever is pre-installed in the image + sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \ + # Install everything using pip (ignore dependency checks - the lockfile is correct, period) + pip install -r /dev/stdin --no-dependencies \ + && echo "All installed Python packages:" \ + && pip freeze + +# Next, copy the remaining files and directories with the source code. +# Since we do this after installing the dependencies, quick build will be really fast +# for most source file changes. +COPY . ./ + +# Use compileall to ensure the runnability of the Actor Python code. +RUN python -m compileall -q . + +# Specify how to launch the source code of your Actor. +CMD ["python", "-m", "{{ cookiecutter.project_name }}"] diff --git a/website/sidebars.js b/website/sidebars.js index 438e6a65a..deae2d492 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -18,8 +18,7 @@ module.exports = { 'introduction/scraping', 'introduction/saving-data', 'introduction/refactoring', - // TODO: add once SDK v2 is released - // 'introduction/running-in-cloud', + 'introduction/deployment', ], }, { @@ -38,21 +37,21 @@ module.exports = { }, ], }, - // { - // type: 'category', - // label: 'Deployment', - // link: { - // type: 'generated-index', - // title: 'Deployment guides', - // description: 'Here you can find guides on how to deploy your crawlers to various cloud providers.', - // slug: '/deployment', - // }, - // items: [ - // { - // type: 'doc', - // id: 'deployment/apify-platform', - // label: 'Deploy on Apify', - // }, + { + type: 'category', + label: 'Deployment', + link: { + type: 'generated-index', + title: 'Deployment guides', + description: 'Here you can find guides on how to deploy your crawlers to various cloud providers.', + slug: '/deployment', + }, + items: [ + { + type: 'doc', + id: 'deployment/apify-platform', + label: 'Deploy on Apify', + }, // { // type: 'category', // label: 'Deploy on AWS', @@ -69,8 +68,8 @@ module.exports = { // 'deployment/gcp-browsers', // ], // }, - // ], - // }, + ], + }, { type: 'category', label: 'Examples',