From a37986ccf5c1b1c4f835ab81a5354bb6df77d45b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Tue, 19 Sep 2023 14:27:54 +0200 Subject: [PATCH] docs: fix the deployment guides paths --- docs/deployment/apify_platform.mdx | 305 ++++++++++++++++++++ docs/deployment/apify_platform_init_exit.ts | 27 ++ docs/deployment/apify_platform_main.ts | 25 ++ docs/deployment/aws-browsers.md | 123 ++++++++ docs/deployment/aws-cheerio.md | 123 ++++++++ docs/deployment/gcp-browsers.md | 85 ++++++ docs/deployment/gcp-cheerio.md | 81 ++++++ 7 files changed, 769 insertions(+) create mode 100644 docs/deployment/apify_platform.mdx create mode 100644 docs/deployment/apify_platform_init_exit.ts create mode 100644 docs/deployment/apify_platform_main.ts create mode 100644 docs/deployment/aws-browsers.md create mode 100644 docs/deployment/aws-cheerio.md create mode 100644 docs/deployment/gcp-browsers.md create mode 100644 docs/deployment/gcp-cheerio.md diff --git a/docs/deployment/apify_platform.mdx b/docs/deployment/apify_platform.mdx new file mode 100644 index 000000000000..39904d54130d --- /dev/null +++ b/docs/deployment/apify_platform.mdx @@ -0,0 +1,305 @@ +--- +id: apify-platform +title: Apify Platform +description: Apify platform - large-scale and high-performance web scraping +--- + +import ApiLink from '@site/src/components/ApiLink'; + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import CodeBlock from '@theme/CodeBlock'; + +import MainSource from '!!raw-loader!./apify_platform_main.ts'; +import InitExitSource from '!!raw-loader!./apify_platform_init_exit.ts'; + +Apify is a [platform](https://apify.com) built to serve large-scale and high-performance web scraping +and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), +convenient [request](../guides/request-storage) and [result](../guides/result-storage) storages, [proxies](../guides/proxy-management), +[scheduling](https://docs.apify.com/scheduler), [webhooks](https://docs.apify.com/webhooks) +and [more](https://docs.apify.com/), accessible through a [web interface](https://console.apify.com) +or an [API](https://docs.apify.com/api). + +While we think that the Apify platform is super cool, and it's definitely worth signing up for a +[free account](https://console.apify.com/sign-up), **Crawlee is and will always be open source**, +runnable locally or on any cloud infrastructure. + +:::note + +We do not test Crawlee in other cloud environments such as Lambda or on specific +architectures such as Raspberry PI. We strive to make it work, but there are no guarantees. + +::: + +## Logging into Apify platform from Crawlee + +To access your [Apify account](https://console.apify.com/sign-up) from Crawlee, you must provide +credentials - your [API token](https://console.apify.com/account?tab=integrations). You can do that +either by utilizing [Apify CLI](https://github.com/apify/apify-cli) or with environment +variables. + +Once you provide credentials to your scraper, you will be able to use all the Apify platform +features, such as calling actors, saving to cloud storages, using Apify proxies, +setting up webhooks and so on. + +### Log in with CLI + +Apify CLI allows you to log in to your Apify account on your computer. If you then run your +scraper using the CLI, your credentials will automatically be added. + +```bash +npm install -g apify-cli +apify login -t YOUR_API_TOKEN +``` + +### Log in with environment variables + +Alternatively, you can always provide credentials to your scraper +by setting the [`APIFY_TOKEN`](#apify_token) environment +variable to your API token. + +> There's also the [`APIFY_PROXY_PASSWORD`](#apify_proxy_password) +> environment variable. Actor automatically infers that from your token, but it can be useful +> when you need to access proxies from a different account than your token represents. + +### Log in with Configuration + +Another option is to use the [`Configuration`](https://apify.github.io/apify-sdk-js/api/apify/class/Configuration) instance and set your api token there. + +```javascript +import { Actor } from 'apify'; + +const sdk = new Actor({ token: 'your_api_token' }); +``` + +## What is an actor + +When you deploy your script to the Apify platform, it becomes an [actor](https://apify.com/actors). +An actor is a serverless microservice that accepts an input and produces an output. It can run for +a few seconds, hours or even infinitely. An actor can perform anything from a simple action such +as filling out a web form or sending an email, to complex operations such as crawling an entire website +and removing duplicates from a large dataset. + +Actors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. +But don't worry, if you share your actor in the store and somebody uses it, it runs under their account, +not yours. + +**Related links** + +- [Store of existing actors](https://apify.com/store) +- [Documentation](https://docs.apify.com/actors) +- [View actors in Apify Console](https://console.apify.com/actors) +- [API reference](https://apify.com/docs/api/v2#/reference/actors) + +## Running an actor locally + +First let's create a boilerplate of the new actor. You could use Apify CLI and just run: + +```bash +apify create my-hello-world +``` + +The CLI will prompt you to select a project boilerplate template - let's pick "Hello world". The tool will create a directory called `my-hello-world` with a Node.js project files. You can run the actor as follows: + +```bash +cd my-hello-world +apify run +``` + +## Running Crawlee code as an actor + +For running Crawlee code as an actor on [Apify platform](https://apify.com/actors) you should either: +- use a combination of [`Actor.init()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#init) and [`Actor.exit()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#exit) functions; +- or wrap it into [`Actor.main()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#main) function. + +:::info NOTE +- Adding [`Actor.init()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#init) and [`Actor.exit()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#exit) to your code are the only two important things needed to run it on Apify platform as an actor. `Actor.init()` is needed to initialize your actor (e.g. to set the correct storage implementation), while without `Actor.exit()` the process will simply never stop. +- [`Actor.main()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#main) is an alternative to `Actor.init()` and `Actor.exit()` as it calls both behind the scenes. +::: + +Let's look at the `CheerioCrawler` example from the [Quick Start](../quick-start) guide: + + + + + {MainSource} + + + + + {InitExitSource} + + + + +Note that you could also run your actor (that is using Crawlee) locally with Apify CLI. You could start it via the following command in your project folder: + +```bash +apify run +``` + +## Deploying an actor to Apify platform + +Now (assuming you are already logged in to your Apify account) you can easily deploy your code to the Apify platform by running: + +```bash +apify push +``` + +Your script will be uploaded to and built on the Apify platform so that it can be run there. For more information, view the +[Apify Actor](https://docs.apify.com/cli) documentation. + +## Usage on Apify platform + +You can also develop your actor in an online code editor directly on the platform (you'll need an Apify Account). Let's go to the [Actors](https://console.apify.com/actors) page in the app, click *Create new* and then go to the *Source* tab and start writing the code or paste one of the examples from the [Examples](../examples) section. + +## Storages + +There are several things worth mentioning here. + +### Helper functions for default Key-Value Store and Dataset + +To simplify access to the _default_ storages, instead of using the helper functions of respective storage classes, you could use: +- [`Actor.setValue()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#setValue), [`Actor.getValue()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#getValue), [`Actor.getInput()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#getInput) for `Key-Value Store` +- [`Actor.pushData()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#pushData) for `Dataset` + +### Using platform storage in a local actor + +When you plan to use the platform storage while developing and running your actor locally, you should use [`Actor.openKeyValueStore()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#openKeyValueStore), [`Actor.openDataset()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#openDataset) and [`Actor.openRequestQueue()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#openRequestQueue) to open the respective storage. + +Using each of these methods allows to pass the [`OpenStorageOptions`](https://apify.github.io/apify-sdk-js/api/apify/interface/OpenStorageOptions) as a second argument, which has only one optional property: [`forceCloud`](https://apify.github.io/apify-sdk-js/api/apify/interface/OpenStorageOptions#forceCloud). If set to `true` - cloud storage will be used instead of the folder on the local disk. + +:::note +If you don't plan to force usage of the platform storages when running the actor locally, there is no need to use the [`Actor`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor) class for it. The Crawlee variants `KeyValueStore.open()`, `Dataset.open()` and `RequestQueue.open()` will work the same. +::: + +### Getting public url of an item in the platform storage + +If you need to share a link to some file stored in a Key-Value Store on Apify Platform, you can use [`getPublicUrl()`](https://apify.github.io/apify-sdk-js/api/apify/class/KeyValueStore#getPublicUrl) method. It accepts only one parameter: `key` - the key of the item you want to share. + +```js +import { KeyValueStore } from 'apify'; + +const store = await KeyValueStore.open(); +await store.setValue('your-file', { foo: 'bar' }); +const url = store.getPublicUrl('your-file'); +// https://api.apify.com/v2/key-value-stores//records/your-file +``` + +### Exporting dataset data + +When the `Dataset` is stored on the [Apify platform](https://apify.com/actors), you can export its data to the following formats: HTML, JSON, CSV, Excel, XML and RSS. The datasets are displayed on the actor run details page and in the [Storage](https://console.apify.com/storage) section in the Apify Console. The actual data is exported using the [Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This way you can easily share the crawling results. + +**Related links** + +- [Apify platform storage documentation](https://docs.apify.com/storage) +- [View storage in Apify Console](https://console.apify.com/storage) +- [Key-value stores API reference](https://apify.com/docs/api/v2#/reference/key-value-stores) +- [Datasets API reference](https://docs.apify.com/api/v2#/reference/datasets) +- [Request queues API reference](https://docs.apify.com/api/v2#/reference/request-queues) + +## Environment variables + +The following are some additional environment variables specific to Apify platform. More Crawlee specific environment variables could be found in the [Environment Variables](./configuration#environment-variables) guide. + +:::note + +It's important to notice that `CRAWLEE_` environment variables don't need to be replaced with equivalent `APIFY_` ones. Likewise, Crawlee understands `APIFY_` environment variables after calling `Actor.init()` or when using `Actor.main()`. + +::: + +### `APIFY_TOKEN` + +The API token for your Apify account. It is used to access the Apify API, e.g. to access cloud storage +or to run an actor on the Apify platform. You can find your API token on the +[Account Settings / Integrations](https://console.apify.com/account?tab=integrations) page. + +### Combinations of `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR` + +> `CRAWLEE_STORAGE_DIR` env variable description could be found in [Environment Variables](../guides/configuration#crawlee_storage_dir) guide. + +By combining the env vars in various ways, you can greatly influence the actor's behavior. + +| Env Vars | API | Storages | +| --------------------------------------- | --- | ---------------- | +| none OR `CRAWLEE_STORAGE_DIR` | no | local | +| `APIFY_TOKEN` | yes | Apify platform | +| `APIFY_TOKEN` AND `CRAWLEE_STORAGE_DIR` | yes | local + platform | + +When using both `APIFY_TOKEN` and `CRAWLEE_STORAGE_DIR`, you can use all the Apify platform +features and your data will be stored locally by default. If you want to access platform storages, +you can use the `{ forceCloud: true }` option in their respective functions. + +```js +import { Actor } from 'apify'; +import { Dataset } from 'crawlee'; + +// or Dataset.open('my-local-data') +const localDataset = await Actor.openDataset('my-local-data'); +// but here we need the `Actor` class +const remoteDataset = await Actor.openDataset('my-dataset', { forceCloud: true }); +``` + +### `APIFY_PROXY_PASSWORD` + +Optional password to [Apify Proxy](https://docs.apify.com/proxy) for IP address rotation. +Assuming Apify Account was already created, you can find the password on the [Proxy page](https://console.apify.com/proxy) +in the Apify Console. The password is automatically inferred using the `APIFY_TOKEN` env var, +so in most cases, you don't need to touch it. You should use it when, for some reason, +you need access to Apify Proxy, but not access to Apify API, or when you need access to +proxy from a different account than your token represents. + +## Proxy management + +In addition to your own proxy servers and proxy servers acquired from +third-party providers used together with Crawlee, you can also rely on [Apify Proxy](https://apify.com/proxy) +for your scraping needs. + +### Apify Proxy + +If you are already subscribed to Apify Proxy, you can start using them immediately in only a few lines of code (for local usage you first should be [logged in](#logging-into-apify-platform-from-crawlee) to your Apify account. + +```javascript +import { Actor } from 'apify'; + +const proxyConfiguration = await Actor.createProxyConfiguration(); +const proxyUrl = await proxyConfiguration.newUrl(); +``` + +Note that unlike using your own proxies in Crawlee, you shouldn't use the constructor to create `ProxyConfiguration` instance. For using Apify Proxy you should create an instance using the [`Actor.createProxyConfiguration()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#createProxyConfiguration) function instead. + +### Apify Proxy Configuration + +With Apify Proxy, you can select specific proxy groups to use, or countries to connect from. +This allows you to get better proxy performance after some initial research. + +```javascript +import { Actor } from 'apify'; + +const proxyConfiguration = await Actor.createProxyConfiguration({ + groups: ['RESIDENTIAL'], + countryCode: 'US', +}); +const proxyUrl = await proxyConfiguration.newUrl(); +``` + +Now your crawlers will use only Residential proxies from the US. Note that you must first get access +to a proxy group before you are able to use it. You can check proxy groups available to you +in the [proxy dashboard](https://console.apify.com/proxy). + +### Apify Proxy vs. Own proxies + +The `ProxyConfiguration` class covers both Apify Proxy and custom proxy URLs so that +you can easily switch between proxy providers. However, some features of the class +are available only to Apify Proxy users, mainly because Apify Proxy is what +one would call a super-proxy. It's not a single proxy server, but an API endpoint +that allows connection through millions of different IP addresses. So the class +essentially has two modes: Apify Proxy or Own (third party) proxy. + +The difference is easy to remember. +- If you're using your own proxies - you should create an instance with the ProxyConfiguration `constructor` function based on the provided `ProxyConfigurationOptions`. +- If you are planning to use Apify Proxy - you should create an instance using the [`Actor.createProxyConfiguration()`](https://apify.github.io/apify-sdk-js/api/apify/class/Actor#createProxyConfiguration) function. `ProxyConfigurationOptions.proxyUrls` and `ProxyConfigurationOptions.newUrlFunction` enable use of your custom proxy URLs, whereas all the other options are there to configure Apify Proxy. + +**Related links** + +- [Apify Proxy docs](https://docs.apify.com/proxy) diff --git a/docs/deployment/apify_platform_init_exit.ts b/docs/deployment/apify_platform_init_exit.ts new file mode 100644 index 000000000000..49a10f100f23 --- /dev/null +++ b/docs/deployment/apify_platform_init_exit.ts @@ -0,0 +1,27 @@ +import { Actor } from 'apify'; +import { CheerioCrawler } from 'crawlee'; + +await Actor.init(); + +const crawler = new CheerioCrawler({ + async requestHandler({ request, $, enqueueLinks }) { + const { url } = request; + + // Extract HTML title of the page. + const title = $('title').text(); + console.log(`Title of ${url}: ${title}`); + + // Add URLs that match the provided pattern. + await enqueueLinks({ + globs: ['https://www.iana.org/*'], + }); + + // Save extracted data to dataset. + await Actor.pushData({ url, title }); + }, +}); + +// Enqueue the initial request and run the crawler +await crawler.run(['https://www.iana.org/']); + +await Actor.exit(); diff --git a/docs/deployment/apify_platform_main.ts b/docs/deployment/apify_platform_main.ts new file mode 100644 index 000000000000..a338047e86ea --- /dev/null +++ b/docs/deployment/apify_platform_main.ts @@ -0,0 +1,25 @@ +import { Actor } from 'apify'; +import { CheerioCrawler } from 'crawlee'; + +await Actor.main(async () => { + const crawler = new CheerioCrawler({ + async requestHandler({ request, $, enqueueLinks }) { + const { url } = request; + + // Extract HTML title of the page. + const title = $('title').text(); + console.log(`Title of ${url}: ${title}`); + + // Add URLs that match the provided pattern. + await enqueueLinks({ + globs: ['https://www.iana.org/*'], + }); + + // Save extracted data to dataset. + await Actor.pushData({ url, title }); + }, + }); + + // Enqueue the initial request and run the crawler + await crawler.run(['https://www.iana.org/']); +}); diff --git a/docs/deployment/aws-browsers.md b/docs/deployment/aws-browsers.md new file mode 100644 index 000000000000..8de69ee55ccb --- /dev/null +++ b/docs/deployment/aws-browsers.md @@ -0,0 +1,123 @@ +--- +id: aws-browsers +title: Browsers on AWS Lambda +--- + +Running browser-enabled Crawlee crawlers in AWS Lambda is a bit complicated - but not too much. The main problem is that we have to upload not only our code and the dependencies, but also the **browser binaries**. + +## Managing browser binaries + +Fortunately, there are already some NPM packages that can help us with managing the browser binaries installation: + +- [@sparticuz/chromium](https://www.npmjs.com/package/@sparticuz/chromium) is an NPM package containing brotli-compressed chromium binaries. When run in the Lambda environment, the package unzips the binaries under the `/tmp/` path and returns the path to the executable. + +We just add this package to the project dependencies and zip the `node_modules` folder. + +```bash +# Install the package +npm i -S @sparticuz/chromium + +# Zip the dependencies +zip -r dependencies.zip ./node_modules +``` + +We will now upload the `dependencies.zip` as a Lambda Layer to AWS. Unfortunately, we cannot do this directly - there is a 50MB limit on direct uploads (and the compressed Chromium build is around that size itself). Instead, we'll upload it as an object into an S3 storage and provide the link to that object during the layer creation. + +## Updating the code + +We also have to slightly update the Crawlee code: + +- First, we pass a new `Configuration` instance to the Crawler. This way, every crawler instance we create will have its own storage and won’t interfere with other crawler instances running in your Lambda environment. + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { Configuration, PlaywrightCrawler } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +const crawler = new PlaywrightCrawler({ + requestHandler: router, +// highlight-start +}, new Configuration({ + persistStorage: false, +})); +// highlight-end + +await crawler.run(startUrls); +``` + +- Now, we actually have to supply the code with the Chromium path from the `@sparticuz/chromium` package. AWS Lambda execution also lacks some hardware support for GPU acceleration etc. - you can tell Chrome about this by passing the `aws_chromium.args` to the `args` parameter. + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { Configuration, PlaywrightCrawler } from 'crawlee'; +import { router } from './routes.js'; +// highlight-next-line +import aws_chromium from '@sparticuz/chromium'; + +const startUrls = ['https://crawlee.dev']; + +const crawler = new PlaywrightCrawler({ + requestHandler: router, + // highlight-start + launchContext: { + launchOptions: { + executablePath: await aws_chromium.executablePath(), + args: aws_chromium.args, + headless: true + } + } + // highlight-end +}, new Configuration({ + persistStorage: false, +})); + +``` + +- Last but not least, we have to wrap the code in the exported `handler` function - this will become the Lambda AWS will be executing. + +```javascript title="src/main.js" +import { Configuration, PlaywrightCrawler } from 'crawlee'; +import { router } from './routes.js'; +import aws_chromium from '@sparticuz/chromium'; + +const startUrls = ['https://crawlee.dev']; + +// highlight-next-line +export const handler = async (event, context) => { + const crawler = new PlaywrightCrawler({ + requestHandler: router, + launchContext: { + launchOptions: { + executablePath: await aws_chromium.executablePath(), + args: aws_chromium.args, + headless: true + } + } + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); + + // highlight-start + return { + statusCode: 200, + body: await crawler.getData(), + }; +} +// highlight-end + +``` + +## Deploying the code + +Now we can simply pack the code into a zip archive (minus the `node_modules` folder, we have put that in the Lambda Layer, remember?). We upload the code archive to AWS as the Lambda body, set up the Lambda so it uses the dependencies Layer, and test our newly created Lambda. + +:::tip Memory settings + +Since we’re using full-size browsers here, we have to update the Lambda configurations a bit. Most importantly, make sure to set the memory setting to **1024 MB or more** and update the **Lambda timeout**. + +The target timeout value depends on how long your crawler will be running. Try measuring the execution time when running your crawler locally and set the timeout accordingly. +::: \ No newline at end of file diff --git a/docs/deployment/aws-cheerio.md b/docs/deployment/aws-cheerio.md new file mode 100644 index 000000000000..371ba15c3a5a --- /dev/null +++ b/docs/deployment/aws-cheerio.md @@ -0,0 +1,123 @@ +--- +id: aws-cheerio +title: Cheerio on AWS Lambda +--- + +Locally, we can conveniently create a Crawlee project with `npx crawlee create`. In order to run this project on AWS Lambda, however, we need to do a few tweaks. + +## Updating the code + +Whenever we instantiate a new crawler, we have to pass a unique `Configuration` instance to it. By default, all the Crawlee crawler instances share the same storage - this can be convenient, but would also cause “statefulness” of our Lambda, which would lead to hard-to-debug problems. + +Also, when creating this Configuration instance, make sure to pass the `persistStorage: false` option. This tells Crawlee to use in-memory storage, as the Lambda filesystem is read-only. + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { CheerioCrawler, Configuration, ProxyConfiguration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +const crawler = new CheerioCrawler({ + requestHandler: router, +// highlight-start +}, new Configuration({ + persistStorage: false, +})); +// highlight-end + +await crawler.run(startUrls); +``` + +Now, we wrap all the logic in a `handler` function. This is the actual “Lambda” that AWS will be executing later on. + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { CheerioCrawler, Configuration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +// highlight-next-line +export const handler = async (event, context) => { + const crawler = new CheerioCrawler({ + requestHandler: router, + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); +// highlight-next-line +}; +``` + +:::tip **Important** + +Make sure to always instantiate a **new crawler instance for every Lambda**. AWS always keeps the environment running for some time after the first Lambda execution (in order to reduce cold-start times) - so any subsequent Lambda calls will access the already-used crawler instance. + +**TLDR: Keep your Lambda stateless.** + +::: + + +Last things last, we also want to return the scraped data from the Lambda when the crawler run ends. + +In the end, your `main.js` script should look something like this: + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { CheerioCrawler, Configuration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +export const handler = async (event, context) => { + const crawler = new CheerioCrawler({ + requestHandler: router, + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); + + // highlight-start + return { + statusCode: 200, + body: await crawler.getData(), + } + // highlight-end +}; +``` + +## Deploying the project + +Now it’s time to deploy our script on AWS! + +Let’s create a zip archive from our project (including the `node_modules` folder) by running `zip -r package.zip .` in the project folder. + +:::note Large `node_modules` folder? + + AWS has a limit of 50MB for direct file upload. Usually, our Crawlee projects won’t be anywhere near this limit, but we can easily exceed this with large dependency trees. + + A better way to install your project dependencies is by using Lambda Layers. With Layers, we can also share files between multiple Lambdas - and keep the actual “code” part of the Lambdas as slim as possible. + + **To create a Lambda Layer, we need to:** + + - Pack the `node_modules` folder into a separate zip file (the archive should contain one folder named `node_modules`). + - Create a new Lambda layer from this archive. We’ll probably need to upload this file to AWS S3 storage and create the Lambda Layer like this. + - After creating it, we simply tell our new Lambda function to use this layer. +::: + +To deploy our actual code, we upload the `package.zip` archive as our code source. + +In Lambda Runtime Settings, we point the `handler` to the main function that runs the crawler. You can use slashes to describe directory structure and `.` to denote a named export. Our handler function is called `handler` and is exported from the `src/main.js` file, so we’ll use `src/main.handler` as the handler name. + +Now we’re all set! By clicking the **Test** button, we can send an example testing event to our new Lambda. The actual contents of the event don’t really matter for now - if you want, further parameterize your crawler run by analyzing the `event` object AWS passes as the first argument to the handler. + +:::tip +In the Configuration tab in the AWS Lambda dashboard, you can configure the amount of memory the Lambda is running with or the size of the ephemeral storage. + +The memory size can greatly affect the execution speed of your Lambda. + +See the [official documentation](https://docs.aws.amazon.com/lambda/latest/operatorguide/computing-power.html) to see how the performance and cost scale with more memory. +::: \ No newline at end of file diff --git a/docs/deployment/gcp-browsers.md b/docs/deployment/gcp-browsers.md new file mode 100644 index 000000000000..7583c628deff --- /dev/null +++ b/docs/deployment/gcp-browsers.md @@ -0,0 +1,85 @@ +--- +id: gcp-browsers +title: Browsers in GCP Cloud Run +--- + +Running full-size browsers on GCP Cloud Functions is actually a bit different from doing so on AWS Lambda - [apparently](https://pptr.dev/troubleshooting#running-puppeteer-on-google-cloud-functions), the latest runtime versions miss dependencies required to run Chromium. + +If we want to run browser-enabled Crawlee crawlers on GCP, we’ll need to turn towards **Cloud Run.** Cloud Run is GCP’s platform for running Docker containers - other than that, (almost) everything is the same as with Cloud Functions / AWS Lambdas. + +GCP can spin up your containers on demand, so you’re only billed for the time it takes your container to return an HTTP response to the requesting client. In a way, it also provides a slightly better developer experience (than regular FaaS), as you can debug your Docker containers locally and be sure you’re getting the same setup in the cloud. + +## Preparing the project + +As always, we first pass a new `Configuration` instance to the crawler constructor: + +```javascript title="src/main.js" +import { Configuration, PlaywrightCrawler } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +const crawler = new PlaywrightCrawler({ + requestHandler: router, +// highlight-start +}, new Configuration({ + persistStorage: false, +})); +// highlight-end + +await crawler.run(startUrls); +``` + +All we now need to do is wrap our crawler with an Express HTTP server handler, so it can communicate with the client via HTTP. Because the Cloud Run platform sees only an opaque Docker container, we have to take care of this bit ourselves. + +:::info +GCP passes you an environment variable called `PORT` - your HTTP server is expected to be listening on this port (GCP exposes this one to the outer world). +::: + +The `main.js` script should be looking like this in the end: + +```javascript title="src/main.js" +import { Configuration, PlaywrightCrawler } from 'crawlee'; +import { router } from './routes.js'; +// highlight-start +import express from 'express'; +const app = express(); +// highlight-end + +const startUrls = ['https://crawlee.dev']; + + +// highlight-next-line +app.get('/', async (req, res) => { + const crawler = new PlaywrightCrawler({ + requestHandler: router, + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); + + // highlight-next-line + return res.send(await crawler.getData()); +// highlight-next-line +}); + +// highlight-next-line +app.listen(parseInt(process.env.PORT) || 3000); +``` + +:::tip +Always make sure to keep all the logic in the request handler - as with other FaaS services, your request handlers have to be **stateless.** +::: + +## Deploying to GCP + +Now, we’re ready to deploy! If you have initialized your project using `npx crawlee create`, the initialization script has prepared a Dockerfile for you. + +All you have to do now is run `gcloud run deploy` in your project folder (the one with your Dockerfile in it). The gcloud CLI application will ask you a few questions, such as what region you want to deploy your application in, or whether you want to make your application public or private. + +After answering those questions, you should be able to see your application in the GCP dashboard and run it using the link you find there. + +:::tip +In case your first execution of your newly created Cloud Run fails, try editing the Run configuration - mainly setting the available memory to 1GiB or more and updating the request timeout according to the size of the website you are scraping. +::: \ No newline at end of file diff --git a/docs/deployment/gcp-cheerio.md b/docs/deployment/gcp-cheerio.md new file mode 100644 index 000000000000..918ff29de93f --- /dev/null +++ b/docs/deployment/gcp-cheerio.md @@ -0,0 +1,81 @@ +--- +id: gcp-cheerio +title: Cheerio on GCP Cloud Functions +--- + +Running CheerioCrawler-based project in GCP functions is actually quite easy - you just have to make a few changes to the project code. + +## Updating the project + +Let’s first create the Crawlee project locally with `npx crawlee create`. Set the `"main"` field in the `package.json` file to `"src/main.js"`. + +```json title="package.json" +{ + "name": "my-crawlee-project", + "version": "1.0.0", + // highlight-next-line + "main": "src/main.js", + ... +} +``` + +Now, let’s update the `main.js` file, namely: + +- Pass a separate `Configuration` instance (with the `persistStorage` option set to `false`) to the crawler constructor. + +```javascript title="src/main.js" +import { CheerioCrawler, Configuration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +const crawler = new CheerioCrawler({ + requestHandler: router, +// highlight-start +}, new Configuration({ + persistStorage: false, +})); +// highlight-end + +await crawler.run(startUrls); +``` + +- Wrap the crawler call in a separate handler function. This function: + - Can be asynchronous + - Takes two positional arguments - `req` (containing details about the user-made request to your cloud function) and `res` (response object you can modify). + - Call `res.send(data)` to return any data from the cloud function. +- Export this function from the `src/main.js` module as a named export. + +```javascript title="src/main.js" +import { CheerioCrawler, Configuration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +// highlight-next-line +export const handler = async (req, res) => { + const crawler = new CheerioCrawler({ + requestHandler: router, + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); + + // highlight-next-line + return res.send(await crawler.getData()) +// highlight-next-line +} +``` + +## Deploying to Google Cloud Platform + +In the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout. + +When deploying, pick **ZIP Upload**. You have to create a new GCP storage bucket to store the zip packages in. + +Now, for the package - you should zip all the contents of your project folder **excluding the `node_modules` folder** - GCP doesn’t have Layers like AWS Lambda does, but takes care of the project setup for us based on the `package.json` file). + +Also, make sure to set the **Entry point** to the name of the function you’ve exported from the `src/main.js` file. GCP takes the file from the `package.json`'s `main` field. + +After the Function deploys, you can test it by clicking the “Testing” tab. This tab contains a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block.