From e4004e36d8f8008a6582be3271955a06eefa57cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 18 Sep 2023 13:03:50 +0200 Subject: [PATCH] feat: add deployment guides --- docs/deployment/deploy-on-aws-browsers.md | 123 ++++++++++++++++++++++ docs/deployment/deploy-on-aws-cheerio.md | 123 ++++++++++++++++++++++ docs/deployment/deploy-on-gcp-browsers.md | 58 ++++++++++ docs/deployment/deploy-on-gcp-cheerio.md | 81 ++++++++++++++ website/sidebars.js | 26 +++++ 5 files changed, 411 insertions(+) create mode 100644 docs/deployment/deploy-on-aws-browsers.md create mode 100644 docs/deployment/deploy-on-aws-cheerio.md create mode 100644 docs/deployment/deploy-on-gcp-browsers.md create mode 100644 docs/deployment/deploy-on-gcp-cheerio.md diff --git a/docs/deployment/deploy-on-aws-browsers.md b/docs/deployment/deploy-on-aws-browsers.md new file mode 100644 index 000000000000..ed42cedbf295 --- /dev/null +++ b/docs/deployment/deploy-on-aws-browsers.md @@ -0,0 +1,123 @@ +--- +id: deploy-on-aws-browsers +title: Browsers on AWS Lambda +--- + +Running browser-enabled Crawlee crawlers in AWS Lambda is a bit complicated - but not too much. The main problem is that we have to upload not only our code and the dependencies, but also the **browser binaries**. + +## Managing browser binaries + +Fortunately, there are already some NPM packages that can help us with managing the browser binaries installation: + +- [@sparticuz/chromium](https://www.npmjs.com/package/@sparticuz/chromium) is an NPM package containing brotli-compressed chromium binaries. When run in the Lambda environment, the package unzips the binaries under the `/tmp/` path and returns the path to the executable. + +We just add this package to the project dependencies and zip the `node_modules` folder. + +```bash +# Install the package +npm i -S @sparticuz/chromium + +# Zip the dependencies +zip -r dependencies.zip ./node_modules +``` + +We will now upload the `dependencies.zip` as a Lambda Layer to AWS. Unfortunately, we cannot do this directly - there is a 50MB limit on direct uploads (and the compressed Chromium build is around that size itself). Instead, we'll upload it as an object into an S3 storage and provide the link to that object during the layer creation. + +## Updating the code + +We also have to slightly update the Crawlee code: + +- First, we pass a new `Configuration` instance to the Crawler. This way, every crawler instance we create will have its own storage and won’t interfere with other crawler instances running in your Lambda environment. + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { Configuration, PlaywrightCrawler } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +const crawler = new PlaywrightCrawler({ + requestHandler: router, +// highlight-start +}, new Configuration({ + persistStorage: false, +})); +// highlight-end + +await crawler.run(startUrls); +``` + +- Now, we actually have to supply the code with the Chromium path from the `@sparticuz/chromium` package. AWS Lambda execution also lacks some hardware support for GPU acceleration etc. - you can tell Chrome about this by passing the `aws_chromium.args` to the `args` parameter. + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { Configuration, PlaywrightCrawler } from 'crawlee'; +import { router } from './routes.js'; +// highlight-next-line +import aws_chromium from '@sparticuz/chromium'; + +const startUrls = ['https://crawlee.dev']; + +const crawler = new PlaywrightCrawler({ + requestHandler: router, + // highlight-start + launchContext: { + launchOptions: { + executablePath: await aws_chromium.executablePath(), + args: aws_chromium.args, + headless: true + } + } + // highlight-end +}, new Configuration({ + persistStorage: false, +})); + +``` + +- Last but not least, we have to wrap the code in the exported `handler` function - this will become the Lambda AWS will be executing. + +```javascript title="src/main.js" +import { Configuration, PlaywrightCrawler } from 'crawlee'; +import { router } from './routes.js'; +import aws_chromium from '@sparticuz/chromium'; + +const startUrls = ['https://crawlee.dev']; + +// highlight-next-line +export const handler = async (event, context) => { + const crawler = new PlaywrightCrawler({ + requestHandler: router, + launchContext: { + launchOptions: { + executablePath: await aws_chromium.executablePath(), + args: aws_chromium.args, + headless: true + } + } + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); + + // highlight-start + return { + statusCode: 200, + body: await crawler.getData(), + }; +} +// highlight-end + +``` + +## Deploying the code + +Now we can simply pack the code into a zip archive (minus the `node_modules` folder, we have put that in the Lambda Layer, remember?). We upload the code archive to AWS as the Lambda body, set up the Lambda so it uses the dependencies Layer, and test our newly created Lambda. + +:::tip Memory settings + +Since we’re using full-size browsers here, we have to update the Lambda configurations a bit. Most importantly, make sure to set the memory setting to **1024 MB or more** and update the **Lambda timeout**. + +The target timeout value depends on how long your crawler will be running. Try measuring the execution time when running your crawler locally and set the timeout accordingly. +::: \ No newline at end of file diff --git a/docs/deployment/deploy-on-aws-cheerio.md b/docs/deployment/deploy-on-aws-cheerio.md new file mode 100644 index 000000000000..4fb4eb4c3864 --- /dev/null +++ b/docs/deployment/deploy-on-aws-cheerio.md @@ -0,0 +1,123 @@ +--- +id: deploy-on-aws-cheerio +title: Cheerio on AWS Lambda +--- + +Locally, we can conveniently create a Crawlee project with `npx crawlee create`. In order to run this project on AWS Lambda, however, we need to do a few tweaks. + +## Updating the code + +Whenever we instantiate a new crawler, we have to pass a unique `Configuration` instance to it. By default, all the Crawlee crawler instances share the same storage - this can be convenient, but would also cause “statefulness” of our Lambda, which would lead to hard-to-debug problems. + +Also, when creating this Configuration instance, make sure to pass the `persistStorage: false` option. This tells Crawlee to use in-memory storage, as the Lambda filesystem is read-only. + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { CheerioCrawler, Configuration, ProxyConfiguration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +const crawler = new CheerioCrawler({ + requestHandler: router, +// highlight-start +}, new Configuration({ + persistStorage: false, +})); +// highlight-end + +await crawler.run(startUrls); +``` + +Now, we wrap all the logic in a `handler` function. This is the actual “Lambda” that AWS will be executing later on. + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { CheerioCrawler, Configuration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +// highlight-next-line +export const handler = async (event, context) => { + const crawler = new CheerioCrawler({ + requestHandler: router, + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); +// highlight-next-line +}; +``` + +:::tip **Important** + +Make sure to always instantiate a **new crawler instance for every Lambda**. AWS always keeps the environment running for some time after the first Lambda execution (in order to reduce cold-start times) - so any subsequent Lambda calls will access the already-used crawler instance. + +**TLDR: Keep your Lambda stateless.** + +::: + + +Last things last, we also want to return the scraped data from the Lambda when the crawler run ends. + +In the end, your `main.js` script should look something like this: + +```javascript title="src/main.js" +// For more information, see https://crawlee.dev/ +import { CheerioCrawler, Configuration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +export const handler = async (event, context) => { + const crawler = new CheerioCrawler({ + requestHandler: router, + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); + + // highlight-start + return { + statusCode: 200, + body: await crawler.getData(), + } + // highlight-end +}; +``` + +## Deploying the project + +Now it’s time to deploy our script on AWS! + +Let’s create a zip archive from our project (including the `node_modules` folder) by running `zip -r package.zip .` in the project folder. + +:::note Large `node_modules` folder? + + AWS has a limit of 50MB for direct file upload. Usually, our Crawlee projects won’t be anywhere near this limit, but we can easily exceed this with large dependency trees. + + A better way to install your project dependencies is by using Lambda Layers. With Layers, we can also share files between multiple Lambdas - and keep the actual “code” part of the Lambdas as slim as possible. + + **To create a Lambda Layer, we need to:** + + - Pack the `node_modules` folder into a separate zip file (the archive should contain one folder named `node_modules`). + - Create a new Lambda layer from this archive. We’ll probably need to upload this file to AWS S3 storage and create the Lambda Layer like this. + - After creating it, we simply tell our new Lambda function to use this layer. +::: + +To deploy our actual code, we upload the `package.zip` archive as our code source. + +In Lambda Runtime Settings, we point the `handler` to the main function that runs the crawler. You can use slashes to describe directory structure and `.` to denote a named export. Our handler function is called `handler` and is exported from the `src/main.js` file, so we’ll use `src/main.handler` as the handler name. + +Now we’re all set! By clicking the **Test** button, we can send an example testing event to our new Lambda. The actual contents of the event don’t really matter for now - if you want, further parameterize your crawler run by analyzing the `event` object AWS passes as the first argument to the handler. + +:::tip +In the Configuration tab in the AWS Lambda dashboard, you can configure the amount of memory the Lambda is running with or the size of the ephemeral storage. + +The memory size can greatly affect the execution speed of your Lambda. + +See the [official documentation](https://docs.aws.amazon.com/lambda/latest/operatorguide/computing-power.html) to see how the performance and cost scale with more memory. +::: \ No newline at end of file diff --git a/docs/deployment/deploy-on-gcp-browsers.md b/docs/deployment/deploy-on-gcp-browsers.md new file mode 100644 index 000000000000..0ef62235cda9 --- /dev/null +++ b/docs/deployment/deploy-on-gcp-browsers.md @@ -0,0 +1,58 @@ +--- +id: deploy-on-gcp-browsers +title: Browsers in GCP Cloud Run +--- + +Running full-size browsers on GCP Cloud Functions is actually a bit different from doing so on AWS Lambda - [apparently](https://pptr.dev/troubleshooting#running-puppeteer-on-google-cloud-functions), the latest runtime versions miss dependencies required to run Chromium. + +If we want to run browser-enabled Crawlee crawlers on GCP, we’ll need to turn towards **Cloud Run.** Cloud Run is GCP’s platform for running Docker containers - other than that, (almost) everything is the same as with Cloud Functions / AWS Lambdas. + +GCP can spin up your containers on demand, so you’re only billed for the time it takes your container to return an HTTP response to the requesting client. In a way, it also provides a slightly better developer experience (than regular FaaS), as you can debug your Docker containers locally and be sure you’re getting the same setup in the cloud. + +As always, we first pass a new `Configuration` instance to the crawler constructor: + +All we now need to do is wrap our crawler with an Express HTTP server handler, so it can communicate with the client via HTTP. Because the Cloud Run platform sees only an opaque Docker container, we have to take care of this bit ourselves. + +:::info +GCP passes you an environment variable called `PORT` - your HTTP server is expected to be listening on this port (GCP exposes this one to the outer world). +::: + +The `main.js` script should be looking like this in the end: + +```javascript title="src/main.js" +import { Configuration, PlaywrightCrawler } from 'crawlee'; +import { router } from './routes.js'; +import express from 'express'; + +const startUrls = ['https://crawlee.dev']; + +const app = express(); + +app.get('/', async (req, res) => { + const crawler = new PlaywrightCrawler({ + requestHandler: router, + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); + + return res.send(await crawler.getData()); +}); + +app.listen(parseInt(process.env.PORT) || 3000); +``` + +:::tip +Always make sure to keep all the logic in the request handler - as with other FaaS services, your request handlers have to be **stateless.** +::: + +Now, we’re ready to deploy! If you have initialized your project using `npx crawlee create`, the initialization script has prepared a Dockerfile for you. + +All you have to do now is run `gcloud run deploy` in your project folder (the one with your Dockerfile in it). The gcloud CLI application will ask you a few questions, such as what region you want to deploy your application in, or whether you want to make your application public or private. + +After answering those questions, you should be able to see your application in the GCP dashboard and run it using the link you find there. + +:::tip +In case your first execution of your newly created Cloud Run fails, try editing the Run configuration - mainly setting the available memory to 1GiB or more and updating the request timeout according to the size of the website you are scraping. +::: \ No newline at end of file diff --git a/docs/deployment/deploy-on-gcp-cheerio.md b/docs/deployment/deploy-on-gcp-cheerio.md new file mode 100644 index 000000000000..65d123864fe5 --- /dev/null +++ b/docs/deployment/deploy-on-gcp-cheerio.md @@ -0,0 +1,81 @@ +--- +id: deploy-on-gcp-cheerio +title: Cheerio on GCP Cloud Functions +--- + +Running CheerioCrawler-based project in GCP functions is actually quite easy - you just have to make a few changes to the project code. + +## Updating the project + +Let’s first create the Crawlee project locally with `npx crawlee create`. Set the `"main"` field in the `package.json` file to `"src/main.js"`. + +```json title="package.json" +{ + "name": "my-crawlee-project", + "version": "1.0.0", + // highlight-next-line + "main": "src/main.js", + ... +} +``` + +Now, let’s update the `main.js` file, namely: + +- Pass a separate `Configuration` instance (with the `persistStorage` option set to `false`) to the crawler constructor. + +```javascript title="src/main.js" +import { CheerioCrawler, Configuration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +const crawler = new CheerioCrawler({ + requestHandler: router, +// highlight-start +}, new Configuration({ + persistStorage: false, +})); +// highlight-end + +await crawler.run(startUrls); +``` + +- Wrap the crawler call in a separate handler function. This function: + - Can be asynchronous + - Takes two positional arguments - `req` (containing details about the user-made request to your cloud function) and `res` (response object you can modify). + - Call `res.send(data)` to return any data from the cloud function. +- Export this function from the `src/main.js` module as a named export. + +```javascript title="src/main.js" +import { CheerioCrawler, Configuration } from 'crawlee'; +import { router } from './routes.js'; + +const startUrls = ['https://crawlee.dev']; + +// highlight-next-line +export const handler = async (req, res) => { + const crawler = new CheerioCrawler({ + requestHandler: router, + }, new Configuration({ + persistStorage: false, + })); + + await crawler.run(startUrls); + + // highlight-next-line + return res.send(await crawler.getData()) +// highlight-next-line +} +``` + +## Deploying to Google Cloud Platform + +In the Google Cloud dashboard, create a new function, allocate memory and CPUs to it, set region and function timeout. + +When deploying, pick **ZIP Upload**. You have to create a new GCP storage bucket to store the zip packages in. + +Now, for the package - you should zip all the contents of your project folder **excluding the `node_modules` folder** - GCP doesn’t have Layers like AWS Lambda does, but takes care of the project setup for us based on the `package.json` file). + +Also, make sure to set the **Entry point** to the name of the function you’ve exported from the `src/main.js` file. GCP takes the file from the `package.json`'s `main` field. + +After the Function deploys, you can test it by clicking the “Testing” tab. This tab contains a `curl` script that calls your new Cloud Function. To avoid having to install the `gcloud` CLI application locally, you can also run this script in the Cloud Shell by clicking the link above the code block. diff --git a/website/sidebars.js b/website/sidebars.js index 477bfeb2cf50..aa8041b810ed 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -21,6 +21,32 @@ module.exports = { 'introduction/deployment', ], }, + { + type: 'category', + label: 'Deployment', + // link: { + // type: 'doc', + // id: 'introduction/introduction', + // }, + items: [ + { + type: 'category', + label: 'Deploy on AWS', + items: [ + 'deployment/deploy-on-aws-cheerio', + 'deployment/deploy-on-aws-browsers', + ], + }, + { + type: 'category', + label: 'Deploy on GCP', + items: [ + 'deployment/deploy-on-gcp-cheerio', + 'deployment/deploy-on-gcp-browsers', + ], + }, + ], + }, { type: 'category', label: 'Guides',