From f97e125e249285dd93771f5948d9407124cd7578 Mon Sep 17 00:00:00 2001 From: Saurav Jain Date: Fri, 17 May 2024 19:54:39 +0530 Subject: [PATCH] docs: improve crawlee seo ranking (#2472) --- docs/examples/crawl_sitemap.mdx | 2 +- docs/examples/crawler-plugins/index.mdx | 6 +++--- docs/examples/http_crawler.mdx | 2 +- docs/examples/http_crawler.ts | 5 +++-- docs/guides/cheerio_crawler.mdx | 6 +++--- website/src/components/Highlights.jsx | 2 +- 6 files changed, 12 insertions(+), 11 deletions(-) diff --git a/docs/examples/crawl_sitemap.mdx b/docs/examples/crawl_sitemap.mdx index a136f00eedfa..711b3c337600 100644 --- a/docs/examples/crawl_sitemap.mdx +++ b/docs/examples/crawl_sitemap.mdx @@ -12,7 +12,7 @@ import CheerioSource from '!!raw-loader!roa-loader!./crawl_sitemap_cheerio.ts'; import PuppeteerSource from '!!raw-loader!roa-loader!./crawl_sitemap_puppeteer.ts'; import PlaywrightSource from '!!raw-loader!roa-loader!./crawl_sitemap_playwright.ts'; -This example downloads and crawls the URLs from a sitemap, by using the `Sitemap` utility class provided by the `@crawlee/utils` module. +This example builds a sitemap crawler which downloads and crawls the URLs from a sitemap, by using the `Sitemap` utility class provided by the `@crawlee/utils` module. diff --git a/docs/examples/crawler-plugins/index.mdx b/docs/examples/crawler-plugins/index.mdx index 08bd4d9430ee..fc38fa195a1a 100644 --- a/docs/examples/crawler-plugins/index.mdx +++ b/docs/examples/crawler-plugins/index.mdx @@ -13,7 +13,7 @@ import PlaywrightExtraSource from '!!raw-loader!roa-loader!./playwright-extra.ts [`puppeteer-extra`](https://www.npmjs.com/package/puppeteer-extra) and [`playwright-extra`](https://www.npmjs.com/package/playwright-extra) are community-built libraries that bring in a plugin system to enhance the usage of [`puppeteer`](https://www.npmjs.com/package/puppeteer) and [`playwright`](https://www.npmjs.com/package/playwright) respectively (bringing in extra functionality, like improving stealth for -example by using the [`puppeteer-extra-plugin-stealth`](https://www.npmjs.com/package/puppeteer-extra-plugin-stealth) plugin). +example by using the Puppeteer Stealth plugin [(`puppeteer-extra-plugin-stealth`)](https://www.npmjs.com/package/puppeteer-extra-plugin-stealth)). :::tip Available plugins @@ -23,7 +23,7 @@ For [`playwright`](https://www.npmjs.com/package/playwright), please see [`playw ::: -In this example, we'll show you how to use the [`puppeteer-extra-plugin-stealth`](https://www.npmjs.com/package/puppeteer-extra-plugin-stealth) plugin +In this example, we'll show you how to use the Puppeteer Stealth [(`puppeteer-extra-plugin-stealth`)](https://www.npmjs.com/package/puppeteer-extra-plugin-stealth) plugin to help you avoid bot detections when crawling your target website. @@ -31,7 +31,7 @@ to help you avoid bot detections when crawling your target website. :::info Before you begin -Make sure you've installed the `puppeteer-extra` and `puppeteer-extra-plugin-stealth` packages via your preferred package manager +Make sure you've installed the Puppeteer Extra (`puppeteer-extra`) and Puppeteer Stealth plugin(`puppeteer-extra-plugin-stealth`) packages via your preferred package manager ```bash npm install puppeteer-extra puppeteer-extra-plugin-stealth diff --git a/docs/examples/http_crawler.mdx b/docs/examples/http_crawler.mdx index e62793ae554a..41f5d25394f3 100644 --- a/docs/examples/http_crawler.mdx +++ b/docs/examples/http_crawler.mdx @@ -7,7 +7,7 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import ApiLink from '@site/src/components/ApiLink'; import HttpCrawlerSource from '!!raw-loader!roa-loader!./http_crawler.ts'; -This example demonstrates how to use `HttpCrawler` to crawl a list of URLs from an external file, load each URL using a plain HTTP request, and save HTML. +This example demonstrates how to use `HttpCrawler` to build a crawler that crawls a list of URLs from an external file, load each URL using a plain HTTP request, and save HTML. {HttpCrawlerSource} diff --git a/docs/examples/http_crawler.ts b/docs/examples/http_crawler.ts index 4052df13136b..3940fc5de650 100644 --- a/docs/examples/http_crawler.ts +++ b/docs/examples/http_crawler.ts @@ -35,8 +35,8 @@ const crawler = new HttpCrawler({ // Store the results to the dataset. In local configuration, // the data will be stored as JSON files in ./storage/datasets/default await Dataset.pushData({ - url: request.url, - body, + url: request.url, // URL of the page + body, // HTML code of the page }); }, @@ -47,6 +47,7 @@ const crawler = new HttpCrawler({ }); // Run the crawler and wait for it to finish. +// It will crawl a list of URLs from an external file, load each URL using a plain HTTP request, and save HTML await crawler.run([ 'https://crawlee.dev', ]); diff --git a/docs/guides/cheerio_crawler.mdx b/docs/guides/cheerio_crawler.mdx index 6097f77b47ed..d6926e9906de 100644 --- a/docs/guides/cheerio_crawler.mdx +++ b/docs/guides/cheerio_crawler.mdx @@ -11,7 +11,7 @@ import ApiLink from '@site/src/components/ApiLink'; ## What is Cheerio -[Cheerio](https://www.npmjs.com/package/cheerio) is essentially [jQuery](https://jquery.com/) for Node.js. It offers the same API, including the familiar `$` object. You can use it, as you would use jQuery for manipulating the DOM of an HTML page. In crawling, you'll mostly use it to select the needed elements and extract their values - the data you're interested in. But jQuery runs in a browser and attaches directly to the browser's DOM. Where does `cheerio` get its HTML? This is where the `Crawler` part of `CheerioCrawler` comes in. +[Cheerio](https://cheerio.js.org/) is essentially [jQuery](https://jquery.com/) for Node.js. It offers the same API, including the familiar `$` object. You can use it, as you would use jQuery for manipulating the DOM of an HTML page. In crawling, you'll mostly use it to select the needed elements and extract their values - the data you're interested in. But jQuery runs in a browser and attaches directly to the browser's DOM. Where does `cheerio` get its HTML? This is where the `Crawler` part of `CheerioCrawler` comes in. ## How the crawler works @@ -23,7 +23,7 @@ Modern web pages often do not serve all of their content in the first HTML respo ::: -Once the page's HTML is retrieved, the crawler will pass it to [Cheerio](https://www.npmjs.com/package/cheerio) for parsing. The result is the typical `$` function, which should be familiar to jQuery users. You can use the `$` function to do all sorts of lookups and manipulation of the page's HTML, but in scraping, you will mostly use it to find specific HTML elements and extract their data. +Once the page's HTML is retrieved, the crawler will pass it to [Cheerio](https://github.com/cheeriojs/cheerio) for parsing. The result is the typical `$` function, which should be familiar to jQuery users. You can use the `$` function to do all sorts of lookups and manipulation of the page's HTML, but in scraping, you will mostly use it to find specific HTML elements and extract their data. Example use of Cheerio and its `$` function in comparison to browser JavaScript: @@ -41,7 +41,7 @@ $('[href]') :::note -This is not to show that Cheerio is better than plain browser JavaScript. Some might actually prefer the more expressive way plain JS provides. Unfortunately, the browser JavaScript methods are not available in Node.js, so Cheerio is your best bet to do the parsing in Node. +This is not to show that Cheerio is better than plain browser JavaScript. Some might actually prefer the more expressive way plain JS provides. Unfortunately, the browser JavaScript methods are not available in Node.js, so Cheerio is your best bet to do the parsing in Node.js. ::: diff --git a/website/src/components/Highlights.jsx b/website/src/components/Highlights.jsx index 00177fdf5c65..e96c2121c751 100644 --- a/website/src/components/Highlights.jsx +++ b/website/src/components/Highlights.jsx @@ -10,7 +10,7 @@ const FeatureList = [ <> We believe websites are best scraped in the language they're written in. Crawlee runs on Node.js and it's built in TypeScript to improve code completion in your IDE, - even if you don't use TypeScript yourself. + even if you don't use TypeScript yourself. Crawlee supports both TypeScript and JavaScript crawling. ), },