Skip to content

Commit

Permalink
Add a unified 'crawler' template
Browse files Browse the repository at this point in the history
  • Loading branch information
janbuchar committed Sep 20, 2024
1 parent 3c3dfe8 commit 1c54b81
Show file tree
Hide file tree
Showing 16 changed files with 366 additions and 23 deletions.
110 changes: 87 additions & 23 deletions src/crawlee/_cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ruff: noqa: TRY301, FBT002, UP007
from __future__ import annotations

import os
import json
from pathlib import Path
from typing import Annotated, Optional, cast

Expand All @@ -16,6 +16,11 @@

cli = typer.Typer(no_args_is_help=True)

cookiecutter_json = json.load((Path().parent.parent.parent / 'templates' / 'crawler' / 'cookiecutter.json').open())
crawler_choices = cookiecutter_json['crawler_type']
package_manager_choices = cookiecutter_json['package_manager']
default_start_url = cookiecutter_json['start_url']


@cli.callback(invoke_without_command=True)
def callback(
Expand Down Expand Up @@ -64,25 +69,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str:
return project_name


def _prompt_for_template() -> str:
"""Prompt the user to select a template from a list."""
# Fetch available templates
response = httpx.get(
TEMPLATE_LIST_URL,
timeout=httpx.Timeout(10),
headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [],
def _prompt_text(message: str, default: str) -> str:
return cast(
str,
ConsoleRender().render(
inquirer.Text(
name='text',
message=message,
default=default,
validate=lambda _, value: bool(value.strip()),
),
),
)
response.raise_for_status()
template_choices = [item['name'] for item in response.json() if item['type'] == 'dir']

# Prompt for template choice

def _prompt_choice(message: str, choices: list[str]) -> str:
"""Prompt the user to pick one from a list of choices."""
return cast(
str,
ConsoleRender().render(
inquirer.List(
name='template',
message='Please select the template for your new Crawlee project',
choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices],
name='choice',
message=message,
choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],
),
),
)


def _prompt_bool(message: str, *, default: bool) -> bool:
return cast(
bool,
ConsoleRender().render(
inquirer.Confirm(
name='confirm',
message=message,
default=default,
),
),
)
Expand All @@ -92,26 +114,63 @@ def _prompt_for_template() -> str:
def create(
project_name: Optional[str] = typer.Argument(
default=None,
show_default=False,
help='The name of the project and the directory that will be created to contain it. '
'If none is given, you will be prompted.',
),
crawler_type: Optional[str] = typer.Option(
None,
'--crawler-type',
'--template',
show_default=False,
help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',
),
package_manager: Optional[str] = typer.Option(
default=None,
show_default=False,
help='Package manager to be used in the new project. If none is given, you will be prompted.',
),
start_url: Optional[str] = typer.Option(
default=None,
show_default=False,
help='The URL where crawling should start. If none is given, you will be prompted.',
),
template: Optional[str] = typer.Option(
enable_apify_integration: Optional[bool] = typer.Option(
default=None,
help='The template to be used to create the project. If none is given, you will be prompted.',
show_default=False,
help='Should Apify integration be set up for you? If not given, you will be prompted.',
),
) -> None:
"""Bootstrap a new Crawlee project."""
try:
# Prompt for project name if not provided.
project_name = _prompt_for_project_name(project_name)

# Prompt for template choice if not provided.
if template is None:
template = _prompt_for_template()

if project_name and template:
# Prompt for crawler_type if not provided.
if crawler_type is None:
crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)

# Prompt for package manager if not provided.
if package_manager is None:
package_manager = _prompt_choice('Please select the package manager', package_manager_choices)

# Prompt for start URL
if start_url is None:
start_url = _prompt_text('Please specify the start URL', default=default_start_url)

# Ask about Apify integration if not explicitly configured
if enable_apify_integration is None:
enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False)

if all(
[
project_name,
crawler_type,
package_manager,
start_url,
enable_apify_integration is not None,
]
):
# Start the bootstrap process.
with Progress(
SpinnerColumn(),
Expand All @@ -121,9 +180,14 @@ def create(
progress.add_task(description='Bootstrapping...', total=None)
cookiecutter(
template='gh:apify/crawlee-python',
directory=f'templates/{template}',
directory='templates/crawler',
no_input=True,
extra_context={'project_name': project_name},
extra_context={
'project_name': project_name,
'package_manager': package_manager,
'crawler_type': crawler_type,
'enable_apify_integration': enable_apify_integration,
},
)

typer.echo(f'Your project "{project_name}" was created.')
Expand Down
12 changes: 12 additions & 0 deletions templates/crawler/cookiecutter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"project_name": "crawlee-python-beautifulsoup-project",
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
"crawler_type": ["beautifulsoup", "parsel", "playwright"],
"package_manager": ["poetry", "pip"],
"enable_apify_integration": false,
"start_url": "https://crawlee.dev",
"_jinja2_env_vars": {
"line_statement_prefix": "# %"
},
"_extensions": ["jinja2.ext.do"]
}
32 changes: 32 additions & 0 deletions templates/crawler/templates/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# % if cookiecutter.enable_apify_integration
from apify import Actor
# % endif
# % block import required
# % endblock

from .routes import router


async def main() -> None:
"""The crawler entry point."""
# % filter truncate(0, end='')
# % block instantiation required
# % endblock
# % endfilter

# % if cookiecutter.enable_apify_integration
async with Actor:
# % filter indent(width=8, first=False)
{{ self.instantiation() }}
# % endfilter
# % else
# % filter indent(width=4, first=False)
{{ self.instantiation() }}
# % endfilter
# % endif

await crawler.run(
[
'{{ cookiecutter.start_url }}',
]
)
12 changes: 12 additions & 0 deletions templates/crawler/templates/main_beautifulsoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
# % endblock

# % block instantiation
crawler = BeautifulSoupCrawler(
request_handler=router,
max_requests_per_crawl=50,
)
# % endblock
12 changes: 12 additions & 0 deletions templates/crawler/templates/main_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.parsel_crawler import ParselCrawler
# % endblock

# % block instantiation
crawler = ParselCrawler(
request_handler=router,
max_requests_per_crawl=50,
)
# % endblock
13 changes: 13 additions & 0 deletions templates/crawler/templates/main_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# % extends 'main.py'

# % block import
from crawlee.playwright_crawler import PlaywrightCrawler
# % endblock

# % block instantiation
crawler = PlaywrightCrawler(
request_handler=router,
headless=True,
max_requests_per_crawl=50,
)
# % endblock
19 changes: 19 additions & 0 deletions templates/crawler/templates/routes_beautifulsoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
from crawlee.router import Router

router = Router[BeautifulSoupCrawlingContext]()


@router.default_handler
async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.soup.find('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': title.text if title else None,
}
)

await context.enqueue_links()
19 changes: 19 additions & 0 deletions templates/crawler/templates/routes_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.parsel_crawler import ParselCrawlingContext
from crawlee.router import Router

router = Router[ParselCrawlingContext]()


@router.default_handler
async def default_handler(context: ParselCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.selector.xpath('//title/text()').get()
await context.push_data(
{
'url': context.request.loaded_url,
'title': title,
}
)

await context.enqueue_links()
19 changes: 19 additions & 0 deletions templates/crawler/templates/routes_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.playwright_crawler import PlaywrightCrawlingContext
from crawlee.router import Router

router = Router[PlaywrightCrawlingContext]()


@router.default_handler
async def default_handler(context: PlaywrightCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = await context.page.query_selector('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': await title.inner_text() if title else None,
}
)

await context.enqueue_links()
62 changes: 62 additions & 0 deletions templates/crawler/{{cookiecutter.project_name}}/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
# % if cookiecutter.crawler_type == 'playwright'
FROM apify/actor-python-playwright:3.12
# % else
FROM apify/actor-python:3.12
# % endif

RUN apt install -yq git && rm -rf /var/lib/apt/lists/*

# % if cookiecutter.package_manager == 'poetry'
RUN pip install -U pip setuptools \
&& pip install poetry \
&& poetry self add poetry-plugin-export

# Second, copy just poetry.lock and pyproject.toml into the Actor image,
# since those should be the only files that affects the dependency install in the next step,
# in order to speed up the build
COPY pyproject.toml ./
COPY poetry.lock ./

# Install the dependencies
RUN echo "Python version:" \
&& python --version \
&& echo "Installing dependencies:" \
# Export packages from poetry.lock
&& poetry export -f requirements.txt --without-hashes | \
# Replace playwright version so that it matches whatever is pre-installed in the image
sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \
# Install everything using pip (ignore dependency checks - the lockfile is correct, period)
pip install -r /dev/stdin --no-dependencies \
&& echo "All installed Python packages:" \
&& pip freeze
# % elif cookiecutter.package_manager == 'pip'
RUN pip install -U pip setuptools

# Second, copy just pyproject.toml into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY pyproject.toml ./

# Install the dependencies
RUN echo "Python version:" \
&& python --version \
&& echo "Installing dependencies:" \
# Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image
pip install . playwright==$(playwright --version | cut -d ' ' -f 2) \
&& echo "All installed Python packages:" \
&& pip freeze
# % endif

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python -m compileall -q .

# Specify how to launch the source code of your Actor.
CMD ["python", "-m", "{{ cookiecutter.__package_name }}"]
Loading

0 comments on commit 1c54b81

Please sign in to comment.