Add a unified 'crawler' template

apify · Sep 20, 2024 · 1c54b81 · 1c54b81
1 parent 3c3dfe8
commit 1c54b81
Show file tree

Hide file tree

Showing 16 changed files with 366 additions and 23 deletions.
diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py
@@ -1,7 +1,7 @@
 # ruff: noqa: TRY301, FBT002, UP007
 from __future__ import annotations
 
-import os
+import json
 from pathlib import Path
 from typing import Annotated, Optional, cast
 
@@ -16,6 +16,11 @@
 
 cli = typer.Typer(no_args_is_help=True)
 
+cookiecutter_json = json.load((Path().parent.parent.parent / 'templates' / 'crawler' / 'cookiecutter.json').open())
+crawler_choices = cookiecutter_json['crawler_type']
+package_manager_choices = cookiecutter_json['package_manager']
+default_start_url = cookiecutter_json['start_url']
+
 
 @cli.callback(invoke_without_command=True)
 def callback(
@@ -64,25 +69,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str:
         return project_name
 
 
-def _prompt_for_template() -> str:
-    """Prompt the user to select a template from a list."""
-    # Fetch available templates
-    response = httpx.get(
-        TEMPLATE_LIST_URL,
-        timeout=httpx.Timeout(10),
-        headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [],
+def _prompt_text(message: str, default: str) -> str:
+    return cast(
+        str,
+        ConsoleRender().render(
+            inquirer.Text(
+                name='text',
+                message=message,
+                default=default,
+                validate=lambda _, value: bool(value.strip()),
+            ),
+        ),
     )
-    response.raise_for_status()
-    template_choices = [item['name'] for item in response.json() if item['type'] == 'dir']
 
-    # Prompt for template choice
+
+def _prompt_choice(message: str, choices: list[str]) -> str:
+    """Prompt the user to pick one from a list of choices."""
     return cast(
         str,
         ConsoleRender().render(
             inquirer.List(
-                name='template',
-                message='Please select the template for your new Crawlee project',
-                choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices],
+                name='choice',
+                message=message,
+                choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],
+            ),
+        ),
+    )
+
+
+def _prompt_bool(message: str, *, default: bool) -> bool:
+    return cast(
+        bool,
+        ConsoleRender().render(
+            inquirer.Confirm(
+                name='confirm',
+                message=message,
+                default=default,
             ),
         ),
     )
@@ -92,26 +114,63 @@ def _prompt_for_template() -> str:
 def create(
     project_name: Optional[str] = typer.Argument(
         default=None,
+        show_default=False,
         help='The name of the project and the directory that will be created to contain it. '
         'If none is given, you will be prompted.',
+    ),
+    crawler_type: Optional[str] = typer.Option(
+        None,
+        '--crawler-type',
+        '--template',
+        show_default=False,
+        help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',
+    ),
+    package_manager: Optional[str] = typer.Option(
+        default=None,
+        show_default=False,
+        help='Package manager to be used in the new project. If none is given, you will be prompted.',
+    ),
+    start_url: Optional[str] = typer.Option(
+        default=None,
         show_default=False,
+        help='The URL where crawling should start. If none is given, you will be prompted.',
     ),
-    template: Optional[str] = typer.Option(
+    enable_apify_integration: Optional[bool] = typer.Option(
         default=None,
-        help='The template to be used to create the project. If none is given, you will be prompted.',
         show_default=False,
+        help='Should Apify integration be set up for you? If not given, you will be prompted.',
     ),
 ) -> None:
     """Bootstrap a new Crawlee project."""
     try:
         # Prompt for project name if not provided.
         project_name = _prompt_for_project_name(project_name)
 
-        # Prompt for template choice if not provided.
-        if template is None:
-            template = _prompt_for_template()
-
-        if project_name and template:
+        # Prompt for crawler_type if not provided.
+        if crawler_type is None:
+            crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)
+
+        # Prompt for package manager if not provided.
+        if package_manager is None:
+            package_manager = _prompt_choice('Please select the package manager', package_manager_choices)
+
+        # Prompt for start URL
+        if start_url is None:
+            start_url = _prompt_text('Please specify the start URL', default=default_start_url)
+
+        # Ask about Apify integration if not explicitly configured
+        if enable_apify_integration is None:
+            enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False)
+
+        if all(
+            [
+                project_name,
+                crawler_type,
+                package_manager,
+                start_url,
+                enable_apify_integration is not None,
+            ]
+        ):
             # Start the bootstrap process.
             with Progress(
                 SpinnerColumn(),
@@ -121,9 +180,14 @@ def create(
                 progress.add_task(description='Bootstrapping...', total=None)
                 cookiecutter(
                     template='gh:apify/crawlee-python',
-                    directory=f'templates/{template}',
+                    directory='templates/crawler',
                     no_input=True,
-                    extra_context={'project_name': project_name},
+                    extra_context={
+                        'project_name': project_name,
+                        'package_manager': package_manager,
+                        'crawler_type': crawler_type,
+                        'enable_apify_integration': enable_apify_integration,
+                    },
                 )
 
             typer.echo(f'Your project "{project_name}" was created.')

diff --git a/templates/crawler/cookiecutter.json b/templates/crawler/cookiecutter.json
@@ -0,0 +1,12 @@
+{
+    "project_name": "crawlee-python-beautifulsoup-project",
+    "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
+    "crawler_type": ["beautifulsoup", "parsel", "playwright"],
+    "package_manager": ["poetry", "pip"],
+    "enable_apify_integration": false,
+    "start_url": "https://crawlee.dev",
+    "_jinja2_env_vars": {
+        "line_statement_prefix": "# %"
+    },
+    "_extensions": ["jinja2.ext.do"]
+}
diff --git a/templates/crawler/templates/main.py b/templates/crawler/templates/main.py
@@ -0,0 +1,32 @@
+# % if cookiecutter.enable_apify_integration
+from apify import Actor
+# % endif
+# % block import required
+# % endblock
+
+from .routes import router
+
+
+async def main() -> None:
+    """The crawler entry point."""
+    # % filter truncate(0, end='')
+    # % block instantiation required
+    # % endblock
+    # % endfilter
+
+    # % if cookiecutter.enable_apify_integration
+    async with Actor:
+        # % filter indent(width=8, first=False)
+        {{ self.instantiation() }}
+        # % endfilter
+    # % else
+        # % filter indent(width=4, first=False)
+    {{ self.instantiation() }}
+        # % endfilter
+    # % endif
+
+    await crawler.run(
+        [
+            '{{ cookiecutter.start_url }}',
+        ]
+    )
diff --git a/templates/crawler/templates/main_beautifulsoup.py b/templates/crawler/templates/main_beautifulsoup.py
@@ -0,0 +1,12 @@
+# % extends 'main.py'
+
+# % block import
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
+# % endblock
+
+# % block instantiation
+crawler = BeautifulSoupCrawler(
+    request_handler=router,
+    max_requests_per_crawl=50,
+)
+# % endblock
diff --git a/templates/crawler/templates/main_parsel.py b/templates/crawler/templates/main_parsel.py
@@ -0,0 +1,12 @@
+# % extends 'main.py'
+
+# % block import
+from crawlee.parsel_crawler import ParselCrawler
+# % endblock
+
+# % block instantiation
+crawler = ParselCrawler(
+    request_handler=router,
+    max_requests_per_crawl=50,
+)
+# % endblock
diff --git a/templates/crawler/templates/main_playwright.py b/templates/crawler/templates/main_playwright.py
@@ -0,0 +1,13 @@
+# % extends 'main.py'
+
+# % block import
+from crawlee.playwright_crawler import PlaywrightCrawler
+# % endblock
+
+# % block instantiation
+crawler = PlaywrightCrawler(
+    request_handler=router,
+    headless=True,
+    max_requests_per_crawl=50,
+)
+# % endblock
diff --git a/templates/crawler/templates/routes_beautifulsoup.py b/templates/crawler/templates/routes_beautifulsoup.py
@@ -0,0 +1,19 @@
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
+from crawlee.router import Router
+
+router = Router[BeautifulSoupCrawlingContext]()
+
+
+@router.default_handler
+async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
+    """Default request handler."""
+    context.log.info(f'Processing {context.request.url} ...')
+    title = context.soup.find('title')
+    await context.push_data(
+        {
+            'url': context.request.loaded_url,
+            'title': title.text if title else None,
+        }
+    )
+
+    await context.enqueue_links()
diff --git a/templates/crawler/templates/routes_parsel.py b/templates/crawler/templates/routes_parsel.py
@@ -0,0 +1,19 @@
+from crawlee.parsel_crawler import ParselCrawlingContext
+from crawlee.router import Router
+
+router = Router[ParselCrawlingContext]()
+
+
+@router.default_handler
+async def default_handler(context: ParselCrawlingContext) -> None:
+    """Default request handler."""
+    context.log.info(f'Processing {context.request.url} ...')
+    title = context.selector.xpath('//title/text()').get()
+    await context.push_data(
+        {
+            'url': context.request.loaded_url,
+            'title': title,
+        }
+    )
+
+    await context.enqueue_links()
diff --git a/templates/crawler/templates/routes_playwright.py b/templates/crawler/templates/routes_playwright.py
@@ -0,0 +1,19 @@
+from crawlee.playwright_crawler import PlaywrightCrawlingContext
+from crawlee.router import Router
+
+router = Router[PlaywrightCrawlingContext]()
+
+
+@router.default_handler
+async def default_handler(context: PlaywrightCrawlingContext) -> None:
+    """Default request handler."""
+    context.log.info(f'Processing {context.request.url} ...')
+    title = await context.page.query_selector('title')
+    await context.push_data(
+        {
+            'url': context.request.loaded_url,
+            'title': await title.inner_text() if title else None,
+        }
+    )
+
+    await context.enqueue_links()
diff --git a/templates/crawler/{{cookiecutter.project_name}}/Dockerfile b/templates/crawler/{{cookiecutter.project_name}}/Dockerfile
@@ -0,0 +1,62 @@
+# First, specify the base Docker image.
+# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
+# You can also use any other image from Docker Hub.
+# % if cookiecutter.crawler_type == 'playwright'
+FROM apify/actor-python-playwright:3.12
+# % else
+FROM apify/actor-python:3.12
+# % endif
+
+RUN apt install -yq git && rm -rf /var/lib/apt/lists/*
+
+# % if cookiecutter.package_manager == 'poetry'
+RUN pip install -U pip setuptools \
+    && pip install poetry \
+    && poetry self add poetry-plugin-export
+
+# Second, copy just poetry.lock and pyproject.toml into the Actor image,
+# since those should be the only files that affects the dependency install in the next step,
+# in order to speed up the build
+COPY pyproject.toml ./
+COPY poetry.lock ./
+
+# Install the dependencies
+RUN echo "Python version:" \
+ && python --version \
+ && echo "Installing dependencies:" \
+ # Export packages from poetry.lock
+ && poetry export -f requirements.txt --without-hashes | \
+ # Replace playwright version so that it matches whatever is pre-installed in the image
+    sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \
+ # Install everything using pip (ignore dependency checks - the lockfile is correct, period)
+    pip install -r /dev/stdin --no-dependencies \
+ && echo "All installed Python packages:" \
+ && pip freeze
+# % elif cookiecutter.package_manager == 'pip'
+RUN pip install -U pip setuptools
+
+# Second, copy just pyproject.toml into the Actor image,
+# since it should be the only file that affects the dependency install in the next step,
+# in order to speed up the build
+COPY pyproject.toml ./
+
+# Install the dependencies
+RUN echo "Python version:" \
+ && python --version \
+ && echo "Installing dependencies:" \
+ # Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image
+    pip install . playwright==$(playwright --version | cut -d ' ' -f 2) \
+ && echo "All installed Python packages:" \
+ && pip freeze
+# % endif
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after installing the dependencies, quick build will be really fast
+# for most source file changes.
+COPY . ./
+
+# Use compileall to ensure the runnability of the Actor Python code.
+RUN python -m compileall -q .
+
+# Specify how to launch the source code of your Actor.
+CMD ["python", "-m", "{{ cookiecutter.__package_name }}"]