-
Notifications
You must be signed in to change notification settings - Fork 264
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
16 changed files
with
366 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"project_name": "crawlee-python-beautifulsoup-project", | ||
"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", | ||
"crawler_type": ["beautifulsoup", "parsel", "playwright"], | ||
"package_manager": ["poetry", "pip"], | ||
"enable_apify_integration": false, | ||
"start_url": "https://crawlee.dev", | ||
"_jinja2_env_vars": { | ||
"line_statement_prefix": "# %" | ||
}, | ||
"_extensions": ["jinja2.ext.do"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# % if cookiecutter.enable_apify_integration | ||
from apify import Actor | ||
# % endif | ||
# % block import required | ||
# % endblock | ||
|
||
from .routes import router | ||
|
||
|
||
async def main() -> None: | ||
"""The crawler entry point.""" | ||
# % filter truncate(0, end='') | ||
# % block instantiation required | ||
# % endblock | ||
# % endfilter | ||
|
||
# % if cookiecutter.enable_apify_integration | ||
async with Actor: | ||
# % filter indent(width=8, first=False) | ||
{{ self.instantiation() }} | ||
# % endfilter | ||
# % else | ||
# % filter indent(width=4, first=False) | ||
{{ self.instantiation() }} | ||
# % endfilter | ||
# % endif | ||
|
||
await crawler.run( | ||
[ | ||
'{{ cookiecutter.start_url }}', | ||
] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# % extends 'main.py' | ||
|
||
# % block import | ||
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler | ||
# % endblock | ||
|
||
# % block instantiation | ||
crawler = BeautifulSoupCrawler( | ||
request_handler=router, | ||
max_requests_per_crawl=50, | ||
) | ||
# % endblock |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# % extends 'main.py' | ||
|
||
# % block import | ||
from crawlee.parsel_crawler import ParselCrawler | ||
# % endblock | ||
|
||
# % block instantiation | ||
crawler = ParselCrawler( | ||
request_handler=router, | ||
max_requests_per_crawl=50, | ||
) | ||
# % endblock |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# % extends 'main.py' | ||
|
||
# % block import | ||
from crawlee.playwright_crawler import PlaywrightCrawler | ||
# % endblock | ||
|
||
# % block instantiation | ||
crawler = PlaywrightCrawler( | ||
request_handler=router, | ||
headless=True, | ||
max_requests_per_crawl=50, | ||
) | ||
# % endblock |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext | ||
from crawlee.router import Router | ||
|
||
router = Router[BeautifulSoupCrawlingContext]() | ||
|
||
|
||
@router.default_handler | ||
async def default_handler(context: BeautifulSoupCrawlingContext) -> None: | ||
"""Default request handler.""" | ||
context.log.info(f'Processing {context.request.url} ...') | ||
title = context.soup.find('title') | ||
await context.push_data( | ||
{ | ||
'url': context.request.loaded_url, | ||
'title': title.text if title else None, | ||
} | ||
) | ||
|
||
await context.enqueue_links() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from crawlee.parsel_crawler import ParselCrawlingContext | ||
from crawlee.router import Router | ||
|
||
router = Router[ParselCrawlingContext]() | ||
|
||
|
||
@router.default_handler | ||
async def default_handler(context: ParselCrawlingContext) -> None: | ||
"""Default request handler.""" | ||
context.log.info(f'Processing {context.request.url} ...') | ||
title = context.selector.xpath('//title/text()').get() | ||
await context.push_data( | ||
{ | ||
'url': context.request.loaded_url, | ||
'title': title, | ||
} | ||
) | ||
|
||
await context.enqueue_links() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from crawlee.playwright_crawler import PlaywrightCrawlingContext | ||
from crawlee.router import Router | ||
|
||
router = Router[PlaywrightCrawlingContext]() | ||
|
||
|
||
@router.default_handler | ||
async def default_handler(context: PlaywrightCrawlingContext) -> None: | ||
"""Default request handler.""" | ||
context.log.info(f'Processing {context.request.url} ...') | ||
title = await context.page.query_selector('title') | ||
await context.push_data( | ||
{ | ||
'url': context.request.loaded_url, | ||
'title': await title.inner_text() if title else None, | ||
} | ||
) | ||
|
||
await context.enqueue_links() |
62 changes: 62 additions & 0 deletions
62
templates/crawler/{{cookiecutter.project_name}}/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# First, specify the base Docker image. | ||
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/. | ||
# You can also use any other image from Docker Hub. | ||
# % if cookiecutter.crawler_type == 'playwright' | ||
FROM apify/actor-python-playwright:3.12 | ||
# % else | ||
FROM apify/actor-python:3.12 | ||
# % endif | ||
|
||
RUN apt install -yq git && rm -rf /var/lib/apt/lists/* | ||
|
||
# % if cookiecutter.package_manager == 'poetry' | ||
RUN pip install -U pip setuptools \ | ||
&& pip install poetry \ | ||
&& poetry self add poetry-plugin-export | ||
|
||
# Second, copy just poetry.lock and pyproject.toml into the Actor image, | ||
# since those should be the only files that affects the dependency install in the next step, | ||
# in order to speed up the build | ||
COPY pyproject.toml ./ | ||
COPY poetry.lock ./ | ||
|
||
# Install the dependencies | ||
RUN echo "Python version:" \ | ||
&& python --version \ | ||
&& echo "Installing dependencies:" \ | ||
# Export packages from poetry.lock | ||
&& poetry export -f requirements.txt --without-hashes | \ | ||
# Replace playwright version so that it matches whatever is pre-installed in the image | ||
sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \ | ||
# Install everything using pip (ignore dependency checks - the lockfile is correct, period) | ||
pip install -r /dev/stdin --no-dependencies \ | ||
&& echo "All installed Python packages:" \ | ||
&& pip freeze | ||
# % elif cookiecutter.package_manager == 'pip' | ||
RUN pip install -U pip setuptools | ||
|
||
# Second, copy just pyproject.toml into the Actor image, | ||
# since it should be the only file that affects the dependency install in the next step, | ||
# in order to speed up the build | ||
COPY pyproject.toml ./ | ||
|
||
# Install the dependencies | ||
RUN echo "Python version:" \ | ||
&& python --version \ | ||
&& echo "Installing dependencies:" \ | ||
# Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image | ||
pip install . playwright==$(playwright --version | cut -d ' ' -f 2) \ | ||
&& echo "All installed Python packages:" \ | ||
&& pip freeze | ||
# % endif | ||
|
||
# Next, copy the remaining files and directories with the source code. | ||
# Since we do this after installing the dependencies, quick build will be really fast | ||
# for most source file changes. | ||
COPY . ./ | ||
|
||
# Use compileall to ensure the runnability of the Actor Python code. | ||
RUN python -m compileall -q . | ||
|
||
# Specify how to launch the source code of your Actor. | ||
CMD ["python", "-m", "{{ cookiecutter.__package_name }}"] |
Oops, something went wrong.