Skip to content

Commit

Permalink
feat: add ignoreIframes opt-out from the Cheerio iframe expansion
Browse files Browse the repository at this point in the history
  • Loading branch information
barjin committed Jun 27, 2024
1 parent f0daf44 commit 5567411
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 6 deletions.
8 changes: 8 additions & 0 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,12 @@ export interface BrowserCrawlerOptions<
* By default, they are expanded automatically. Use this option to disable this behavior.
*/
ignoreShadowRoots?: boolean;

/**
* Whether to ignore `iframes` when processing the page content via `parseWithCheerio` helper.
* By default, `iframes` are expanded automatically. Use this option to disable this behavior.
*/
ignoreIframes?: boolean;
}

/**
Expand Down Expand Up @@ -343,6 +349,7 @@ export abstract class BrowserCrawler<
useSessionPool: ow.optional.boolean,
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
ignoreShadowRoots: ow.optional.boolean,
ignoreIframes: ow.optional.boolean,
};

/**
Expand Down Expand Up @@ -372,6 +379,7 @@ export abstract class BrowserCrawler<
handleFailedRequestFunction,
headless,
ignoreShadowRoots,
ignoreIframes,
...basicCrawlerOptions
} = options;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -597,10 +597,14 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {}
* @param page Playwright [`Page`](https://playwright.dev/docs/api/class-page) object.
* @param ignoreShadowRoots
*/
export async function parseWithCheerio(page: Page, ignoreShadowRoots = false): Promise<CheerioRoot> {
export async function parseWithCheerio(
page: Page,
ignoreShadowRoots = false,
ignoreIframes = false,
): Promise<CheerioRoot> {
ow(page, ow.object.validate(validators.browserPage));

if (page.frames().length > 1) {
if (page.frames().length > 1 && !ignoreIframes) {
const frames = await page.$$('iframe');

await Promise.all(
Expand Down Expand Up @@ -858,7 +862,7 @@ export function registerUtilsToContext(
await context.waitForSelector(selector, timeoutMs);
}

return parseWithCheerio(context.page, crawlerOptions.ignoreShadowRoots);
return parseWithCheerio(context.page, crawlerOptions.ignoreShadowRoots, crawlerOptions.ignoreIframes);
};
context.infiniteScroll = async (options?: InfiniteScrollOptions) => infiniteScroll(context.page, options);
context.saveSnapshot = async (options?: SaveSnapshotOptions) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,14 @@ export async function injectJQuery(page: Page, options?: { surviveNavigations?:
* @param page Puppeteer [`Page`](https://pptr.dev/api/puppeteer.page) object.
* @param ignoreShadowRoots
*/
export async function parseWithCheerio(page: Page, ignoreShadowRoots = false): Promise<CheerioRoot> {
export async function parseWithCheerio(
page: Page,
ignoreShadowRoots = false,
ignoreIframes = false,
): Promise<CheerioRoot> {
ow(page, ow.object.validate(validators.browserPage));

if (page.frames().length > 1) {
if (page.frames().length > 1 && !ignoreIframes) {
const frames = await page.$$('iframe');

await Promise.all(
Expand Down Expand Up @@ -1068,7 +1072,7 @@ export function registerUtilsToContext(
await context.waitForSelector(selector, timeoutMs);
}

return parseWithCheerio(context.page, crawlerOptions.ignoreShadowRoots);
return parseWithCheerio(context.page, crawlerOptions.ignoreShadowRoots, crawlerOptions.ignoreIframes);
};
context.enqueueLinksByClickingElements = async (
options: Omit<EnqueueLinksByClickingElementsOptions, 'page' | 'requestQueue'>,
Expand Down

0 comments on commit 5567411

Please sign in to comment.