Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Decouple HTTP client #2661

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
a995220
Introduce BaseHttpClient interface
janbuchar Sep 6, 2024
3c638ef
Add GotScrapingHttpClient
janbuchar Sep 6, 2024
34043cb
Use the http client in BasicCrawler
janbuchar Sep 6, 2024
e3846bc
Finalize using HttpClient in send_request
janbuchar Sep 11, 2024
374e973
Lint
janbuchar Sep 11, 2024
12b7eb5
Format
janbuchar Sep 11, 2024
fb01709
Simplify types in cookie-related code
janbuchar Sep 25, 2024
8e3709a
Merge remote-tracking branch 'origin/master' into decouple-http-client
janbuchar Sep 25, 2024
07da1f9
Decouple got-scraping from HttpCrawler
janbuchar Sep 25, 2024
c037552
Adapt FileDownload class
janbuchar Sep 26, 2024
936d064
Add httpClient to validation schema
janbuchar Sep 26, 2024
1f42f86
Lint
janbuchar Sep 26, 2024
d80641e
Fix type of context.sendRequest
janbuchar Sep 27, 2024
448b1ec
Make BasicHttpClient an interface
janbuchar Sep 30, 2024
66c0dae
Remove niche properties from the response type
janbuchar Sep 30, 2024
9330017
Adjust cookie jar type
janbuchar Oct 1, 2024
b97f5a6
Extract sendRequest from BasicCrawler
janbuchar Oct 1, 2024
d0a1863
Handle searchParams before delegating to http client
janbuchar Oct 1, 2024
c72e81a
Handle json and form directly in sendRequest
janbuchar Oct 1, 2024
8034dab
Refactor sendRequest, handle username/password
janbuchar Oct 2, 2024
0e62fd2
Unused import
janbuchar Oct 2, 2024
12b94c6
Mistake
janbuchar Oct 2, 2024
2ccd065
Merge remote-tracking branch 'origin/master' into decouple-http-client
janbuchar Oct 4, 2024
e97084f
Lint
janbuchar Oct 4, 2024
7f22ad5
Add missing docblocks
janbuchar Oct 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ import type {
StatisticState,
StatisticsOptions,
LoadedContext,
BaseHttpClient,
HttpResponse,
HttpRequest,
ResponseTypes,
} from '@crawlee/core';
import {
AutoscaledPool,
Expand All @@ -49,9 +53,10 @@ import {
SessionPool,
Statistics,
validators,
GotScrapingHttpClient,
} from '@crawlee/core';
import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
import { ROTATE_PROXY_ERRORS, gotScraping } from '@crawlee/utils';
import { ROTATE_PROXY_ERRORS } from '@crawlee/utils';
import { stringify } from 'csv-stringify/sync';
import { ensureDir, writeFile, writeJSON } from 'fs-extra';
// @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood
Expand Down Expand Up @@ -351,6 +356,8 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
* whether to output them to the Key-Value store.
*/
statisticsOptions?: StatisticsOptions;

httpClient?: BaseHttpClient;
}

/**
Expand Down Expand Up @@ -496,6 +503,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected crawlingContexts = new Map<string, Context>();
protected autoscaledPoolOptions: AutoscaledPoolOptions;
protected events: EventManager;
protected httpClient: BaseHttpClient;
protected retryOnBlocked: boolean;
private _closeEvents?: boolean;

Expand Down Expand Up @@ -530,6 +538,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
statusMessageCallback: ow.optional.function,

retryOnBlocked: ow.optional.boolean,
httpClient: ow.optional.object,

// AutoscaledPool shorthands
minConcurrency: ow.optional.number,
Expand Down Expand Up @@ -592,10 +601,12 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
statusMessageCallback,

statisticsOptions,
httpClient,
} = options;

this.requestList = requestList;
this.requestQueue = requestQueue;
this.httpClient = httpClient ?? new GotScrapingHttpClient();
this.log = log;
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
this.statusMessageCallback = statusMessageCallback as StatusMessageCallback;
Expand Down Expand Up @@ -1270,7 +1281,9 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
addRequests: this.addRequests.bind(this),
pushData: this.pushData.bind(this),
useState: this.useState.bind(this),
sendRequest: async (overrideOptions?: OptionsInit) => {
sendRequest: async <TResponseType extends keyof ResponseTypes = 'text'>(
overrideOptions?: Partial<HttpRequest<TResponseType>>,
): Promise<HttpResponse<TResponseType>> => {
const cookieJar = session
? {
getCookieString: async (url: string) => session!.getCookieString(url),
Expand All @@ -1279,19 +1292,14 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}
: overrideOptions?.cookieJar;

return gotScraping({
return this.httpClient.sendRequest<TResponseType>({
url: request!.url,
method: request!.method as Method, // Narrow type to omit CONNECT
body: request!.payload,
headers: request!.headers,
proxyUrl: crawlingContext.proxyInfo?.url,
sessionToken: session,
responseType: 'text',
...overrideOptions,
retry: {
limit: 0,
...overrideOptions?.retry,
},
cookieJar,
});
},
Expand Down
15 changes: 8 additions & 7 deletions packages/core/src/cookie_utils.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import type { IncomingMessage } from 'node:http';

import type { BrowserLikeResponse, Dictionary, Cookie as CookieObject } from '@crawlee/types';
import type { Cookie as CookieObject } from '@crawlee/types';
import { Cookie, CookieJar } from 'tough-cookie';

import { log } from './log';
import { CookieParseError } from './session_pool/errors';

export interface ResponseLike {
url?: string | (() => string);
headers?: Record<string, string | string[] | undefined> | (() => Record<string, string | string[] | undefined>);
}

/**
* @internal
*/
export function getCookiesFromResponse(
response: IncomingMessage | BrowserLikeResponse | { headers: Dictionary<string | string[]> },
): Cookie[] {
export function getCookiesFromResponse(response: ResponseLike): Cookie[] {
const headers = typeof response.headers === 'function' ? response.headers() : response.headers;
const cookieHeader = headers['set-cookie'] || '';
const cookieHeader = headers?.['set-cookie'] || '';

try {
return Array.isArray(cookieHeader)
Expand Down
177 changes: 177 additions & 0 deletions packages/core/src/http_clients/base_http_client.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import type { Readable } from 'stream';

import type { FormDataLike } from './form_data_like';

type Timeout =
| {
lookup: number;
connect: number;
secureConnect: number;
socket: number;
send: number;
response: number;
}
| { request: number };

type Method =
| 'GET'
| 'POST'
| 'PUT'
| 'PATCH'
| 'HEAD'
| 'DELETE'
| 'OPTIONS'
| 'TRACE'
| 'get'
| 'post'
| 'put'
| 'patch'
| 'head'
| 'delete'
| 'options'
| 'trace';

export interface ResponseTypes {
'json': unknown;
'text': string;
'buffer': Buffer;
}

interface Progress {
percent: number;
transferred: number;
total?: number;
}

interface ToughCookieJar {
getCookieString: ((
currentUrl: string,
options: Record<string, unknown>,
callback: (error: Error | null, cookies: string) => void,
) => void) &
((url: string, callback: (error: Error | null, cookieHeader: string) => void) => void);
setCookie: ((
cookieOrString: unknown,
currentUrl: string,
options: Record<string, unknown>,
callback: (error: Error | null, cookie: unknown) => void,
) => void) &
((rawCookie: string, url: string, callback: (error: Error | null, result: unknown) => void) => void);
}

interface PromiseCookieJar {
getCookieString: (url: string) => Promise<string>;
setCookie: (rawCookie: string, url: string) => Promise<unknown>;
}

type SimpleHeaders = Record<string, string | string[] | undefined>;

// Omitted (https://github.com/sindresorhus/got/blob/main/documentation/2-options.md):
// - decompress,
// - resolveBodyOnly,
// - allowGetBody,
// - dnsLookup,
// - dnsCache,
// - dnsLookupIpVersion,
// - retry,
// - hooks,
// - parseJson,
// - stringifyJson,
// - request,
// - cache,
// - cacheOptions,
// - http2
// - https
// - agent
// - localAddress
// - createConnection
// - pagination
// - setHost
// - maxHeaderSize
// - methodRewriting
// - enableUnixSockets
// - context
export interface HttpRequest<TResponseType extends keyof ResponseTypes = 'text'> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may not want to keep this. If people use responseType in current sendRequest, the type is incorrect anyway. And I'd guess that using e.g., gotScraping({...}).json() is more prevalent anyway.

[k: string]: unknown; // TODO BC with got - remove in 4.0

url: string | URL;
method?: Method;
searchParams?: string | URLSearchParams | Record<string, string | number | boolean | null | undefined>;
signal?: AbortSignal;
headers?: SimpleHeaders;
body?: string | Buffer | Readable | Generator | AsyncGenerator | FormDataLike;
form?: Record<string, string>;
json?: unknown;

username?: string;
password?: string;

cookieJar?: ToughCookieJar | PromiseCookieJar;
followRedirect?: boolean | ((response: any) => boolean); // TODO BC with got - specify type better in 4.0
maxRedirects?: number;

timeout?: Partial<Timeout>;

encoding?: BufferEncoding;
responseType?: TResponseType;
throwHttpErrors?: boolean;

// from got-scraping Context
proxyUrl?: string;
headerGeneratorOptions?: Record<string, unknown>;
useHeaderGenerator?: boolean;
headerGenerator?: {
getHeaders: (options: Record<string, unknown>) => Record<string, string>;
};
insecureHTTPParser?: boolean;
sessionToken?: object;
}

interface BaseHttpResponseData {
redirectUrls: URL[];
url: string;

ip?: string;
statusCode: number;
statusMessage?: string;

headers: SimpleHeaders;
trailers: SimpleHeaders; // Populated after the whole message is processed

complete: boolean;
httpVersion: string;

rawHeaders: string[];
rawTrailers: string[];
}

interface HttpResponseWithoutBody<TResponseType extends keyof ResponseTypes = keyof ResponseTypes>
extends BaseHttpResponseData {
request: HttpRequest<TResponseType>;
}

export interface HttpResponse<TResponseType extends keyof ResponseTypes = keyof ResponseTypes>
extends HttpResponseWithoutBody<TResponseType> {
[k: string]: any; // TODO BC with got - remove in 4.0

body: ResponseTypes[TResponseType];
}

export interface StreamingHttpResponse extends HttpResponseWithoutBody {
stream: Readable;
readonly downloadProgress: Progress;
readonly uploadProgress: Progress;
}

export type RedirectHandler = (
redirectResponse: BaseHttpResponseData,
updatedRequest: { url?: string | URL; headers: SimpleHeaders },
) => void;

export abstract class BaseHttpClient {
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
abstract sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
request: HttpRequest<TResponseType>,
): Promise<HttpResponse<TResponseType>>;

abstract stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse>;
}
67 changes: 67 additions & 0 deletions packages/core/src/http_clients/form_data_like.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* This is copied from https://github.com/octet-stream/form-data-encoder
*/

interface FileLike {
/**
* Name of the file referenced by the File object.
*/
readonly name: string;
/**
* Returns the media type ([`MIME`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types)) of the file represented by a `File` object.
*/
readonly type: string;
/**
* Size of the file parts in bytes
*/
readonly size: number;
/**
* The last modified date of the file as the number of milliseconds since the Unix epoch (January 1, 1970 at midnight). Files without a known last modified date return the current date.
*/
readonly lastModified: number;
/**
* Returns a [`ReadableStream`](https://developer.mozilla.org/en-US/docs/Web/API/ReadableStream) which upon reading returns the data contained within the [`File`](https://developer.mozilla.org/en-US/docs/Web/API/File).
*/
stream(): ReadableStream<Uint8Array> | AsyncIterable<Uint8Array>;
readonly [Symbol.toStringTag]?: string;
}

/**
* A `string` or `File` that represents a single value from a set of `FormData` key-value pairs.
*/
type FormDataEntryValue = string | FileLike;
/**
* This interface reflects minimal shape of the FormData
*/
export interface FormDataLike {
/**
* Appends a new value onto an existing key inside a FormData object,
* or adds the key if it does not already exist.
*
* The difference between `set()` and `append()` is that if the specified key already exists, `set()` will overwrite all existing values with the new one, whereas `append()` will append the new value onto the end of the existing set of values.
*
* @param name The name of the field whose data is contained in `value`.
* @param value The field's value. This can be [`Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)
or [`File`](https://developer.mozilla.org/en-US/docs/Web/API/File). If none of these are specified the value is converted to a string.
* @param fileName The filename reported to the server, when a Blob or File is passed as the second parameter. The default filename for Blob objects is "blob". The default filename for File objects is the file's filename.
*/
append(name: string, value: unknown, fileName?: string): void;
/**
* Returns all the values associated with a given key from within a `FormData` object.
*
* @param {string} name A name of the value you want to retrieve.
*
* @returns An array of `FormDataEntryValue` whose key matches the value passed in the `name` parameter. If the key doesn't exist, the method returns an empty list.
*/
getAll(name: string): FormDataEntryValue[];
/**
* Returns an [`iterator`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Iteration_protocols) allowing to go through the `FormData` key/value pairs.
* The key of each pair is a string; the value is a [`FormDataValue`](https://developer.mozilla.org/en-US/docs/Web/API/FormDataEntryValue).
*/
entries(): IterableIterator<[string, FormDataEntryValue]>;
/**
* An alias for FormDataLike#entries()
*/
[Symbol.iterator](): IterableIterator<[string, FormDataEntryValue]>;
readonly [Symbol.toStringTag]?: string;
}
Loading