Skip to content

Commit

Permalink
FB + other dynamic site capture improvements (#249)
Browse files Browse the repository at this point in the history
- update to wabac.js 2.19.6 for improved FB rewriting rules
- update to browsertrix-behaviors 0.6.4 to fix async fetch, even when
behaviors aren't running on autopilot
- Don't truncate POST bodies if URL is handled by custom rules
- bump to 0.12.6
  • Loading branch information
ikreymer committed Aug 13, 2024
1 parent e4932ff commit 63d247a
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 36 deletions.
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "@webrecorder/archivewebpage",
"productName": "ArchiveWeb.page",
"version": "0.12.5",
"version": "0.12.6",
"main": "index.js",
"description": "Create Web Archives directly in your browser",
"repository": "https://github.com/webrecorder/archiveweb.page",
Expand All @@ -11,9 +11,9 @@
"@fortawesome/fontawesome-free": "^5.13.0",
"@ipld/car": "^5.3.1",
"@webrecorder/awp-sw": "^0.4.4",
"@webrecorder/wabac": "^2.19.4",
"@webrecorder/wabac": "^2.19.6",
"auto-js-ipfs": "^2.3.0",
"browsertrix-behaviors": "^0.6.0",
"browsertrix-behaviors": "^0.6.4",
"btoa": "^1.2.1",
"bulma": "^0.9.3",
"client-zip": "^2.2.2",
Expand Down
7 changes: 3 additions & 4 deletions src/recorder.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { RequestResponseInfo } from "./requestresponseinfo.js";

import { baseRules as baseDSRules, htmlRules as htmlDSRules } from "@webrecorder/wabac/src/rewrite";
import { getCustomRewriter } from "@webrecorder/wabac/src/rewrite";
import { rewriteDASH, rewriteHLS } from "@webrecorder/wabac/src/rewrite/rewriteVideo";
import { Buffer } from "buffer";

Expand Down Expand Up @@ -1009,10 +1009,9 @@ class Recorder {
case "text/javascript":
case "application/javascript":
case "application/x-javascript": {
const rules = ct === "text/html" ? htmlDSRules : baseDSRules;
const rw = rules.getRewriter(url);
const rw = getCustomRewriter(url, ct === "text/html");

if (rw !== rules.defaultRewriter) {
if (rw) {
string = payload.toString();
newString = rw.rewrite(string, {live: true, save: extraOpts});
}
Expand Down
26 changes: 15 additions & 11 deletions src/requestresponseinfo.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"use strict";

import { getCustomRewriter } from "@webrecorder/wabac/src/rewrite";
import { getStatusText } from "@webrecorder/wabac/src/utils";

import { postToGetUrl } from "warcio";
Expand Down Expand Up @@ -191,20 +192,23 @@ class RequestResponseInfo
postData: this.postData || "",
};
if (postToGetUrl(convData)) {
//this.requestBody = convData.requestBody;
// truncate to avoid extra long URLs
try {
const url = new URL(convData.url);
for (const [key, value] of url.searchParams.entries()) {
if (value && value.length > MAX_ARG_LEN) {
url.searchParams.set(key, value.slice(0, MAX_ARG_LEN));
// if URL for custom rewriting, keep as is, otherwise truncate to avoid extra long URLs
if (getCustomRewriter(this.url, mime === "text/html")) {
this.url = convData.url;
} else {
try {
const url = new URL(convData.url);
for (const [key, value] of url.searchParams.entries()) {
if (value && value.length > MAX_ARG_LEN) {
url.searchParams.set(key, value.slice(0, MAX_ARG_LEN));
}
}
convData.url = url.href;
} catch (e) {
//ignore
}
convData.url = url.href;
} catch (e) {
//ignore
this.url = convData.url.slice(0, MAX_URL_LENGTH);
}
this.url = convData.url.slice(0, MAX_URL_LENGTH);
}
}

Expand Down
43 changes: 25 additions & 18 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -984,21 +984,21 @@
uuid "^9.0.0"
warcio "^2.2.1"

"@webrecorder/wabac@^2.17.3", "@webrecorder/wabac@^2.18.1", "@webrecorder/wabac@^2.19.4":
version "2.19.4"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.4.tgz#6c91a65928413b8394f17b57f57a803dcb111dbe"
integrity sha512-USWUoreSfgyeYYrC2/o2YYr4dCUSwgOSzbpdapqh90VQ4Fb0fjwPAiessBCH4rA5yd9QpOgWdkapDmXvLx6Bww==
"@webrecorder/wabac@^2.17.3", "@webrecorder/wabac@^2.18.1", "@webrecorder/wabac@^2.19.6":
version "2.19.6"
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.6.tgz#775078cc752eb29a15fc2835a1484c32f85661cb"
integrity sha512-3DzrASJBfwaFjtpYhISaYidYD8JgOAeGqx1ciFKSWo7cXnTQWtfyNGQfJoPLunfuzZvKBoxakEdcuE+Hl8q4rA==
dependencies:
"@peculiar/asn1-ecc" "^2.3.4"
"@peculiar/asn1-schema" "^2.3.3"
"@peculiar/x509" "^1.9.2"
"@webrecorder/wombat" "^3.7.11"
"@webrecorder/wombat" "^3.7.12"
acorn "^8.10.0"
auto-js-ipfs "^2.1.1"
base64-js "^1.5.1"
brotli "^1.3.3"
buffer "^6.0.3"
fast-xml-parser "^4.4.0"
fast-xml-parser "^4.4.1"
hash-wasm "^4.9.0"
http-link-header "^1.1.3"
http-status-codes "^2.1.4"
Expand All @@ -1013,10 +1013,10 @@
stream-browserify "^3.0.0"
warcio "^2.2.1"

"@webrecorder/wombat@^3.7.11":
version "3.7.11"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.11.tgz#27539f52317b2d80af4f28d971d59b53bc0f2b96"
integrity sha512-WlGpKjHUpP2aZo/OrY5aduNX/TVdo+hSkzu9as/63wSQ4ZFWIqZ+pxYXci43hjV5oVjcMP4KALLq+V+Fuo8qSA==
"@webrecorder/wombat@^3.7.12":
version "3.7.12"
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.12.tgz#b2328ebfcea4f8acafdf1f81dea1d10a576b0357"
integrity sha512-MqSUxzSiapTGuoPeh7FNIe6ZX//KiCIiSydByzFqujin/e1nG7pmw7x2JgGeyWPYH6hYN/RxrpBcqJRBmYtHRg==
dependencies:
warcio "^2.2.0"

Expand Down Expand Up @@ -1415,10 +1415,12 @@ browserslist@^4.21.10:
node-releases "^2.0.14"
update-browserslist-db "^1.0.13"

browsertrix-behaviors@^0.6.0:
version "0.6.0"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.0.tgz#e16345e4b414b18e6441548d517d01b4316f744e"
integrity sha512-BdfEPHmDjhEIFrn80UKnwGT6HRgnmq2shNybu8BEfAHJQsqZdvP/VVKWvNGnWML1jjUKiwtvtkdFhtHedFQkzA==
browsertrix-behaviors@^0.6.4:
version "0.6.4"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.4.tgz#33fe9a433108f2faac3a03af91aff940433e5b87"
integrity sha512-xaiO/VqqeSd5FnAkIKQINxC/q3Med33Lqw3LGxD4NBtkcMSh1Anz/+830QHVlQbp08nIPUXYV96hDrx1Uv0PmQ==
dependencies:
query-selector-shadow-dom "^1.0.1"

btoa@^1.2.1:
version "1.2.1"
Expand Down Expand Up @@ -2537,10 +2539,10 @@ fast-uri@^2.3.0:
resolved "https://registry.yarnpkg.com/fast-uri/-/fast-uri-2.3.0.tgz#bdae493942483d299e7285dcb4627767d42e2793"
integrity sha512-eel5UKGn369gGEWOqBShmFJWfq/xSJvsgDzgLYC845GneayWvXBf0lJCBn5qTABfewy1ZDPoaR5OZCP+kssfuw==

fast-xml-parser@^4.4.0:
version "4.4.0"
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.0.tgz#341cc98de71e9ba9e651a67f41f1752d1441a501"
integrity sha512-kLY3jFlwIYwBNDojclKsNAC12sfD6NwW74QB2CoNGPvtVxjliYehVunB3HYyNi+n4Tt1dAcgwYvmKF/Z18flqg==
fast-xml-parser@^4.4.1:
version "4.4.1"
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.1.tgz#86dbf3f18edf8739326447bcaac31b4ae7f6514f"
integrity sha512-xkjOecfnKGkSsOwtZ5Pz7Us/T6mrbPQrq0nh+aCO5V9nk5NLWmasAHumTKjiPJPWANe+kAZ84Jc8ooJkzZ88Sw==
dependencies:
strnum "^1.0.5"

Expand Down Expand Up @@ -4611,6 +4613,11 @@ [email protected]:
dependencies:
side-channel "^1.0.4"

query-selector-shadow-dom@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/query-selector-shadow-dom/-/query-selector-shadow-dom-1.0.1.tgz#1c7b0058eff4881ac44f45d8f84ede32e9a2f349"
integrity sha512-lT5yCqEBgfoMYpf3F2xQRK7zEr1rhIIZuceDK6+xRkJQ4NMbHTwXqk4NkwDwQMNqXgG9r9fyHnzwNVs6zV5KRw==

queue-microtask@^1.2.2:
version "1.2.3"
resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243"
Expand Down

0 comments on commit 63d247a

Please sign in to comment.