Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent empty commits if files did not change #809

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion packages/hub/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
}
},
"browser": {
"./src/utils/sha256-node.ts": false,
"./src/utils/sha-node.ts": false,
"./src/utils/FileBlob.ts": false,
"./dist/index.js": "./dist/browser/index.js",
"./dist/index.mjs": "./dist/browser/index.mjs"
Expand Down
93 changes: 92 additions & 1 deletion packages/hub/src/lib/commit.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { assert, it, describe } from "vitest";

import { TEST_HUB_URL, TEST_ACCESS_TOKEN, TEST_USER } from "../test/consts";
import type { RepoId } from "../types/public";
import type { CommitFile } from "./commit";
import type { CommitFile, CommitOperation } from "./commit";
import { commit } from "./commit";
import { createRepo } from "./create-repo";
import { deleteRepo } from "./delete-repo";
Expand Down Expand Up @@ -127,6 +127,97 @@ size ${lfsContent.length}
}
}, 60_000);

it("should not commit if nothing to add/update", async function () {
const tokenizerJsonUrl = new URL(
"https://huggingface.co/spaces/aschen/push-model-from-web/raw/main/mobilenet/model.json"
);
const repoName = `${TEST_USER}/TEST-${insecureRandomString()}`;
const repo: RepoId = {
name: repoName,
type: "model",
};

await createRepo({
credentials: {
accessToken: TEST_ACCESS_TOKEN,
},
hubUrl: TEST_HUB_URL,
repo,
license: "mit",
});

try {
const readme1 = await downloadFile({ repo, path: "README.md", hubUrl: TEST_HUB_URL });
assert.strictEqual(readme1?.status, 200);

const nodeOperation: CommitFile[] = isFrontend
? []
: [
{
operation: "addOrUpdate",
path: "tsconfig.json",
content: (await import("node:url")).pathToFileURL("./tsconfig.json") as URL,
},
];

const operations: CommitOperation[] = [
{
operation: "addOrUpdate",
content: new Blob(["This is me"]),
path: "test.txt",
},
{
operation: "addOrUpdate",
content: new Blob([lfsContent]),
path: "test.lfs.txt",
},
...nodeOperation,
{
operation: "addOrUpdate",
content: tokenizerJsonUrl,
path: "lamaral.json",
},
];

const firstOutput = await commit({
repo,
title: "Preparation commit",
credentials: {
accessToken: TEST_ACCESS_TOKEN,
},
hubUrl: TEST_HUB_URL,
operations,
// To test web workers in the front-end
useWebWorkers: { minSize: 5_000 },
});

const secondOutput = await commit({
repo,
title: "Empty commit",
credentials: {
accessToken: TEST_ACCESS_TOKEN,
},
hubUrl: TEST_HUB_URL,
operations,
// To test web workers in the front-end
useWebWorkers: { minSize: 5_000 },
});

assert.deepStrictEqual(firstOutput.commit, secondOutput.commit);
assert.strictEqual(secondOutput.hookOutput, "Nothing to commit");

const currentRes: Response = await fetch(`${TEST_HUB_URL}/api/${repo.type}s/${repo.name}`);
const current = await currentRes.json();
assert.strictEqual(secondOutput.commit.oid, current.sha);
} finally {
await deleteRepo({
repo,
hubUrl: TEST_HUB_URL,
credentials: { accessToken: TEST_ACCESS_TOKEN },
});
}
}, 60_000);

it("should commit a full repo from HF with web urls", async function () {
const repoName = `${TEST_USER}/TEST-${insecureRandomString()}`;
const repo: RepoId = {
Expand Down
141 changes: 90 additions & 51 deletions packages/hub/src/lib/commit.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { checkCredentials } from "../utils/checkCredentials";
import { chunk } from "../utils/chunk";
import { promisesQueue } from "../utils/promisesQueue";
import { promisesQueueStreaming } from "../utils/promisesQueueStreaming";
import { sha1 } from "../utils/sha1";
import { sha256 } from "../utils/sha256";
import { toRepoId } from "../utils/toRepoId";
import { WebBlob } from "../utils/WebBlob";
Expand Down Expand Up @@ -125,8 +126,6 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
const repoId = toRepoId(params.repo);
yield { event: "phase", phase: "preuploading" };

const lfsShas = new Map<string, string | null>();

const abortController = new AbortController();
const abortSignal = abortController.signal;

Expand Down Expand Up @@ -168,6 +167,10 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr

const gitAttributes = allOperations.filter(isFileOperation).find((op) => op.path === ".gitattributes")?.content;

let currentCommitOid: string | undefined;
const deleteOps = allOperations.filter((op): op is CommitDeletedEntry => !isFileOperation(op));
const fileOpsToCommit: (CommitBlob &
({ uploadMode: "lfs"; sha: string } | { uploadMode: "regular"; newSha: string | null }))[] = [];
for (const operations of chunk(allOperations.filter(isFileOperation), 100)) {
const payload: ApiPreuploadRequest = {
gitAttributes: gitAttributes && (await gitAttributes.text()),
Expand Down Expand Up @@ -202,44 +205,53 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
}

const json: ApiPreuploadResponse = await res.json();
currentCommitOid = json.commitOid;

yield* eventToGenerator<{ event: "fileProgress"; state: "hashing"; path: string; progress: number }, void[]>(
(yieldCallback, returnCallback, rejectCallack) => {
return promisesQueue(
json.files.map((file) => async () => {
const op = operations.find((op) => op.path === file.path);
if (!op) {
return; // this should never happen, server-side we always return one entry per operation
}

for (const file of json.files) {
if (file.uploadMode === "lfs") {
lfsShas.set(file.path, null);
const iterator =
file.uploadMode === "lfs"
? sha256(op.content, { useWebWorker: params.useWebWorkers, abortSignal })
: sha1(new Blob([`blob ${op.content.size}\0`, await op.content.arrayBuffer()]), { abortSignal }); // https://git-scm.com/book/en/v2/Git-Internals-Git-Objects

let res: IteratorResult<number, string | null>;
do {
res = await iterator.next();
if (!res.done) {
yieldCallback({ event: "fileProgress", path: op.path, progress: res.value, state: "hashing" });
}
} while (!res.done);

if (res.value === null || res.value !== file.oid) {
fileOpsToCommit.push({
...op,
uploadMode: file.uploadMode,
sha: res.value,
} as (typeof fileOpsToCommit)[number]);
// ^^ we know `newSha` can only be `null` in the case of `sha1` as expected, but TS doesn't
}
}),
CONCURRENT_SHAS
).then(returnCallback, rejectCallack);
}
}
);

abortSignal?.throwIfAborted();
}

yield { event: "phase", phase: "uploadingLargeFiles" };

for (const operations of chunk(
allOperations.filter(isFileOperation).filter((op) => lfsShas.has(op.path)),
for (const lfsOpsToCommit of chunk(
fileOpsToCommit.filter((op): op is CommitBlob & { uploadMode: "lfs"; sha: string } => op.uploadMode === "lfs"),
100
)) {
const shas = yield* eventToGenerator<
{ event: "fileProgress"; state: "hashing"; path: string; progress: number },
string[]
>((yieldCallback, returnCallback, rejectCallack) => {
return promisesQueue(
operations.map((op) => async () => {
const iterator = sha256(op.content, { useWebWorker: params.useWebWorkers, abortSignal: abortSignal });
let res: IteratorResult<number, string>;
do {
res = await iterator.next();
if (!res.done) {
yieldCallback({ event: "fileProgress", path: op.path, progress: res.value, state: "hashing" });
}
} while (!res.done);
const sha = res.value;
lfsShas.set(op.path, res.value);
return sha;
}),
CONCURRENT_SHAS
).then(returnCallback, rejectCallack);
});

abortSignal?.throwIfAborted();

const payload: ApiLfsBatchRequest = {
operation: "upload",
// multipart is a custom protocol for HF
Expand All @@ -250,8 +262,8 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
name: params.branch ?? "main",
},
}),
objects: operations.map((op, i) => ({
oid: shas[i],
objects: lfsOpsToCommit.map((op) => ({
oid: op.sha,
size: op.content.size,
})),
};
Expand Down Expand Up @@ -279,7 +291,7 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
const json: ApiLfsBatchResponse = await res.json();
const batchRequestId = res.headers.get("X-Request-Id") || undefined;

const shaToOperation = new Map(operations.map((op, i) => [shas[i], op]));
const shaToOperation = new Map(lfsOpsToCommit.map((op) => [op.sha, op]));

yield* eventToGenerator<CommitProgressEvent, void>((yieldCallback, returnCallback, rejectCallback) => {
return promisesQueueStreaming(
Expand All @@ -293,9 +305,9 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
abortSignal?.throwIfAborted();

if (obj.error) {
const errorMessage = `Error while doing LFS batch call for ${operations[shas.indexOf(obj.oid)].path}: ${
obj.error.message
}${batchRequestId ? ` - Request ID: ${batchRequestId}` : ""}`;
const errorMessage = `Error while doing LFS batch call for ${op.path}: ${obj.error.message}${
batchRequestId ? ` - Request ID: ${batchRequestId}` : ""
}`;
throw new HubApiError(res.url, obj.error.code, batchRequestId, errorMessage);
}
if (!obj.actions?.upload) {
Expand Down Expand Up @@ -367,9 +379,7 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
if (!res.ok) {
throw await createApiError(res, {
requestId: batchRequestId,
message: `Error while uploading part ${part} of ${
operations[shas.indexOf(obj.oid)].path
} to LFS storage`,
message: `Error while uploading part ${part} of ${op.path} to LFS storage`,
});
}

Expand Down Expand Up @@ -399,9 +409,7 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
if (!res.ok) {
throw await createApiError(res, {
requestId: batchRequestId,
message: `Error completing multipart upload of ${
operations[shas.indexOf(obj.oid)].path
} to LFS storage`,
message: `Error completing multipart upload of ${op.path} to LFS storage`,
});
}

Expand Down Expand Up @@ -438,7 +446,7 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
if (!res.ok) {
throw await createApiError(res, {
requestId: batchRequestId,
message: `Error while uploading ${operations[shas.indexOf(obj.oid)].path} to LFS storage`,
message: `Error while uploading ${op.path} to LFS storage`,
});
}

Expand All @@ -459,6 +467,38 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr

yield { event: "phase", phase: "committing" };

const allOpsToCommit = [...deleteOps, ...fileOpsToCommit];
if (allOpsToCommit.length === 0) {
if (!currentCommitOid) {
Copy link
Member Author

@Pierrci Pierrci Jul 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, we currently don't return early or throw an error in the case where operations is empty, meaning it's possible to get there without calling the preupload endpoint before, and thus currentCommitOid can still be undefined here.

So I kept this for now; the other alternative would be to return an object with a different different shape or throw an error if there are no operations, but it would be breaking change - I'll let you decide @coyotte508

Copy link
Member

@coyotte508 coyotte508 Jul 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's fine to throw on empty commits IMO, we can do a major version change (and maybe do some of https://github.com/huggingface/huggingface.js/milestone/2 before publishing)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually if we upgrade to 1.0 we can add the null return type and return null on empty commit

(And it's better than throwing an error, at least it will be caught by TS)

const res: Response = await (params.fetch ?? fetch)(
`${params?.hubUrl || HUB_URL}/api/${repoId.type}s/${repoId.name}`,
{
headers: {
accept: "application/json",
...(params.credentials && { Authorization: `Bearer ${params.credentials.accessToken}` }),
},
}
);

if (!res.ok) {
throw await createApiError(res);
}

const { sha } = await res.json();
currentCommitOid = sha as string;
}

return {
commit: {
oid: currentCommitOid,
url: `${params?.hubUrl || HUB_URL}${repoId.type !== "model" ? `/${repoId.type}s` : ""}/${
repoId.name
}/commit/${currentCommitOid}`,
},
hookOutput: "Nothing to commit",
};
}

return yield* eventToGenerator<CommitProgressEvent, CommitOutput>(
async (yieldCallback, returnCallback, rejectCallback) =>
(params.fetch ?? fetch)(
Expand All @@ -481,17 +521,16 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
} satisfies ApiCommitHeader,
},
...((await Promise.all(
allOperations.map((operation) => {
allOpsToCommit.map((operation) => {
if (isFileOperation(operation)) {
const sha = lfsShas.get(operation.path);
if (sha) {
if (operation.uploadMode === "lfs") {
return {
key: "lfsFile",
value: {
path: operation.path,
algo: "sha256",
size: operation.content.size,
oid: sha,
oid: operation.sha,
} satisfies ApiCommitLfsFile,
};
}
Expand All @@ -509,8 +548,8 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
progressCallback: (progress: number) => {
// For now, we display equal progress for all files
// We could compute the progress based on the size of `convertOperationToNdJson` for each of the files instead
for (const op of allOperations) {
if (isFileOperation(op) && !lfsShas.has(op.path)) {
for (const op of allOpsToCommit) {
if (isFileOperation(op) && op.uploadMode === "regular") {
yieldCallback({
event: "fileProgress",
path: op.path,
Expand Down
Loading
Loading