diff --git a/apps/app/src/features/page-bulk-export/interfaces/page-bulk-export.ts b/apps/app/src/features/page-bulk-export/interfaces/page-bulk-export.ts index dcf2fae33be..dd78bc855fc 100644 --- a/apps/app/src/features/page-bulk-export/interfaces/page-bulk-export.ts +++ b/apps/app/src/features/page-bulk-export/interfaces/page-bulk-export.ts @@ -34,6 +34,7 @@ export interface IPageBulkExportJob { completedAt?: Date, // the date at which job was completed attachment?: Ref, status: PageBulkExportJobStatus, + revisionListHash?: string, // Hash created from the list of revision IDs. Used to detect existing duplicate uploads. } export interface IPageBulkExportJobHasId extends IPageBulkExportJob, HasObjectId {} diff --git a/apps/app/src/features/page-bulk-export/server/models/page-bulk-export-job.ts b/apps/app/src/features/page-bulk-export/server/models/page-bulk-export-job.ts index 83d97d04fc6..cfb8abd35f0 100644 --- a/apps/app/src/features/page-bulk-export/server/models/page-bulk-export-job.ts +++ b/apps/app/src/features/page-bulk-export/server/models/page-bulk-export-job.ts @@ -21,6 +21,7 @@ const pageBulkExportJobSchema = new Schema({ status: { type: String, enum: Object.values(PageBulkExportJobStatus), required: true, default: PageBulkExportJobStatus.initializing, }, + revisionListHash: { type: String }, }, { timestamps: true }); export default getOrCreateModel('PageBulkExportJob', pageBulkExportJobSchema); diff --git a/apps/app/src/features/page-bulk-export/server/service/page-bulk-export-job-cron.ts b/apps/app/src/features/page-bulk-export/server/service/page-bulk-export-job-cron.ts index 93bd2893164..bdf8587edfb 100644 --- a/apps/app/src/features/page-bulk-export/server/service/page-bulk-export-job-cron.ts +++ b/apps/app/src/features/page-bulk-export/server/service/page-bulk-export-job-cron.ts @@ -55,14 +55,24 @@ class PageBulkExportJobCronService extends CronService { */ async deleteDownloadExpiredExportJobs() { const downloadExpirationSeconds = configManager.getConfig('crowi', 'app:bulkExportDownloadExpirationSeconds'); + const thresholdDate = new Date(Date.now() - downloadExpirationSeconds * 1000); const downloadExpiredExportJobs = await PageBulkExportJob.find({ status: PageBulkExportJobStatus.completed, - completedAt: { $lt: new Date(Date.now() - downloadExpirationSeconds * 1000) }, + completedAt: { $lt: thresholdDate }, }); const cleanup = async(job: PageBulkExportJobDocument) => { await pageBulkExportService?.cleanUpExportJobResources(job); - await this.crowi.attachmentService?.removeAttachment(job.attachment); + + const hasSameAttachmentAndDownloadNotExpired = await PageBulkExportJob.findOne({ + attachment: job.attachment, + _id: { $ne: job._id }, + completedAt: { $gte: thresholdDate }, + }); + if (hasSameAttachmentAndDownloadNotExpired == null) { + // delete attachment if no other export job (which download has not expired) has re-used it + await this.crowi.attachmentService?.removeAttachment(job.attachment); + } }; await this.cleanUpAndDeleteBulkExportJobs(downloadExpiredExportJobs, cleanup); diff --git a/apps/app/src/features/page-bulk-export/server/service/page-bulk-export/index.ts b/apps/app/src/features/page-bulk-export/server/service/page-bulk-export/index.ts index 3794779b107..f5c39772383 100644 --- a/apps/app/src/features/page-bulk-export/server/service/page-bulk-export/index.ts +++ b/apps/app/src/features/page-bulk-export/server/service/page-bulk-export/index.ts @@ -1,10 +1,11 @@ +import { createHash } from 'crypto'; import fs from 'fs'; import path from 'path'; import { Writable } from 'stream'; import { pipeline as pipelinePromise } from 'stream/promises'; import { - getIdForRef, type IPage, isPopulated, SubscriptionStatusType, + getIdForRef, getIdStringForRef, type IPage, isPopulated, SubscriptionStatusType, } from '@growi/core'; import { getParentPath, normalizePath } from '@growi/core/dist/utils/path-utils'; import type { Archiver } from 'archiver'; @@ -110,7 +111,22 @@ class PageBulkExportService { if (pageBulkExportJob.status === PageBulkExportJobStatus.initializing) { await this.createPageSnapshots(user, pageBulkExportJob); - pageBulkExportJob.status = PageBulkExportJobStatus.exporting; + + const duplicateExportJob = await PageBulkExportJob.findOne({ + user: pageBulkExportJob.user, + page: pageBulkExportJob.page, + format: pageBulkExportJob.format, + status: PageBulkExportJobStatus.completed, + revisionListHash: pageBulkExportJob.revisionListHash, + }); + if (duplicateExportJob != null) { + // if an upload with the exact same contents exists, re-use the same attachment of that upload + pageBulkExportJob.attachment = duplicateExportJob.attachment; + pageBulkExportJob.status = PageBulkExportJobStatus.completed; + } + else { + pageBulkExportJob.status = PageBulkExportJobStatus.exporting; + } await pageBulkExportJob.save(); } if (pageBulkExportJob.status === PageBulkExportJobStatus.exporting) { @@ -162,7 +178,8 @@ class PageBulkExportService { } /** - * Create a snapshot for each page that is to be exported in the pageBulkExportJob + * Create a snapshot for each page that is to be exported in the pageBulkExportJob. + * Also calulate revisionListHash and save it to the pageBulkExportJob. */ private async createPageSnapshots(user, pageBulkExportJob: PageBulkExportJobDocument): Promise { // if the process of creating snapshots was interrupted, delete the snapshots and create from the start @@ -173,6 +190,8 @@ class PageBulkExportService { throw new Error('Base page not found'); } + const revisionListHash = createHash('sha256'); + // create a Readable for pages to be exported const { PageQueryBuilder } = this.pageModel; const builder = await new PageQueryBuilder(this.pageModel.find()) @@ -188,6 +207,9 @@ class PageBulkExportService { objectMode: true, write: async(page: PageDocument, encoding, callback) => { try { + if (page.revision != null) { + revisionListHash.update(getIdStringForRef(page.revision)); + } await PageBulkExportPageSnapshot.create({ pageBulkExportJob, path: page.path, @@ -205,6 +227,9 @@ class PageBulkExportService { this.pageBulkExportJobStreamManager.addJobStream(pageBulkExportJob._id, pagesReadable); await pipelinePromise(pagesReadable, pageSnapshotsWritable); + + pageBulkExportJob.revisionListHash = revisionListHash.digest('hex'); + await pageBulkExportJob.save(); } /** @@ -267,7 +292,8 @@ class PageBulkExportService { const pageArchiver = this.setUpPageArchiver(); const bufferToPartSizeTransform = getBufferToFixedSizeTransform(this.maxPartSize); - const originalName = `${pageBulkExportJob._id}.${this.compressExtension}`; + if (pageBulkExportJob.revisionListHash == null) throw new Error('revisionListHash is not set'); + const originalName = `${pageBulkExportJob.revisionListHash}.${this.compressExtension}`; const attachment = Attachment.createWithoutSave(null, user, originalName, this.compressExtension, 0, AttachmentType.PAGE_BULK_EXPORT); const uploadKey = `${FilePathOnStoragePrefix.pageBulkExport}/${attachment.fileName}`;