diff --git a/scripts/ai/clean-docs-without-metadata.ts b/scripts/ai/clean-docs-without-metadata.ts new file mode 100644 index 000000000..76d9e40db --- /dev/null +++ b/scripts/ai/clean-docs-without-metadata.ts @@ -0,0 +1,258 @@ +#!/usr/bin/node + +// This script finds documents without metadata in the realtime database. +// If the deleteTypes array is uncommented, it will delete these documents. + +// to run this script type the following in the terminal +// cf. https://stackoverflow.com/a/66626333/16328462 +// $ cd scripts/ai +// $ npx tsx clean-docs-without-metadata.ts + +import admin from "firebase-admin"; + +import { getFirebaseBasePath, getScriptRootFilePath, prettyDuration, + remapFirebaseClassPublications, remapFirebaseProblemDocPublications } from "../lib/script-utils.js"; + +// Load the service account key JSON file. +import { getClassKeys } from "../lib/firebase-classes.js"; + +// The portal to get documents from. For example, "learn.concord.org". +const portal = "learn.concord.org"; +// The demo name to use. Make falsy to not use a demo. +// const demo = "TAGCLUE"; +const demo = false; + +// Make falsy to include all documents +const documentLimit = false; +// const documentLimit = 10000; + +// List of types to delete if they appear safe to do so. +const deleteTypes = [ + // "problem", + // "planning", + // "learningLog", + // "publication" +]; + +// If a problem or planning doc doesn't have metadata and there is an offering for +// the user that also doesn't have type specific metadata for any problem or planning +// docs, perhaps the doc is supposed to go in this "empty slot". +// Or if there is a learningLog that doesn't have metadata and there is no learningLog +// metadata for this user. +// For the problem and planning docs this hasn't been practical to figure out. +// There are usually multiple empty slots. +// For learningLogs there haven't been any cases like this. +// Additionally these docs would not be accessible to the user +// after they were created. So there isn't much point in keeping them around. +const deleteDefaultDocsEvenIfThereIsEmptySlot = true; +// const deleteDefaultDocsEvenIfThereIsEmptySlot = false; + +console.log(`*** Starting to Download Documents ***`); + +const startTime = Date.now(); +let documentsProcessed = 0; +let undefinedDocuments = 0; +let failedDocuments = 0; +let emptyDocuments = 0; + +const databaseURL = "https://collaborative-learning-ec215.firebaseio.com"; + +const firebaseBasePath = getFirebaseBasePath(portal, demo); + +const {classKeys, accessTime, fetchTime} = await getClassKeys(firebaseBasePath); + +// Fetch the service account key JSON file contents; must be in same folder as script +const serviceAccountFile = getScriptRootFilePath("serviceAccountKey.json"); +const credential = admin.credential.cert(serviceAccountFile); +// Initialize the app with a service account, granting admin privileges +admin.initializeApp({ + credential, + databaseURL +}); + +const credentialTime = Date.now(); + +// CHECKME: what about cross class supports? +// They might be saved as a supportPublication with an offering but then there +// is a metadata file that has extra information about it. + +for (const key of Object.keys(classKeys)) { + const getClassValue = async (prop: string) => { + const snapshot = await admin.database().ref(`${firebaseBasePath}/${key}/${prop}`).once("value"); + return snapshot.val(); + }; + + if (documentLimit && documentsProcessed >= documentLimit) break; + const users = await getClassValue("users"); + const offerings = await getClassValue("offerings"); + const fbPersonalPublications = await getClassValue("personalPublications"); + const personalPublications = remapFirebaseClassPublications(fbPersonalPublications); + const fbLearningLogPublications = await getClassValue("publications"); + const learningLogPublications = remapFirebaseClassPublications(fbLearningLogPublications); + + const problemDocPublications = {}; + for (const [offeringId, offering] of Object.entries(offerings)) { + const fbProblemDocPublications = (offering as any).publications; + if (!fbProblemDocPublications) continue; + problemDocPublications[offeringId] = remapFirebaseProblemDocPublications(fbProblemDocPublications); + } + + for (const [userId, user] of Object.entries(users)) { + if (documentLimit && documentsProcessed >= documentLimit) break; + if (!user.documents) continue; + for (const [docId, doc] of Object.entries(user.documents)) { + if (documentLimit && documentsProcessed >= documentLimit) break; + documentsProcessed++; + + const docPath = `${firebaseBasePath}/${key}/users/${userId}/documents/${docId}`; + const content = doc.content as string | undefined; + let parsedContent; + let tiles; + if (!content) { + undefinedDocuments++; + } else { + try { + parsedContent = JSON.parse(content); + tiles = Object.values(parsedContent.tileMap); + if (tiles.length === 0) { + emptyDocuments++; + } + } catch (e) { + failedDocuments++; + } + } + + const documentMetadata = user.documentMetadata[docId]; + + // We only care about docs without metadata + if (documentMetadata) continue; + + const deleteDoc = async () => { + if (deleteTypes.includes(doc.type)) { + try { + await admin.database().ref(docPath).remove(); + console.log("deleted", docPath); + } catch (e) { + console.log("failed to delete", docPath, e); + } + } else { + console.log("would delete", docPath); + } + }; + + const personalDocMetadata = user.personalDocs?.[docId]; + const learningLogMetadata = user.learningLogs?.[docId]; + + const hasContent = !!tiles && (tiles.length > 0); + const tools = new Set(); + if (hasContent) { + for (const tile of tiles) { + const { type } = tile.content; + if (type === "Placeholder") continue; + tools.add(type); + } + } + + console.log(documentsProcessed, "No metadata", `${key}/users/${userId}/documentMetadata/${docId}`, + {type: doc.type, hasContent }); + + if (tools.size) { + console.log("tools", [...tools]); + } + + const typeSpecificMetadata = { offerings: {}} as any; + if (learningLogMetadata) typeSpecificMetadata.learningLogMetadata = learningLogMetadata; + if (personalDocMetadata) typeSpecificMetadata.personalDocMetadata = personalDocMetadata; + + // Look for type specific metadata + // We have to search through several places since we don't have an offering id + for (const [offeringId, offering] of Object.entries(offerings)) { + const offeringUser = (offering as any).users?.[userId]; + const problemMetadata = offeringUser?.documents?.[docId]; + const planningMetadata = offeringUser?.planning?.[docId]; + if (problemMetadata || planningMetadata) { + typeSpecificMetadata.offerings[offeringId] = {}; + } + if (problemMetadata) { + typeSpecificMetadata.offerings[offeringId].problemMetadata = problemMetadata; + } + if (planningMetadata) { + typeSpecificMetadata.offerings[offeringId].planningMetadata = planningMetadata; + } + } + + for (const [offeringId, problemDocPublicationsOffering] of Object.entries(problemDocPublications)) { + const problemDocPublication = problemDocPublicationsOffering?.[docId]; + if (problemDocPublication) { + if (typeSpecificMetadata.offerings[offeringId]) { + typeSpecificMetadata.offerings[offeringId] = {}; + } + typeSpecificMetadata.offerings[offeringId].problemDocPublication = problemDocPublication; + } + } + + const personalPublication = personalPublications?.[docId]; + if (personalPublication) { + typeSpecificMetadata.personalPublication = personalPublication; + } + const learningLogPublication = learningLogPublications?.[docId]; + if (learningLogPublication) { + typeSpecificMetadata.learningLogPublication = learningLogPublication; + } + + if (Object.keys(typeSpecificMetadata).length > 1 || Object.keys(typeSpecificMetadata.offerings).length > 0) { + // So far none of the docs without generic metadata have any typeSpecific Metadata + console.log("typeSpecific", typeSpecificMetadata); + + // If there is type specific metadata don't try to delete this document + continue; + } + + // If the doc type is a problem or planning check to see if there is one configured for all + // of the offerings of this user. If there isn't one, perhaps this doc should be saved. + // + if (["problem", "planning"].includes(doc.type)) { + const offeringStats = {}; + for (const [offeringId, offering] of Object.entries(offerings)) { + const offeringUser = (offering as any).users?.[userId]; + if (doc.type === "problem") { + offeringStats[offeringId] = Object.keys(offeringUser?.documents || {}).length; + } + if (doc.type === "planning") { + offeringStats[offeringId] = Object.keys(offeringUser?.planning || {}).length; + } + } + console.log("offeringStats", offeringStats); + if (deleteDefaultDocsEvenIfThereIsEmptySlot || !Object.values(offeringStats).includes(0)) { + await deleteDoc(); + } + + } + if (doc.type === "learningLog") { + // undefined means there isn't even a learningLogs map in the database + const numLearningLogs = user.learningLogs && Object.keys(user.learningLogs || {}).length; + console.log("num learningLogs", numLearningLogs); + if (deleteDefaultDocsEvenIfThereIsEmptySlot || numLearningLogs) { + await deleteDoc(); + } + } + if (doc.type === "publication") { + await deleteDoc(); + } + } + + } +} + +const endTime = Date.now(); +console.log(`***** End script *****`); +console.log(`- Time to access token: ${prettyDuration(accessTime - startTime)}`); +console.log(`- Time to fetch documents: ${prettyDuration(fetchTime - startTime)}`); +console.log(`- Time to get credential: ${prettyDuration(credentialTime - startTime)}`); +console.log(`- Total Time: ${prettyDuration(endTime - startTime)}`); +console.log(`Documents downloaded: ${documentsProcessed}`); +console.log(`Undefined documents: ${undefinedDocuments}`); +console.log(`Empty documents: ${emptyDocuments}`); +console.log(`Failed to process: ${failedDocuments}`); + +process.exit(0); diff --git a/scripts/ai/download-documents-with-info.ts b/scripts/ai/download-documents-with-info.ts index 70819971a..de1846325 100644 --- a/scripts/ai/download-documents-with-info.ts +++ b/scripts/ai/download-documents-with-info.ts @@ -14,13 +14,15 @@ import admin from "firebase-admin"; import stringify from "json-stringify-pretty-compact"; import { datasetPath, networkFileName } from "./script-constants.js"; -import { getFirebaseBasePath, getScriptRootFilePath, prettyDuration } from "../lib/script-utils.js"; +import { getFirebaseBasePath, getScriptRootFilePath, prettyDuration, + remapFirebaseClassPublications, remapFirebaseProblemDocPublications } from "../lib/script-utils.js"; // Load the service account key JSON file. import { getClassKeys } from "../lib/firebase-classes.js"; // The portal to get documents from. For example, "learn.concord.org". const portal = "learn.concord.org"; +// const portal = "learn.portal.staging.concord.org"; // The demo name to use. Make falsy to not use a demo. // const demo = "TAGCLUE"; const demo = false; @@ -52,45 +54,6 @@ admin.initializeApp({ databaseURL }); -/** - * Firebase publications are stored with different keys than their document - * id for some reason. In some cases the real document id is in self.documentKey - * so we make a map with that documentKey as the key of the map. - * - * @param fbPublications - */ -function remapFirebaseClassPublications(fbPublications: Record) { - if (!fbPublications) return undefined; - const publications = {}; - for (const [fbId, publication] of Object.entries(fbPublications)) { - if (!publication?.self?.documentKey) { - console.log("Invalid publication found: ", fbId); - continue; - } - publications[publication.self.documentKey] = publication; - } - return publications; -} - -/** - * Firebase publications are stored with different keys than their document - * id for some reason. In some cases the real document id is in documentKey - * so we make a map with that documentKey as the key of the map. - * @param fbPublications - */ -function remapFirebaseProblemDocPublications(fbPublications: Record) { - if (!fbPublications) return undefined; - const publications = {}; - for (const [fbId, publication] of Object.entries(fbPublications)) { - if (!publication?.documentKey) { - console.log("Invalid publication found: ", fbId); - continue; - } - publications[publication.documentKey] = publication; - } - return publications; -} - const credentialTime = Date.now(); // CHECKME: what about cross class supports? diff --git a/scripts/ai/update-class-metadata.ts b/scripts/ai/update-class-metadata.ts index 23eab7774..6d0e989e4 100644 --- a/scripts/ai/update-class-metadata.ts +++ b/scripts/ai/update-class-metadata.ts @@ -74,7 +74,7 @@ async function processFile() { const documentSnapshots = await documentCollection.where("id", "==", id).get(); - if (documentSnapshots.empty) { + const createClassDoc = async () => { const metaData = { context_id, id, @@ -88,28 +88,38 @@ async function processFile() { await newMetaDataDoc.create(metaData); console.log("Created new class metadata", metaDataDocId); metadataCreated++; - } else { - // There can be multiple class metadata documents for each actual class. Note that the name/path for these - // Firestore documents may be "[network name]_[class hash]" and/or simply "[class hash]". - // For now we just update all of these documents. - documentSnapshots.forEach(doc => { - const requiredMatches = [ - { field: "context_id", expected: context_id, actual: doc.data().context_id }, - { field: "id", expected: id, actual: doc.data().id }, - { field: "uri", expected: uri, actual: doc.data().uri } - ]; - - for (const { field, expected, actual } of requiredMatches) { - if (expected !== actual) { - console.error(`Skipping update due to ${field} mismatch. Expected ${expected}, got ${actual}.`); - return; - } + }; + + // There can be multiple class metadata documents for each actual class. Note that the name/path for these + // Firestore documents may be "[network name]_[class hash]" and/or simply "[class hash]". + // For now we just update all of these documents. + + let hasClassDocWithSimpleId = false; + for (const doc of documentSnapshots.docs) { + if (doc.id === context_id) hasClassDocWithSimpleId = true; + + const requiredMatches = [ + { field: "context_id", expected: context_id, actual: doc.data().context_id }, + { field: "id", expected: id, actual: doc.data().id }, + { field: "uri", expected: uri, actual: doc.data().uri } + ]; + + let hasMismatch = false; + for (const { field, expected, actual } of requiredMatches) { + if (expected !== actual) { + console.error(`Skipping update of ${doc.id} due to ${field} mismatch. Expected ${expected}, got ${actual}.`); + hasMismatch = true; } + } + if (hasMismatch) continue; - doc.ref.update({ name, networks, teachers } as any); - console.log(context_id, doc.id, "Updated existing class metadata with", { name, networks, teachers }); - metadataUpdated++; - }); + await doc.ref.update({ name, networks, teachers } as any); + console.log(context_id, doc.id, "Updated existing class metadata with", { name, networks, teachers }); + metadataUpdated++; + } + + if (!hasClassDocWithSimpleId) { + await createClassDoc(); } } } diff --git a/scripts/ai/update-metadata.ts b/scripts/ai/update-metadata.ts index c18b3a68e..0baa6593b 100644 --- a/scripts/ai/update-metadata.ts +++ b/scripts/ai/update-metadata.ts @@ -60,6 +60,12 @@ const collectionUrl = getFirestoreBasePath(portal, demo); console.log(`*** Updating docs in ${collectionUrl} ***`); const documentCollection = admin.firestore().collection(collectionUrl); +const offeringInfoFile = `${sourcePath}/offering-info.json`; +let offeringInfo; +if (!demo) { + offeringInfo = JSON.parse(fs.readFileSync(offeringInfoFile, "utf8")); +} + let processedFiles = 0; let metadataUpdated = 0; let metadataCreated = 0; @@ -116,19 +122,26 @@ async function processFile(file: string) { }; if (!demo) { - const offeringInfoFile = `${sourcePath}/offering-info.json`; - const offeringInfo = JSON.parse(fs.readFileSync(offeringInfoFile, "utf8")); - const offering = offeringInfo[offeringId]; if (offering) { const { activity_url } = offering; - const { investigation, problem, unit } = getProblemDetails(activity_url); - - unitFields = { - problem, - investigation, - unit - }; + try { + const { investigation, problem, unit } = getProblemDetails(activity_url); + + if (unit && !problem) { + console.log("Found unit but not problem in activity_url", activity_url); + } + + unitFields = { + problem, + investigation, + unit + }; + } catch (e) { + console.error(e, {offeringId, offering}); + console.log("Skipping document because it has an invalid offering", {offeringId, offering}); + return; + } } } else { if (offeringId) { @@ -213,11 +226,6 @@ async function processFile(file: string) { properties: {}, strategies, - // For now we just handle demo documents where the teachers are hardcoded. - // To support Portal launches we'll either have to get the list of teachers from the offering - // info, or refactor the code so this teacher list isn't needed here. See: - // https://docs.google.com/document/d/1VDr-nkthu333eVD0BQXPYPVD8kt60qkMYq2jRkXza9c/edit#heading=h.pw87siu4ztwo - teachers: ["1001", "1002", "1003"], tools, title: documentTitle || null, type: documentType, @@ -225,15 +233,19 @@ async function processFile(file: string) { visibility }; + if (!documentType) { + console.log("Skipping document because it has no documentType", documentId); + return; + } + // Use a prefix of `uid:[owner_uid]` for metadata documents that we create for more // info see: // https://docs.google.com/document/d/1VDr-nkthu333eVD0BQXPYPVD8kt60qkMYq2jRkXza9c/edit#heading=h.5t2tt6igiiou const metaDataDocId = `uid:${userId}_${documentId}`; - console.log(documentId, "Created new metadata", metaDataDocId); const newMetaDataDoc = documentCollection.doc(metaDataDocId); await newMetaDataDoc.create(metaData); - console.log(documentId, "Created new metadata", metaDataDocId); + console.log(processedFiles, documentId, "Created new metadata", metaDataDocId); metadataCreated++; } else { // There can be multiple metadata documents for each actual document. @@ -241,12 +253,14 @@ async function processFile(file: string) { // stops creating multiple copies. See: // https://docs.google.com/document/d/1VDr-nkthu333eVD0BQXPYPVD8kt60qkMYq2jRkXza9c/edit#heading=h.5t2tt6igiiou documentSnapshots.forEach(doc => { - doc.ref.update(unitFields as any); - console.log(documentId, doc.id, "Updated metadata with", unitFields); - doc.ref.update({ strategies, tools } as any); - console.log(documentId, doc.id, "Updated metadata with", { strategies, tools }); - doc.ref.update({ visibility } as any); - console.log(documentId, doc.id, "Updated metadata with", { visibility }); + const newMetadata = { + ...unitFields, + strategies, + tools, + visibility + }; + doc.ref.update(newMetadata as any); + console.log(processedFiles, documentId, doc.id, "Updated metadata with", newMetadata); metadataUpdated++; }); } @@ -269,6 +283,7 @@ const fileBatchSize = 8; await new Promise((resolve) => { // Process every file in the source directory fs.readdir(sourcePath, async (_error, files) => { + console.log(`*** Processing ${files.length} documents ***`); for (const file of files) { checkedFiles++; fileBatch.push(file); diff --git a/scripts/lib/script-utils.ts b/scripts/lib/script-utils.ts index b5f08470f..b155dc356 100644 --- a/scripts/lib/script-utils.ts +++ b/scripts/lib/script-utils.ts @@ -80,7 +80,8 @@ export function getUnitCode(unitParam: string | undefined) { } export function getProblemDetails(url: string) { - const urlParams = new URLSearchParams(url); + const activityURL = new URL(url); + const urlParams = activityURL.searchParams; const unitParam = urlParams.get("unit"); // The unit param's value may be a unit code or a full url, so we make sure to get just the unit code const unit = getUnitCode(unitParam); @@ -88,3 +89,51 @@ export function getProblemDetails(url: string) { const [investigation, problem] = investigationAndProblem ? investigationAndProblem.split(".") : [null, null]; return { investigation, problem, unit }; } + +/** + * Create a new Record based on a passed in Record. The keys in the new Record + * are computed by the passed in getNewKey function. + * If getNewKey returns a falsely value the entry is skipped and it is logged + * to the console. + * + * @param originalMap + * @param getNewKey + * @returns + */ +export function remap( + originalMap: Record, + getNewKey: (value: any) => string | undefined +) { + if (!originalMap) return undefined; + const newMap = {}; + for (const [originalKey, value] of Object.entries(originalMap)) { + const newKey = getNewKey(value); + if (!newKey) { + console.log("Invalid value found: ", originalKey, value); + continue; + } + newMap[newKey] = value; + } + return newMap; +} + +/** + * Firebase publications are stored with different keys than their document + * id for some reason. In some cases the real document id is in self.documentKey + * so we make a map with that documentKey as the key of the map. + * + * @param fbPublications + */ +export function remapFirebaseClassPublications(fbPublications: Record) { + return remap(fbPublications, (metadata) => metadata?.self?.documentKey); +} + +/** + * Firebase publications are stored with different keys than their document + * id for some reason. In some cases the real document id is in documentKey + * so we make a map with that documentKey as the key of the map. + * @param fbPublications + */ +export function remapFirebaseProblemDocPublications(fbPublications: Record) { + return remap(fbPublications, (metadata) => metadata?.documentKey); +}