diff --git a/src/packages/database/postgres-server-queries.coffee b/src/packages/database/postgres-server-queries.coffee index ce739226c86..aae46a79646 100644 --- a/src/packages/database/postgres-server-queries.coffee +++ b/src/packages/database/postgres-server-queries.coffee @@ -2591,8 +2591,8 @@ exports.extend_PostgreSQL = (ext) -> class PostgreSQL extends ext return await unlink_old_deleted_projects(@) # async function - cleanup_old_projects_data: () => - return await cleanup_old_projects_data(@) + cleanup_old_projects_data: (max_run_m) => + return await cleanup_old_projects_data(@, max_run_m) # async function unlist_all_public_paths: (account_id, is_owner) => diff --git a/src/packages/database/postgres/bulk-delete.test.ts b/src/packages/database/postgres/bulk-delete.test.ts index 423c10b0091..acbbfff0bbc 100644 --- a/src/packages/database/postgres/bulk-delete.test.ts +++ b/src/packages/database/postgres/bulk-delete.test.ts @@ -3,6 +3,9 @@ * License: AGPLv3 s.t. "Commons Clause" – see LICENSE.md for details */ +// see packages/database/pool/pool.ts for where this name is also hard coded: +process.env.PGDATABASE = "smc_ephemeral_testing_database"; + import getPool, { initEphemeralDatabase } from "@cocalc/database/pool"; import { uuid } from "@cocalc/util/misc"; import { bulk_delete } from "./bulk-delete"; diff --git a/src/packages/database/postgres/bulk-delete.ts b/src/packages/database/postgres/bulk-delete.ts index b6b38490a40..098f73cafd1 100644 --- a/src/packages/database/postgres/bulk-delete.ts +++ b/src/packages/database/postgres/bulk-delete.ts @@ -1,14 +1,17 @@ -// see packages/database/pool/pool.ts for where this name is also hard coded: -process.env.PGDATABASE = "smc_ephemeral_testing_database"; - import { escapeIdentifier } from "pg"; import getPool from "@cocalc/database/pool"; import { SCHEMA } from "@cocalc/util/schema"; +type Field = + | "project_id" + | "account_id" + | "target_project_id" + | "source_project_id"; + interface Opts { table: string; // e.g. project_log, etc. - field: "project_id" | "account_id"; // for now, we only support a few + field: Field; // for now, we only support a few id?: string; // default "id", the ID field in the table, which identifies each row uniquely value: string; // a UUID limit?: number; // default 1024 diff --git a/src/packages/database/postgres/delete-projects.ts b/src/packages/database/postgres/delete-projects.ts index 8e88c8bf66b..293af2fe72f 100644 --- a/src/packages/database/postgres/delete-projects.ts +++ b/src/packages/database/postgres/delete-projects.ts @@ -9,11 +9,12 @@ Code related to permanently deleting projects. import getLogger from "@cocalc/backend/logger"; import getPool from "@cocalc/database/pool"; -import { callback2 } from "@cocalc/util/async-utils"; -import { PostgreSQL } from "./types"; -import { minutes_ago } from "@cocalc/util/misc"; import { getServerSettings } from "@cocalc/database/settings"; +import { callback2 } from "@cocalc/util/async-utils"; import { KUCALC_ON_PREMISES } from "@cocalc/util/db-schema/site-defaults"; +import { minutes_ago } from "@cocalc/util/misc"; +import { bulk_delete } from "./bulk-delete"; +import { PostgreSQL } from "./types"; const log = getLogger("db:delete-projects"); @@ -59,8 +60,9 @@ async function get_account_id( } /* -This deletes all projects older than the given number of days, from the perspective of a user. -Another task has to run to actually get rid of the data, etc. +This removes all users from all projects older than the given number of days and marked as deleted. +In particular, users are no longer able to access that project. +The "cleanup_old_projects_data" function has to run to actually get rid of the data, etc. */ export async function unlink_old_deleted_projects( db: PostgreSQL, @@ -70,7 +72,7 @@ export async function unlink_old_deleted_projects( query: "UPDATE projects", set: { users: null }, where: [ - "deleted = true", + "deleted = true", "users IS NOT NULL", `last_edited <= NOW() - '${age_d} days'::INTERVAL`, ], @@ -83,27 +85,32 @@ FROM projects as p INNER JOIN syncstrings as s ON p.project_id = s.project_id WHERE p.deleted = true + AND users IS NULL AND p.state ->> 'state' != 'deleted' +ORDER BY + p.project_id, s.string_id `; /* - This is more thorough than the above. It issues actual delete operations on data of projects marked as deleted. + This more thorough delete procedure comes after the above. + It issues actual delete operations on data of projects marked as deleted. When done, it sets the state.state to "deleted". The operations involves deleting all syncstrings of that project (and associated with that, patches), - and only for on-prem setups, it also deletes all the data stored in the project on disk. + and only for on-prem setups, it also deletes all the data stored in the project on disk and various tables. - This function is called every couple of hours. Hence ensure it does not run longer than the given max_run_m time (minutes) + This function is called every couple of hours. Hence it checks to not run longer than the given max_run_m time (minutes). */ export async function cleanup_old_projects_data( db: PostgreSQL, - delay_ms = 50, max_run_m = 60, ) { const settings = await getServerSettings(); const on_prem = settings.kucalc === KUCALC_ON_PREMISES; + const L0 = log.extend("cleanup_old_projects_data"); + const L = L0.debug; - log.debug("cleanup_old_projects_data", { delay_ms, max_run_m, on_prem }); + log.debug("cleanup_old_projects_data", { max_run_m, on_prem }); const start_ts = new Date(); const pool = getPool(); @@ -115,34 +122,95 @@ export async function cleanup_old_projects_data( for (const row of rows) { const { project_id, string_id } = row; if (start_ts < minutes_ago(max_run_m)) { - log.debug( - `cleanup_old_projects_data: too much time elapsed, breaking after ${num} syncstrings`, - ); + L(`too much time elapsed, breaking after ${num} syncstrings`); break; } - log.debug( - `cleanup_old_projects_data: deleting syncstring ${project_id}/${string_id}`, - ); + L(`deleting syncstring ${project_id}/${string_id}`); num += 1; await callback2(db.delete_syncstring, { string_id }); - // wait for the given amount of delay_ms millio seconds - await new Promise((done) => setTimeout(done, delay_ms)); + // wait a bit after deleting syncstrings, e.g. to let the standby db catch up + await new Promise((done) => setTimeout(done, 100)); + // Q_CLEANUP_SYNCSTRINGS orders by project_id, hence we trigger project specific actions when the id changes if (pid != project_id) { pid = project_id; + const L2 = L0.extend(project_id).debug; + if (on_prem) { - log.debug( - `cleanup_old_projects_data: deleting project data in ${project_id}`, - ); + L2(`cleanup_old_projects_data for project_id=${project_id}`); // TODO: this only works on-prem, and requires the project files to be mounted - log.debug(`deleting all shared files in project ${project_id}`); + L2(`deleting all shared files in project ${project_id}`); // TODO: do it directly like above, and also get rid of all those shares in the database + + const delPublicPaths = await bulk_delete({ + table: "public_paths", + field: "project_id", + value: project_id, + }); + L2(`deleted public_paths ${delPublicPaths.rowsDeleted} entries`); + + const delProjectLog = await bulk_delete({ + table: "project_log", + field: "project_id", + value: project_id, + }); + L2(`deleted project_log ${delProjectLog.rowsDeleted} entries`); + + const delFileUse = await bulk_delete({ + table: "file_use", + field: "project_id", + value: project_id, + }); + L2(`deleted file_use ${delFileUse.rowsDeleted} entries`); + + const delAccessLog = await bulk_delete({ + table: "file_access_log", + field: "project_id", + value: project_id, + }); + L2(`deleted file_access_log ${delAccessLog.rowsDeleted} entries`); + + const delJupyterApiLog = await bulk_delete({ + table: "jupyter_api_log", + field: "project_id", + value: project_id, + }); + L2(`deleted jupyter_api_log ${delJupyterApiLog.rowsDeleted} entries`); + + for (const field of [ + "target_project_id", + "source_project_id", + ] as const) { + const delCopyPaths = await bulk_delete({ + table: "copy_paths", + field, + value: project_id, + }); + L2(`deleted copy_paths/${field} ${delCopyPaths.rowsDeleted} entries`); + } + + const delListings = await bulk_delete({ + table: "listings", + field: "project_id", + id: "project_id", // TODO listings has a more complex ID, is this a problem? + value: project_id, + }); + L2(`deleted ${delListings.rowsDeleted} listings`); + + const delInviteTokens = await bulk_delete({ + table: "project_invite_tokens", + field: "project_id", + value: project_id, + id: "token", + }); + L2(`deleted ${delInviteTokens.rowsDeleted} entries`); } // now, that we're done with that project, mark it as state.state ->> 'deleted' + // in addition to the flag "deleted = true" await callback2(db.set_project_state, { project_id, state: "deleted", diff --git a/src/packages/hub/run/delete-projects.js b/src/packages/hub/run/delete-projects.js index dbef215ed90..1b0c8c833eb 100755 --- a/src/packages/hub/run/delete-projects.js +++ b/src/packages/hub/run/delete-projects.js @@ -16,7 +16,8 @@ async function update() { console.log("unlinking old deleted projects..."); try { await db.unlink_old_deleted_projects(); - await db.cleanup_old_projects_data(); + const max_run_m = (INTERVAL_MS / 2) / (1000 * 60) + await db.cleanup_old_projects_data(max_run_m); } catch (err) { if (err !== null) { throw Error(`failed to unlink projects -- ${err}`);