Skip to content

Commit

Permalink
database/delete-projects: expand scope
Browse files Browse the repository at this point in the history
  • Loading branch information
haraldschilly committed Jul 10, 2024
1 parent 4126e2c commit 37d39b0
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 30 deletions.
4 changes: 2 additions & 2 deletions src/packages/database/postgres-server-queries.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -2591,8 +2591,8 @@ exports.extend_PostgreSQL = (ext) -> class PostgreSQL extends ext
return await unlink_old_deleted_projects(@)

# async function
cleanup_old_projects_data: () =>
return await cleanup_old_projects_data(@)
cleanup_old_projects_data: (max_run_m) =>
return await cleanup_old_projects_data(@, max_run_m)

# async function
unlist_all_public_paths: (account_id, is_owner) =>
Expand Down
3 changes: 3 additions & 0 deletions src/packages/database/postgres/bulk-delete.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
* License: AGPLv3 s.t. "Commons Clause" – see LICENSE.md for details
*/

// see packages/database/pool/pool.ts for where this name is also hard coded:
process.env.PGDATABASE = "smc_ephemeral_testing_database";

import getPool, { initEphemeralDatabase } from "@cocalc/database/pool";
import { uuid } from "@cocalc/util/misc";
import { bulk_delete } from "./bulk-delete";
Expand Down
11 changes: 7 additions & 4 deletions src/packages/database/postgres/bulk-delete.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
// see packages/database/pool/pool.ts for where this name is also hard coded:
process.env.PGDATABASE = "smc_ephemeral_testing_database";

import { escapeIdentifier } from "pg";

import getPool from "@cocalc/database/pool";
import { SCHEMA } from "@cocalc/util/schema";

type Field =
| "project_id"
| "account_id"
| "target_project_id"
| "source_project_id";

interface Opts {
table: string; // e.g. project_log, etc.
field: "project_id" | "account_id"; // for now, we only support a few
field: Field; // for now, we only support a few
id?: string; // default "id", the ID field in the table, which identifies each row uniquely
value: string; // a UUID
limit?: number; // default 1024
Expand Down
114 changes: 91 additions & 23 deletions src/packages/database/postgres/delete-projects.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ Code related to permanently deleting projects.

import getLogger from "@cocalc/backend/logger";
import getPool from "@cocalc/database/pool";
import { callback2 } from "@cocalc/util/async-utils";
import { PostgreSQL } from "./types";
import { minutes_ago } from "@cocalc/util/misc";
import { getServerSettings } from "@cocalc/database/settings";
import { callback2 } from "@cocalc/util/async-utils";
import { KUCALC_ON_PREMISES } from "@cocalc/util/db-schema/site-defaults";
import { minutes_ago } from "@cocalc/util/misc";
import { bulk_delete } from "./bulk-delete";
import { PostgreSQL } from "./types";

const log = getLogger("db:delete-projects");

Expand Down Expand Up @@ -59,8 +60,9 @@ async function get_account_id(
}

/*
This deletes all projects older than the given number of days, from the perspective of a user.
Another task has to run to actually get rid of the data, etc.
This removes all users from all projects older than the given number of days and marked as deleted.
In particular, users are no longer able to access that project.
The "cleanup_old_projects_data" function has to run to actually get rid of the data, etc.
*/
export async function unlink_old_deleted_projects(
db: PostgreSQL,
Expand All @@ -70,7 +72,7 @@ export async function unlink_old_deleted_projects(
query: "UPDATE projects",
set: { users: null },
where: [
"deleted = true",
"deleted = true",
"users IS NOT NULL",
`last_edited <= NOW() - '${age_d} days'::INTERVAL`,
],
Expand All @@ -83,27 +85,32 @@ FROM projects as p
INNER JOIN syncstrings as s
ON p.project_id = s.project_id
WHERE p.deleted = true
AND users IS NULL
AND p.state ->> 'state' != 'deleted'
ORDER BY
p.project_id, s.string_id
`;

/*
This is more thorough than the above. It issues actual delete operations on data of projects marked as deleted.
This more thorough delete procedure comes after the above.
It issues actual delete operations on data of projects marked as deleted.
When done, it sets the state.state to "deleted".
The operations involves deleting all syncstrings of that project (and associated with that, patches),
and only for on-prem setups, it also deletes all the data stored in the project on disk.
and only for on-prem setups, it also deletes all the data stored in the project on disk and various tables.
This function is called every couple of hours. Hence ensure it does not run longer than the given max_run_m time (minutes)
This function is called every couple of hours. Hence it checks to not run longer than the given max_run_m time (minutes).
*/
export async function cleanup_old_projects_data(
db: PostgreSQL,
delay_ms = 50,
max_run_m = 60,
) {
const settings = await getServerSettings();
const on_prem = settings.kucalc === KUCALC_ON_PREMISES;
const L0 = log.extend("cleanup_old_projects_data");
const L = L0.debug;

log.debug("cleanup_old_projects_data", { delay_ms, max_run_m, on_prem });
log.debug("cleanup_old_projects_data", { max_run_m, on_prem });
const start_ts = new Date();

const pool = getPool();
Expand All @@ -115,34 +122,95 @@ export async function cleanup_old_projects_data(
for (const row of rows) {
const { project_id, string_id } = row;
if (start_ts < minutes_ago(max_run_m)) {
log.debug(
`cleanup_old_projects_data: too much time elapsed, breaking after ${num} syncstrings`,
);
L(`too much time elapsed, breaking after ${num} syncstrings`);
break;
}

log.debug(
`cleanup_old_projects_data: deleting syncstring ${project_id}/${string_id}`,
);
L(`deleting syncstring ${project_id}/${string_id}`);
num += 1;
await callback2(db.delete_syncstring, { string_id });

// wait for the given amount of delay_ms millio seconds
await new Promise((done) => setTimeout(done, delay_ms));
// wait a bit after deleting syncstrings, e.g. to let the standby db catch up
await new Promise((done) => setTimeout(done, 100));

// Q_CLEANUP_SYNCSTRINGS orders by project_id, hence we trigger project specific actions when the id changes
if (pid != project_id) {
pid = project_id;
const L2 = L0.extend(project_id).debug;

if (on_prem) {
log.debug(
`cleanup_old_projects_data: deleting project data in ${project_id}`,
);
L2(`cleanup_old_projects_data for project_id=${project_id}`);
// TODO: this only works on-prem, and requires the project files to be mounted

log.debug(`deleting all shared files in project ${project_id}`);
L2(`deleting all shared files in project ${project_id}`);
// TODO: do it directly like above, and also get rid of all those shares in the database

const delPublicPaths = await bulk_delete({
table: "public_paths",
field: "project_id",
value: project_id,
});
L2(`deleted public_paths ${delPublicPaths.rowsDeleted} entries`);

const delProjectLog = await bulk_delete({
table: "project_log",
field: "project_id",
value: project_id,
});
L2(`deleted project_log ${delProjectLog.rowsDeleted} entries`);

const delFileUse = await bulk_delete({
table: "file_use",
field: "project_id",
value: project_id,
});
L2(`deleted file_use ${delFileUse.rowsDeleted} entries`);

const delAccessLog = await bulk_delete({
table: "file_access_log",
field: "project_id",
value: project_id,
});
L2(`deleted file_access_log ${delAccessLog.rowsDeleted} entries`);

const delJupyterApiLog = await bulk_delete({
table: "jupyter_api_log",
field: "project_id",
value: project_id,
});
L2(`deleted jupyter_api_log ${delJupyterApiLog.rowsDeleted} entries`);

for (const field of [
"target_project_id",
"source_project_id",
] as const) {
const delCopyPaths = await bulk_delete({
table: "copy_paths",
field,
value: project_id,
});
L2(`deleted copy_paths/${field} ${delCopyPaths.rowsDeleted} entries`);
}

const delListings = await bulk_delete({
table: "listings",
field: "project_id",
id: "project_id", // TODO listings has a more complex ID, is this a problem?
value: project_id,
});
L2(`deleted ${delListings.rowsDeleted} listings`);

const delInviteTokens = await bulk_delete({
table: "project_invite_tokens",
field: "project_id",
value: project_id,
id: "token",
});
L2(`deleted ${delInviteTokens.rowsDeleted} entries`);
}

// now, that we're done with that project, mark it as state.state ->> 'deleted'
// in addition to the flag "deleted = true"
await callback2(db.set_project_state, {
project_id,
state: "deleted",
Expand Down
3 changes: 2 additions & 1 deletion src/packages/hub/run/delete-projects.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ async function update() {
console.log("unlinking old deleted projects...");
try {
await db.unlink_old_deleted_projects();
await db.cleanup_old_projects_data();
const max_run_m = (INTERVAL_MS / 2) / (1000 * 60)
await db.cleanup_old_projects_data(max_run_m);
} catch (err) {
if (err !== null) {
throw Error(`failed to unlink projects -- ${err}`);
Expand Down

0 comments on commit 37d39b0

Please sign in to comment.