Add CI workflow for checkpoint garbage collection

Implemented a GitHub Actions workflow to test the checkpoint restore operator's ability to correctly delete checkpoints through garbage collection. - Added `wait_for_checkpoint_reduction.sh` script to check the number of checkpoint tar files and ensure it is reduced to 2 within a 60-second timeout. - Added `generate_checkpoint_tar.sh` script to correctly create and move checkpoint tar files. Signed-off-by: Parthiba-Hazra <[email protected]>
checkpoint-restore · Jul 8, 2024 · 6674bbe · 6674bbe
1 parent 832b08e
commit 6674bbe
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 0 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -46,3 +46,14 @@ jobs:
 
       - name: Check resources
         run: kubectl get all -n checkpoint-restore-operator-system
+
+      - name: Generate Checkpoint Tar Files
+        run: |
+          mkdir /var/lib/kubelet/checkpoints
+          sudo chmod 777 /var/lib/kubelet/checkpoints
+          ./test/generate_checkpoint_tar.sh
+
+      - name: Wait for Checkpoint Tar Files to Reduce
+        run: |
+          kubectl apply -f ./test/test_checkpointrestoreoperator.yaml
+          ./test/wait_for_checkpoint_reduction.sh
diff --git a/test/data/config.dump b/test/data/config.dump
@@ -0,0 +1,12 @@
+{
+  "id": "6924be1bd90c23f10e2667102b0ee0f74f09bba78b6661871e733cb3b1737821",
+  "name": "k8s_container-name_deployment-name_default_6975ee47-6765-45dc-9a2b-1e38d51031f7_0",
+  "rootfsImage": "docker.io/library/nginx@sha256:161ef4b1bf7effb350a2a9625cb2b59f69d54ec6059a8a155a1438d0439c593c",
+  "rootfsImageRef": "a8758716bb6aa4d90071160d27028fe4eaee7ce8166221a97d30440c8eac2be6",
+  "rootfsImageName": "docker.io/library/nginx:latest",
+  "runtime": "runc",
+  "createdTime": "2024-01-27T14:45:26.083444055Z",
+  "checkpointedTime": "2024-01-28T00:10:45.673538606+05:30",
+  "restoredTime": "0001-01-01T00:00:00Z",
+  "restored": false
+}
diff --git a/test/data/spec.dump b/test/data/spec.dump
@@ -0,0 +1,10 @@
+{
+	"annotations": {
+		"io.container.manager": "cri-o",
+		"io.kubernetes.container.hash": "1511917a",
+		"io.kubernetes.container.name": "containername",
+		"io.kubernetes.pod.name": "podname",
+		"io.kubernetes.pod.namespace": "namespace",
+		"io.kubernetes.cri-o.Metadata": "{\"name\":\"containername\"}"
+	}
+}
diff --git a/test/generate_checkpoint_tar.sh b/test/generate_checkpoint_tar.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+DATA_DIR="./test/data"
+if [ ! -d "$DATA_DIR" ]; then
+  echo "Data directory '$DATA_DIR' does not exist."
+  exit 1
+fi
+
+CONFIG_DUMP="$DATA_DIR/config.dump"
+SPEC_DUMP="$DATA_DIR/spec.dump"
+if [ ! -f "$CONFIG_DUMP" ] || [ ! -f "$SPEC_DUMP" ]; then
+  echo "config.dump and/or spec.dump files are missing in the data directory."
+  exit 1
+fi
+
+TEMP_DIR=$(mktemp -d)
+
+cp "$CONFIG_DUMP" "$TEMP_DIR"
+cp "$SPEC_DUMP" "$TEMP_DIR"
+
+create_checkpoint_tar() {
+  local tar_name=$1
+  local original_name=$2
+  tar -cf "$tar_name" -C "$TEMP_DIR" .
+  sudo mv "$tar_name" "/var/lib/kubelet/checkpoints/$original_name"
+  echo "Checkpoint tar file created at /var/lib/kubelet/checkpoints/$original_name"
+}
+
+for _ in {1..5}; do
+  TIMESTAMP=$(date +%Y-%m-%dT%H:%M:%S)
+  TAR_NAME="checkpoint.tar"
+  ORIGINAL_NAME="checkpoint-podname_namespace-containername-$TIMESTAMP.tar"
+  create_checkpoint_tar "$TAR_NAME" "$ORIGINAL_NAME"
+  sleep 1
+done
+
+rm -rf "$TEMP_DIR"
diff --git a/test/test_checkpointrestoreoperator.yaml b/test/test_checkpointrestoreoperator.yaml
@@ -0,0 +1,22 @@
+apiVersion: criu.org/v1
+kind: CheckpointRestoreOperator
+metadata:
+  labels:
+    app.kubernetes.io/name: checkpointrestoreoperator
+    app.kubernetes.io/instance: checkpointrestoreoperator-sample
+    app.kubernetes.io/part-of: checkpoint-restore-operator
+    app.kubernetes.io/managed-by: kustomize
+    app.kubernetes.io/created-by: checkpoint-restore-operator
+  name: checkpointrestoreoperator-sample
+spec:
+  checkpointDirectory: /var/lib/kubelet/checkpoints
+  applyPoliciesImmediately: true
+  globalPolicy:
+    maxCheckpointsPerNamespace: 10
+    maxCheckpointsPerPod: 10
+    maxCheckpointsPerContainer: 10
+  containerPolicies:
+    - namespace: namespace
+      pod: podname
+      container: containername
+      maxCheckpoints: 2
diff --git a/test/wait_for_checkpoint_reduction.sh b/test/wait_for_checkpoint_reduction.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+CHECKPOINTS_DIR="/var/lib/kubelet/checkpoints"
+EXPECTED_COUNT=2
+TIMEOUT=60
+start_time=$(date +%s)
+
+count_tar_files() {
+  find "$CHECKPOINTS_DIR" -maxdepth 1 -name "*.tar" -print | wc -l
+}
+
+# Wait for the checkpoint tar files to be reduced from 5 to 2
+while true; do
+  current_count=$(count_tar_files)
+  if [ "$current_count" -le "$EXPECTED_COUNT" ]; then
+    echo "Checkpoint tar files reduced to $current_count (<= $EXPECTED_COUNT)"
+    break
+  fi
+  current_time=$(date +%s)
+  elapsed_time=$((current_time - start_time))
+  if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
+    echo "Timeout reached: Checkpoint tar files count is still $current_count (should be $EXPECTED_COUNT)"
+    exit 1
+  fi
+  sleep 5
+done