Skip to content

Commit

Permalink
Add CI workflow for checkpoint garbage collection
Browse files Browse the repository at this point in the history
Implemented a GitHub Actions workflow to test the checkpoint
restore operator's ability to correctly delete checkpoints
through garbage collection.

- Added `wait_for_checkpoint_reduction.sh` script to check the
  number of checkpoint tar files and ensure it is reduced to 2
  within a 60-second timeout.
- Added `generate_checkpoint_tar.sh` script to correctly
  create and move checkpoint tar files.

Signed-off-by: Parthiba-Hazra <[email protected]>
  • Loading branch information
Parthiba-Hazra committed Jul 8, 2024
1 parent 832b08e commit 6674bbe
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,14 @@ jobs:

- name: Check resources
run: kubectl get all -n checkpoint-restore-operator-system

- name: Generate Checkpoint Tar Files
run: |
mkdir /var/lib/kubelet/checkpoints
sudo chmod 777 /var/lib/kubelet/checkpoints
./test/generate_checkpoint_tar.sh
- name: Wait for Checkpoint Tar Files to Reduce
run: |
kubectl apply -f ./test/test_checkpointrestoreoperator.yaml
./test/wait_for_checkpoint_reduction.sh
12 changes: 12 additions & 0 deletions test/data/config.dump
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"id": "6924be1bd90c23f10e2667102b0ee0f74f09bba78b6661871e733cb3b1737821",
"name": "k8s_container-name_deployment-name_default_6975ee47-6765-45dc-9a2b-1e38d51031f7_0",
"rootfsImage": "docker.io/library/nginx@sha256:161ef4b1bf7effb350a2a9625cb2b59f69d54ec6059a8a155a1438d0439c593c",
"rootfsImageRef": "a8758716bb6aa4d90071160d27028fe4eaee7ce8166221a97d30440c8eac2be6",
"rootfsImageName": "docker.io/library/nginx:latest",
"runtime": "runc",
"createdTime": "2024-01-27T14:45:26.083444055Z",
"checkpointedTime": "2024-01-28T00:10:45.673538606+05:30",
"restoredTime": "0001-01-01T00:00:00Z",
"restored": false
}
10 changes: 10 additions & 0 deletions test/data/spec.dump
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"annotations": {
"io.container.manager": "cri-o",
"io.kubernetes.container.hash": "1511917a",
"io.kubernetes.container.name": "containername",
"io.kubernetes.pod.name": "podname",
"io.kubernetes.pod.namespace": "namespace",
"io.kubernetes.cri-o.Metadata": "{\"name\":\"containername\"}"
}
}
37 changes: 37 additions & 0 deletions test/generate_checkpoint_tar.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

DATA_DIR="./test/data"
if [ ! -d "$DATA_DIR" ]; then
echo "Data directory '$DATA_DIR' does not exist."
exit 1
fi

CONFIG_DUMP="$DATA_DIR/config.dump"
SPEC_DUMP="$DATA_DIR/spec.dump"
if [ ! -f "$CONFIG_DUMP" ] || [ ! -f "$SPEC_DUMP" ]; then
echo "config.dump and/or spec.dump files are missing in the data directory."
exit 1
fi

TEMP_DIR=$(mktemp -d)

cp "$CONFIG_DUMP" "$TEMP_DIR"
cp "$SPEC_DUMP" "$TEMP_DIR"

create_checkpoint_tar() {
local tar_name=$1
local original_name=$2
tar -cf "$tar_name" -C "$TEMP_DIR" .
sudo mv "$tar_name" "/var/lib/kubelet/checkpoints/$original_name"
echo "Checkpoint tar file created at /var/lib/kubelet/checkpoints/$original_name"
}

for _ in {1..5}; do
TIMESTAMP=$(date +%Y-%m-%dT%H:%M:%S)
TAR_NAME="checkpoint.tar"
ORIGINAL_NAME="checkpoint-podname_namespace-containername-$TIMESTAMP.tar"
create_checkpoint_tar "$TAR_NAME" "$ORIGINAL_NAME"
sleep 1
done

rm -rf "$TEMP_DIR"
22 changes: 22 additions & 0 deletions test/test_checkpointrestoreoperator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: criu.org/v1
kind: CheckpointRestoreOperator
metadata:
labels:
app.kubernetes.io/name: checkpointrestoreoperator
app.kubernetes.io/instance: checkpointrestoreoperator-sample
app.kubernetes.io/part-of: checkpoint-restore-operator
app.kubernetes.io/managed-by: kustomize
app.kubernetes.io/created-by: checkpoint-restore-operator
name: checkpointrestoreoperator-sample
spec:
checkpointDirectory: /var/lib/kubelet/checkpoints
applyPoliciesImmediately: true
globalPolicy:
maxCheckpointsPerNamespace: 10
maxCheckpointsPerPod: 10
maxCheckpointsPerContainer: 10
containerPolicies:
- namespace: namespace
pod: podname
container: containername
maxCheckpoints: 2
26 changes: 26 additions & 0 deletions test/wait_for_checkpoint_reduction.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

CHECKPOINTS_DIR="/var/lib/kubelet/checkpoints"
EXPECTED_COUNT=2
TIMEOUT=60
start_time=$(date +%s)

count_tar_files() {
find "$CHECKPOINTS_DIR" -maxdepth 1 -name "*.tar" -print | wc -l
}

# Wait for the checkpoint tar files to be reduced from 5 to 2
while true; do
current_count=$(count_tar_files)
if [ "$current_count" -le "$EXPECTED_COUNT" ]; then
echo "Checkpoint tar files reduced to $current_count (<= $EXPECTED_COUNT)"
break
fi
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
echo "Timeout reached: Checkpoint tar files count is still $current_count (should be $EXPECTED_COUNT)"
exit 1
fi
sleep 5
done

0 comments on commit 6674bbe

Please sign in to comment.