Skip to content

Commit

Permalink
Update reading/writing without interruption after pod restart e2e test
Browse files Browse the repository at this point in the history
Signed-off-by: torredil <[email protected]>

(cherry picked from commit b5f141d)
  • Loading branch information
torredil authored and mskanth972 committed Aug 17, 2023
1 parent a681ab0 commit 163f545
Show file tree
Hide file tree
Showing 13 changed files with 149 additions and 75 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ ARG EFSUTILSSOURCE=github
RUN mkdir -p /tmp/rpms && \
if [ "$EFSUTILSSOURCE" = "yum" ]; \
then echo "Installing efs-utils from Amazon Linux 2 yum repo" && \
yum -y install --downloadonly --downloaddir=/tmp/rpms amazon-efs-utils-1.34.4-1.amzn2.noarch; \
yum -y install --downloadonly --downloaddir=/tmp/rpms amazon-efs-utils-1.35.0-1.amzn2.noarch; \
else echo "Installing efs-utils from github using the latest git tag" && \
yum -y install git rpm-build make && \
git clone https://github.com/aws/efs-utils && \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ spec:
annotations: {{- toYaml . | nindent 8 }}
{{- end }}
spec:
hostNetwork: true
{{- if .Values.imagePullSecrets }}
imagePullSecrets:
{{- range .Values.imagePullSecrets }}
Expand Down
20 changes: 10 additions & 10 deletions charts/aws-efs-csi-driver/templates/node-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,9 @@ spec:
{{- with .Values.node.nodeSelector }}
{{- toYaml . | nindent 8 }}
{{- end }}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: eks.amazonaws.com/compute-type
operator: NotIn
values:
- fargate
hostNetwork: true
{{- with .Values.node.affinity }}
affinity: {{- toYaml . | nindent 8 }}
{{- end }}
dnsPolicy: {{ .Values.node.dnsPolicy }}
{{- with .Values.node.dnsConfig }}
dnsConfig: {{- toYaml . | nindent 8 }}
Expand All @@ -77,9 +70,16 @@ spec:
- --endpoint=$(CSI_ENDPOINT)
- --logtostderr
- --v={{ .Values.node.logLevel }}
- --vol-metrics-opt-in={{ hasKey .Values.node "volMetricsOptIn" | ternary .Values.node.volMetricsOptIn false }}
- --vol-metrics-refresh-period={{ hasKey .Values.node "volMetricsRefreshPeriod" | ternary .Values.node.volMetricsRefreshPeriod 240 }}
- --vol-metrics-fs-rate-limit={{ hasKey .Values.node "volMetricsFsRateLimit" | ternary .Values.node.volMetricsFsRateLimit 5 }}
env:
- name: CSI_ENDPOINT
value: unix:/csi/csi.sock
- name: CSI_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
{{- if .Values.useFIPS }}
- name: AWS_USE_FIPS_ENDPOINT
value: "true"
Expand Down
27 changes: 27 additions & 0 deletions charts/aws-efs-csi-driver/templates/node-serviceaccount.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,30 @@ metadata:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: efs-csi-node-role
labels:
app.kubernetes.io/name: {{ include "aws-efs-csi-driver.name" . }}
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: efs-csi-node-binding
labels:
app.kubernetes.io/name: {{ include "aws-efs-csi-driver.name" . }}
subjects:
- kind: ServiceAccount
name: {{ .Values.node.serviceAccount.name }}
namespace: {{ .Release.Namespace }}
roleRef:
kind: ClusterRole
name: efs-csi-node-role
apiGroup: rbac.authorization.k8s.io

1 change: 0 additions & 1 deletion deploy/kubernetes/base/controller-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ spec:
app.kubernetes.io/name: aws-efs-csi-driver
app.kubernetes.io/instance: kustomize
spec:
hostNetwork: true
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: efs-csi-controller-sa
Expand Down
11 changes: 5 additions & 6 deletions deploy/kubernetes/base/node-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,16 @@ spec:
operator: NotIn
values:
- fargate
securityContext:
fsGroup: 0
runAsGroup: 0
runAsNonRoot: false
runAsUser: 0
hostNetwork: true
dnsPolicy: ClusterFirst
serviceAccountName: efs-csi-node-sa
priorityClassName: system-node-critical
tolerations:
- operator: Exists
securityContext:
fsGroup: 0
runAsGroup: 0
runAsNonRoot: false
runAsUser: 0
containers:
- name: efs-plugin
securityContext:
Expand Down
27 changes: 27 additions & 0 deletions deploy/kubernetes/base/node-serviceaccount.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,30 @@ metadata:
name: efs-csi-node-sa
labels:
app.kubernetes.io/name: aws-efs-csi-driver
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: efs-csi-node-role
labels:
app.kubernetes.io/name: aws-efs-csi-driver
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: efs-csi-node-binding
labels:
app.kubernetes.io/name: aws-efs-csi-driver
subjects:
- kind: ServiceAccount
name: efs-csi-node-sa
namespace: default
roleRef:
kind: ClusterRole
name: efs-csi-node-role
apiGroup: rbac.authorization.k8s.io

8 changes: 4 additions & 4 deletions hack/e2e/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,15 @@ IMAGE_TAG=${IMAGE_TAG:-${TEST_ID}}

# kops: must include patch version (e.g. 1.19.1)
# eksctl: mustn't include patch version (e.g. 1.19)
K8S_VERSION_KOPS=${K8S_VERSION_KOPS:-${K8S_VERSION:-1.26.2}}
K8S_VERSION_EKSCTL=${K8S_VERSION_EKSCTL:-${K8S_VERSION:-1.25}}
K8S_VERSION_KOPS=${K8S_VERSION_KOPS:-${K8S_VERSION:-1.27.3}}
K8S_VERSION_EKSCTL=${K8S_VERSION_EKSCTL:-${K8S_VERSION:-1.27}}

KOPS_VERSION=${KOPS_VERSION:-1.26.2}
KOPS_VERSION=${KOPS_VERSION:-1.27.0-beta.3}
KOPS_STATE_FILE=${KOPS_STATE_FILE:-s3://k8s-kops-csi-e2e}
KOPS_PATCH_FILE=${KOPS_PATCH_FILE:-./hack/kops-patch.yaml}
KOPS_PATCH_NODE_FILE=${KOPS_PATCH_NODE_FILE:-./hack/kops-patch-node.yaml}

EKSCTL_VERSION=${EKSCTL_VERSION:-0.133.0}
EKSCTL_VERSION=${EKSCTL_VERSION:-0.151.0}
EKSCTL_PATCH_FILE=${EKSCTL_PATCH_FILE:-./hack/eksctl-patch.yaml}
EKSCTL_ADMIN_ROLE=${EKSCTL_ADMIN_ROLE:-}
# Creates a windows node group. The windows ami doesn't (yet) install csi-proxy
Expand Down
5 changes: 5 additions & 0 deletions hack/eksctl-patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ iam:
vpcResourceControllerPolicy: true
withOIDC: true
serviceAccounts:
- metadata:
name: efs-csi-node-sa
namespace: kube-system
wellKnownPolicies:
efsCSIController: true
- metadata:
name: efs-csi-controller-sa
namespace: kube-system
Expand Down
3 changes: 3 additions & 0 deletions hack/kops-patch-node.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
spec:
instanceMetadata:
httpTokens: optional
8 changes: 7 additions & 1 deletion hack/kops-patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,14 @@ spec:
"elasticfilesystem:CreateAccessPoint",
"elasticfilesystem:DeleteAccessPoint",
"elasticfilesystem:DescribeFileSystems",
"elasticfilesystem:DescribeAccessPoints"
"elasticfilesystem:DescribeAccessPoints",
"elasticfilesystem:DescribeMountTargets",
"ec2:DescribeAvailabilityZones"
],
"Resource": "*"
}
]
cloudConfig:
awsEBSCSIDriver:
managed: true
2 changes: 2 additions & 0 deletions hack/values_eksctl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ controller:
create: false # let eksctl create it
node:
logLevel: 5
serviceAccount:
create: false
109 changes: 58 additions & 51 deletions test/e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,23 @@ package e2e
import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/util/rand"
"os"
"strconv"
"strings"
"time"

"k8s.io/apimachinery/pkg/util/rand"

"github.com/onsi/ginkgo/v2"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
e2evolume "k8s.io/kubernetes/test/e2e/framework/volume"
storageframework "k8s.io/kubernetes/test/e2e/storage/framework"
"k8s.io/kubernetes/test/e2e/storage/testsuites"
"k8s.io/kubernetes/test/e2e/storage/utils"
Expand Down Expand Up @@ -274,56 +273,36 @@ var _ = ginkgo.Describe("[efs-csi] EFS CSI", func() {
framework.ExpectNoError(e2epod.WaitForPodNameRunningInNamespace(f.ClientSet, pod.Name, f.Namespace.Name), "waiting for pod running")
})

ginkgo.It("should continue reading/writing without hanging after the driver pod is restarted", func() {
ginkgo.By(fmt.Sprintf("Creating efs pvc & pv"))
pvc, pv, err := createEFSPVCPV(f.ClientSet, f.Namespace.Name, f.Namespace.Name, "", map[string]string{})
framework.ExpectNoError(err, "creating efs pvc & pv")
defer func() {
_ = f.ClientSet.CoreV1().PersistentVolumes().Delete(context.TODO(), pv.Name, metav1.DeleteOptions{})
}()
ginkgo.It("should continue reading/writing without interruption after the driver pod is restarted", func() {
const FilePath = "/mnt/testfile.txt"
const TestDuration = 30 * time.Second

node, err := e2enode.GetRandomReadySchedulableNode(f.ClientSet)
framework.ExpectNoError(err, "getting random ready schedulable node")
command := fmt.Sprintf("touch /mnt/volume1/%s-%s && trap exit TERM; while true; do sleep 1; done", f.Namespace.Name, time.Now().Format(time.RFC3339))
ginkgo.By("Creating EFS PVC and associated PV")
pvc, pv, err := createEFSPVCPV(f.ClientSet, f.Namespace.Name, f.Namespace.Name, "", map[string]string{})
framework.ExpectNoError(err)
defer f.ClientSet.CoreV1().PersistentVolumes().Delete(context.TODO(), pv.Name, metav1.DeleteOptions{})

ginkgo.By(fmt.Sprintf("Creating pod on node %q to mount pvc %q and run %q", node.Name, pvc.Name, command))
pod := e2epod.MakePod(f.Namespace.Name, nil, []*v1.PersistentVolumeClaim{pvc}, false, command)
pod.Spec.NodeName = node.Name
ginkgo.By("Deploying a pod to write data")
writeCommand := fmt.Sprintf("while true; do date +%%s >> %s; sleep 1; done", FilePath)
pod := e2epod.MakePod(f.Namespace.Name, nil, []*v1.PersistentVolumeClaim{pvc}, false, writeCommand)
pod, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(context.TODO(), pod, metav1.CreateOptions{})
framework.ExpectNoError(err, "creating pod")
framework.ExpectNoError(e2epod.WaitForPodNameRunningInNamespace(f.ClientSet, pod.Name, f.Namespace.Name), "waiting for pod running")
framework.ExpectNoError(err)
framework.ExpectNoError(e2epod.WaitForPodNameRunningInNamespace(f.ClientSet, pod.Name, f.Namespace.Name))
defer f.ClientSet.CoreV1().Pods(f.Namespace.Name).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{})

ginkgo.By(fmt.Sprintf("Getting driver pod on node %q", node.Name))
labelSelector := labels.SelectorFromSet(EfsDriverLabelSelectors).String()
fieldSelector := fields.SelectorFromSet(fields.Set{"spec.nodeName": node.Name}).String()
podList, err := f.ClientSet.CoreV1().Pods(EfsDriverNamespace).List(
context.TODO(),
metav1.ListOptions{
LabelSelector: labelSelector,
FieldSelector: fieldSelector,
})
framework.ExpectNoError(err, "getting driver pod")
framework.ExpectEqual(len(podList.Items), 1, "expected 1 efs csi node pod but got %d", len(podList.Items))
driverPod := podList.Items[0]

ginkgo.By(fmt.Sprintf("Deleting driver pod %q on node %q", driverPod.Name, node.Name))
err = e2epod.DeletePodWithWaitByName(f.ClientSet, driverPod.Name, EfsDriverNamespace)
framework.ExpectNoError(err, "deleting driver pod")

ginkgo.By(fmt.Sprintf("Execing a write via the pod on node %q", node.Name))
command = fmt.Sprintf("touch /mnt/volume1/%s-%s", f.Namespace.Name, time.Now().Format(time.RFC3339))
done := make(chan bool)
go func() {
defer ginkgo.GinkgoRecover()
e2evolume.VerifyExecInPodSucceed(f, pod, command)
done <- true
}()
select {
case <-done:
framework.Logf("verified exec in pod succeeded")
case <-time.After(30 * time.Second):
framework.Failf("timed out verifying exec in pod succeeded")
}
ginkgo.By("Triggering a restart for the EFS CSI Node DaemonSet")
_, err = framework.RunKubectl("kube-system", "rollout", "restart", "daemonset", "efs-csi-node")
framework.ExpectNoError(err)

time.Sleep(TestDuration)

ginkgo.By("Validating no interruption")
readCommand := fmt.Sprintf("cat %s", FilePath)
content, err := framework.RunKubectl(f.Namespace.Name, "exec", pod.Name, "--", "/bin/sh", "-c", readCommand)
framework.ExpectNoError(err)

timestamps := strings.Split(strings.TrimSpace(content), "\n")
checkInterruption(timestamps)
})

testEncryptInTransit := func(f *framework.Framework, encryptInTransit *bool) {
Expand Down Expand Up @@ -462,3 +441,31 @@ func makeDir(path string) error {
}
return nil
}

// checkInterruption takes a slice of strings, where each string is expected to
// be an integer representing a timestamp. It checks that the difference between each successive
// pair of integers is not greater than 1.
//
// This function is used to check that reading/writing to a file was not
// interrupted for more than 1 second at a time, even when the driver pod is
// restarted.
func checkInterruption(timestamps []string) {
var curr int64
var err error

for i, t := range timestamps {
if i == 0 {
curr, err = strconv.ParseInt(t, 10, 64)
framework.ExpectNoError(err)
continue
}

next, err := strconv.ParseInt(t, 10, 64)
framework.ExpectNoError(err)
if next-curr > 1 {
framework.Failf("Detected an interruption. Time gap: %d seconds.", next-curr)
}

curr = next
}
}

0 comments on commit 163f545

Please sign in to comment.