diff --git a/Dockerfile b/Dockerfile index 325e9ec20..15b34de7c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,7 +36,7 @@ ARG EFSUTILSSOURCE=github RUN mkdir -p /tmp/rpms && \ if [ "$EFSUTILSSOURCE" = "yum" ]; \ then echo "Installing efs-utils from Amazon Linux 2 yum repo" && \ - yum -y install --downloadonly --downloaddir=/tmp/rpms amazon-efs-utils-1.34.4-1.amzn2.noarch; \ + yum -y install --downloadonly --downloaddir=/tmp/rpms amazon-efs-utils-1.35.0-1.amzn2.noarch; \ else echo "Installing efs-utils from github using the latest git tag" && \ yum -y install git rpm-build make && \ git clone https://github.com/aws/efs-utils && \ diff --git a/charts/aws-efs-csi-driver/templates/controller-deployment.yaml b/charts/aws-efs-csi-driver/templates/controller-deployment.yaml index 26b663ffd..90e219f4b 100644 --- a/charts/aws-efs-csi-driver/templates/controller-deployment.yaml +++ b/charts/aws-efs-csi-driver/templates/controller-deployment.yaml @@ -27,7 +27,6 @@ spec: annotations: {{- toYaml . | nindent 8 }} {{- end }} spec: - hostNetwork: true {{- if .Values.imagePullSecrets }} imagePullSecrets: {{- range .Values.imagePullSecrets }} diff --git a/charts/aws-efs-csi-driver/templates/node-daemonset.yaml b/charts/aws-efs-csi-driver/templates/node-daemonset.yaml index 0443f0a57..acaebe22a 100644 --- a/charts/aws-efs-csi-driver/templates/node-daemonset.yaml +++ b/charts/aws-efs-csi-driver/templates/node-daemonset.yaml @@ -44,16 +44,9 @@ spec: {{- with .Values.node.nodeSelector }} {{- toYaml . | nindent 8 }} {{- end }} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: eks.amazonaws.com/compute-type - operator: NotIn - values: - - fargate - hostNetwork: true + {{- with .Values.node.affinity }} + affinity: {{- toYaml . | nindent 8 }} + {{- end }} dnsPolicy: {{ .Values.node.dnsPolicy }} {{- with .Values.node.dnsConfig }} dnsConfig: {{- toYaml . | nindent 8 }} @@ -77,9 +70,16 @@ spec: - --endpoint=$(CSI_ENDPOINT) - --logtostderr - --v={{ .Values.node.logLevel }} + - --vol-metrics-opt-in={{ hasKey .Values.node "volMetricsOptIn" | ternary .Values.node.volMetricsOptIn false }} + - --vol-metrics-refresh-period={{ hasKey .Values.node "volMetricsRefreshPeriod" | ternary .Values.node.volMetricsRefreshPeriod 240 }} + - --vol-metrics-fs-rate-limit={{ hasKey .Values.node "volMetricsFsRateLimit" | ternary .Values.node.volMetricsFsRateLimit 5 }} env: - name: CSI_ENDPOINT value: unix:/csi/csi.sock + - name: CSI_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName {{- if .Values.useFIPS }} - name: AWS_USE_FIPS_ENDPOINT value: "true" diff --git a/charts/aws-efs-csi-driver/templates/node-serviceaccount.yaml b/charts/aws-efs-csi-driver/templates/node-serviceaccount.yaml index 9fd3c7a0f..a63383bf0 100644 --- a/charts/aws-efs-csi-driver/templates/node-serviceaccount.yaml +++ b/charts/aws-efs-csi-driver/templates/node-serviceaccount.yaml @@ -10,3 +10,30 @@ metadata: {{- toYaml . | nindent 4 }} {{- end }} {{- end }} +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: efs-csi-node-role + labels: + app.kubernetes.io/name: {{ include "aws-efs-csi-driver.name" . }} +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: efs-csi-node-binding + labels: + app.kubernetes.io/name: {{ include "aws-efs-csi-driver.name" . }} +subjects: + - kind: ServiceAccount + name: {{ .Values.node.serviceAccount.name }} + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: efs-csi-node-role + apiGroup: rbac.authorization.k8s.io + \ No newline at end of file diff --git a/deploy/kubernetes/base/controller-deployment.yaml b/deploy/kubernetes/base/controller-deployment.yaml index 49e9ec262..08ad518eb 100644 --- a/deploy/kubernetes/base/controller-deployment.yaml +++ b/deploy/kubernetes/base/controller-deployment.yaml @@ -21,7 +21,6 @@ spec: app.kubernetes.io/name: aws-efs-csi-driver app.kubernetes.io/instance: kustomize spec: - hostNetwork: true nodeSelector: kubernetes.io/os: linux serviceAccountName: efs-csi-controller-sa diff --git a/deploy/kubernetes/base/node-daemonset.yaml b/deploy/kubernetes/base/node-daemonset.yaml index 3ec601538..8d5261e7b 100644 --- a/deploy/kubernetes/base/node-daemonset.yaml +++ b/deploy/kubernetes/base/node-daemonset.yaml @@ -31,17 +31,16 @@ spec: operator: NotIn values: - fargate - securityContext: - fsGroup: 0 - runAsGroup: 0 - runAsNonRoot: false - runAsUser: 0 - hostNetwork: true dnsPolicy: ClusterFirst serviceAccountName: efs-csi-node-sa priorityClassName: system-node-critical tolerations: - operator: Exists + securityContext: + fsGroup: 0 + runAsGroup: 0 + runAsNonRoot: false + runAsUser: 0 containers: - name: efs-plugin securityContext: diff --git a/deploy/kubernetes/base/node-serviceaccount.yaml b/deploy/kubernetes/base/node-serviceaccount.yaml index 9840fb6ad..b51a610fe 100644 --- a/deploy/kubernetes/base/node-serviceaccount.yaml +++ b/deploy/kubernetes/base/node-serviceaccount.yaml @@ -6,3 +6,30 @@ metadata: name: efs-csi-node-sa labels: app.kubernetes.io/name: aws-efs-csi-driver +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: efs-csi-node-role + labels: + app.kubernetes.io/name: aws-efs-csi-driver +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: efs-csi-node-binding + labels: + app.kubernetes.io/name: aws-efs-csi-driver +subjects: + - kind: ServiceAccount + name: efs-csi-node-sa + namespace: default +roleRef: + kind: ClusterRole + name: efs-csi-node-role + apiGroup: rbac.authorization.k8s.io + \ No newline at end of file diff --git a/hack/e2e/run.sh b/hack/e2e/run.sh index 28cc74ed9..8431bab8b 100755 --- a/hack/e2e/run.sh +++ b/hack/e2e/run.sh @@ -49,15 +49,15 @@ IMAGE_TAG=${IMAGE_TAG:-${TEST_ID}} # kops: must include patch version (e.g. 1.19.1) # eksctl: mustn't include patch version (e.g. 1.19) -K8S_VERSION_KOPS=${K8S_VERSION_KOPS:-${K8S_VERSION:-1.26.2}} -K8S_VERSION_EKSCTL=${K8S_VERSION_EKSCTL:-${K8S_VERSION:-1.25}} +K8S_VERSION_KOPS=${K8S_VERSION_KOPS:-${K8S_VERSION:-1.27.3}} +K8S_VERSION_EKSCTL=${K8S_VERSION_EKSCTL:-${K8S_VERSION:-1.27}} -KOPS_VERSION=${KOPS_VERSION:-1.26.2} +KOPS_VERSION=${KOPS_VERSION:-1.27.0-beta.3} KOPS_STATE_FILE=${KOPS_STATE_FILE:-s3://k8s-kops-csi-e2e} KOPS_PATCH_FILE=${KOPS_PATCH_FILE:-./hack/kops-patch.yaml} KOPS_PATCH_NODE_FILE=${KOPS_PATCH_NODE_FILE:-./hack/kops-patch-node.yaml} -EKSCTL_VERSION=${EKSCTL_VERSION:-0.133.0} +EKSCTL_VERSION=${EKSCTL_VERSION:-0.151.0} EKSCTL_PATCH_FILE=${EKSCTL_PATCH_FILE:-./hack/eksctl-patch.yaml} EKSCTL_ADMIN_ROLE=${EKSCTL_ADMIN_ROLE:-} # Creates a windows node group. The windows ami doesn't (yet) install csi-proxy diff --git a/hack/eksctl-patch.yaml b/hack/eksctl-patch.yaml index 052eb6756..9ceb66314 100644 --- a/hack/eksctl-patch.yaml +++ b/hack/eksctl-patch.yaml @@ -2,6 +2,11 @@ iam: vpcResourceControllerPolicy: true withOIDC: true serviceAccounts: + - metadata: + name: efs-csi-node-sa + namespace: kube-system + wellKnownPolicies: + efsCSIController: true - metadata: name: efs-csi-controller-sa namespace: kube-system diff --git a/hack/kops-patch-node.yaml b/hack/kops-patch-node.yaml new file mode 100644 index 000000000..c793cd8ab --- /dev/null +++ b/hack/kops-patch-node.yaml @@ -0,0 +1,3 @@ +spec: + instanceMetadata: + httpTokens: optional diff --git a/hack/kops-patch.yaml b/hack/kops-patch.yaml index 05c8ee5cc..39d183239 100644 --- a/hack/kops-patch.yaml +++ b/hack/kops-patch.yaml @@ -8,8 +8,14 @@ spec: "elasticfilesystem:CreateAccessPoint", "elasticfilesystem:DeleteAccessPoint", "elasticfilesystem:DescribeFileSystems", - "elasticfilesystem:DescribeAccessPoints" + "elasticfilesystem:DescribeAccessPoints", + "elasticfilesystem:DescribeMountTargets", + "ec2:DescribeAvailabilityZones" ], "Resource": "*" } ] + + cloudConfig: + awsEBSCSIDriver: + managed: true diff --git a/hack/values_eksctl.yaml b/hack/values_eksctl.yaml index 5b1e27d56..f99eec38d 100644 --- a/hack/values_eksctl.yaml +++ b/hack/values_eksctl.yaml @@ -4,3 +4,5 @@ controller: create: false # let eksctl create it node: logLevel: 5 + serviceAccount: + create: false diff --git a/test/e2e/e2e.go b/test/e2e/e2e.go index a3d5ea3d2..ac228a7a9 100644 --- a/test/e2e/e2e.go +++ b/test/e2e/e2e.go @@ -3,24 +3,23 @@ package e2e import ( "context" "fmt" - "k8s.io/apimachinery/pkg/util/rand" "os" + "strconv" + "strings" "time" + "k8s.io/apimachinery/pkg/util/rand" + "github.com/onsi/ginkgo/v2" v1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/fields" - "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/sets" clientset "k8s.io/client-go/kubernetes" "k8s.io/kubernetes/test/e2e/framework" - e2enode "k8s.io/kubernetes/test/e2e/framework/node" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" - e2evolume "k8s.io/kubernetes/test/e2e/framework/volume" storageframework "k8s.io/kubernetes/test/e2e/storage/framework" "k8s.io/kubernetes/test/e2e/storage/testsuites" "k8s.io/kubernetes/test/e2e/storage/utils" @@ -274,56 +273,36 @@ var _ = ginkgo.Describe("[efs-csi] EFS CSI", func() { framework.ExpectNoError(e2epod.WaitForPodNameRunningInNamespace(f.ClientSet, pod.Name, f.Namespace.Name), "waiting for pod running") }) - ginkgo.It("should continue reading/writing without hanging after the driver pod is restarted", func() { - ginkgo.By(fmt.Sprintf("Creating efs pvc & pv")) - pvc, pv, err := createEFSPVCPV(f.ClientSet, f.Namespace.Name, f.Namespace.Name, "", map[string]string{}) - framework.ExpectNoError(err, "creating efs pvc & pv") - defer func() { - _ = f.ClientSet.CoreV1().PersistentVolumes().Delete(context.TODO(), pv.Name, metav1.DeleteOptions{}) - }() + ginkgo.It("should continue reading/writing without interruption after the driver pod is restarted", func() { + const FilePath = "/mnt/testfile.txt" + const TestDuration = 30 * time.Second - node, err := e2enode.GetRandomReadySchedulableNode(f.ClientSet) - framework.ExpectNoError(err, "getting random ready schedulable node") - command := fmt.Sprintf("touch /mnt/volume1/%s-%s && trap exit TERM; while true; do sleep 1; done", f.Namespace.Name, time.Now().Format(time.RFC3339)) + ginkgo.By("Creating EFS PVC and associated PV") + pvc, pv, err := createEFSPVCPV(f.ClientSet, f.Namespace.Name, f.Namespace.Name, "", map[string]string{}) + framework.ExpectNoError(err) + defer f.ClientSet.CoreV1().PersistentVolumes().Delete(context.TODO(), pv.Name, metav1.DeleteOptions{}) - ginkgo.By(fmt.Sprintf("Creating pod on node %q to mount pvc %q and run %q", node.Name, pvc.Name, command)) - pod := e2epod.MakePod(f.Namespace.Name, nil, []*v1.PersistentVolumeClaim{pvc}, false, command) - pod.Spec.NodeName = node.Name + ginkgo.By("Deploying a pod to write data") + writeCommand := fmt.Sprintf("while true; do date +%%s >> %s; sleep 1; done", FilePath) + pod := e2epod.MakePod(f.Namespace.Name, nil, []*v1.PersistentVolumeClaim{pvc}, false, writeCommand) pod, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(context.TODO(), pod, metav1.CreateOptions{}) - framework.ExpectNoError(err, "creating pod") - framework.ExpectNoError(e2epod.WaitForPodNameRunningInNamespace(f.ClientSet, pod.Name, f.Namespace.Name), "waiting for pod running") + framework.ExpectNoError(err) + framework.ExpectNoError(e2epod.WaitForPodNameRunningInNamespace(f.ClientSet, pod.Name, f.Namespace.Name)) + defer f.ClientSet.CoreV1().Pods(f.Namespace.Name).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{}) - ginkgo.By(fmt.Sprintf("Getting driver pod on node %q", node.Name)) - labelSelector := labels.SelectorFromSet(EfsDriverLabelSelectors).String() - fieldSelector := fields.SelectorFromSet(fields.Set{"spec.nodeName": node.Name}).String() - podList, err := f.ClientSet.CoreV1().Pods(EfsDriverNamespace).List( - context.TODO(), - metav1.ListOptions{ - LabelSelector: labelSelector, - FieldSelector: fieldSelector, - }) - framework.ExpectNoError(err, "getting driver pod") - framework.ExpectEqual(len(podList.Items), 1, "expected 1 efs csi node pod but got %d", len(podList.Items)) - driverPod := podList.Items[0] - - ginkgo.By(fmt.Sprintf("Deleting driver pod %q on node %q", driverPod.Name, node.Name)) - err = e2epod.DeletePodWithWaitByName(f.ClientSet, driverPod.Name, EfsDriverNamespace) - framework.ExpectNoError(err, "deleting driver pod") - - ginkgo.By(fmt.Sprintf("Execing a write via the pod on node %q", node.Name)) - command = fmt.Sprintf("touch /mnt/volume1/%s-%s", f.Namespace.Name, time.Now().Format(time.RFC3339)) - done := make(chan bool) - go func() { - defer ginkgo.GinkgoRecover() - e2evolume.VerifyExecInPodSucceed(f, pod, command) - done <- true - }() - select { - case <-done: - framework.Logf("verified exec in pod succeeded") - case <-time.After(30 * time.Second): - framework.Failf("timed out verifying exec in pod succeeded") - } + ginkgo.By("Triggering a restart for the EFS CSI Node DaemonSet") + _, err = framework.RunKubectl("kube-system", "rollout", "restart", "daemonset", "efs-csi-node") + framework.ExpectNoError(err) + + time.Sleep(TestDuration) + + ginkgo.By("Validating no interruption") + readCommand := fmt.Sprintf("cat %s", FilePath) + content, err := framework.RunKubectl(f.Namespace.Name, "exec", pod.Name, "--", "/bin/sh", "-c", readCommand) + framework.ExpectNoError(err) + + timestamps := strings.Split(strings.TrimSpace(content), "\n") + checkInterruption(timestamps) }) testEncryptInTransit := func(f *framework.Framework, encryptInTransit *bool) { @@ -462,3 +441,31 @@ func makeDir(path string) error { } return nil } + +// checkInterruption takes a slice of strings, where each string is expected to +// be an integer representing a timestamp. It checks that the difference between each successive +// pair of integers is not greater than 1. +// +// This function is used to check that reading/writing to a file was not +// interrupted for more than 1 second at a time, even when the driver pod is +// restarted. +func checkInterruption(timestamps []string) { + var curr int64 + var err error + + for i, t := range timestamps { + if i == 0 { + curr, err = strconv.ParseInt(t, 10, 64) + framework.ExpectNoError(err) + continue + } + + next, err := strconv.ParseInt(t, 10, 64) + framework.ExpectNoError(err) + if next-curr > 1 { + framework.Failf("Detected an interruption. Time gap: %d seconds.", next-curr) + } + + curr = next + } +}