Skip to content

Latest commit

 

History

History
294 lines (251 loc) · 7.11 KB

KernelBug.md

File metadata and controls

294 lines (251 loc) · 7.11 KB
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: centos
spec:
  containers:
  - name: centos
    image: centos
    ports:
    - containerPort: 80
    command:
    - sleep
    - "3600"
EOF

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: debian
spec:
  containers:
  - name: debian
    image: debian
    ports:
    - containerPort: 80
    command:
    - sleep
    - "3600"
EOF



postStart:
  exec:
    command:
    - /bin/sh
    - -c
    - "/bin/echo 'options single-request-reopen' >> /etc/resolv.conf"

cat <<EOF | kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
  name: centoss-deployment
  labels:
    app: centos
spec:
  replicas: 1
  selector:
    matchLabels:
      app: centoss
  template:
    metadata:
      labels:
        app: centoss
    spec:
      containers:
      - name: centoss
        image: centos
        ports:
        - containerPort: 80
        command:
        - sleep
        - "3600"
EOF


for i in {1..1000}; do curl -s -w "%{time_total}\n" -o /dev/null http://www.microsoft.com/; done
for i in {1..1000}; do curl -s -w "%{time_total}\n" -o /dev/null http://http://137.117.245.83; done
kubectl delete pod -n kube-system --selector="k8s-app=kube-dns"
kubectl delete  pod -n kube-system --selector="component=azure-cni-networkmonitor"
kubectl delete  pod -n kube-system --selector="component=kube-svc-redirect"
kubectl delete pod centos
kubectl delete deployment centoss-deployment6

while(true); do sleep 1; kubectl  delete pod   -n kube-system  --selector "component=azure-cni-networkmonitor";  done
 component=kube-svc-redirect

sudo apt-get update
sudo apt-get install linux-image-4.15.0-1030-azure 
sudo systemctl reboot


repro cni
kubectl exec -ti centos -- /bin/bash
[root@centos /]# for i in {1..100}; do curl -s -w "%{time_total}\n" -o /dev/null http://www.bing.com/; done
5.593
5.596
0.096
0.101
5.602
0.110

for i in `seq 1 1000`;do time nslookup kubernetes.default; done

We are still getting customer feedback that are complaining about dns timeouts after applying the kernel patch. However what is different that the timeouts are no longer visible in empty clusters - therefore here a set of things I have done to get some load.

I used a centos for testing



cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: centos
spec:
  containers:
  - name: centos
    image: centos
    ports:
    - containerPort: 80
    command:
    - sleep
    - "3600"
EOF


cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: debian
spec:
  containers:
  - name: debian
    image: debian
    ports:
    - containerPort: 80
    command:
    - sleep
    - "3600"
EOF

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: dnstools
spec:
  containers:
  - name: dnstools
    image: ngayall/dnstools
    ports:
    - containerPort: 80
    command:
    - sleep
    - "3600"
EOF

dnstools# dig +short @169.254.20.10  google.com
216.58.204.238

dnstools# dig +short @10.0.0.10  google.com
;; connection timed out; no servers could be reached

dnstools# dig +short google.com
;; connection timed out; no servers could be reached


cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: redis
  labels:
    name: redis
spec:
  containers:
  - name: redis
    image: redis:4.0.11-alpine
    args: ["--requirepass", "MySuperSecretRedis"]
    ports:
    - containerPort: 6379
---
apiVersion: v1
kind: Pod
metadata:
  name: rediscli
  labels:
    name: rediscli
spec:
  containers:
  - name: redis
    image: redis
---
apiVersion: v1
kind: Service
metadata:
  name: redis-svc
  labels:
    name: redis-svc
spec:
  selector:
    name: redis
  type: ClusterIP
  ports:
   - port: 6379
     targetPort: 6379
     protocol: TCP
EOF

redis-cli -h redis-svc -p 6379 -a MySuperSecretRedis ping
redis-cli -h redis-svc -p 6379 ping

I also used bing for testing the dns timeout (just in case google was messing with us ;)
kubectl exec -ti centos -- /bin/bash
for i in {1..100}; do curl -s -w "%{time_total}\n" -o /dev/null http://www.google.com/; done

Without the kernel patch the dns timeout occurs like clockwork almost every 5-10 calls
[root@centos /]# for i in {1..100}; do curl -s -w "%{time_total}\n" -o /dev/null http://www.bing.com/; done
5.593
5.596
0.096
0.101
5.602

However after applying the kernel patch and rebooting all nodes I cannot repro the dns timeout on kubenet based clusters (that is promising). However I can repro it on my azure cni based cluster with the following setup (and some load in the cluster).

I need to have some load inside the cluster therefore I launch the azure voting sample
kubectl apply -f https://raw.githubusercontent.com/Azure-Samples/azure-voting-app-redis/master/azure-vote-all-in-one-redis.yaml
Scale the deployment and generate some load 
kubectl scale --replicas=20 deployment/azure-vote-front
kubectl scale --replicas=3 deployment/azure-vote-back

Using chaoskube who is killing random pods after 2 seconds
https://github.com/linki/chaoskube


cat > asdfas <<  EOF

cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
  name: chaoskube
spec:
  containers:
  - name: chaoskube
    image: quay.io/linki/chaoskube:v0.11.0
    args:
    # kill a pod every 10 minutes
    - --interval=0m1s
    # only target pods in the test environment
    - --labels=app=azure-vote-front
    # exclude all pods in the kube-system namespace
    - --namespaces=!kube-system
    - --no-dry-run
EOF

In addition some random traffic that is hitting the frontend svc of 
SERVICE_IP=$(kubectl get svc azure-vote-front --template="{{range .status.loadBalancer.ingress}}{{.ip}}{{end}}")
for i in {1..1000}; do curl -s -w "%{time_total}\n" -o /dev/null http://$SERVICE_IP; done

Now after running the centos I can get a dns timeout for one in 300 curls to bing running this.

var=1;
while true ; do
  res=$( { curl -o /dev/null -s -w %{time_namelookup}\\n  http://www.bing.com; } 2>&1 )
  var=$((var+1))
  now=$(date +"%T")
  if [[ $res =~ ^[1-9] ]]; then
    now=$(date +"%T")
    echo "$var slow: $res $now"
    break
  fi
done

var=1;
while true ; do
  res=$( { curl -o /dev/null -s -w %{time_namelookup}\\n  http://www.bing.com; } 2>&1 )
  var=$((var+1))
  now=$(date +"%T")
  if [[ $res =~ ^[1-9] ]]; then
    now=$(date +"%T")
    echo "$var slow: $res $now"
    break
  fi
done

[root@centos /]# while true ; do   res=$( { curl -o /dev/null -s -w %{time_namelookup}\\n  http://www.bing.com; } 2>&1 );   var=$((var+1));   now=$(date +"%T");   if [[ $res =~ ^[1-9] ]]; then     now=$(date +"%T");     echo "$var slow: $res $now";     break;   fi; done
183 slow: 10.517 22:09:27
[root@centos /]# while true ; do   res=$( { curl -o /dev/null -s -w %{time_namelookup}\\n  http://www.bing.com; } 2>&1 );   var=$((var+1));   now=$(date +"%T");   if [[ $res =~ ^[1-9] ]]; then     now=$(date +"%T");     echo "$var slow: $res $now";     break;   fi; done
204 slow: 10.519 22:10:28

while true ; do res=$( { do time nslookup kubernetes.default } 2>&1 ); var=$((var+1)); now=$(date +"%T"); if [[ $res =~ ^[1-9] ]]; then now=$(date +"%T"); echo "$var slow: $res $now"; break; fi; done

for i in seq 1 100000;do time nslookup kubernetes.default; sleep 100; done