I've tried following various different guides on installing HA kube and I cant get anything to work.
My basic setup is to have 3 master nodes with Keepalived creating a virtual IP (10.10.20.30) and keepalived to send the traffic to the kubelet that is up. I run this script on one fresh ubuntu install and I'm planning on using the provided master join command to join more masters.
I'm under the impression that I can use port 8443 on the Haproxy to forward to 6443 on the other master nodes.
Here is the script I've created going through all the steps. I'm running this on a fresh install of ubuntu 20.04.
export KUBE_VERSION="1.24.6-00"
# SWAP
## this session
sudo swapoff -a
## permanently
sudo sed -i '/swap/d' /etc/fstab
# NETWORKING
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
overlay
br_netfilter
EOF
sudo modprobe overlay
sudo modprobe br_netfilter
sudo systemctl disable --now ufw || echo "already disabled"
## sysctl params required by setup, params persist across reboots
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
#Apply sysctl params without reboot
sudo sysctl --system
# INSTALLS
sudo apt-get update
sudo apt-get install -y apt-transport-https ca-certificates curl
sudo curl -fsSLo /usr/share/keyrings/kubernetes-archive-keyring.gpg https://packages.cloud.google.com/apt/doc/apt-key.gpg
echo "deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list
sudo apt-get update
sudo apt-mark unhold kubelet kubeadm kubectl
sudo apt-get install -y kubelet=${KUBE_VERSION} kubeadm=${KUBE_VERSION} kubectl=${KUBE_VERSION} --allow-downgrades
sudo apt-mark hold kubelet kubeadm kubectl
## Make sure no kube clusters are joined/created
sudo kubeadm reset -f
rm /home/skadmin/.kube/config || echo "already removed"
sudo systemctl stop kubelet || echo "kubelet service doesn't exist"
# DISABLE apparmor
sudo systemctl stop apparmor
sudo systemctl disable apparmor
sudo systemctl restart containerd.service || echo "containerd service doesn't exist"
# CONTAINER RUNTIME
sudo apt install -y containerd
sudo mkdir -p /etc/containerd || echo "already exists"
containerd config default | sudo tee /etc/containerd/config.toml
sudo systemctl restart containerd
sudo systemctl enable containerd
# KEEPALIVED
## /etc/keepalived/keepalived.conf
sudo apt-get install -y keepalived
export STATE="BACKUP"
export INTERFACE="ens18"
export ROUTER_ID=51
export PRIORITY=100
export AUTH_PASS=42
export APISERVER_VIP=10.10.20.30
cat <<EOF | sudo tee /etc/keepalived/keepalived.conf
global_defs {
router_id LVS_DEVEL
}
vrrp_script check_apiserver {
script "/etc/keepalived/check_apiserver.sh"
interval 3
weight -2
fall 5
rise 2
}
vrrp_instance VI_1 {
state ${STATE}
interface ${INTERFACE}
virtual_router_id ${ROUTER_ID}
priority ${PRIORITY}
advert_int 5
authentication {
auth_type PASS
auth_pass ${AUTH_PASS}
}
virtual_ipaddress {
${APISERVER_VIP}
}
track_script {
check_apiserver
}
}
EOF
## /etc/keepalived/check_apiserver.sh
export APISERVER_DEST_PORT=8443
cat <<EOF | sudo tee /etc/keepalived/check_apiserver.sh
#!/bin/sh
errorExit() {
echo "*** $*" 1>&2
exit 1
}
curl --silent --max-time 2 --insecure https://localhost:${APISERVER_DEST_PORT}/ -o /dev/null || errorExit "Error GET https://localhost:${APISERVER_DEST_PORT}/"
if ip addr | grep -q ${APISERVER_VIP}; then
curl --silent --max-time 2 --insecure https://${APISERVER_VIP}:${APISERVER_DEST_PORT}/ -o /dev/null || errorExit "Error GET https://${APISERVER_VIP}:${APISERVER_DEST_PORT}/"
fi
EOF
sudo chmod +x /etc/keepalived/check_apiserver.sh
sudo systemctl enable --now keepalived
sudo systemctl restart keepalived
# HA PROXY
sudo apt-get install -y haproxy
## /etc/haproxy/haproxy.cfg
export APISERVER_SRC_PORT=6443
export HOST1_ID=k8-1
export HOST1_ADDRESS=10.10.20.21
export HOST2_ID=k8-2
export HOST2_ADDRESS=10.10.20.22
export HOST3_ID=k8-3
export HOST3_ADDRESS=10.10.20.23
cat <<EOF | sudo tee /etc/haproxy/haproxy.cfg
frontend kubernetes-frontend
bind *:${APISERVER_DEST_PORT}
mode tcp
option tcplog
default_backend kubernetes-backend
backend kubernetes-backend
option httpchk GET /healthz
http-check expect status 200
mode tcp
option ssl-hello-chk
#option httpchk HEAD /
balance roundrobin
server ${HOST2_ID} ${HOST2_ADDRESS}:${APISERVER_SRC_PORT} check fall 3 rise 2
# server ${HOST3_ID} ${HOST3_ADDRESS}:${APISERVER_SRC_PORT} check fall 3 rise 2
# server ${HOST4_ID} ${HOST4_ADDRESS}:${APISERVER_SRC_PORT} check fall 3 rise 2
EOF
sudo systemctl enable haproxy
sudo systemctl restart haproxy
# INIT CLUSTER
cat <<EOF | tee cluster_config.yaml
apiVersion: kubeadm.k8s.io/v1beta3
kind: ClusterConfiguration
kubernetesVersion: stable
controlPlaneEndpoint: "kube.sk.stringking.com:${APISERVER_DEST_PORT}"
kubernetesVersion: v1.24.6
networking:
podSubnet: 10.0.0.0/16
localAPIEndpoint:
advertiseAddress: 10.10.20.21
bindPort: 8443
cgroupDriver: cgroupfs
EOF
sudo kubeadm init --upload-certs --config cluster_config.yaml
mkdir /home/skadmin/.kube/ || echo "can't create"
sudo cp /etc/kubernetes/admin.conf /home/skadmin/.kube/config
sudo chown skadmin /home/skadmin/.kube/config
sudo chgrp skadmin /home/skadmin/.kube/config
sudo chmod 700 /home/skadmin/.kube/config
# INSTALL CALICO
kubectl apply -f https://docs.projectcalico.org/manifests/calico-typha.yaml
After running it I get:
$ sudo kubeadm init --upload-certs --config cluster_config.yaml
W1019 05:30:57.257489 18686 initconfiguration.go:306] error unmarshaling configuration schema.GroupVersionKind{Group:"kubeadm.k8s.io", Version:"v1beta3", Kind:"ClusterConfiguration"}: strict decoding error: yaml: unmarshal errors:
line 5: key "kubernetesVersion" already set in map, unknown field "cgroupDriver", unknown field "localAPIEndpoint"
[init] Using Kubernetes version: v1.24.6
[preflight] Running pre-flight checks
[WARNING SystemVerification]: missing optional cgroups: blkio
[preflight] Pulling images required for setting up a Kubernetes cluster
[preflight] This might take a minute or two, depending on the speed of your internet connection
[preflight] You can also perform this action in beforehand using 'kubeadm config images pull'
[certs] Using certificateDir folder "/etc/kubernetes/pki"
[certs] Generating "ca" certificate and key
[certs] Generating "apiserver" certificate and key
[certs] apiserver serving cert is signed for DNS names [k8-1 kube.sk.stringking.com kubernetes kubernetes.default kubernetes.default.svc kubernetes.default.svc.cluster.local] and IPs [10.96.0.1 10.10.20.21]
[certs] Generating "apiserver-kubelet-client" certificate and key
[certs] Generating "front-proxy-ca" certificate and key
[certs] Generating "front-proxy-client" certificate and key
[certs] Generating "etcd/ca" certificate and key
[certs] Generating "etcd/server" certificate and key
[certs] etcd/server serving cert is signed for DNS names [k8-1 localhost] and IPs [10.10.20.21 127.0.0.1 ::1]
[certs] Generating "etcd/peer" certificate and key
[certs] etcd/peer serving cert is signed for DNS names [k8-1 localhost] and IPs [10.10.20.21 127.0.0.1 ::1]
[certs] Generating "etcd/healthcheck-client" certificate and key
[certs] Generating "apiserver-etcd-client" certificate and key
[certs] Generating "sa" key and public key
[kubeconfig] Using kubeconfig folder "/etc/kubernetes"
W1019 05:30:59.451551 18686 endpoint.go:57] [endpoint] WARNING: port specified in controlPlaneEndpoint overrides bindPort in the controlplane address
[kubeconfig] Writing "admin.conf" kubeconfig file
W1019 05:30:59.544831 18686 endpoint.go:57] [endpoint] WARNING: port specified in controlPlaneEndpoint overrides bindPort in the controlplane address
[kubeconfig] Writing "kubelet.conf" kubeconfig file
W1019 05:30:59.646279 18686 endpoint.go:57] [endpoint] WARNING: port specified in controlPlaneEndpoint overrides bindPort in the controlplane address
[kubeconfig] Writing "controller-manager.conf" kubeconfig file
W1019 05:31:00.043652 18686 endpoint.go:57] [endpoint] WARNING: port specified in controlPlaneEndpoint overrides bindPort in the controlplane address
[kubeconfig] Writing "scheduler.conf" kubeconfig file
[kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[kubelet-start] Starting the kubelet
[control-plane] Using manifest folder "/etc/kubernetes/manifests"
[control-plane] Creating static Pod manifest for "kube-apiserver"
[control-plane] Creating static Pod manifest for "kube-controller-manager"
[control-plane] Creating static Pod manifest for "kube-scheduler"
[etcd] Creating static Pod manifest for local etcd in "/etc/kubernetes/manifests"
[wait-control-plane] Waiting for the kubelet to boot up the control plane as static Pods from directory "/etc/kubernetes/manifests". This can take up to 4m0s
[kubelet-check] Initial timeout of 40s passed.
Unfortunately, an error has occurred:
timed out waiting for the condition
This error is likely caused by:
- The kubelet is not running
- The kubelet is unhealthy due to a misconfiguration of the node in some way (required cgroups disabled)
If you are on a systemd-powered system, you can try to troubleshoot the error with the following commands:
- 'systemctl status kubelet'
- 'journalctl -xeu kubelet'
Additionally, a control plane component may have crashed or exited when started by the container runtime.
To troubleshoot, list all containers using your preferred container runtimes CLI.
Here is one example how you may list all running Kubernetes containers by using crictl:
- 'crictl --runtime-endpoint unix:///var/run/containerd/containerd.sock ps -a | grep kube | grep -v pause'
Once you have found the failing container, you can inspect its logs with:
- 'crictl --runtime-endpoint unix:///var/run/containerd/containerd.sock logs CONTAINERID'
error execution phase wait-control-plane: couldn't initialize a Kubernetes cluster
To see the stack trace of this error execute with --v=5 or higher
Additionally in my journalctl
journalctl -flu kubelet
Oct 19 05:50:45 k8-1 kubelet[18783]: E1019 05:50:45.963575 18783 event.go:276] Unable to write event: '&v1.Event{TypeMeta:v1.TypeMeta{Kind:"", APIVersion:""}, ObjectMeta:v1.ObjectMeta{Name:"k8-1.171f61621a018184", GenerateName:"", Namespace:"default", SelfLink:"", UID:"", ResourceVersion:"", Generation:0, CreationTimestamp:time.Date(1, time.January, 1, 0, 0, 0, 0, time.UTC), DeletionTimestamp:<nil>, DeletionGracePeriodSeconds:(*int64)(nil), Labels:map[string]string(nil), Annotations:map[string]string(nil), OwnerReferences:[]v1.OwnerReference(nil), Finalizers:[]string(nil), ZZZ_DeprecatedClusterName:"", ManagedFields:[]v1.ManagedFieldsEntry(nil)}, InvolvedObject:v1.ObjectReference{Kind:"Node", Namespace:"", Name:"k8-1", UID:"k8-1", APIVersion:"", ResourceVersion:"", FieldPath:""}, Reason:"NodeHasSufficientPID", Message:"Node k8-1 status is now: NodeHasSufficientPID", Source:v1.EventSource{Component:"kubelet", Host:"k8-1"}, FirstTimestamp:time.Date(2022, time.October, 19, 5, 31, 1, 121368452, time.Local), LastTimestamp:time.Date(2022, time.October, 19, 5, 31, 1, 249038541, time.Local), Count:3, Type:"Normal", EventTime:time.Date(1, time.January, 1, 0, 0, 0, 0, time.UTC), Series:(*v1.EventSeries)(nil), Action:"", Related:(*v1.ObjectReference)(nil), ReportingController:"", ReportingInstance:""}': 'Patch "https://kube.sk.stringking.com:8443/api/v1/namespaces/default/events/k8-1.171f61621a018184": EOF'(may retry after sleeping)
Oct 19 05:50:46 k8-1 kubelet[18783]: E1019 05:50:46.057764 18783 kubelet.go:2424] "Error getting node" err="node \"k8-1\" not found"
Oct 19 05:50:46 k8-1 kubelet[18783]: E1019 05:50:46.159048 18783 kubelet.go:2424]