Skip to content

CoreDNS pods are not rechable for resolution from other node #3388

Open
@maslo64

Description

@maslo64

RKE version:
rke version v1.4.8
Docker version: (docker version,docker info preferred)

#docker version
Client: Docker Engine - Community
 Version:           20.10.24
 API version:       1.41
 Go version:        go1.19.7
 Git commit:        297e128
 Built:             Tue Apr  4 18:20:23 2023
 OS/Arch:           linux/amd64
 Context:           default
 Experimental:      true

Server: Docker Engine - Community
 Engine:
  Version:          20.10.24
  API version:      1.41 (minimum version 1.12)
  Go version:       go1.19.7
  Git commit:       5d6db84
  Built:            Tue Apr  4 18:18:29 2023
  OS/Arch:          linux/amd64
  Experimental:     false
 containerd:
  Version:          1.6.22
  GitCommit:        8165feabfdfe38c65b599c4993d227328c231fca
 runc:
  Version:          1.1.8
  GitCommit:        v1.1.8-0-g82f18fe
 docker-init:
  Version:          0.19.0
  GitCommit:        de40ad0
server.subdom.company.com:(/docker/rke)(root)#docker info
Client:
 Context:    default
 Debug Mode: false
 Plugins:
  app: Docker App (Docker Inc., v0.9.1-beta3)
  buildx: Docker Buildx (Docker Inc., v0.10.4-docker)
  compose: Docker Compose (Docker Inc., v2.20.2)
  scan: Docker Scan (Docker Inc., v0.23.0)

Server:
 Containers: 43
  Running: 23
  Paused: 0
  Stopped: 20
 Images: 20
 Server Version: 20.10.24
 Storage Driver: overlay2
  Backing Filesystem: xfs
  Supports d_type: true
  Native Overlay Diff: true
  userxattr: false
 Logging Driver: json-file
 Cgroup Driver: cgroupfs
 Cgroup Version: 1
 Plugins:
  Volume: local
  Network: bridge host ipvlan macvlan null overlay
  Log: awslogs fluentd gcplogs gelf journald json-file local logentries splunk syslog
 Swarm: inactive
 Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux runc
 Default Runtime: runc
 Init Binary: docker-init
 containerd version: 8165feabfdfe38c65b599c4993d227328c231fca
 runc version: v1.1.8-0-g82f18fe
 init version: de40ad0
 Security Options:
  seccomp
   Profile: default
 Kernel Version: 4.18.0-477.21.1.el8_8.x86_64
 Operating System: Red Hat Enterprise Linux 8.8 (Ootpa)
 OSType: linux
 Architecture: x86_64
 CPUs: 8
 Total Memory: 15.11GiB
 Name: server.subdom.company.com
 ID: QTBU:SEGF:ILNV:PIO6:W4DF:PKWK:QRQL:K3F4:GSYL:VXAZ:7WWE:NW5U
 Docker Root Dir: /docker/docker-data
 Debug Mode: false
 HTTP Proxy: http://proxy.subdom.company.com:8081
 HTTPS Proxy: http://proxy.subdom.company.com:8081
 No Proxy: .company.com,localhost,server.subdom.company.com,127.0.0.1
 Registry: https://index.docker.io/v1/
 Labels:
 Experimental: false
 Insecure Registries:
  127.0.0.0/8
 Live Restore Enabled: false

server.subdom.company.com:(/docker/rke)(root)#

Operating system and kernel: (cat /etc/os-release, uname -r preferred)

server.subdom.company.com:(/docker/rke)(root)#cat /etc/os-release
NAME="Red Hat Enterprise Linux"
VERSION="8.8 (Ootpa)"
ID="rhel"
ID_LIKE="fedora"
VERSION_ID="8.8"
PLATFORM_ID="platform:el8"
PRETTY_NAME="Red Hat Enterprise Linux 8.8 (Ootpa)"
ANSI_COLOR="0;31"
CPE_NAME="cpe:/o:redhat:enterprise_linux:8::baseos"
HOME_URL="https://www.redhat.com/"
DOCUMENTATION_URL="https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8"
BUG_REPORT_URL="https://bugzilla.redhat.com/"

REDHAT_BUGZILLA_PRODUCT="Red Hat Enterprise Linux 8"
REDHAT_BUGZILLA_PRODUCT_VERSION=8.8
REDHAT_SUPPORT_PRODUCT="Red Hat Enterprise Linux"
REDHAT_SUPPORT_PRODUCT_VERSION="8.8"
server.subdom.company.com:(/docker/rke)(root)#uname -r
4.18.0-477.21.1.el8_8.x86_64
server.subdom.company.com:(/docker/rke)(root)#

Type/provider of hosts: (VirtualBox/Bare-metal/AWS/GCE/DO)
vmware
cluster.yml file:

server.subdom.company.com:(/docker/rke)(root)#cat cluster.yml
# If you intended to deploy Kubernetes in an air-gapped environment,
# please consult the documentation on how to configure custom RKE images.
nodes:
- address: server.subdom.company.com
  port: "22"
  internal_address: ""
  role:
  - controlplane
  - worker
  - etcd
  hostname_override: ""
  user: rkeadm
  docker_socket: /var/run/docker.sock
  ssh_key: ""
  ssh_key_path: /home/rkeadm/.ssh/id_rsa
  ssh_cert: ""
  ssh_cert_path: ""
  labels: {}
  taints: []
- address: agent.subdom.company.com
  port: "22"
  internal_address: ""
  role:
  - worker
  hostname_override: ""
  user: rkeadm
  docker_socket: /var/run/docker.sock
  ssh_key: ""
  ssh_key_path: /home/rkeadm/.ssh/id_rsa
  ssh_cert: ""
  ssh_cert_path: ""
  labels: {}
  taints: []
services:
  etcd:
    image: ""
    extra_args: {}
    extra_args_array: {}
    extra_binds: []
    extra_env: []
    win_extra_args: {}
    win_extra_args_array: {}
    win_extra_binds: []
    win_extra_env: []
    external_urls: []
    ca_cert: ""
    cert: ""
    key: ""
    path: ""
    uid: 0
    gid: 0
    snapshot: null
    retention: ""
    creation: ""
    backup_config: null
  kube-api:
    image: ""
    extra_args: {}
    extra_args_array: {}
    extra_binds: []
    extra_env: []
    win_extra_args: {}
    win_extra_args_array: {}
    win_extra_binds: []
    win_extra_env: []
    service_cluster_ip_range: 10.43.0.0/16
    service_node_port_range: ""
    pod_security_policy: false
    pod_security_configuration: ""
    always_pull_images: false
    secrets_encryption_config: null
    audit_log: null
    admission_configuration: null
    event_rate_limit: null
  kube-controller:
    image: ""
    extra_args: {}
    extra_args_array: {}
    extra_binds: []
    extra_env: []
    win_extra_args: {}
    win_extra_args_array: {}
    win_extra_binds: []
    win_extra_env: []
    cluster_cidr: 10.42.0.0/16
    service_cluster_ip_range: 10.43.0.0/16
  scheduler:
    image: ""
    extra_args: {}
    extra_args_array: {}
    extra_binds: []
    extra_env: []
    win_extra_args: {}
    win_extra_args_array: {}
    win_extra_binds: []
    win_extra_env: []
  kubelet:
    image: ""
    extra_args: {}
    extra_args_array: {}
    extra_binds: []
    extra_env: []
    win_extra_args: {}
    win_extra_args_array: {}
    win_extra_binds: []
    win_extra_env: []
    cluster_domain: cluster.local
    infra_container_image: ""
    cluster_dns_server: 10.43.0.10
    fail_swap_on: false
    generate_serving_certificate: false
  kubeproxy:
    image: ""
    extra_args: {}
    extra_args_array: {}
    extra_binds: []
    extra_env: []
    win_extra_args: {}
    win_extra_args_array: {}
    win_extra_binds: []
    win_extra_env: []
network:
  plugin: canal
  options: {}
  mtu: 0
  node_selector: {}
  update_strategy: null
  tolerations: []
authentication:
  strategy: x509
  sans: []
  webhook: null
addons: ""
addons_include: []
system_images:
  etcd: rancher/mirrored-coreos-etcd:v3.5.6
  alpine: rancher/rke-tools:v0.1.89
  nginx_proxy: rancher/rke-tools:v0.1.89
  cert_downloader: rancher/rke-tools:v0.1.89
  kubernetes_services_sidecar: rancher/rke-tools:v0.1.89
  kubedns: rancher/mirrored-k8s-dns-kube-dns:1.22.20
  dnsmasq: rancher/mirrored-k8s-dns-dnsmasq-nanny:1.22.20
  kubedns_sidecar: rancher/mirrored-k8s-dns-sidecar:1.22.20
  kubedns_autoscaler: rancher/mirrored-cluster-proportional-autoscaler:1.8.6
  coredns: rancher/mirrored-coredns-coredns:1.9.4
  coredns_autoscaler: rancher/mirrored-cluster-proportional-autoscaler:1.8.6
  nodelocal: rancher/mirrored-k8s-dns-node-cache:1.22.20
  kubernetes: rancher/hyperkube:v1.26.7-rancher1
  flannel: rancher/mirrored-flannel-flannel:v0.21.4
  flannel_cni: rancher/flannel-cni:v0.3.0-rancher8
  calico_node: rancher/mirrored-calico-node:v3.25.0
  calico_cni: rancher/calico-cni:v3.25.0-rancher1
  calico_controllers: rancher/mirrored-calico-kube-controllers:v3.25.0
  calico_ctl: rancher/mirrored-calico-ctl:v3.25.0
  calico_flexvol: rancher/mirrored-calico-pod2daemon-flexvol:v3.25.0
  canal_node: rancher/mirrored-calico-node:v3.25.0
  canal_cni: rancher/calico-cni:v3.25.0-rancher1
  canal_controllers: rancher/mirrored-calico-kube-controllers:v3.25.0
  canal_flannel: rancher/mirrored-flannel-flannel:v0.21.4
  canal_flexvol: rancher/mirrored-calico-pod2daemon-flexvol:v3.25.0
  weave_node: weaveworks/weave-kube:2.8.1
  weave_cni: weaveworks/weave-npc:2.8.1
  pod_infra_container: rancher/mirrored-pause:3.7
  ingress: rancher/nginx-ingress-controller:nginx-1.7.0-rancher1
  ingress_backend: rancher/mirrored-nginx-ingress-controller-defaultbackend:1.5-rancher1
  ingress_webhook: rancher/mirrored-ingress-nginx-kube-webhook-certgen:v20230312-helm-chart-4.5.2-28-g66a760794
  metrics_server: rancher/mirrored-metrics-server:v0.6.3
  windows_pod_infra_container: rancher/mirrored-pause:3.7
  aci_cni_deploy_container: noiro/cnideploy:5.2.7.1.81c2369
  aci_host_container: noiro/aci-containers-host:5.2.7.1.81c2369
  aci_opflex_container: noiro/opflex:5.2.7.1.81c2369
  aci_mcast_container: noiro/opflex:5.2.7.1.81c2369
  aci_ovs_container: noiro/openvswitch:5.2.7.1.81c2369
  aci_controller_container: noiro/aci-containers-controller:5.2.7.1.81c2369
  aci_gbp_server_container: noiro/gbp-server:5.2.7.1.81c2369
  aci_opflex_server_container: noiro/opflex-server:5.2.7.1.81c2369
ssh_key_path: /home/rkeadm/.ssh/id_rsa
ssh_cert_path: ""
ssh_agent_auth: false
authorization:
  mode: rbac
  options: {}
ignore_docker_version: null
enable_cri_dockerd: null
kubernetes_version: ""
private_registries: []
ingress:
  provider: ""
  options: {}
  node_selector: {}
  extra_args: {}
  dns_policy: ""
  extra_envs: []
  extra_volumes: []
  extra_volume_mounts: []
  update_strategy: null
  http_port: 0
  https_port: 0
  network_mode: ""
  tolerations: []
  default_backend: null
  default_http_backend_priority_class_name: ""
  nginx_ingress_controller_priority_class_name: ""
  default_ingress_class: null
cluster_name: ""
cloud_provider:
  name: ""
prefix_path: ""
win_prefix_path: ""
addon_job_timeout: 0
bastion_host:
  address: ""
  port: ""
  user: ""
  ssh_key: ""
  ssh_key_path: ""
  ssh_cert: ""
  ssh_cert_path: ""
  ignore_proxy_env_vars: false
monitoring:
  provider: ""
  options: {}
  node_selector: {}
  update_strategy: null
  replicas: null
  tolerations: []
  metrics_server_priority_class_name: ""
restore:
  restore: false
  snapshot_name: ""
rotate_encryption_key: false
dns: null
server.subdom.company.com:(/docker/rke)(root)#

Steps to Reproduce:

  • Install RKE1 cluster with
    Results:
  • DNS resolution taking long time in containers if using cluster service IP
  • DNS resolution is not working cross-server
  • DNS works locally
#kubectl get pods -o wide -n kube-system -l k8s-app=kube-dns
NAME                       READY   STATUS    RESTARTS       AGE    IP            NODE                        NOMINATED NODE   READINESS GATES
coredns-747b8d9ccc-fhz9w   1/1     Running   5 (127m ago)   23h    10.42.0.133   server.subdom.company.com   <none>           <none>
coredns-747b8d9ccc-zdwjl   1/1     Running   0              121m   10.42.1.75    agent.subdom.company.com   <none>           <none>

Firewalld is disabled

server.subdom.company.com:(/root)(root)#firewall-cmd --state
not running
server.subdom.company.com:(/root)(root)#
agent.subdom.company.com:(/root)(root)#firewall-cmd --state
not running
agent.subdom.company.com:(/root)(root)#

Resolution is working fine locally on same server, doesn't work against pod on another node, and via service IP it's taking long time


server.subdom.company.com:(/root)(root)#kubectl run -it --rm --restart=Never busybox --image=busybox:1.28 --overrides='{"spec": { "nodeSelector": {"kubernetes.io/hostname": "agent.subdom.company.com"}}}' -- time nslookup kube-dns.kube-system.svc.cluster.local 10.42.1.75
Server:    10.42.1.75
Address 1: 10.42.1.75 10-42-1-75.kube-dns.kube-system.svc.cluster.local

Name:      kube-dns.kube-system.svc.cluster.local
Address 1: 10.43.0.10 kube-dns.kube-system.svc.cluster.local
real    0m 0.00s
user    0m 0.00s
sys     0m 0.00s
pod "busybox" deleted
server.subdom.company.com:(/root)(root)#kubectl run -it --rm --restart=Never busybox --image=busybox:1.28 --overrides='{"spec": { "nodeSelector": {"kubernetes.io/hostname": "server.subdom.company.com"}}}' -- time nslookup kube-dns.kube-system.svc.cluster.local 10.42.1.75
If you don't see a command prompt, try pressing enter.
Address 1: 10.42.1.75

nslookup: can't resolve 'kube-dns.kube-system.svc.cluster.local'
Command exited with non-zero status 1
real    1m 0.05s
user    0m 0.00s
sys     0m 0.00s
pod "busybox" deleted
pod default/busybox terminated (Error)
server.subdom.company.com:(/root)(root)#kubectl run -it --rm --restart=Never busybox --image=busybox:1.28 --overrides='{"spec": { "nodeSelector": {"kubernetes.io/hostname": "server.subdom.company.com"}}}' -- time nslookup kube-dns.kube-system.svc.cluster.local 10.42.0.133
Server:    10.42.0.133
Address 1: 10.42.0.133 10-42-0-133.kube-dns.kube-system.svc.cluster.local

Name:      kube-dns.kube-system.svc.cluster.local
Address 1: 10.43.0.10 kube-dns.kube-system.svc.cluster.local
real    0m 0.00s
user    0m 0.00s
sys     0m 0.00s
pod "busybox" deleted
server.subdom.company.com:(/root)(root)#kubectl run -it --rm --restart=Never busybox --image=busybox:1.28 --overrides='{"spec": { "nodeSelector": {"kubernetes.io/hostname": "server.subdom.company.com"}}}' -- time nslookup kube-dns.kube-system.svc.cluster.local
If you don't see a command prompt, try pressing enter.
Address 1: 10.43.0.10 kube-dns.kube-system.svc.cluster.local

Name:      kube-dns.kube-system.svc.cluster.local
Address 1: 10.43.0.10 kube-dns.kube-system.svc.cluster.local
real    0m 10.01s
user    0m 0.00s
sys     0m 0.00s
pod "busybox" deleted
server.subdom.company.com:(/root)(root)#

SURE-6954

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions