Skip to content

Commit 84ba4a6

Browse files
authored
add support for diagnosing crashing pod (#11)
* add support for diagnosing crashing pod Signed-off-by: Yeh-lei Wu <rayingecho@gmail.com> * lint codes Signed-off-by: Yeh-lei Wu <rayingecho@gmail.com>
1 parent 2ff8469 commit 84ba4a6

1 file changed

Lines changed: 92 additions & 18 deletions

File tree

pkg/plugin/cmd.go

Lines changed: 92 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package plugin
22

33
import (
4+
"context"
45
"encoding/json"
56
"fmt"
67
"github.com/aylei/kubectl-debug/pkg/util"
@@ -9,14 +10,19 @@ import (
910
"io"
1011
corev1 "k8s.io/api/core/v1"
1112
"k8s.io/apimachinery/pkg/apis/meta/v1"
13+
"k8s.io/apimachinery/pkg/util/uuid"
1214
"k8s.io/cli-runtime/pkg/genericclioptions"
1315
"k8s.io/client-go/kubernetes"
1416
coreclient "k8s.io/client-go/kubernetes/typed/core/v1"
1517
restclient "k8s.io/client-go/rest"
1618
"k8s.io/client-go/tools/remotecommand"
19+
"k8s.io/client-go/tools/watch"
20+
"k8s.io/kubernetes/pkg/client/conditions"
21+
"k8s.io/kubernetes/pkg/util/interrupt"
1722
"log"
1823
"net/url"
1924
"os/user"
25+
"time"
2026
)
2127

2228
const (
@@ -54,17 +60,17 @@ type DebugOptions struct {
5460
PodName string
5561

5662
// Debug options
57-
RetainContainer bool
58-
Image string
59-
ContainerName string
60-
Command []string
61-
AgentPort int
62-
ConfigLocation string
63-
64-
Flags *genericclioptions.ConfigFlags
65-
PodClient coreclient.PodsGetter
66-
Args []string
67-
Config *restclient.Config
63+
Image string
64+
ContainerName string
65+
Command []string
66+
AgentPort int
67+
ConfigLocation string
68+
Fork bool
69+
70+
Flags *genericclioptions.ConfigFlags
71+
CoreClient coreclient.CoreV1Interface
72+
Args []string
73+
Config *restclient.Config
6874

6975
genericclioptions.IOStreams
7076
}
@@ -106,6 +112,8 @@ func NewDebugCmd(streams genericclioptions.IOStreams) *cobra.Command {
106112
fmt.Sprintf("Agent port for debug cli to connect, default to %d", defaultAgentPort))
107113
cmd.Flags().StringVar(&opts.ConfigLocation, "debug-config", "",
108114
fmt.Sprintf("Debug config file, default to ~%s", defaultConfigLocation))
115+
cmd.Flags().BoolVar(&opts.Fork, "fork", false,
116+
"Fork a new pod for debugging (useful if the pod status is CrashLoopBackoff)")
109117
opts.Flags.AddFlags(cmd.Flags())
110118

111119
return cmd
@@ -173,7 +181,7 @@ func (o *DebugOptions) Complete(cmd *cobra.Command, args []string, argsLenAtDash
173181
if err != nil {
174182
return err
175183
}
176-
o.PodClient = clientset.CoreV1()
184+
o.CoreClient = clientset.CoreV1()
177185

178186
return nil
179187
}
@@ -190,14 +198,10 @@ func (o *DebugOptions) Validate() error {
190198

191199
func (o *DebugOptions) Run() error {
192200

193-
pod, err := o.PodClient.Pods(o.Namespace).Get(o.PodName, v1.GetOptions{})
201+
pod, err := o.CoreClient.Pods(o.Namespace).Get(o.PodName, v1.GetOptions{})
194202
if err != nil {
195203
return err
196204
}
197-
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
198-
return fmt.Errorf("cannot debug in a completed pod; current phase is %s", pod.Status.Phase)
199-
}
200-
hostIP := pod.Status.HostIP
201205

202206
containerName := o.ContainerName
203207
if len(containerName) == 0 {
@@ -208,6 +212,35 @@ func (o *DebugOptions) Run() error {
208212
containerName = pod.Spec.Containers[0].Name
209213
}
210214

215+
// in fork mode, we launch an new pod as a copy of target pod
216+
// and hack the entry point of the target container with sleep command
217+
// which keeps the container running.
218+
if o.Fork {
219+
pod = copyAndStripPod(pod, containerName)
220+
pod, err = o.CoreClient.Pods(pod.Namespace).Create(pod)
221+
if err != nil {
222+
return err
223+
}
224+
watcher, err := o.CoreClient.Pods(pod.Namespace).Watch(v1.SingleObject(pod.ObjectMeta))
225+
if err != nil {
226+
return err
227+
}
228+
// FIXME: hard code -> config
229+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
230+
defer cancel()
231+
log.Println("waiting for forked container running...")
232+
event, err := watch.UntilWithoutRetry(ctx, watcher, conditions.PodRunning)
233+
if err != nil {
234+
return err
235+
}
236+
pod = event.Object.(*corev1.Pod)
237+
}
238+
239+
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
240+
return fmt.Errorf("cannot debug in a completed pod; current phase is %s", pod.Status.Phase)
241+
}
242+
hostIP := pod.Status.HostIP
243+
211244
containerId, err := o.getContainerIdByName(pod, containerName)
212245
if err != nil {
213246
return err
@@ -244,7 +277,20 @@ func (o *DebugOptions) Run() error {
244277
return o.remoteExecute("POST", uri, o.Config, o.In, o.Out, o.ErrOut, t.Raw, sizeQueue)
245278
}
246279

247-
if err := t.Safe(fn); err != nil {
280+
// ensure forked pod is deleted on cancelation
281+
withCleanUp := func() error {
282+
return interrupt.Chain(nil, func() {
283+
if o.Fork {
284+
err := o.CoreClient.Pods(pod.Namespace).Delete(pod.Name, v1.NewDeleteOptions(0))
285+
if err != nil {
286+
// we may leak pod here, but we have nothing to do except noticing the user
287+
log.Printf("failed to delete pod %s, consider manual deletion.", pod.Name)
288+
}
289+
}
290+
}).Run(fn)
291+
}
292+
293+
if err := t.Safe(withCleanUp); err != nil {
248294
fmt.Printf("error execute remote, %v\n", err)
249295
return err
250296
}
@@ -308,3 +354,31 @@ func (o *DebugOptions) setupTTY() term.TTY {
308354
}
309355
return t
310356
}
357+
358+
// copyAndStripPod copy the given pod template, strip the probes and labels,
359+
// and replace the entry point
360+
func copyAndStripPod(pod *corev1.Pod, targetContainer string) *corev1.Pod {
361+
copied := &corev1.Pod{
362+
ObjectMeta: *pod.ObjectMeta.DeepCopy(),
363+
Spec: *pod.Spec.DeepCopy(),
364+
}
365+
copied.Name = fmt.Sprintf("%s-%s-debug", pod.Name, uuid.NewUUID())
366+
copied.Labels = nil
367+
copied.Spec.RestartPolicy = corev1.RestartPolicyNever
368+
for i, c := range copied.Spec.Containers {
369+
copied.Spec.Containers[i].LivenessProbe = nil
370+
copied.Spec.Containers[i].ReadinessProbe = nil
371+
if c.Name == targetContainer {
372+
// Hack, infinite sleep command to keep the container running
373+
copied.Spec.Containers[i].Command = []string{"sh", "-c", "--"}
374+
copied.Spec.Containers[i].Args = []string{"while true; do sleep 30; done;"}
375+
}
376+
}
377+
copied.ResourceVersion = ""
378+
copied.UID = ""
379+
copied.SelfLink = ""
380+
copied.CreationTimestamp = v1.Time{}
381+
copied.OwnerReferences = []v1.OwnerReference{}
382+
383+
return copied
384+
}

0 commit comments

Comments
 (0)