Skip to content

Commit b5d5525

Browse files
authored
Merge pull request #346 from mook-as/robust-startup
Make startup more robust
2 parents 146280f + 2f23e97 commit b5d5525

File tree

4 files changed

+144
-132
lines changed

4 files changed

+144
-132
lines changed

src/k8s-engine/client.ts

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -218,19 +218,8 @@ export class KubeClient extends events.EventEmitter {
218218
console.log(`Waited more than ${ maxWaitTime / 1000 } secs for kubernetes to fully start up. Giving up.`);
219219
break;
220220
}
221-
try {
222-
if (await this.getServiceListWatch()) {
223-
break;
224-
}
225-
} catch (ex) {
226-
if (ex?.code === 'ERR_TLS_CERT_ALTNAME_INVALID') {
227-
// If the VM restarted and the IP address changed, the certificate
228-
// will need to be regenerated (to include the new IP address).
229-
// Wait for K3s to do so.
230-
console.log('Incorrect TLS cert when waitin for services; retrying.');
231-
} else {
232-
throw ex;
233-
}
221+
if (await this.getServiceListWatch()) {
222+
break;
234223
}
235224
await util.promisify(setTimeout)(waitTime);
236225
}

src/k8s-engine/hyperkit.ts

Lines changed: 3 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,8 @@ import fs from 'fs';
66
import os from 'os';
77
import path from 'path';
88
import timers from 'timers';
9-
import util from 'util';
109

1110
import Electron from 'electron';
12-
import fetch from 'node-fetch';
1311
import semver from 'semver';
1412
import XDGAppPaths from 'xdg-app-paths';
1513
import { exec as sudo } from 'sudo-prompt';
@@ -446,34 +444,9 @@ export default class HyperkitBackend extends events.EventEmitter implements K8s.
446444
}
447445
});
448446

449-
// Wait for k3s server; note that we're delibrately sending a HTTP request
450-
// to an HTTPS server, and expecting an error response back.
451-
while (true) {
452-
try {
453-
const resp = await fetch(`http://${ await this.ipAddress }:6443`);
454-
455-
if (resp.status === 400) {
456-
break;
457-
}
458-
} catch (e) {
459-
if (e.code !== 'ECONNREFUSED') {
460-
throw e;
461-
}
462-
}
463-
await util.promisify(setTimeout)(500);
464-
}
465-
466-
try {
467-
await this.k3sHelper.updateKubeconfig(
468-
resources.executable('docker-machine-driver-hyperkit'),
469-
'--storage-path', path.join(paths.state(), 'driver'),
470-
'ssh', '--', 'sudo', `${ cacheDir }/kubeconfig`,
471-
);
472-
} catch (e) {
473-
console.error(e);
474-
console.error(e.stack);
475-
throw e;
476-
}
447+
await this.k3sHelper.waitForServerReady(() => this.ipAddress);
448+
await this.k3sHelper.updateKubeconfig(
449+
() => this.hyperkitWithCapture('ssh', '--', 'sudo', `${ cacheDir }/kubeconfig`));
477450
this.setState(K8s.State.STARTED);
478451
this.setProgress(Progress.DONE);
479452
this.client = new K8s.Client();

src/k8s-engine/k3sHelper.ts

Lines changed: 73 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
import childProcess from 'child_process';
21
import { Console } from 'console';
32
import crypto from 'crypto';
43
import events from 'events';
54
import fs from 'fs';
65
import os from 'os';
76
import path from 'path';
87
import stream from 'stream';
8+
import tls from 'tls';
99
import util from 'util';
1010

1111
import fetch from 'node-fetch';
@@ -14,6 +14,7 @@ import XDGAppPaths from 'xdg-app-paths';
1414
import { KubeConfig } from '@kubernetes/client-node';
1515
import yaml from 'yaml';
1616

17+
import * as childProcess from '../utils/childProcess';
1718
import Logging from '../utils/logging';
1819
import resources from '../resources';
1920
import DownloadProgressListener from '../utils/DownloadProgressListener';
@@ -356,6 +357,69 @@ export default class K3sHelper extends events.EventEmitter {
356357
}
357358
}
358359

360+
/**
361+
* Wait the K3s server to be ready after starting up.
362+
*
363+
* This will check that the proper TLS certificate is generated by K3s; this
364+
* is required as if the VM IP address changes, K3s will use a certificate
365+
* that is only valid for the old address for a short while. If we attempt to
366+
* communicate with the cluster at this point, things will fail.
367+
*
368+
* @param getHost A function to return the IP address that K3s will listen on
369+
* internally. This may be called multiple times, as the
370+
* address may not be ready yet.
371+
* @param port The port that K3s will listen on.
372+
*/
373+
async waitForServerReady(getHost: () => Promise<string|undefined>, port = 6443): Promise<void> {
374+
let host: string | undefined;
375+
376+
console.log(`Waiting for K3s server to be ready on port ${ port }...`);
377+
while (true) {
378+
try {
379+
host = await getHost();
380+
381+
if (typeof host === 'undefined') {
382+
await util.promisify(setTimeout)(500);
383+
continue;
384+
}
385+
386+
await new Promise<void>((resolve, reject) => {
387+
const socket = tls.connect(
388+
{
389+
host, port, rejectUnauthorized: false
390+
},
391+
() => {
392+
const { subjectaltname } = socket.getPeerCertificate();
393+
const names = subjectaltname.split(',').map( s => s.trim());
394+
const acceptable = [`IP Address:${ host }`, `DNS:${ host }`];
395+
396+
if (names.some(name => acceptable.includes(name))) {
397+
// We got a certificate with a SubjectAltName that includes the
398+
// host we're looking for.
399+
resolve();
400+
}
401+
reject({ code: 'ENOHOST' });
402+
});
403+
404+
socket.on('error', reject);
405+
});
406+
break;
407+
} catch (error) {
408+
switch (error.code) {
409+
case 'ENOHOST':
410+
case 'ECONNREFUSED':
411+
case 'ECONNRESET':
412+
break;
413+
default:
414+
// Unrecognized error; log but continue waiting.
415+
console.error(error);
416+
}
417+
await util.promisify(setTimeout)(1_000);
418+
}
419+
}
420+
console.log(`The K3s server is ready on ${ host }:${ port }.`);
421+
}
422+
359423
/**
360424
* Find the home directory, in a way that is compatible with the
361425
* @kubernetes/client-node package.
@@ -426,58 +490,23 @@ export default class K3sHelper extends events.EventEmitter {
426490
/**
427491
* Update the user's kubeconfig such that the K3s context is available and
428492
* set as the current context. This assumes that K3s is already running.
493+
*
494+
* @param configReader A function that returns the kubeconfig from the K3s VM.
429495
*/
430-
async updateKubeconfig(spawnExecutable: string, ...spawnArgs: string[]): Promise<void> {
496+
async updateKubeconfig(configReader: () => Promise<string>): Promise<void> {
431497
const contextName = 'rancher-desktop';
432498
const workDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'rancher-desktop-kubeconfig-'));
433499

434500
try {
435501
const workPath = path.join(workDir, 'kubeconfig');
436-
const workFile = await fs.promises.open(workPath, 'w+', 0o600);
437-
438-
try {
439-
const k3sOptions: childProcess.SpawnOptions = { stdio: ['ignore', workFile.fd, 'inherit'] };
440-
const k3sChild = childProcess.spawn(spawnExecutable, spawnArgs, k3sOptions);
441-
442-
console.log('Fetching K3s kubeconfig...');
443-
await new Promise<void>((resolve, reject) => {
444-
k3sChild.on('error', reject);
445-
k3sChild.on('exit', (status, signal) => {
446-
if (status === 0) {
447-
return resolve();
448-
}
449-
const message = status ? `status ${ status }` : `signal ${ signal }`;
450-
451-
reject(new Error(`Error getting kubeconfig: exited with ${ message }`));
452-
});
453-
});
454-
} finally {
455-
await workFile.close();
456-
}
457-
458-
// On Windows repeat until the kubeconfig file is readable
459-
let delay = 0; // msec
460-
const delayIncrement = 200;
461-
const maxDelay = 10_000;
462-
463-
while (delay < maxDelay) {
464-
try {
465-
await fs.promises.readFile(workPath, { encoding: 'utf-8' });
466-
break;
467-
} catch (err) {
468-
console.log(`Error reading ${ workPath }: ${ err }`);
469-
console.log(`Waiting for ${ delay / 1000.0 } sec`);
470-
delay += delayIncrement;
471-
await util.promisify(setTimeout)(delay);
472-
}
473-
}
474502

475503
// For some reason, using KubeConfig.loadFromFile presents permissions
476504
// errors; doing the same ourselves seems to work better. Since the file
477505
// comes from the WSL container, it must not contain any paths, so there
478-
// is no need to fix it up.
506+
// is no need to fix it up. This also lets us use an external function to
507+
// read the kubeconfig.
479508
const workConfig = new KubeConfig();
480-
const workContents = await fs.promises.readFile(workPath, { encoding: 'utf-8' });
509+
const workContents = await configReader();
481510

482511
workConfig.loadFromString(workContents);
483512
// @kubernetes/client-node deosn't have an API to modify the configs...
@@ -534,20 +563,8 @@ export default class K3sHelper extends events.EventEmitter {
534563
// The config file we modified might not be the top level one.
535564
// Update the current context.
536565
console.log('Setting default context...');
537-
await new Promise<void>((resolve, reject) => {
538-
const child = childProcess.spawn(
539-
resources.executable('kubectl'),
540-
['config', 'use-context', contextName],
541-
{ stdio: 'inherit' });
542-
543-
child.on('error', reject);
544-
child.on('exit', (status, signal) => {
545-
if (status !== 0 || signal !== null) {
546-
reject(new Error(`kubectl set-context returned with ${ [status, signal] }`));
547-
}
548-
resolve();
549-
});
550-
});
566+
await childProcess.spawnFile(
567+
resources.executable('kubectl'), ['config', 'use-context', contextName]);
551568
} finally {
552569
await fs.promises.rmdir(workDir, { recursive: true, maxRetries: 10 });
553570
}

src/k8s-engine/wsl.ts

Lines changed: 66 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,53 @@ export default class WSLBackend extends events.EventEmitter implements K8s.Kuber
215215
}
216216
}
217217

218+
/**
219+
* execCommand runs the given command in the K3s WSL environment and returns
220+
* the standard output.
221+
* @param command The command to execute.
222+
* @returns The output of the command.
223+
*/
224+
protected async execCommand(...command: string[]): Promise<string> {
225+
const args = ['--distribution', INSTANCE_NAME, '--exec'].concat(command);
226+
const { stdout } = await childProcess.spawnFile('wsl.exe', args,
227+
{
228+
stdio: ['ignore', 'pipe', await Logging.wsl.fdStream],
229+
windowsHide: true
230+
});
231+
232+
return stdout;
233+
}
234+
235+
/** Get the IPv4 address of the VM, assuming it's already up. */
236+
protected get ipAddress(): Promise<string | undefined> {
237+
return (async() => {
238+
// Get the routing map structure
239+
const state = await this.execCommand('cat', '/proc/net/fib_trie');
240+
241+
// We look for the IP address by:
242+
// 1. Convert the structure (text) into lines.
243+
// 2. Look for lines followed by "/32 host LOCAL".
244+
// This gives interface addresses.
245+
const lines = state
246+
.split(/\r?\n+/)
247+
.filter((_, i, array) => (array[i + 1] || '').includes('/32 host LOCAL'));
248+
// 3. Filter for lines with the shortest prefix; this is needed to reject
249+
// the CNI interfaces.
250+
const lengths: [number, string][] = lines.map(line => [line.length - line.trimStart().length, line]);
251+
const minLength = Math.min(...lengths.map(([length]) => length));
252+
// 4. Drop the tree formatting (" |-- "). The result are IP addresses.
253+
// 5. Reject loopback addresses.
254+
const addresses = lengths
255+
.filter(([length]) => length === minLength)
256+
.map(([_, address]) => address.replace(/^\s+\|--/, '').trim())
257+
.filter(address => !address.startsWith('127.'));
258+
259+
// Assume the first address is what we want, as the WSL VM only has one
260+
// (non-loopback, non-CNI) interface.
261+
return addresses[0];
262+
})();
263+
}
264+
218265
async getBackendInvalidReason(): Promise<K8s.KubernetesError | null> {
219266
// Check if wsl.exe is available
220267
try {
@@ -274,6 +321,22 @@ export default class WSLBackend extends events.EventEmitter implements K8s.Kuber
274321

275322
// If we were previously running, stop it now.
276323
this.process?.kill('SIGTERM');
324+
await childProcess.spawnFile('wsl.exe', ['--terminate', INSTANCE_NAME],
325+
{
326+
stdio: ['ignore', await Logging.wsl.fdStream, await Logging.wsl.fdStream],
327+
windowsHide: true
328+
});
329+
330+
// Temporary workaround: ensure root is mounted as shared -- this will be done later
331+
// Right now the builder pod needs to be restarted after the remount
332+
// TODO: When this code is removed, make `client.getActivePod` protected again.
333+
await childProcess.spawnFile(
334+
'wsl.exe',
335+
['--user', 'root', '--distribution', INSTANCE_NAME, 'mount', '--make-shared', '/'],
336+
{
337+
stdio: ['ignore', await Logging.wsl.fdStream, await Logging.wsl.fdStream],
338+
windowsHide: true,
339+
});
277340

278341
// Run run-k3s with NORUN, to set up the environment.
279342
await childProcess.spawnFile('wsl.exe',
@@ -316,30 +379,9 @@ export default class WSLBackend extends events.EventEmitter implements K8s.Kuber
316379
}
317380
});
318381

319-
// Wait for k3s server; note that we're deliberately sending an HTTP request
320-
// to an HTTPS server, and expecting an error response back.
321-
while (true) {
322-
try {
323-
const resp = await fetch('http://localhost:6444');
324-
325-
if (resp.status === 400) {
326-
break;
327-
}
328-
} catch (e) {
329-
if (!['ECONNREFUSED', 'ECONNRESET'].includes(e.code)) {
330-
throw e;
331-
}
332-
}
333-
await util.promisify(setTimeout)(500);
334-
}
335-
336-
try {
337-
await this.k3sHelper.updateKubeconfig(
338-
'wsl.exe', '--distribution', INSTANCE_NAME, '--exec', '/usr/local/bin/kubeconfig');
339-
} catch (err) {
340-
console.log(`k3sHelper.updateKubeconfig failed: ${ err }. Will retry...`);
341-
throw err;
342-
}
382+
await this.k3sHelper.waitForServerReady(() => this.ipAddress);
383+
await this.k3sHelper.updateKubeconfig(
384+
() => this.execCommand('/usr/local/bin/kubeconfig'));
343385

344386
this.client = new K8s.Client();
345387
await this.client.waitForServiceWatcher();
@@ -352,15 +394,6 @@ export default class WSLBackend extends events.EventEmitter implements K8s.Kuber
352394
// Right now the builder pod needs to be restarted after the remount
353395
// TODO: When this code is removed, make `client.getActivePod` protected again.
354396
try {
355-
await childProcess.spawnFile(
356-
'wsl.exe',
357-
['--user', 'root', '--distribution', INSTANCE_NAME, 'mount', '--make-shared', '/'],
358-
{
359-
stdio: ['ignore', await Logging.wsl.fdStream, await Logging.wsl.fdStream],
360-
windowsHide: true,
361-
});
362-
console.log('Waiting for ensuring root is shared');
363-
await util.promisify(setTimeout)(60_000);
364397
await childProcess.spawnFile(
365398
resources.executable('kim'),
366399
['builder', 'install', '--force', '--no-wait'],

0 commit comments

Comments
 (0)