Skip to content

Commit 4e9859a

Browse files
return loading run status to frontend to fix a couple of bugs (#341)
* return loading run status to frontend to fix a couple of bugs * Add message when logdir has no runs (#343) * Add message when logdir has no runs * Correct Typography import * fix test failure * remove dead code Co-authored-by: TomWildenhain-Microsoft <[email protected]>
1 parent 0ce4bcb commit 4e9859a

File tree

6 files changed

+89
-25
lines changed

6 files changed

+89
-25
lines changed

tb_plugin/fe/src/api/generated/api.ts

+20-1
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,25 @@ export interface Performance {
596596
*/
597597
children?: Array<Performance>
598598
}
599+
/**
600+
*
601+
* @export
602+
* @interface Runs
603+
*/
604+
export interface Runs {
605+
/**
606+
*
607+
* @type {Array<string>}
608+
* @memberof Runs
609+
*/
610+
runs: Array<string>
611+
/**
612+
*
613+
* @type {boolean}
614+
* @memberof Runs
615+
*/
616+
loading: boolean
617+
}
599618
/**
600619
*
601620
* @export
@@ -2162,7 +2181,7 @@ export const DefaultApiFp = function (configuration?: Configuration) {
21622181
*/
21632182
runsGet(
21642183
options?: any
2165-
): (fetch?: FetchAPI, basePath?: string) => Promise<Array<string>> {
2184+
): (fetch?: FetchAPI, basePath?: string) => Promise<Runs> {
21662185
const localVarFetchArgs = DefaultApiFetchParamCreator(
21672186
configuration
21682187
).runsGet(options)

tb_plugin/fe/src/api/openapi.yaml

+13-3
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@ paths:
1313
content:
1414
'*/*':
1515
schema:
16-
type: array
17-
items:
18-
type: string
16+
$ref: '#/components/schemas/Runs'
1917
/views:
2018
get:
2119
parameters:
@@ -453,6 +451,18 @@ paths:
453451
type: object
454452
components:
455453
schemas:
454+
Runs:
455+
type: object
456+
required:
457+
- runs
458+
- loading
459+
properties:
460+
runs:
461+
type: array
462+
items:
463+
type: string
464+
loading:
465+
type: boolean
456466
Performance:
457467
type: object
458468
required:

tb_plugin/fe/src/app.tsx

+18-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
* Copyright (c) Microsoft Corporation. All rights reserved.
33
*--------------------------------------------------------------------------------------------*/
44

5+
import Card from '@material-ui/core/Card'
6+
import CardContent from '@material-ui/core/CardContent'
7+
import CardHeader from '@material-ui/core/CardHeader'
58
import ClickAwayListener from '@material-ui/core/ClickAwayListener'
69
import CssBaseline from '@material-ui/core/CssBaseline'
710
import Divider from '@material-ui/core/Divider'
@@ -15,6 +18,7 @@ import Select, { SelectProps } from '@material-ui/core/Select'
1518
import { makeStyles } from '@material-ui/core/styles'
1619
import ChevronLeftIcon from '@material-ui/icons/ChevronLeft'
1720
import ChevronRightIcon from '@material-ui/icons/ChevronRight'
21+
import Typography from '@material-ui/core/Typography'
1822
import 'antd/es/button/style/css'
1923
import 'antd/es/list/style/css'
2024
import 'antd/es/table/style/css'
@@ -130,6 +134,7 @@ export const App = () => {
130134

131135
const [run, setRun] = React.useState<string>('')
132136
const [runs, setRuns] = React.useState<string[]>([])
137+
const [runsLoading, setRunsLoading] = React.useState(true)
133138

134139
const [workers, setWorkers] = React.useState<string[]>([])
135140
const [worker, setWorker] = React.useState<string>('')
@@ -152,7 +157,8 @@ export const App = () => {
152157
while (true) {
153158
try {
154159
const runs = await api.defaultApi.runsGet()
155-
setRuns(runs)
160+
setRuns(runs.runs)
161+
setRunsLoading(runs.loading)
156162
} catch (e) {
157163
console.info('Cannot fetch runs: ', e)
158164
}
@@ -248,6 +254,17 @@ export const App = () => {
248254
}
249255

250256
const renderContent = () => {
257+
if (!runsLoading && runs.length == 0) {
258+
return (
259+
<Card variant="outlined">
260+
<CardHeader title="No Runs Found"></CardHeader>
261+
<CardContent>
262+
<Typography>There are not any runs in the log folder.</Typography>
263+
</CardContent>
264+
</Card>
265+
)
266+
}
267+
251268
if (!loaded || !run || !worker || !view || !span) {
252269
return <FullCircularProgress />
253270
}

tb_plugin/test/test_tensorboard_end2end.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,22 @@ def _test_tensorboard(self, host, port, expected_runs, path_prefix):
112112
try:
113113
response = urllib.request.urlopen(run_link)
114114
data = response.read()
115-
if data == expected_runs:
115+
runs = None
116+
if data:
117+
data = json.loads(data)
118+
runs = data.get("runs")
119+
if runs:
120+
runs = '[{}]'.format(", ".join(['"{}"'.format(i) for i in runs]))
121+
runs = runs.encode('utf-8')
122+
if runs == expected_runs:
116123
break
117124
if retry_times % 10 == 0:
118125
print("receive mismatched data, retrying", data)
119126
time.sleep(2)
120127
retry_times -= 1
121128
if retry_times<0:
122129
self.fail("Load run timeout")
123-
except Exception:
130+
except Exception as e:
124131
if retry_times > 0:
125132
continue
126133
else:

tb_plugin/torch_tb_profiler/plugin.py

+28-17
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ def __init__(self, context):
4949
mp.set_start_method(start_method, force=True)
5050
self.logdir = io.abspath(context.logdir.rstrip('/'))
5151

52-
self._is_active = None
53-
self._is_active_initialized_event = threading.Event()
52+
self._load_lock = threading.Lock()
53+
self._load_threads = []
5454

5555
self._runs = OrderedDict()
5656
self._runs_lock = threading.Lock()
@@ -76,8 +76,7 @@ def clean():
7676
def is_active(self):
7777
"""Returns whether there is relevant data for the plugin to process.
7878
"""
79-
self._is_active_initialized_event.wait()
80-
return self._is_active
79+
return True
8180

8281
def get_plugin_apps(self):
8382
return {
@@ -104,13 +103,21 @@ def get_plugin_apps(self):
104103
}
105104

106105
def frontend_metadata(self):
107-
return base_plugin.FrontendMetadata(es_module_path="/index.js")
106+
return base_plugin.FrontendMetadata(es_module_path="/index.js", disable_reload=True)
108107

109108
@wrappers.Request.application
110109
def runs_route(self, request):
111110
with self._runs_lock:
112111
names = list(self._runs.keys())
113-
return self.respond_as_json(names)
112+
113+
with self._load_lock:
114+
loading = bool(self._load_threads)
115+
116+
data = {
117+
"runs": names,
118+
"loading": loading
119+
}
120+
return self.respond_as_json(data)
114121

115122
@wrappers.Request.application
116123
def views_route(self, request):
@@ -130,7 +137,6 @@ def workers_route(self, request):
130137
self._validate(run=name, view=view)
131138
run = self._get_run(name)
132139
self._check_run(run, name)
133-
workers = run.get_workers(view)
134140
return self.respond_as_json(run.get_workers(view))
135141

136142
@wrappers.Request.application
@@ -305,19 +311,22 @@ def _monitor_runs(self):
305311
logger.debug("Scan run dir")
306312
run_dirs = self._get_run_dirs()
307313

314+
has_dir = False
308315
# Assume no deletion on run directories, trigger async load if find a new run
309316
for run_dir in run_dirs:
310-
# Set _is_active quickly based on file pattern match, don't wait for data loading
311-
if not self._is_active:
312-
self._is_active = True
313-
self._is_active_initialized_event.set()
314-
317+
has_dir = True
315318
if run_dir not in touched:
316319
touched.add(run_dir)
317320
logger.info("Find run directory %s", run_dir)
318321
# Use threading to avoid UI stall and reduce data parsing time
319322
t = threading.Thread(target=self._load_run, args=(run_dir,))
320323
t.start()
324+
with self._load_lock:
325+
self._load_threads.append(t)
326+
327+
if not has_dir:
328+
# handle directory removed case.
329+
self._runs.clear()
321330
except Exception as ex:
322331
logger.warning("Failed to scan runs. Exception=%s", ex, exc_info=True)
323332

@@ -338,11 +347,6 @@ def _receive_runs(self):
338347
if is_new:
339348
self._runs = OrderedDict(sorted(self._runs.items()))
340349

341-
# Update is_active
342-
if not self._is_active:
343-
self._is_active = True
344-
self._is_active_initialized_event.set()
345-
346350
def _get_run_dirs(self):
347351
"""Scan logdir, find PyTorch Profiler run directories.
348352
A directory is considered to be a run if it contains 1 or more *.pt.trace.json[.gz].
@@ -371,6 +375,13 @@ def _load_run(self, run_dir):
371375
except Exception as ex:
372376
logger.warning("Failed to load run %s. Exception=%s", ex, name, exc_info=True)
373377

378+
t = threading.current_thread()
379+
with self._load_lock:
380+
try:
381+
self._load_threads.remove(t)
382+
except ValueError:
383+
logger.warning("could not find the thread {}".format(run_dir))
384+
374385
def _get_run(self, name) -> Run:
375386
with self._runs_lock:
376387
return self._runs.get(name, None)

tb_plugin/torch_tb_profiler/static/index.html

+1-1
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)