Skip to content

Commit 7e5ea61

Browse files
btiantiangles
btian
authored andcommitted
feat: gpu utilization info for daemon
1 parent 325cba1 commit 7e5ea61

File tree

3 files changed

+27
-2
lines changed

3 files changed

+27
-2
lines changed

modules/api/daemon_api.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,15 @@ def __init__(self, app: FastAPI):
2626

2727
@staticmethod
2828
def get_task_count():
29-
current_task, pending_tasks, _, finished_task_count, failed_task_count, consecutive_failed_task_count, last_error_message = modules.progress.get_task_queue_info()
29+
current_task, pending_tasks, _, finished_task_count, failed_task_count, consecutive_failed_task_count, last_error_message, gpu_utilization = modules.progress.get_task_queue_info()
3030
return GetTaskCountResponse(
3131
current_task=current_task if current_task else '',
3232
queued_tasks=pending_tasks,
3333
finished_task_count=finished_task_count,
3434
failed_task_count=failed_task_count,
3535
consecutive_failed_task_count=consecutive_failed_task_count,
3636
last_error_message=last_error_message,
37+
gpu_utilization=gpu_utilization,
3738
)
3839

3940
def _add_api_route(self, path: str, endpoint, **kwargs):

modules/api/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ class GetTaskCountResponse(BaseModel):
334334
failed_task_count: int = Field(title="FailedTaskCount")
335335
consecutive_failed_task_count: int = Field(title="ConsecutiveFailedTaskCount")
336336
last_error_message: str = Field(title="CurrentTask")
337+
gpu_utilization: float = Field(title="GPUUtilization")
337338

338339

339340
class ExtensionItem(BaseModel):

modules/progress.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,24 @@
3434

3535
logger = logging.getLogger(__name__)
3636

37+
# info to calculation GPU utilization
38+
_service_begin_time = time.time()
39+
_busy_time = 0
40+
_tasks_start_at = {} # id_task -> time in second
41+
42+
43+
def _make_gpu_utilization():
44+
curr = time.time()
45+
busy_time = _busy_time
46+
47+
# in case of current task is not finished yet
48+
if current_task:
49+
busy_time += (curr - _tasks_start_at.get(current_task, curr))
50+
51+
return busy_time/(curr - _service_begin_time)
3752

3853
def get_task_queue_info():
39-
return current_task, pending_tasks, finished_tasks, finished_task_count, failed_task_count, consecutive_failed_task_count, last_error_message
54+
return current_task, pending_tasks, finished_tasks, finished_task_count, failed_task_count, consecutive_failed_task_count, last_error_message, _make_gpu_utilization()
4055

4156

4257
def start_task(id_task):
@@ -49,6 +64,7 @@ def start_task(id_task):
4964

5065
task_info = _pop_task_from_queue(id_task)
5166
task_info['started_at'] = time.time()
67+
_tasks_start_at[id_task] = time.time()
5268

5369
return task_info
5470

@@ -68,9 +84,16 @@ def finish_task(id_task, task_failed=False, error_message=''):
6884
global finished_tasks
6985
global failed_tasks
7086
global last_error_message
87+
global _busy_time
88+
7189
logger.info(
7290
f'finish_task, id_task: {id_task}, current_task: {current_task}, current_task_step: {current_task_step}')
7391

92+
# record gpu busy time
93+
curr = time.time()
94+
task_started_at = _tasks_start_at.pop(id_task, curr)
95+
_busy_time += (curr - task_started_at)
96+
7497
# if a task was finished before start, we need pop it out from pending queue
7598
_pop_task_from_queue(id_task)
7699

0 commit comments

Comments
 (0)