Skip to content

Commit 5d16c65

Browse files
WescoeurNambrok
authored andcommitted
feat(linstor-manager): add error codes to healthCheck helper
Signed-off-by: Ronan Abhamon <[email protected]>
1 parent 6cab1f9 commit 5d16c65

File tree

2 files changed

+120
-35
lines changed

2 files changed

+120
-35
lines changed

Diff for: drivers/linstor-manager

+119-34
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import socket
2727
import XenAPI
2828
import XenAPIPlugin
2929

30+
from json import JSONEncoder
3031
from linstorjournaler import LinstorJournaler
3132
from linstorvhdutil import LinstorVhdUtil
3233
from linstorvolumemanager import get_controller_uri, get_local_volume_openers, LinstorVolumeManager
@@ -877,6 +878,64 @@ def get_drbd_openers(session, args):
877878
raise
878879

879880

881+
class HealthCheckError(object):
882+
__slots__ = ('data')
883+
884+
MASK_REPORT_LEVEL = 0x7000000
885+
MASK_TYPE = 0xFF0000
886+
MASK_VALUE = 0XFFFF
887+
888+
# 24-26 bits
889+
REPORT_LEVEL_WARN = 0x1000000
890+
REPORT_LEVEL_ERR = 0x2000000
891+
892+
# 16-23 bits
893+
TYPE_GENERIC = 0x10000
894+
TYPE_NODE = 0x20000
895+
TYPE_STORAGE_POOL = 0x30000
896+
TYPE_VOLUME = 0x40000
897+
TYPE_RESOURCE = 0x50000
898+
899+
# 1-15 bits
900+
GENERIC_UNEXPECTED = REPORT_LEVEL_ERR | TYPE_GENERIC | 0
901+
GENERIC_LINSTOR_UNREACHABLE = REPORT_LEVEL_ERR | TYPE_GENERIC | 1
902+
903+
NODE_NOT_ONLINE = REPORT_LEVEL_WARN | TYPE_NODE | 0
904+
905+
STORAGE_POOL_UNKNOWN_FREE_SIZE = REPORT_LEVEL_ERR | TYPE_STORAGE_POOL | 0
906+
STORAGE_POOL_UNKNOWN_CAPACITY = REPORT_LEVEL_ERR | TYPE_STORAGE_POOL | 1
907+
STORAGE_POOL_LOW_FREE_SIZE = REPORT_LEVEL_WARN | TYPE_STORAGE_POOL | 2
908+
909+
VOLUME_UNKNOWN_STATE = REPORT_LEVEL_WARN | TYPE_VOLUME | 0
910+
VOLUME_INVALID_STATE = REPORT_LEVEL_ERR | TYPE_VOLUME | 1
911+
VOLUME_WRONG_DISKLESS_STATE = REPORT_LEVEL_WARN | TYPE_VOLUME | 2
912+
VOLUME_INTERNAL_UNVERIFIED_STATE = REPORT_LEVEL_WARN | TYPE_VOLUME | 3
913+
914+
MAP_CODE_TO_PARAMS = {
915+
GENERIC_UNEXPECTED: { 'message' },
916+
GENERIC_LINSTOR_UNREACHABLE: { 'message' },
917+
NODE_NOT_ONLINE: { 'name', 'status' },
918+
STORAGE_POOL_UNKNOWN_FREE_SIZE: { 'name' },
919+
STORAGE_POOL_UNKNOWN_CAPACITY: { 'name' },
920+
STORAGE_POOL_LOW_FREE_SIZE: { 'name', 'threshold' },
921+
VOLUME_UNKNOWN_STATE: { 'node', 'resource', 'number' },
922+
VOLUME_INVALID_STATE: { 'node', 'resource', 'number', 'state' },
923+
VOLUME_WRONG_DISKLESS_STATE: { 'node', 'resource', 'number', 'state' },
924+
VOLUME_INTERNAL_UNVERIFIED_STATE: { 'node', 'resource', 'number', 'state' }
925+
}
926+
927+
def __init__(self, code, **kwargs):
928+
attributes = self.MAP_CODE_TO_PARAMS[code]
929+
data = { 'code': code }
930+
for attr_name, attr_value in kwargs.items():
931+
assert attr_name in attributes
932+
data[attr_name] = attr_value
933+
self.data = data
934+
935+
def to_json(self):
936+
return self.data
937+
938+
880939
def health_check(session, args):
881940
group_name = args['groupName']
882941

@@ -885,11 +944,15 @@ def health_check(session, args):
885944
'nodes': {},
886945
'storage-pools': {},
887946
'resources': {},
888-
'warnings': [],
889947
'errors': []
890948
}
891949

892950
def format_result():
951+
# See: https://stackoverflow.com/questions/18478287/making-object-json-serializable-with-regular-encoder/18561055#18561055
952+
def _default(self, obj):
953+
return getattr(obj.__class__, 'to_json', _default.default)(obj)
954+
_default.default = JSONEncoder().default
955+
JSONEncoder.default = _default
893956
return json.dumps(result)
894957

895958
# 1. Get controller.
@@ -912,7 +975,10 @@ def health_check(session, args):
912975
)
913976
except Exception as e:
914977
# Probably a network issue, or offline controller.
915-
result['errors'].append('Cannot join SR: `{}`.'.format(e))
978+
result['errors'].append(HealthCheckError(
979+
code=HealthCheckError.GENERIC_LINSTOR_UNREACHABLE,
980+
message=str(e)
981+
))
916982
return format_result()
917983

918984
try:
@@ -921,7 +987,11 @@ def health_check(session, args):
921987
result['nodes'] = nodes
922988
for node_name, status in nodes.items():
923989
if status != 'ONLINE':
924-
result['warnings'].append('Node `{}` is {}.'.format(node_name, status))
990+
result['errors'].append(HealthCheckError(
991+
code=HealthCheckError.NODE_NOT_ONLINE,
992+
name=node_name,
993+
status=status
994+
))
925995

926996
# 3. Check storage pool statuses.
927997
storage_pools_per_node = linstor.get_storage_pools_info()
@@ -931,23 +1001,25 @@ def health_check(session, args):
9311001
free_size = storage_pool['free-size']
9321002
capacity = storage_pool['capacity']
9331003
if free_size < 0 or capacity <= 0:
934-
result['errors'].append(
935-
'Cannot get free size and/or capacity of storage pool `{}`.'
936-
.format(storage_pool['uuid'])
937-
)
938-
elif free_size > capacity:
939-
result['errors'].append(
940-
'Free size of storage pool `{}` is greater than capacity.'
941-
.format(storage_pool['uuid'])
942-
)
1004+
if free_size < 0:
1005+
result['errors'].append(HealthCheckError(
1006+
code=HealthCheckError.STORAGE_POOL_UNKNOWN_FREE_SIZE,
1007+
name=storage_pool['name']
1008+
))
1009+
elif capacity < 0:
1010+
result['errors'].append(HealthCheckError(
1011+
code=HealthCheckError.STORAGE_POOL_UNKNOWN_CAPACITY,
1012+
name=storage_pool['name']
1013+
))
9431014
else:
9441015
remaining_percent = free_size / float(capacity) * 100.0
9451016
threshold = 10.0
9461017
if remaining_percent < threshold:
947-
result['warnings'].append(
948-
'Remaining size of storage pool `{}` is below {}% of its capacity.'
949-
.format(storage_pool['uuid'], threshold)
950-
)
1018+
result['errors'].append(HealthCheckError(
1019+
code=HealthCheckError.STORAGE_POOL_LOW_FREE_SIZE,
1020+
name=storage_pool['name'],
1021+
threshold=threshold
1022+
))
9511023

9521024
# 4. Check resource statuses.
9531025
all_resources = linstor.get_resources_info()
@@ -960,33 +1032,46 @@ def health_check(session, args):
9601032
if disk_state in ['UpToDate', 'Created', 'Attached']:
9611033
continue
9621034
if disk_state == 'DUnknown':
963-
result['warnings'].append(
964-
'Unknown state for volume `{}` at index {} for resource `{}` on node `{}`'
965-
.format(volume['device-path'], volume_index, resource_name, node_name)
966-
)
1035+
result['errors'].append(HealthCheckError(
1036+
code=HealthCheckError.VOLUME_UNKNOWN_STATE,
1037+
node=node_name,
1038+
resource=resource_name,
1039+
number=volume_index
1040+
))
9671041
continue
9681042
if disk_state in ['Inconsistent', 'Failed', 'To: Creating', 'To: Attachable', 'To: Attaching']:
969-
result['errors'].append(
970-
'Invalid state `{}` for volume `{}` at index {} for resource `{}` on node `{}`'
971-
.format(disk_state, volume['device-path'], volume_index, resource_name, node_name)
972-
)
1043+
result['errors'].append(HealthCheckError(
1044+
code=HealthCheckError.VOLUME_INVALID_STATE,
1045+
node=node_name,
1046+
resource=resource_name,
1047+
number=volume_index,
1048+
state=disk_state
1049+
))
9731050
continue
9741051
if disk_state == 'Diskless':
9751052
if resource['diskful']:
976-
result['errors'].append(
977-
'Unintentional diskless state detected for volume `{}` at index {} for resource `{}` on node `{}`'
978-
.format(volume['device-path'], volume_index, resource_name, node_name)
979-
)
1053+
result['errors'].append(HealthCheckError(
1054+
code=HealthCheckError.VOLUME_WRONG_DISKLESS_STATE,
1055+
node=node_name,
1056+
resource=resource_name,
1057+
number=volume_index,
1058+
state=disk_state
1059+
))
9801060
elif resource['tie-breaker']:
9811061
volume['disk-state'] = 'TieBreaker'
9821062
continue
983-
result['warnings'].append(
984-
'Unhandled state `{}` for volume `{}` at index {} for resource `{}` on node `{}`'
985-
.format(disk_state, volume['device-path'], volume_index, resource_name, node_name)
986-
)
987-
1063+
result['errors'].append(HealthCheckError(
1064+
code=HealthCheckError.VOLUME_INTERNAL_UNVERIFIED_STATE,
1065+
node=node_name,
1066+
resource=resource_name,
1067+
number=volume_index,
1068+
state=disk_state
1069+
))
9881070
except Exception as e:
989-
result['errors'].append('Unexpected error: `{}`'.format(e))
1071+
result['errors'].append(HealthCheckError(
1072+
code=HealthCheckError.GENERIC_UNEXPECTED,
1073+
message=str(e)
1074+
))
9901075

9911076
return format_result()
9921077

Diff for: drivers/linstorvolumemanager.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1623,7 +1623,7 @@ def get_storage_pools_info(self):
16231623
capacity *= 1024
16241624

16251625
storage_pools[pool.node_name].append({
1626-
'storage-pool-name': pool.name,
1626+
'name': pool.name,
16271627
'linstor-uuid': pool.uuid,
16281628
'free-size': size,
16291629
'capacity': capacity

0 commit comments

Comments
 (0)