@@ -27,6 +27,7 @@ import socket
27
27
import XenAPI
28
28
import XenAPIPlugin
29
29
30
+ from json import JSONEncoder
30
31
from linstorjournaler import LinstorJournaler
31
32
from linstorvhdutil import LinstorVhdUtil
32
33
from linstorvolumemanager import get_controller_uri , get_local_volume_openers , LinstorVolumeManager
@@ -877,6 +878,64 @@ def get_drbd_openers(session, args):
877
878
raise
878
879
879
880
881
+ class HealthCheckError (object ):
882
+ __slots__ = ('data' )
883
+
884
+ MASK_REPORT_LEVEL = 0x7000000
885
+ MASK_TYPE = 0xFF0000
886
+ MASK_VALUE = 0XFFFF
887
+
888
+ # 24-26 bits
889
+ REPORT_LEVEL_WARN = 0x1000000
890
+ REPORT_LEVEL_ERR = 0x2000000
891
+
892
+ # 16-23 bits
893
+ TYPE_GENERIC = 0x10000
894
+ TYPE_NODE = 0x20000
895
+ TYPE_STORAGE_POOL = 0x30000
896
+ TYPE_VOLUME = 0x40000
897
+ TYPE_RESOURCE = 0x50000
898
+
899
+ # 1-15 bits
900
+ GENERIC_UNEXPECTED = REPORT_LEVEL_ERR | TYPE_GENERIC | 0
901
+ GENERIC_LINSTOR_UNREACHABLE = REPORT_LEVEL_ERR | TYPE_GENERIC | 1
902
+
903
+ NODE_NOT_ONLINE = REPORT_LEVEL_WARN | TYPE_NODE | 0
904
+
905
+ STORAGE_POOL_UNKNOWN_FREE_SIZE = REPORT_LEVEL_ERR | TYPE_STORAGE_POOL | 0
906
+ STORAGE_POOL_UNKNOWN_CAPACITY = REPORT_LEVEL_ERR | TYPE_STORAGE_POOL | 1
907
+ STORAGE_POOL_LOW_FREE_SIZE = REPORT_LEVEL_WARN | TYPE_STORAGE_POOL | 2
908
+
909
+ VOLUME_UNKNOWN_STATE = REPORT_LEVEL_WARN | TYPE_VOLUME | 0
910
+ VOLUME_INVALID_STATE = REPORT_LEVEL_ERR | TYPE_VOLUME | 1
911
+ VOLUME_WRONG_DISKLESS_STATE = REPORT_LEVEL_WARN | TYPE_VOLUME | 2
912
+ VOLUME_INTERNAL_UNVERIFIED_STATE = REPORT_LEVEL_WARN | TYPE_VOLUME | 3
913
+
914
+ MAP_CODE_TO_PARAMS = {
915
+ GENERIC_UNEXPECTED : { 'message' },
916
+ GENERIC_LINSTOR_UNREACHABLE : { 'message' },
917
+ NODE_NOT_ONLINE : { 'name' , 'status' },
918
+ STORAGE_POOL_UNKNOWN_FREE_SIZE : { 'name' },
919
+ STORAGE_POOL_UNKNOWN_CAPACITY : { 'name' },
920
+ STORAGE_POOL_LOW_FREE_SIZE : { 'name' , 'threshold' },
921
+ VOLUME_UNKNOWN_STATE : { 'node' , 'resource' , 'number' },
922
+ VOLUME_INVALID_STATE : { 'node' , 'resource' , 'number' , 'state' },
923
+ VOLUME_WRONG_DISKLESS_STATE : { 'node' , 'resource' , 'number' , 'state' },
924
+ VOLUME_INTERNAL_UNVERIFIED_STATE : { 'node' , 'resource' , 'number' , 'state' }
925
+ }
926
+
927
+ def __init__ (self , code , ** kwargs ):
928
+ attributes = self .MAP_CODE_TO_PARAMS [code ]
929
+ data = { 'code' : code }
930
+ for attr_name , attr_value in kwargs .items ():
931
+ assert attr_name in attributes
932
+ data [attr_name ] = attr_value
933
+ self .data = data
934
+
935
+ def to_json (self ):
936
+ return self .data
937
+
938
+
880
939
def health_check (session , args ):
881
940
group_name = args ['groupName' ]
882
941
@@ -885,11 +944,15 @@ def health_check(session, args):
885
944
'nodes' : {},
886
945
'storage-pools' : {},
887
946
'resources' : {},
888
- 'warnings' : [],
889
947
'errors' : []
890
948
}
891
949
892
950
def format_result ():
951
+ # See: https://stackoverflow.com/questions/18478287/making-object-json-serializable-with-regular-encoder/18561055#18561055
952
+ def _default (self , obj ):
953
+ return getattr (obj .__class__ , 'to_json' , _default .default )(obj )
954
+ _default .default = JSONEncoder ().default
955
+ JSONEncoder .default = _default
893
956
return json .dumps (result )
894
957
895
958
# 1. Get controller.
@@ -912,7 +975,10 @@ def health_check(session, args):
912
975
)
913
976
except Exception as e :
914
977
# Probably a network issue, or offline controller.
915
- result ['errors' ].append ('Cannot join SR: `{}`.' .format (e ))
978
+ result ['errors' ].append (HealthCheckError (
979
+ code = HealthCheckError .GENERIC_LINSTOR_UNREACHABLE ,
980
+ message = str (e )
981
+ ))
916
982
return format_result ()
917
983
918
984
try :
@@ -921,7 +987,11 @@ def health_check(session, args):
921
987
result ['nodes' ] = nodes
922
988
for node_name , status in nodes .items ():
923
989
if status != 'ONLINE' :
924
- result ['warnings' ].append ('Node `{}` is {}.' .format (node_name , status ))
990
+ result ['errors' ].append (HealthCheckError (
991
+ code = HealthCheckError .NODE_NOT_ONLINE ,
992
+ name = node_name ,
993
+ status = status
994
+ ))
925
995
926
996
# 3. Check storage pool statuses.
927
997
storage_pools_per_node = linstor .get_storage_pools_info ()
@@ -931,23 +1001,25 @@ def health_check(session, args):
931
1001
free_size = storage_pool ['free-size' ]
932
1002
capacity = storage_pool ['capacity' ]
933
1003
if free_size < 0 or capacity <= 0 :
934
- result ['errors' ].append (
935
- 'Cannot get free size and/or capacity of storage pool `{}`.'
936
- .format (storage_pool ['uuid' ])
937
- )
938
- elif free_size > capacity :
939
- result ['errors' ].append (
940
- 'Free size of storage pool `{}` is greater than capacity.'
941
- .format (storage_pool ['uuid' ])
942
- )
1004
+ if free_size < 0 :
1005
+ result ['errors' ].append (HealthCheckError (
1006
+ code = HealthCheckError .STORAGE_POOL_UNKNOWN_FREE_SIZE ,
1007
+ name = storage_pool ['name' ]
1008
+ ))
1009
+ elif capacity < 0 :
1010
+ result ['errors' ].append (HealthCheckError (
1011
+ code = HealthCheckError .STORAGE_POOL_UNKNOWN_CAPACITY ,
1012
+ name = storage_pool ['name' ]
1013
+ ))
943
1014
else :
944
1015
remaining_percent = free_size / float (capacity ) * 100.0
945
1016
threshold = 10.0
946
1017
if remaining_percent < threshold :
947
- result ['warnings' ].append (
948
- 'Remaining size of storage pool `{}` is below {}% of its capacity.'
949
- .format (storage_pool ['uuid' ], threshold )
950
- )
1018
+ result ['errors' ].append (HealthCheckError (
1019
+ code = HealthCheckError .STORAGE_POOL_LOW_FREE_SIZE ,
1020
+ name = storage_pool ['name' ],
1021
+ threshold = threshold
1022
+ ))
951
1023
952
1024
# 4. Check resource statuses.
953
1025
all_resources = linstor .get_resources_info ()
@@ -960,33 +1032,46 @@ def health_check(session, args):
960
1032
if disk_state in ['UpToDate' , 'Created' , 'Attached' ]:
961
1033
continue
962
1034
if disk_state == 'DUnknown' :
963
- result ['warnings' ].append (
964
- 'Unknown state for volume `{}` at index {} for resource `{}` on node `{}`'
965
- .format (volume ['device-path' ], volume_index , resource_name , node_name )
966
- )
1035
+ result ['errors' ].append (HealthCheckError (
1036
+ code = HealthCheckError .VOLUME_UNKNOWN_STATE ,
1037
+ node = node_name ,
1038
+ resource = resource_name ,
1039
+ number = volume_index
1040
+ ))
967
1041
continue
968
1042
if disk_state in ['Inconsistent' , 'Failed' , 'To: Creating' , 'To: Attachable' , 'To: Attaching' ]:
969
- result ['errors' ].append (
970
- 'Invalid state `{}` for volume `{}` at index {} for resource `{}` on node `{}`'
971
- .format (disk_state , volume ['device-path' ], volume_index , resource_name , node_name )
972
- )
1043
+ result ['errors' ].append (HealthCheckError (
1044
+ code = HealthCheckError .VOLUME_INVALID_STATE ,
1045
+ node = node_name ,
1046
+ resource = resource_name ,
1047
+ number = volume_index ,
1048
+ state = disk_state
1049
+ ))
973
1050
continue
974
1051
if disk_state == 'Diskless' :
975
1052
if resource ['diskful' ]:
976
- result ['errors' ].append (
977
- 'Unintentional diskless state detected for volume `{}` at index {} for resource `{}` on node `{}`'
978
- .format (volume ['device-path' ], volume_index , resource_name , node_name )
979
- )
1053
+ result ['errors' ].append (HealthCheckError (
1054
+ code = HealthCheckError .VOLUME_WRONG_DISKLESS_STATE ,
1055
+ node = node_name ,
1056
+ resource = resource_name ,
1057
+ number = volume_index ,
1058
+ state = disk_state
1059
+ ))
980
1060
elif resource ['tie-breaker' ]:
981
1061
volume ['disk-state' ] = 'TieBreaker'
982
1062
continue
983
- result ['warnings' ].append (
984
- 'Unhandled state `{}` for volume `{}` at index {} for resource `{}` on node `{}`'
985
- .format (disk_state , volume ['device-path' ], volume_index , resource_name , node_name )
986
- )
987
-
1063
+ result ['errors' ].append (HealthCheckError (
1064
+ code = HealthCheckError .VOLUME_INTERNAL_UNVERIFIED_STATE ,
1065
+ node = node_name ,
1066
+ resource = resource_name ,
1067
+ number = volume_index ,
1068
+ state = disk_state
1069
+ ))
988
1070
except Exception as e :
989
- result ['errors' ].append ('Unexpected error: `{}`' .format (e ))
1071
+ result ['errors' ].append (HealthCheckError (
1072
+ code = HealthCheckError .GENERIC_UNEXPECTED ,
1073
+ message = str (e )
1074
+ ))
990
1075
991
1076
return format_result ()
992
1077
0 commit comments