Skip to content

Commit a9a6923

Browse files
niklubnik
andauthored
feat: CustomInterface parser (#681)
Co-authored-by: nik <[email protected]>
1 parent 4a5949e commit a9a6923

File tree

3 files changed

+368
-0
lines changed

3 files changed

+368
-0
lines changed

src/label_studio_sdk/label_interface/control_tags.py

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
"textarea": "TextAreaTag",
4141
"timeserieslabels": "TimeSeriesLabelsTag",
4242
"chatmessage": "ChatMessageTag",
43+
"custominterface": "CustomInterfaceTag",
4344
}
4445

4546

@@ -1098,3 +1099,205 @@ class TimeSeriesLabelsTag(ControlTag):
10981099
tag: str = "TimeSeriesLabels"
10991100
_label_attr_name: str = "timeserieslabels"
11001101
_value_class: Type[TimeSeriesValue] = TimeSeriesValue
1102+
1103+
1104+
class CustomInterfaceValue(BaseModel):
1105+
custominterface: Dict[str, str]
1106+
1107+
class CustomInterfaceTag(ControlTag):
1108+
""" """
1109+
tag: str = "CustomInterface"
1110+
_value_class: Type[CustomInterfaceValue] = CustomInterfaceValue
1111+
_label_attr_name: str = "custominterface"
1112+
1113+
# Registry of type aliases that can be used in outputs specification
1114+
# Each alias maps to a function that takes arguments and returns a JSON schema fragment
1115+
_TYPE_ALIASES = {
1116+
'choices': lambda args: {
1117+
"type": "string",
1118+
"enum": [arg.strip() for arg in args if arg.strip()]
1119+
},
1120+
'multichoices': lambda args: {
1121+
"type": "array",
1122+
"items": {
1123+
"type": "string",
1124+
"enum": [arg.strip() for arg in args if arg.strip()]
1125+
}
1126+
},
1127+
'number': lambda args: {
1128+
"type": "number",
1129+
**({"minimum": float(args[0].strip())} if len(args) > 0 and args[0].strip() else {}),
1130+
**({"maximum": float(args[1].strip())} if len(args) > 1 and args[1].strip() else {}),
1131+
},
1132+
'rating': lambda args: {
1133+
"type": "integer",
1134+
"minimum": 1,
1135+
"maximum": int(args[0].strip()) if len(args) > 0 and args[0].strip() else 5,
1136+
},
1137+
}
1138+
1139+
def _parse_type_alias(self, value: str) -> dict:
1140+
"""
1141+
Parse a type alias like 'choices(label1, label2)' into a JSON schema fragment.
1142+
1143+
Args:
1144+
value: A string that may contain a type alias with arguments.
1145+
1146+
Returns:
1147+
dict: A JSON schema fragment for the type.
1148+
"""
1149+
import re
1150+
# Match pattern like "alias_name(arg1, arg2, ...)"
1151+
match = re.match(r'^(\w+)\s*\(\s*(.+?)\s*\)$', value.strip())
1152+
if match:
1153+
alias_name = match.group(1).lower()
1154+
args_str = match.group(2)
1155+
# Split arguments by comma, handling potential whitespace
1156+
args = [arg.strip() for arg in args_str.split(',')]
1157+
1158+
if alias_name in self._TYPE_ALIASES:
1159+
return self._TYPE_ALIASES[alias_name](args)
1160+
1161+
# Default to string type if no alias matched
1162+
return {"type": "string"}
1163+
1164+
def _try_parse_json(self, outputs_str: str) -> dict | None:
1165+
"""
1166+
Attempt to parse the outputs string as JSON.
1167+
1168+
Args:
1169+
outputs_str: The raw outputs string from the tag configuration.
1170+
1171+
Returns:
1172+
dict or None: Parsed JSON if valid, None otherwise.
1173+
"""
1174+
import json
1175+
stripped = outputs_str.strip()
1176+
if stripped.startswith('{'):
1177+
try:
1178+
return json.loads(stripped)
1179+
except json.JSONDecodeError:
1180+
return None
1181+
return None
1182+
1183+
def _parse_delimited_list(self, outputs_str: str) -> list:
1184+
"""
1185+
Parse a string into a list by splitting on non-alphanumeric delimiters.
1186+
1187+
Splits on: comma, semicolon, vertical bar, or any sequence of
1188+
whitespace/non-alphanumeric characters (except parentheses for aliases).
1189+
1190+
Args:
1191+
outputs_str: The raw outputs string to parse.
1192+
1193+
Returns:
1194+
list: List of parsed output field names/definitions.
1195+
"""
1196+
import re
1197+
# Split on common delimiters: comma, semicolon, pipe, or whitespace
1198+
# But preserve content inside parentheses for type aliases
1199+
parts = []
1200+
current = []
1201+
paren_depth = 0
1202+
1203+
for char in outputs_str:
1204+
if char == '(':
1205+
paren_depth += 1
1206+
current.append(char)
1207+
elif char == ')':
1208+
paren_depth -= 1
1209+
current.append(char)
1210+
elif paren_depth == 0 and char in ',;|\t\n':
1211+
# Delimiter found outside parentheses
1212+
part = ''.join(current).strip()
1213+
if part:
1214+
parts.append(part)
1215+
current = []
1216+
else:
1217+
current.append(char)
1218+
1219+
# Don't forget the last part
1220+
part = ''.join(current).strip()
1221+
if part:
1222+
parts.append(part)
1223+
1224+
return parts
1225+
1226+
def _parse_output_field(self, field_spec: str) -> tuple:
1227+
"""
1228+
Parse a single output field specification.
1229+
1230+
Handles formats like:
1231+
- "field_name" -> (field_name, {"type": "string"})
1232+
- "field_name:choices(a,b,c)" -> (field_name, {"type": "string", "enum": ["a","b","c"]})
1233+
1234+
Args:
1235+
field_spec: A single field specification string.
1236+
1237+
Returns:
1238+
tuple: (field_name, json_schema_fragment)
1239+
"""
1240+
field_spec = field_spec.strip()
1241+
1242+
# Check if there's a type specification with colon separator
1243+
if ':' in field_spec:
1244+
name_part, type_part = field_spec.split(':', 1)
1245+
name = name_part.strip()
1246+
schema = self._parse_type_alias(type_part.strip())
1247+
return (name, schema)
1248+
1249+
# Check if the entire spec is a type alias (for JSON-style definitions)
1250+
import re
1251+
if re.match(r'^\w+\s*\(', field_spec):
1252+
# This looks like a standalone type alias, not a field name
1253+
# In this case, we can't determine the field name, so return as-is
1254+
return (field_spec, {"type": "string"})
1255+
1256+
# Plain field name defaults to string type
1257+
return (field_spec, {"type": "string"})
1258+
1259+
def to_json_schema(self):
1260+
"""
1261+
Converts the current CustomInterfaceTag instance into a JSON Schema.
1262+
1263+
Supports multiple parsing strategies (mutually compatible):
1264+
1265+
1. Delimited list: If 'outputs' contains field names separated by
1266+
comma, semicolon, pipe, or whitespace, each becomes a string property.
1267+
Example: "field1, field2, field3" or "field1|field2|field3"
1268+
1269+
2. JSON Schema: If 'outputs' starts with '{', it's parsed as a JSON schema.
1270+
Example: '{"field1": {"type": "number"}, "field2": {"type": "string"}}'
1271+
1272+
3. Type aliases: Special syntax for common patterns within delimited lists.
1273+
- "field:choices(a,b,c)" -> enum with string type
1274+
- "field:multichoices(a,b,c)" -> array of enum values
1275+
Example: "rating:choices(good, bad), tags:multichoices(urgent, review)"
1276+
1277+
Returns:
1278+
dict: A dictionary representing the JSON Schema with properties for each output.
1279+
"""
1280+
outputs_str = self.attr.get('outputs', '')
1281+
1282+
if not outputs_str or not outputs_str.strip():
1283+
return {"type": "object", "properties": {}}
1284+
1285+
# Strategy 2: Try parsing as JSON first
1286+
json_schema = self._try_parse_json(outputs_str)
1287+
if json_schema is not None:
1288+
# If it's already a complete schema, return it
1289+
if "type" in json_schema and "properties" in json_schema:
1290+
return json_schema
1291+
# If it's just properties, wrap them
1292+
return {"type": "object", "properties": json_schema}
1293+
1294+
# Strategy 1 & 3: Parse as delimited list with optional type aliases
1295+
fields = self._parse_delimited_list(outputs_str)
1296+
1297+
properties = {}
1298+
for field_spec in fields:
1299+
field_name, field_schema = self._parse_output_field(field_spec)
1300+
if field_name:
1301+
properties[field_name] = field_schema
1302+
1303+
return {"type": "object", "properties": properties}

src/label_studio_sdk/label_interface/interface.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ def __init__(self, config: str, tags_mapping=None, *args, **kwargs):
303303
self._objects = objects
304304
self._labels = labels
305305
self._tree = tree
306+
306307

307308
def create_regions(self, data: Dict[str, Union[str, Dict, List[str], List[Dict]]]) -> List[Region]:
308309
"""

0 commit comments

Comments
 (0)