@@ -39,6 +39,11 @@ def find_element_by_xml_id(root, uuid):
3939
4040
4141def process_row (r ):
42+ """
43+ Check gold-standard speaker annotations, including <note type="speaker"> introductions.
44+ - If UUID points to <u>: check its @who.
45+ - If UUID points to <note type="speaker">: find the first <u> after it and check the whole <u> chain.
46+ """
4247 xml_path = r ['protocol_id' ]
4348 uuid = r ['uuid' ]
4449 folder_type = r ['folder_type' ]
@@ -60,15 +65,43 @@ def process_row(r):
6065 return result
6166
6267 if folder_type == 'is-speaker' :
63- if el .tag .endswith ('u' ) and el .get ('who' ) != expected_person :
64- result ['fail_is_speaker' ] = [xml_path , uuid , expected_person , el .get ('who' )]
65- logger .error (f"Speaker drift: { uuid } expected { expected_person } , got { el .get ('who' )} " )
66- if el .tag .endswith ('note' ) and el .get ('type' ) != 'speaker' :
67- result ['fail_is_speaker' ] = [xml_path , uuid , 'type=speaker' , el .get ('type' )]
68- logger .error (f"Speaker note drift: { uuid } " )
68+ if el .tag .endswith ('u' ):
69+ who_attr = el .get ('who' )
70+ if not who_attr or who_attr != expected_person :
71+ result ['fail_is_speaker' ] = [xml_path , uuid , expected_person , who_attr ]
72+ logger .error (f"Speaker drift: { uuid } expected { expected_person } , got { who_attr } " )
73+
74+ elif el .tag .endswith ('note' ) and el .get ('type' ) == 'speaker' :
75+ first_u = None
76+ for sibling in el .itersiblings ():
77+ if sibling .tag .endswith ('u' ):
78+ first_u = sibling
79+ break
80+
81+ if first_u is None :
82+ result ['fail_is_speaker' ] = [xml_path , uuid , "no following <u>" , None ]
83+ logger .error (f"Speaker note { uuid } has no following <u>" )
84+ else :
85+ u = first_u
86+ while u is not None :
87+ u_who = u .get ('who' )
88+ if u_who != expected_person :
89+ result ['fail_is_speaker' ] = [xml_path , uuid , expected_person , u_who ]
90+ logger .error (f"Speaker chain drift: { uuid } expected { expected_person } , got { u_who } " )
91+ break
92+ next_id = u .get ('next' )
93+ if not next_id :
94+ break
95+ u = find_element_by_xml_id (root , next_id )
96+
97+ else :
98+ result ['fail_is_speaker' ] = [xml_path , uuid , "unexpected tag" , el .tag ]
99+ logger .error (f"Unexpected tag for is-speaker: { uuid } ({ el .tag } )" )
100+
69101 else :
70- if el .get ('who' ) or (el .tag .endswith ('note' ) and el .get ('type' ) == 'speaker' ):
71- actual = el .get ('who' ) if el .get ('who' ) else 'type=speaker'
102+ who_attr = el .get ('who' )
103+ if (who_attr is not None and who_attr .strip () != '' ) or (el .tag .endswith ('note' ) and el .get ('type' ) == 'speaker' ):
104+ actual = who_attr if who_attr else 'type=speaker'
72105 result ['fail_non_speaker' ] = [xml_path , uuid , actual ]
73106 logger .error (f"Non-speaker drift: { uuid } ({ actual } )" )
74107
0 commit comments