@@ -73,6 +73,8 @@ def _build_study(article: ArticleContent) -> dict[str, Any]:
7373 "table_id" : metadata .identifier ,
7474 "raw_table_xml" : metadata .raw_xml ,
7575 }
76+ if metadata .reference_sentences :
77+ analysis_metadata ["reference_sentences" ] = metadata .reference_sentences
7678 points = [
7779 {
7880 "coordinates" : triplet ,
@@ -111,9 +113,16 @@ def _heuristic_space(header_text: str, meta_text: str) -> str | None:
111113
112114def _metadata_text (metadata : TableMetadata ) -> str :
113115 parts : list [str ] = []
114- for value in (metadata .caption , metadata .label , metadata .legend , metadata .foot ):
116+ for value in (
117+ metadata .caption ,
118+ metadata .label ,
119+ metadata .legend ,
120+ metadata .foot ,
121+ ):
115122 if value :
116123 parts .append (value )
124+ if metadata .reference_sentences :
125+ parts .extend (metadata .reference_sentences )
117126 raw_xml = metadata .raw_xml
118127 if raw_xml :
119128 try :
@@ -160,6 +169,36 @@ def _article_text(payload: bytes) -> str:
160169 return " " .join (root .xpath (".//text()" ))
161170
162171
172+ def _reference_sentences (
173+ root : etree ._Element , table_id : str | None
174+ ) -> list [str ]:
175+ if not table_id :
176+ return []
177+ xpath = (
178+ './/*[local-name()="cross-ref" or local-name()="cross-refs"]'
179+ '[contains(concat(" ", normalize-space(@refid), " "), '
180+ 'concat(" ", $table_id, " "))]'
181+ )
182+ ref_nodes = root .xpath (xpath , table_id = table_id )
183+ sentences : list [str ] = []
184+ seen : set [int ] = set ()
185+ for node in ref_nodes :
186+ parents = node .xpath (
187+ 'ancestor::*[local-name()="para" or local-name()="simple-para"][1]'
188+ )
189+ if not parents :
190+ continue
191+ para = parents [0 ]
192+ marker = id (para )
193+ if marker in seen :
194+ continue
195+ seen .add (marker )
196+ text = " " .join (" " .join (para .itertext ()).split ())
197+ if text :
198+ sentences .append (text )
199+ return sentences
200+
201+
163202def _manual_extract_tables (payload : bytes ) -> list [Tuple [TableMetadata , pd .DataFrame ]]:
164203 parser = etree .XMLParser (remove_blank_text = True )
165204 try :
@@ -178,6 +217,7 @@ def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataF
178217 './/*[local-name()="table-foot" or local-name()="table-wrap-foot"]' ,
179218 )
180219 identifier = table .get ("id" )
220+ references = _reference_sentences (root , identifier )
181221 df = _table_to_dataframe (table )
182222 if df is None or df .empty :
183223 continue
@@ -190,6 +230,7 @@ def _manual_extract_tables(payload: bytes) -> list[Tuple[TableMetadata, pd.DataF
190230 legend = legend ,
191231 foot = foot ,
192232 raw_xml = raw_xml ,
233+ reference_sentences = references ,
193234 )
194235 tables .append ((metadata , df ))
195236 return tables
0 commit comments