Skip to content

Commit c3d6f55

Browse files
Add XMLSupToXrefPipe to convert numeric sup elements to xref
- Implements transformation of <sup> elements with numeric content to <xref> when they represent bibliographic references (bibr) or footnotes (fn) - Analyzes sup elements to identify numeric references - Extracts numeric labels from ref and fn elements for comparison - Includes helper method to extract numeric labels from mixed-citation - Prevents duplication by checking existing xref elements - Prioritizes bibr over fn when ambiguous, maintains sup when uncertain - Added pipe to XML processing pipeline
1 parent ee79eaf commit c3d6f55

1 file changed

Lines changed: 98 additions & 0 deletions

File tree

scielo_classic_website/spsxml/sps_xml_pipes.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def _process(document):
8686
XMLStylePipe(),
8787
XMLArticleMetaCountsPipe(),
8888
XMLNormalizeSpacePipe(),
89+
XMLSupToXrefPipe(),
8990
XMLDeleteRepeatedElementWithId(),
9091
XMLDeleteRepeatedTranslations(),
9192
XMLFontFaceSymbolPipe(),
@@ -645,3 +646,100 @@ def transform(self, data):
645646

646647
ET.strip_tags(xml, "REMOVEFONTFACESYMBOL")
647648
return data
649+
650+
651+
class XMLSupToXrefPipe(plumber.Pipe):
652+
"""
653+
Transforma elementos <sup> com conteúdo numérico em <xref> quando eles
654+
representam referências bibliográficas (bibr) ou notas de rodapé (fn).
655+
656+
Critérios de análise:
657+
- sup deve ter conteúdo numérico
658+
- Verifica se não existem xref[@ref-type='bibr'] (evita duplicação)
659+
- Compara valores numéricos com ref/label e fn/label existentes
660+
- Prioriza bibr sobre fn quando há ambiguidade
661+
- Na dúvida, mantém como sup
662+
"""
663+
664+
def transform(self, data):
665+
raw, xml = data
666+
667+
sups = xml.xpath(".//sup")
668+
if not sups:
669+
return data
670+
671+
sup_values = set()
672+
for sup in list(sups):
673+
if sup.find("xref") is not None:
674+
continue
675+
parent = sup.getparent()
676+
if parent.tag == "xref":
677+
continue
678+
text = "".join(sup.itertext()).strip()
679+
if text and text.isdigit():
680+
sup_values.add(text)
681+
if not sup_values:
682+
return data
683+
684+
ids, numeric_labels = self.get_ids_and_labels(xml, "ref", "mixed-citation")
685+
done = self._convert_sup_to_xref(sups, sup_values, ids, numeric_labels, "bibr")
686+
if done:
687+
return data
688+
689+
ids, numeric_labels = self.get_ids_and_labels(xml, "fn")
690+
done = self._convert_sup_to_xref(sups, sup_values, ids, numeric_labels, "fn")
691+
if done:
692+
return data
693+
694+
return data
695+
696+
def get_ids_and_labels(self, xml, from_tag, subtag=None):
697+
elem_ids = {}
698+
numeric_labels = set()
699+
for elem in xml.xpath(f".//{from_tag}"):
700+
node = elem
701+
if subtag:
702+
node = elem.find(subtag)
703+
label = elem.findtext("label") or self._extract_numeric_label_from_node(node)
704+
if label:
705+
numeric_labels.add(label)
706+
elem_ids[label] = elem.get("id")
707+
return elem_ids, numeric_labels
708+
709+
def _convert_sup_to_xref(self, sups, sup_values, ids, numeric_labels, ref_type):
710+
total = 0
711+
if sup_values.issubset(numeric_labels):
712+
for sup in list(sups):
713+
sup_text = "".join(sup.itertext()).strip()
714+
rid = ids.get(sup_text)
715+
if not rid:
716+
continue
717+
parent = sup.getparent()
718+
xref = ET.Element("xref")
719+
xref.set("ref-type", ref_type)
720+
xref.set("rid", rid)
721+
xref.append(deepcopy(sup))
722+
parent.replace(sup, xref)
723+
total += 1
724+
return total
725+
726+
def _extract_numeric_label_from_node(self, node):
727+
"""
728+
Extrai o label numérico inicial de um elemento mixed-citation.
729+
730+
Returns:
731+
str: O label numérico extraído ou string vazia se não encontrado
732+
"""
733+
if node is None:
734+
return None
735+
text = "".join(node.itertext()).strip()
736+
if not text or not text[0].isdigit():
737+
return None
738+
739+
label = ""
740+
for char in text:
741+
if char.isdigit():
742+
label += char
743+
else:
744+
break
745+
return label

0 commit comments

Comments
 (0)