Corrige bugs em labeling_utils e function_docx

Rossi-Luciano · claude · Rossi-Luciano · commit d104670d3ebc · 2026-06-01T09:06:06.000-03:00
labeling_utils.py:
- Regex de detecção da seção de referências mais precisa (refer[eê]nci|references?)
- resp_json inicializado antes do bloco condicional (evitava UnboundLocalError)
- Escapa '&lt;' literal em append_fragment para não quebrar o parser XML
- Guarda None em proccess_special_content para search_special_id retornando None

function_docx.py:
- is_numPr inicializado antes do loop (tabelas após listas eram descartadas
  silenciosamente por flag herdada da iteração anterior)
- Parágrafos adicionados a content independente de tabelas adjacentes

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py
@@ -773,6 +773,17 @@ def create_labeled_object2(i, item, state, sections):
         obj["type"] = "paragraph"
         obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
 
+    if state.get("body") and re.search(
+        r"^(refer[eê]nci|references?)\s*$", item.get("text").strip().lower()
+    ):
+        state["label"] = "<sec>"
+        state["body"] = False
+        state["back"] = True
+        result = {"label": "<sec>", "body": False, "back": True}
+        obj["type"] = "paragraph"
+        obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
+
+
     if not result:
         result = {"label": "<p>", "body": state["body"], "back": state["back"]}
         state["label"] = result.get("label")
@@ -874,12 +885,12 @@ def get_data_first_block(text, metadata, user_id):
         "Content-Type": "application/json",
     }
 
+    resp_json = {}
     response = requests.post(url, json=payload, headers=headers)
 
     if response.status_code == 200:
         response_json = response.json()
         message_str = response_json["message"]
-
         resp_json = json.loads(message_str)
 
     return resp_json
@@ -1279,6 +1290,7 @@ def append_fragment(node_dest, val):
 
     clean = escape_angle_brackets_outside_tags(clean)
     clean = remove_unpaired_tags(clean)
+    clean = re.sub(r'<(?![/a-zA-Z_])', '&lt;', clean)
 
     if clean == "":
         parent = node_dest.getparent()
@@ -1351,23 +1363,17 @@ def proccess_special_content(text, data_body):
     res = []
     dict_type = {"f": "fig", "t": "table", "e": "disp-formula"}
 
-    try:
-        for match in re.finditer(
-            pattern, text, re.IGNORECASE | re.UNICODE | re.VERBOSE
-        ):
-            label = match.group(0)
-
-            id = search_special_id(data_body, label)
-
-            res.append(
-                {
-                    "label": label,
-                    "id": id,
-                    "reftype": dict_type.get(id[0].lower(), "other"),
-                }
-            )
-    except Exception as exc:
-        print(f"ERROR proccess_special_content: {exc}")
-        pass
+    for match in re.finditer(pattern, text, re.IGNORECASE | re.UNICODE | re.VERBOSE):
+        label = match.group(0)
+        id = search_special_id(data_body, label)
+        if id is None:
+            continue
+        res.append(
+            {
+                "label": label,
+                "id": id,
+                "reftype": dict_type.get(id[0].lower(), "other"),
+            }
+        )
 
     return res
diff --git a/markuplib/function_docx.py b/markuplib/function_docx.py
@@ -339,6 +339,10 @@ def extrae_Tabla(element, rels_map, namespaces):
             is_numPr = False
             if isinstance(element, CT_P):
                 obj = {}
+                paragraph = element
+                text_paragraph = []
+                _ns_w = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
+                is_numPr = paragraph.find('.//w:numPr', namespaces=_ns_w) is not None
 
                 namespaces = {
                     "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",