Skip to content

Commit d104670

Browse files
Rossi-Lucianoclaude
andcommitted
Corrige bugs em labeling_utils e function_docx
labeling_utils.py: - Regex de detecção da seção de referências mais precisa (refer[eê]nci|references?) - resp_json inicializado antes do bloco condicional (evitava UnboundLocalError) - Escapa '<' literal em append_fragment para não quebrar o parser XML - Guarda None em proccess_special_content para search_special_id retornando None function_docx.py: - is_numPr inicializado antes do loop (tabelas após listas eram descartadas silenciosamente por flag herdada da iteração anterior) - Parágrafos adicionados a content independente de tabelas adjacentes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 5d1c282 commit d104670

2 files changed

Lines changed: 29 additions & 19 deletions

File tree

markup_doc/labeling_utils.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,17 @@ def create_labeled_object2(i, item, state, sections):
773773
obj["type"] = "paragraph"
774774
obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
775775

776+
if state.get("body") and re.search(
777+
r"^(refer[eê]nci|references?)\s*$", item.get("text").strip().lower()
778+
):
779+
state["label"] = "<sec>"
780+
state["body"] = False
781+
state["back"] = True
782+
result = {"label": "<sec>", "body": False, "back": True}
783+
obj["type"] = "paragraph"
784+
obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
785+
786+
776787
if not result:
777788
result = {"label": "<p>", "body": state["body"], "back": state["back"]}
778789
state["label"] = result.get("label")
@@ -874,12 +885,12 @@ def get_data_first_block(text, metadata, user_id):
874885
"Content-Type": "application/json",
875886
}
876887

888+
resp_json = {}
877889
response = requests.post(url, json=payload, headers=headers)
878890

879891
if response.status_code == 200:
880892
response_json = response.json()
881893
message_str = response_json["message"]
882-
883894
resp_json = json.loads(message_str)
884895

885896
return resp_json
@@ -1279,6 +1290,7 @@ def append_fragment(node_dest, val):
12791290

12801291
clean = escape_angle_brackets_outside_tags(clean)
12811292
clean = remove_unpaired_tags(clean)
1293+
clean = re.sub(r'<(?![/a-zA-Z_])', '&lt;', clean)
12821294

12831295
if clean == "":
12841296
parent = node_dest.getparent()
@@ -1351,23 +1363,17 @@ def proccess_special_content(text, data_body):
13511363
res = []
13521364
dict_type = {"f": "fig", "t": "table", "e": "disp-formula"}
13531365

1354-
try:
1355-
for match in re.finditer(
1356-
pattern, text, re.IGNORECASE | re.UNICODE | re.VERBOSE
1357-
):
1358-
label = match.group(0)
1359-
1360-
id = search_special_id(data_body, label)
1361-
1362-
res.append(
1363-
{
1364-
"label": label,
1365-
"id": id,
1366-
"reftype": dict_type.get(id[0].lower(), "other"),
1367-
}
1368-
)
1369-
except Exception as exc:
1370-
print(f"ERROR proccess_special_content: {exc}")
1371-
pass
1366+
for match in re.finditer(pattern, text, re.IGNORECASE | re.UNICODE | re.VERBOSE):
1367+
label = match.group(0)
1368+
id = search_special_id(data_body, label)
1369+
if id is None:
1370+
continue
1371+
res.append(
1372+
{
1373+
"label": label,
1374+
"id": id,
1375+
"reftype": dict_type.get(id[0].lower(), "other"),
1376+
}
1377+
)
13721378

13731379
return res

markuplib/function_docx.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ def extrae_Tabla(element, rels_map, namespaces):
339339
is_numPr = False
340340
if isinstance(element, CT_P):
341341
obj = {}
342+
paragraph = element
343+
text_paragraph = []
344+
_ns_w = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
345+
is_numPr = paragraph.find('.//w:numPr', namespaces=_ns_w) is not None
342346

343347
namespaces = {
344348
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",

0 commit comments

Comments
 (0)