-
Notifications
You must be signed in to change notification settings - Fork 6.4k
Open
Labels
enhancementNew feature or requestNew feature or requesttriageIssue needs to be triaged/prioritizedIssue needs to be triaged/prioritized
Description
Feature Description
For context, we use Azure OpenAI models. Azure places a security/safety/guardrails policy around the LLM calls; you can't disable this. This policy sometimes produces false positives. For example, in my experience, I get a false positive about every 15,000 nodes ingested in my TitleExtractor
. When I get a false positive, the LLM call fails and an exception is thrown. This kills the entire ingestion. I can't wrap the ingestion pipeline in try/catch. The solution I think is to make a SafeTitleExtractor
, which I (with liberal help from GPT-5 Thinking hard) have done below.
import asyncio
from typing import Any, Dict, List, Optional, Sequence, cast
from llama_index.core.extractors import TitleExtractor
from llama_index.core.schema import BaseNode, TextNode
from llama_index.core.prompts import PromptTemplate
from llama_index.core import Settings
DEFAULT_TITLE_NODE_TEMPLATE = """\
Context: {context_str}. Give a title that summarizes all of \
the unique entities, titles or themes found in the context. Title: """
DEFAULT_TITLE_COMBINE_TEMPLATE = """\
{context_str}. Based on the above candidate titles and content, \
what is the comprehensive title for this document? Title: """
class SafeTitleExtractor(TitleExtractor):
"""A resilient TitleExtractor that tolerates LLM failures and returns best-effort titles."""
def __init__(
self,
llm: Optional["LLM"] = None,
llm_predictor: Optional["LLM"] = None,
nodes: int = 5,
node_template: str = DEFAULT_TITLE_NODE_TEMPLATE,
combine_template: str = DEFAULT_TITLE_COMBINE_TEMPLATE,
num_workers: int = 4,
retries: int = 1, # <-- add small retry budget
backoff_secs: float = 0.5, # <-- jitter/backoff between retries
**kwargs: Any,
) -> None:
super().__init__(
llm=llm or llm_predictor or Settings.llm,
nodes=nodes,
node_template=node_template,
combine_template=combine_template,
num_workers=num_workers,
**kwargs,
)
self._retries = max(0, retries)
self._backoff = max(0.0, backoff_secs)
async def _retry(self, func, *args, **kwargs):
"""Tiny async retry helper."""
attempt = 0
while True:
try:
return await func(*args, **kwargs)
except Exception:
if attempt >= self._retries:
raise
await asyncio.sleep(self._backoff * (2 ** attempt))
attempt += 1
async def aextract(self, nodes: Sequence[BaseNode]) -> List[Dict]:
# same logic as base, but guard missing keys to avoid KeyError
nodes_by_doc_id = self.separate_nodes_by_ref_id(nodes)
titles_by_doc_id = await self.extract_titles(nodes_by_doc_id)
return [
{"document_title": titles_by_doc_id.get(node.ref_doc_id) or self._fallback_from_node(node)}
for node in nodes
]
async def get_title_candidates(self, nodes: List[BaseNode]) -> List[str]:
# Wrap per-node LLM calls; fall back to heuristics for each node independently.
candidates: List[str] = []
for node in nodes:
text = ""
if isinstance(node, TextNode):
text = (node.text or "").strip()
try:
# retry the LLM call a bit; if it still fails, drop to fallback
title = await self._retry(
self.llm.apredict,
PromptTemplate(template=self.node_template),
context_str=text,
)
title = (title or "").strip()
if title:
candidates.append(title)
continue
except Exception:
pass # fall through to fallback
# Fallback: first non-empty line / filename / generic
fb = self._fallback_from_text_or_meta(text, node)
if fb:
candidates.append(fb)
# If ALL nodes failed and produced nothing, ensure at least one fallback
if not candidates and nodes:
candidates.append(self._fallback_from_node(nodes[0]) or "Untitled")
return candidates
async def extract_titles(self, nodes_by_doc_id: Dict) -> Dict:
final: Dict[Optional[str], str] = {}
async def get_titles_by_doc(nodes: List[BaseNode], key: Optional[str]) -> None:
try:
title_candidates = await self.get_title_candidates(nodes)
combined = ", ".join(title_candidates)
title = await self._retry(
self.llm.apredict,
PromptTemplate(template=self.combine_template),
context_str=combined,
)
title = (title or "").strip()
if not title:
raise ValueError("Empty title from combiner")
except Exception:
# Combine fallback: best candidate or node-derived fallback
title = (
(title_candidates[0] if title_candidates else None)
or (self._fallback_from_node(nodes[0]) if nodes else None)
or "Untitled"
)
final[key] = title
# Run sequentially to simplify error handling; flip to tasks if you prefer concurrency.
for key, group in nodes_by_doc_id.items():
await get_titles_by_doc(group, key)
return final
# ------------------------
# Fallback helpers
# ------------------------
def _fallback_from_text_or_meta(self, text: str, node: BaseNode) -> str:
first_line = next((ln.strip() for ln in text.splitlines() if ln.strip()), "")
if first_line:
return (first_line[:120]).rstrip(" .,:;")
# Try a few common metadata keys that often carry a document name
meta = getattr(node, "metadata", {}) or {}
for k in ("file_name", "filename", "title", "name"):
val = (meta.get(k) or "").strip()
if val:
return val[:120]
return "Untitled"
def _fallback_from_node(self, node: BaseNode) -> str:
text = cast(TextNode, node).text if isinstance(node, TextNode) else ""
return self._fallback_from_text_or_meta(text or "", node)
What would be better, is for you to make the BaseExtractor
robust so that all other extractors inherit that.
Reason
No response
Value of Feature
Noted above.
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or requesttriageIssue needs to be triaged/prioritizedIssue needs to be triaged/prioritized