Skip to content

Commit 17562f9

Browse files
authored
Id not set in checkpoint2 (#4468)
* unconditionally set completion * drive connector improvements * fixing broader typing issue * fix tests, CW comments * actual test fix
1 parent 9c73099 commit 17562f9

File tree

13 files changed

+60
-51
lines changed

13 files changed

+60
-51
lines changed

backend/onyx/agents/agent_search/deep_search/initial/generate_initial_answer/nodes/generate_initial_answer.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -117,16 +117,16 @@ def generate_initial_answer(
117117

118118
consolidated_context_docs = structured_subquestion_docs.cited_documents
119119
counter = 0
120-
for original_doc_number, original_doc in enumerate(
121-
orig_question_retrieval_documents
122-
):
123-
if original_doc_number not in structured_subquestion_docs.cited_documents:
124-
if (
125-
counter <= AGENT_MIN_ORIG_QUESTION_DOCS
126-
or len(consolidated_context_docs) < AGENT_MAX_ANSWER_CONTEXT_DOCS
127-
):
128-
consolidated_context_docs.append(original_doc)
129-
counter += 1
120+
for original_doc in orig_question_retrieval_documents:
121+
if original_doc in structured_subquestion_docs.cited_documents:
122+
continue
123+
124+
if (
125+
counter <= AGENT_MIN_ORIG_QUESTION_DOCS
126+
or len(consolidated_context_docs) < AGENT_MAX_ANSWER_CONTEXT_DOCS
127+
):
128+
consolidated_context_docs.append(original_doc)
129+
counter += 1
130130

131131
# sort docs by their scores - though the scores refer to different questions
132132
relevant_docs = dedup_inference_section_list(consolidated_context_docs)

backend/onyx/agents/agent_search/deep_search/main/nodes/generate_validate_refined_answer.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,8 @@ def generate_validate_refined_answer(
146146
consolidated_context_docs = structured_subquestion_docs.cited_documents
147147

148148
counter = 0
149-
for original_doc_number, original_doc in enumerate(
150-
original_question_verified_documents
151-
):
152-
if original_doc_number not in structured_subquestion_docs.cited_documents:
149+
for original_doc in original_question_verified_documents:
150+
if original_doc not in structured_subquestion_docs.cited_documents:
153151
if (
154152
counter <= AGENT_MIN_ORIG_QUESTION_DOCS
155153
or len(consolidated_context_docs)

backend/onyx/agents/agent_search/orchestration/nodes/choose_tool.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def choose_tool(
5757
not agent_config.behavior.use_agentic_search
5858
and agent_config.tooling.search_tool is not None
5959
and (
60-
not force_use_tool.force_use or force_use_tool.tool_name == SearchTool.name
60+
not force_use_tool.force_use or force_use_tool.tool_name == SearchTool._NAME
6161
)
6262
):
6363
override_kwargs = SearchToolOverrideKwargs()

backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py

-3
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,6 @@ def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> b
9898
if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
9999
return False
100100

101-
if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
102-
return False
103-
104101
# If the last sync is None, it has never been run so we run the sync
105102
last_perm_sync = cc_pair.last_time_perm_sync
106103
if last_perm_sync is None:

backend/onyx/connectors/google_drive/connector.py

+21-6
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,7 @@ def _impersonate_user_for_retrieval(
420420
is_slim: bool,
421421
checkpoint: GoogleDriveCheckpoint,
422422
concurrent_drive_itr: Callable[[str], Iterator[str]],
423-
filtered_folder_ids: set[str],
423+
sorted_filtered_folder_ids: list[str],
424424
start: SecondsSinceUnixEpoch | None = None,
425425
end: SecondsSinceUnixEpoch | None = None,
426426
) -> Iterator[RetrievedDriveFile]:
@@ -509,6 +509,7 @@ def _yield_from_drive(
509509
yield from _yield_from_drive(drive_id, start)
510510
curr_stage.stage = DriveRetrievalStage.FOLDER_FILES
511511
resuming = False # we are starting the next stage for the first time
512+
512513
if curr_stage.stage == DriveRetrievalStage.FOLDER_FILES:
513514

514515
def _yield_from_folder_crawl(
@@ -526,16 +527,28 @@ def _yield_from_folder_crawl(
526527
)
527528

528529
# resume from a checkpoint
530+
last_processed_folder = None
529531
if resuming:
530532
folder_id = curr_stage.completed_until_parent_id
531533
assert folder_id is not None, "folder id not set in checkpoint"
532534
resume_start = curr_stage.completed_until
533535
yield from _yield_from_folder_crawl(folder_id, resume_start)
536+
last_processed_folder = folder_id
537+
538+
skipping_seen_folders = last_processed_folder is not None
539+
for folder_id in sorted_filtered_folder_ids:
540+
if skipping_seen_folders:
541+
skipping_seen_folders = folder_id != last_processed_folder
542+
continue
534543

535-
remaining_folders = filtered_folder_ids - self._retrieved_ids
536-
for folder_id in remaining_folders:
544+
if folder_id in self._retrieved_ids:
545+
continue
546+
547+
curr_stage.completed_until = 0
548+
curr_stage.completed_until_parent_id = folder_id
537549
logger.info(f"Getting files in folder '{folder_id}' as '{user_email}'")
538550
yield from _yield_from_folder_crawl(folder_id, start)
551+
539552
curr_stage.stage = DriveRetrievalStage.DONE
540553

541554
def _manage_service_account_retrieval(
@@ -584,11 +597,13 @@ def _manage_service_account_retrieval(
584597
drive_ids_to_retrieve, checkpoint
585598
)
586599

600+
sorted_filtered_folder_ids = sorted(folder_ids_to_retrieve)
601+
587602
# only process emails that we haven't already completed retrieval for
588603
non_completed_org_emails = [
589604
user_email
590-
for user_email, stage in checkpoint.completion_map.items()
591-
if stage != DriveRetrievalStage.DONE
605+
for user_email, stage_completion in checkpoint.completion_map.items()
606+
if stage_completion.stage != DriveRetrievalStage.DONE
592607
]
593608

594609
# don't process too many emails before returning a checkpoint. This is
@@ -609,7 +624,7 @@ def _manage_service_account_retrieval(
609624
is_slim,
610625
checkpoint,
611626
drive_id_iterator,
612-
folder_ids_to_retrieve,
627+
sorted_filtered_folder_ids,
613628
start,
614629
end,
615630
)

backend/onyx/connectors/google_drive/file_retrieval.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,17 @@ def crawl_folders_for_files(
122122
start=start,
123123
end=end,
124124
):
125-
found_files = True
126125
logger.info(f"Found file: {file['name']}, user email: {user_email}")
126+
found_files = True
127127
yield RetrievedDriveFile(
128128
drive_file=file,
129129
user_email=user_email,
130130
parent_id=parent_id,
131131
completion_stage=DriveRetrievalStage.FOLDER_FILES,
132132
)
133+
# Only mark a folder as done if it was fully traversed without errors
134+
if found_files:
135+
update_traversed_ids_func(parent_id)
133136
except Exception as e:
134137
logger.error(f"Error getting files in parent {parent_id}: {e}")
135138
yield RetrievedDriveFile(
@@ -139,8 +142,6 @@ def crawl_folders_for_files(
139142
completion_stage=DriveRetrievalStage.FOLDER_FILES,
140143
error=e,
141144
)
142-
if found_files:
143-
update_traversed_ids_func(parent_id)
144145
else:
145146
logger.info(f"Skipping subfolder files since already traversed: {parent_id}")
146147

backend/onyx/context/search/postprocessing/postprocessing.py

+4-16
Original file line numberDiff line numberDiff line change
@@ -374,14 +374,6 @@ def filter_sections(
374374
if query.evaluation_type == LLMEvaluationType.SKIP:
375375
return []
376376

377-
# Additional safeguard: Log a warning if this function is ever called with SKIP evaluation type
378-
# This should never happen if our fast paths are working correctly
379-
if query.evaluation_type == LLMEvaluationType.SKIP:
380-
logger.warning(
381-
"WARNING: filter_sections called with SKIP evaluation_type. This should never happen!"
382-
)
383-
return []
384-
385377
sections_to_filter = sections_to_filter[: query.max_llm_filter_sections]
386378

387379
contents = [
@@ -461,12 +453,10 @@ def search_postprocessing(
461453

462454
llm_filter_task_id = None
463455
# Only add LLM filtering if not in SKIP mode and if LLM doc relevance is not disabled
464-
if (
465-
search_query.evaluation_type not in [LLMEvaluationType.SKIP]
466-
and not DISABLE_LLM_DOC_RELEVANCE
467-
and search_query.evaluation_type
468-
in [LLMEvaluationType.BASIC, LLMEvaluationType.UNSPECIFIED]
469-
):
456+
if not DISABLE_LLM_DOC_RELEVANCE and search_query.evaluation_type in [
457+
LLMEvaluationType.BASIC,
458+
LLMEvaluationType.UNSPECIFIED,
459+
]:
470460
logger.info("Adding LLM filtering task for document relevance evaluation")
471461
post_processing_tasks.append(
472462
FunctionCall(
@@ -479,8 +469,6 @@ def search_postprocessing(
479469
)
480470
)
481471
llm_filter_task_id = post_processing_tasks[-1].result_id
482-
elif search_query.evaluation_type == LLMEvaluationType.SKIP:
483-
logger.info("Fast path: Skipping LLM filtering task for ordering-only mode")
484472
elif DISABLE_LLM_DOC_RELEVANCE:
485473
logger.info("Skipping LLM filtering task because LLM doc relevance is disabled")
486474

backend/onyx/server/features/tool/api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,6 @@ def list_tools(
152152
return [
153153
ToolSnapshot.from_model(tool)
154154
for tool in tools
155-
if tool.in_code_tool_id != ImageGenerationTool.name
155+
if tool.in_code_tool_id != ImageGenerationTool._NAME
156156
or is_image_generation_available(db_session=db_session)
157157
]

backend/pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ mypy_path = "$MYPY_CONFIG_FILE_DIR"
44
explicit_package_bases = true
55
disallow_untyped_defs = true
66
enable_error_code = ["possibly-undefined"]
7+
strict_equality = true
78

89
[[tool.mypy.overrides]]
910
module = "alembic.versions.*"

backend/scripts/sources_selection_analysis.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -186,15 +186,17 @@ def _identify_diff(self, content_key: str) -> list[dict]:
186186
)
187187
return changes
188188

189-
def check_config_changes(self, previous_doc_rank: int, new_doc_rank: int) -> None:
189+
def check_config_changes(
190+
self, previous_doc_rank: int | str, new_doc_rank: int
191+
) -> None:
190192
"""Try to identify possible reasons why a change has been detected by
191193
checking the latest document update date or the boost value.
192194
193195
Args:
194196
previous_doc_rank (int): The document rank for the previous analysis
195197
new_doc_rank (int): The document rank for the new analysis
196198
"""
197-
if new_doc_rank == "not_ranked":
199+
if isinstance(new_doc_rank, str) and new_doc_rank == "not_ranked":
198200
color_output(
199201
(
200202
"NOTE: The document is missing in the 'current' analysis file. "

backend/tests/daily/connectors/salesforce/test_salesforce_connector.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import time
44
from pathlib import Path
55
from typing import Any
6+
from typing import cast
67

78
import pytest
89

@@ -24,7 +25,7 @@ def extract_key_value_pairs_to_set(
2425

2526
def load_test_data(
2627
file_name: str = "test_salesforce_data.json",
27-
) -> dict[str, list[str] | dict[str, Any]]:
28+
) -> dict[str, str | list[str] | dict[str, Any] | list[dict[str, Any]]]:
2829
current_dir = Path(__file__).parent
2930
with open(current_dir / file_name, "r") as f:
3031
return json.load(f)
@@ -90,7 +91,7 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
9091
if not isinstance(expected_text, list):
9192
raise ValueError("Expected text is not a list")
9293

93-
unparsed_expected_key_value_pairs: list[str] = expected_text
94+
unparsed_expected_key_value_pairs: list[str] = cast(list[str], expected_text)
9495
received_key_value_pairs = extract_key_value_pairs_to_set(received_text)
9596
expected_key_value_pairs = extract_key_value_pairs_to_set(
9697
unparsed_expected_key_value_pairs
@@ -110,7 +111,12 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
110111
assert primary_owner.first_name == expected_primary_owner["first_name"]
111112
assert primary_owner.last_name == expected_primary_owner["last_name"]
112113

113-
assert target_test_doc.secondary_owners == test_data["secondary_owners"]
114+
secondary_owners = (
115+
[owner.model_dump() for owner in target_test_doc.secondary_owners]
116+
if target_test_doc.secondary_owners
117+
else None
118+
)
119+
assert secondary_owners == test_data["secondary_owners"]
114120
assert target_test_doc.title == test_data["title"]
115121

116122

backend/tests/integration/common_utils/managers/persona.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,8 @@ def verify(
243243
and set(user.email for user in fetched_persona.users)
244244
== set(persona.users)
245245
and set(fetched_persona.groups) == set(persona.groups)
246-
and set(fetched_persona.labels) == set(persona.label_ids)
246+
and {label.id for label in fetched_persona.labels}
247+
== set(persona.label_ids)
247248
)
248249
return False
249250

backend/tests/unit/onyx/chat/stream_processing/test_quotes_processing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def test_fuzzy_match_quotes_to_docs() -> None:
192192
results = match_quotes_to_docs(
193193
test_quotes, [test_chunk_0, test_chunk_1], fuzzy_search=True
194194
)
195-
assert results == {
195+
assert results.model_dump() == {
196196
"a doc with some": {"document": "test doc 0", "link": "doc 0 base"},
197197
"a doc with some LINK": {
198198
"document": "test doc 0",

0 commit comments

Comments
 (0)