Skip to content

Commit e3c55cc

Browse files
Merge branch 'main' into feat/add-connector-gh-pages
2 parents 763c10d + f45798b commit e3c55cc

File tree

140 files changed

+3085
-1220
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+3085
-1220
lines changed

backend/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ Edition features outside of personal development or testing purposes. Please rea
88
[email protected] for more information. Please visit https://github.com/onyx-dot-app/onyx"
99

1010
# Default ONYX_VERSION, typically overriden during builds by GitHub Actions.
11-
ARG ONYX_VERSION=0.8-dev
11+
ARG ONYX_VERSION=0.0.0-dev
1212
# DO_NOT_TRACK is used to disable telemetry for Unstructured
1313
ENV ONYX_VERSION=${ONYX_VERSION} \
1414
DANSWER_RUNNING_IN_DOCKER="true" \

backend/Dockerfile.model_server

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ You can find it at https://hub.docker.com/r/onyx/onyx-model-server. For more det
77
visit https://github.com/onyx-dot-app/onyx."
88

99
# Default ONYX_VERSION, typically overriden during builds by GitHub Actions.
10-
ARG ONYX_VERSION=0.8-dev
10+
ARG ONYX_VERSION=0.0.0-dev
1111
ENV ONYX_VERSION=${ONYX_VERSION} \
1212
DANSWER_RUNNING_IN_DOCKER="true"
1313

@@ -31,7 +31,8 @@ RUN python -c "from transformers import AutoTokenizer; \
3131
AutoTokenizer.from_pretrained('distilbert-base-uncased'); \
3232
AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
3333
from huggingface_hub import snapshot_download; \
34-
snapshot_download(repo_id='danswer/hybrid-intent-token-classifier', revision='v1.0.3'); \
34+
snapshot_download(repo_id='onyx-dot-app/hybrid-intent-token-classifier'); \
35+
snapshot_download(repo_id='onyx-dot-app/information-content-model'); \
3536
snapshot_download('nomic-ai/nomic-embed-text-v1'); \
3637
snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
3738
from sentence_transformers import SentenceTransformer; \
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""add chunk stats table
2+
3+
Revision ID: 3781a5eb12cb
4+
Revises: df46c75b714e
5+
Create Date: 2025-03-10 10:02:30.586666
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
11+
# revision identifiers, used by Alembic.
12+
revision = "3781a5eb12cb"
13+
down_revision = "df46c75b714e"
14+
branch_labels = None
15+
depends_on = None
16+
17+
18+
def upgrade() -> None:
19+
op.create_table(
20+
"chunk_stats",
21+
sa.Column("id", sa.String(), primary_key=True, index=True),
22+
sa.Column(
23+
"document_id",
24+
sa.String(),
25+
sa.ForeignKey("document.id"),
26+
nullable=False,
27+
index=True,
28+
),
29+
sa.Column("chunk_in_doc_id", sa.Integer(), nullable=False),
30+
sa.Column("information_content_boost", sa.Float(), nullable=True),
31+
sa.Column(
32+
"last_modified",
33+
sa.DateTime(timezone=True),
34+
nullable=False,
35+
index=True,
36+
server_default=sa.func.now(),
37+
),
38+
sa.Column("last_synced", sa.DateTime(timezone=True), nullable=True, index=True),
39+
sa.UniqueConstraint(
40+
"document_id", "chunk_in_doc_id", name="uq_chunk_stats_doc_chunk"
41+
),
42+
)
43+
44+
op.create_index(
45+
"ix_chunk_sync_status", "chunk_stats", ["last_modified", "last_synced"]
46+
)
47+
48+
49+
def downgrade() -> None:
50+
op.drop_index("ix_chunk_sync_status", table_name="chunk_stats")
51+
op.drop_table("chunk_stats")
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""add_default_vision_provider_to_llm_provider
2+
3+
Revision ID: df46c75b714e
4+
Revises: 3934b1bc7b62
5+
Create Date: 2025-03-11 16:20:19.038945
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "df46c75b714e"
14+
down_revision = "3934b1bc7b62"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade() -> None:
20+
op.add_column(
21+
"llm_provider",
22+
sa.Column(
23+
"is_default_vision_provider",
24+
sa.Boolean(),
25+
nullable=True,
26+
server_default=sa.false(),
27+
),
28+
)
29+
op.add_column(
30+
"llm_provider", sa.Column("default_vision_model", sa.String(), nullable=True)
31+
)
32+
33+
34+
def downgrade() -> None:
35+
op.drop_column("llm_provider", "default_vision_model")
36+
op.drop_column("llm_provider", "is_default_vision_provider")
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""add new available tenant table
2+
3+
Revision ID: 3b45e0018bf1
4+
Revises: ac842f85f932
5+
Create Date: 2025-03-06 09:55:18.229910
6+
7+
"""
8+
import sqlalchemy as sa
9+
10+
from alembic import op
11+
12+
13+
# revision identifiers, used by Alembic.
14+
revision = "3b45e0018bf1"
15+
down_revision = "ac842f85f932"
16+
branch_labels = None
17+
depends_on = None
18+
19+
20+
def upgrade() -> None:
21+
# Create new_available_tenant table
22+
op.create_table(
23+
"available_tenant",
24+
sa.Column("tenant_id", sa.String(), nullable=False),
25+
sa.Column("alembic_version", sa.String(), nullable=False),
26+
sa.Column("date_created", sa.DateTime(), nullable=False),
27+
sa.PrimaryKeyConstraint("tenant_id"),
28+
)
29+
30+
31+
def downgrade() -> None:
32+
# Drop new_available_tenant table
33+
op.drop_table("available_tenant")

backend/ee/onyx/external_permissions/confluence/doc_sync.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Rules defined here:
33
https://confluence.atlassian.com/conf85/check-who-can-view-a-page-1283360557.html
44
"""
5+
from collections.abc import Generator
56
from typing import Any
67

78
from ee.onyx.configs.app_configs import CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC
@@ -263,13 +264,11 @@ def _fetch_all_page_restrictions(
263264
space_permissions_by_space_key: dict[str, ExternalAccess],
264265
is_cloud: bool,
265266
callback: IndexingHeartbeatInterface | None,
266-
) -> list[DocExternalAccess]:
267+
) -> Generator[DocExternalAccess, None, None]:
267268
"""
268269
For all pages, if a page has restrictions, then use those restrictions.
269270
Otherwise, use the space's restrictions.
270271
"""
271-
document_restrictions: list[DocExternalAccess] = []
272-
273272
for slim_doc in slim_docs:
274273
if callback:
275274
if callback.should_stop():
@@ -286,11 +285,9 @@ def _fetch_all_page_restrictions(
286285
confluence_client=confluence_client,
287286
perm_sync_data=slim_doc.perm_sync_data,
288287
):
289-
document_restrictions.append(
290-
DocExternalAccess(
291-
doc_id=slim_doc.id,
292-
external_access=restrictions,
293-
)
288+
yield DocExternalAccess(
289+
doc_id=slim_doc.id,
290+
external_access=restrictions,
294291
)
295292
# If there are restrictions, then we don't need to use the space's restrictions
296293
continue
@@ -324,11 +321,9 @@ def _fetch_all_page_restrictions(
324321
continue
325322

326323
# If there are no restrictions, then use the space's restrictions
327-
document_restrictions.append(
328-
DocExternalAccess(
329-
doc_id=slim_doc.id,
330-
external_access=space_permissions,
331-
)
324+
yield DocExternalAccess(
325+
doc_id=slim_doc.id,
326+
external_access=space_permissions,
332327
)
333328
if (
334329
not space_permissions.is_public
@@ -342,13 +337,12 @@ def _fetch_all_page_restrictions(
342337
)
343338

344339
logger.debug("Finished fetching all page restrictions for space")
345-
return document_restrictions
346340

347341

348342
def confluence_doc_sync(
349343
cc_pair: ConnectorCredentialPair,
350344
callback: IndexingHeartbeatInterface | None,
351-
) -> list[DocExternalAccess]:
345+
) -> Generator[DocExternalAccess, None, None]:
352346
"""
353347
Adds the external permissions to the documents in postgres
354348
if the document doesn't already exists in postgres, we create
@@ -387,7 +381,7 @@ def confluence_doc_sync(
387381
slim_docs.extend(doc_batch)
388382

389383
logger.debug("Fetching all page restrictions for space")
390-
return _fetch_all_page_restrictions(
384+
yield from _fetch_all_page_restrictions(
391385
confluence_client=confluence_connector.confluence_client,
392386
slim_docs=slim_docs,
393387
space_permissions_by_space_key=space_permissions_by_space_key,

backend/ee/onyx/external_permissions/gmail/doc_sync.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from collections.abc import Generator
12
from datetime import datetime
23
from datetime import timezone
34

@@ -34,7 +35,7 @@ def _get_slim_doc_generator(
3435
def gmail_doc_sync(
3536
cc_pair: ConnectorCredentialPair,
3637
callback: IndexingHeartbeatInterface | None,
37-
) -> list[DocExternalAccess]:
38+
) -> Generator[DocExternalAccess, None, None]:
3839
"""
3940
Adds the external permissions to the documents in postgres
4041
if the document doesn't already exists in postgres, we create
@@ -48,7 +49,6 @@ def gmail_doc_sync(
4849
cc_pair, gmail_connector, callback=callback
4950
)
5051

51-
document_external_access: list[DocExternalAccess] = []
5252
for slim_doc_batch in slim_doc_generator:
5353
for slim_doc in slim_doc_batch:
5454
if callback:
@@ -60,17 +60,14 @@ def gmail_doc_sync(
6060
if slim_doc.perm_sync_data is None:
6161
logger.warning(f"No permissions found for document {slim_doc.id}")
6262
continue
63+
6364
if user_email := slim_doc.perm_sync_data.get("user_email"):
6465
ext_access = ExternalAccess(
6566
external_user_emails=set([user_email]),
6667
external_user_group_ids=set(),
6768
is_public=False,
6869
)
69-
document_external_access.append(
70-
DocExternalAccess(
71-
doc_id=slim_doc.id,
72-
external_access=ext_access,
73-
)
70+
yield DocExternalAccess(
71+
doc_id=slim_doc.id,
72+
external_access=ext_access,
7473
)
75-
76-
return document_external_access

backend/ee/onyx/external_permissions/google_drive/doc_sync.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from collections.abc import Generator
12
from datetime import datetime
23
from datetime import timezone
34
from typing import Any
@@ -147,7 +148,7 @@ def _get_permissions_from_slim_doc(
147148
def gdrive_doc_sync(
148149
cc_pair: ConnectorCredentialPair,
149150
callback: IndexingHeartbeatInterface | None,
150-
) -> list[DocExternalAccess]:
151+
) -> Generator[DocExternalAccess, None, None]:
151152
"""
152153
Adds the external permissions to the documents in postgres
153154
if the document doesn't already exists in postgres, we create
@@ -161,7 +162,6 @@ def gdrive_doc_sync(
161162

162163
slim_doc_generator = _get_slim_doc_generator(cc_pair, google_drive_connector)
163164

164-
document_external_accesses = []
165165
for slim_doc_batch in slim_doc_generator:
166166
for slim_doc in slim_doc_batch:
167167
if callback:
@@ -174,10 +174,7 @@ def gdrive_doc_sync(
174174
google_drive_connector=google_drive_connector,
175175
slim_doc=slim_doc,
176176
)
177-
document_external_accesses.append(
178-
DocExternalAccess(
179-
external_access=ext_access,
180-
doc_id=slim_doc.id,
181-
)
177+
yield DocExternalAccess(
178+
external_access=ext_access,
179+
doc_id=slim_doc.id,
182180
)
183-
return document_external_accesses

0 commit comments

Comments
 (0)