Skip to content

Commit 3b510ed

Browse files
committed
Add RawDocumentRequest
Signed-off-by: Hemslo Wang <[email protected]>
1 parent 377e41f commit 3b510ed

File tree

8 files changed

+68
-35
lines changed

8 files changed

+68
-35
lines changed

app/dependencies/html_preprocessor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from bs4 import BeautifulSoup
22

3-
from app.models.document_model import DocumentMetadataModel, DocumentModel
3+
from app.models.document_metadata_model import DocumentMetadataModel
4+
from app.models.document_model import DocumentModel
45
from app.models.html_document_request import HTMLDocumentRequest
56

67

app/dependencies/repository.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from app.dependencies.redis import RedisDep
99
from app.models.document_model import DocumentModel
1010
from app.models.html_document_request import HTMLDocumentRequest
11+
from app.models.raw_document_request import RawDocumentRequest
1112

1213

1314
class Repository:
@@ -19,7 +20,7 @@ def get_digest(self, source_id: str) -> str | None:
1920
digest = self.redis.client.get(f"{config.DIGEST_PREFIX}:{source_id}")
2021
return digest.decode() if digest else None
2122

22-
def save(self, doc: HTMLDocumentRequest) -> None:
23+
def save(self, doc: HTMLDocumentRequest | RawDocumentRequest) -> None:
2324
docs = self.preprocess_doc(doc)
2425
existing_keys = self.redis.client.keys(
2526
f"{self.redis.key_prefix}:{doc.source_id}:*"
@@ -42,8 +43,24 @@ def reset(self) -> None:
4243
self.redis.client.delete(*digest_keys)
4344
self.redis._create_index_if_not_exist(config.EMBEDDING_DIM)
4445

45-
def preprocess_doc(self, doc: HTMLDocumentRequest) -> list[DocumentModel]:
46-
return self.document_transformer.transform_documents([preprocess(doc)])
46+
@staticmethod
47+
def _preprocess(
48+
doc: HTMLDocumentRequest | RawDocumentRequest,
49+
) -> DocumentModel:
50+
match doc:
51+
case HTMLDocumentRequest():
52+
return preprocess(doc)
53+
case RawDocumentRequest():
54+
return DocumentModel(
55+
metadata=doc.metadata,
56+
page_content=doc.page_content,
57+
)
58+
59+
def preprocess_doc(
60+
self,
61+
doc: HTMLDocumentRequest | RawDocumentRequest,
62+
) -> list[DocumentModel]:
63+
return self.document_transformer.transform_documents([self._preprocess(doc)])
4764

4865

4966
RepositoryDep = Annotated[Repository, Depends(Repository)]

app/models/base_document.py

Lines changed: 1 addition & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
import hashlib
22
from functools import cached_property
33
from typing import TypedDict
4-
from urllib.parse import urlparse, urlunparse
54

65
from langchain_core.documents import Document
7-
from pydantic import Field, validator
6+
from pydantic import Field
87

98

109
def sha256(content: str) -> str:
@@ -26,24 +25,5 @@ def source_id(self):
2625
def page_content_digest(self):
2726
return sha256(self.page_content)
2827

29-
@validator("metadata", pre=True)
30-
def validate_metadata(cls, metadata):
31-
metadata["source"] = cls._normalize_url(metadata["source"])
32-
return metadata
33-
34-
@staticmethod
35-
def _normalize_url(url) -> str:
36-
parsed_url = urlparse(url)
37-
return urlunparse(
38-
(
39-
parsed_url.scheme,
40-
parsed_url.netloc,
41-
parsed_url.path,
42-
parsed_url.params,
43-
parsed_url.query,
44-
"",
45-
)
46-
)
47-
4828
class Config:
4929
keep_untouched = (cached_property,)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from typing import NotRequired, TypedDict
2+
3+
4+
class DocumentMetadataModel(TypedDict):
5+
source: str
6+
title: NotRequired[str]
7+
description: NotRequired[str]
8+
language: NotRequired[str]

app/models/document_model.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,7 @@
1-
from typing import NotRequired, TypedDict
2-
31
from pydantic import Field
42

53
from app.models.base_document import BaseDocument
6-
7-
8-
class DocumentMetadataModel(TypedDict):
9-
source: str
10-
title: NotRequired[str]
11-
description: NotRequired[str]
12-
language: NotRequired[str]
4+
from app.models.document_metadata_model import DocumentMetadataModel
135

146

157
class DocumentModel(BaseDocument):
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,29 @@
11
from typing import Literal
2+
from urllib.parse import urlparse, urlunparse
3+
4+
from pydantic.class_validators import validator
25

36
from app.models.base_document import BaseDocument
47

58

69
class HTMLDocumentRequest(BaseDocument):
710
type: Literal["HTML"] = "HTML"
11+
12+
@validator("metadata", pre=True)
13+
def validate_metadata(cls, metadata):
14+
metadata["source"] = cls._normalize_url(metadata["source"])
15+
return metadata
16+
17+
@staticmethod
18+
def _normalize_url(url) -> str:
19+
parsed_url = urlparse(url)
20+
return urlunparse(
21+
(
22+
parsed_url.scheme,
23+
parsed_url.netloc,
24+
parsed_url.path,
25+
parsed_url.params,
26+
parsed_url.query,
27+
"",
28+
)
29+
)

app/models/raw_document_request.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from typing import Literal
2+
3+
from pydantic.fields import Field
4+
5+
from app.models.base_document import BaseDocument
6+
from app.models.document_metadata_model import DocumentMetadataModel
7+
8+
9+
class RawDocumentRequest(BaseDocument):
10+
type: Literal["RAW"] = "RAW"
11+
12+
metadata: DocumentMetadataModel = Field(default_factory=DocumentMetadataModel)

app/routers/ingest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22

33
from ..dependencies.repository import RepositoryDep
44
from ..models.html_document_request import HTMLDocumentRequest
5+
from ..models.raw_document_request import RawDocumentRequest
56

67
router = APIRouter()
78

89

910
@router.post("/ingest", description="Ingest a document", status_code=201)
1011
def ingest_doc(
11-
doc: HTMLDocumentRequest,
12+
doc: HTMLDocumentRequest | RawDocumentRequest,
1213
response: Response,
1314
repository: RepositoryDep,
1415
) -> str:

0 commit comments

Comments
 (0)