Skip to content

Commit 87741a0

Browse files
authored
Merge branch 'master' into cherry2
2 parents c29393b + 275f692 commit 87741a0

14 files changed

Lines changed: 380 additions & 59 deletions

File tree

examples/index/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ indexer:
99
uri: ./proc_demo.db
1010
name: my_db
1111
collection_name: my_docs
12-
documents_path: 'examples/process/outputs/merged/final_pp.jsonl'
12+
documents_path: 'examples/postprocessor/outputs/merged/final_pp.jsonl'

examples/postprocessor/config.yaml

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,23 @@
11
pp_modules:
2-
- type: chunker
2+
- type: file_namer
3+
- type: chunker
34
args:
45
chunking_strategy: sentence
6+
- type: translator
7+
args:
8+
target_language: en
9+
attachment_tag: <attachment>
10+
confidence_threshold: 0.7
11+
constrained_languages:
12+
- fr
13+
- en
14+
- type: metafuse
15+
args:
16+
metadata_keys:
17+
- file_name
18+
content_template: Content from {file_name}
19+
position: beginning
20+
521
output:
6-
output_path: examples/process/outputs/merged/
7-
save_each_step: True
22+
output_path: examples/postprocessor/outputs/merged/
23+
save_each_step: True

examples/sample_data/txt/poeme.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Demain, dès l’aube, à l’heure où blanchit la campagne,
2+
Je partirai. Vois-tu, je sais que tu m’attends.
3+
J’irai par la forêt, j’irai par la montagne.
4+
Je ne puis demeurer loin de toi plus longtemps.
5+
6+
Je marcherai les yeux fixés sur mes pensées,
7+
Sans rien voir au dehors, sans entendre aucun bruit,
8+
Seul, inconnu, le dos courbé, les mains croisées,
9+
Triste, et le jour pour moi sera comme la nuit.
10+
11+
Je ne regarderai ni l’or du soir qui tombe,
12+
Ni les voiles au loin descendant vers Harfleur,
13+
Et quand j’arriverai, je mettrai sur ta tombe
14+
Un bouquet de houx vert et de bruyère en fleur.
15+

pyproject.toml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ dependencies = [
7777
"langchain-anthropic==0.3.4",
7878
"langchain-aws==0.2.22",
7979
"langchain-cohere==0.4.2",
80-
"langchain-huggingface==0.3.0",
80+
"langchain-huggingface==0.1.2",
8181
"langchain-milvus==0.1.8",
8282
"langchain-mistralai==0.2.7",
8383
"langchain-nvidia-ai-endpoints",
@@ -88,7 +88,11 @@ dependencies = [
8888
"nltk>=3.9",
8989
"starlette==0.46",
9090
"typing_extensions==4.12.2",
91-
"sympy==1.14.0"
91+
"sympy==1.14.0",
92+
"langid",
93+
"mammoth==1.9.0",
94+
"argostranslate",
95+
"sentence-transformers",
9296
]
9397

9498
[project.optional-dependencies]
@@ -104,7 +108,7 @@ rag = [
104108
"langchain-anthropic==0.3.4",
105109
"langchain-aws",
106110
"langchain-cohere==0.4.2",
107-
"langchain-huggingface==0.3.0",
111+
"langchain-huggingface==0.1.2",
108112
"langchain-milvus==0.1.8",
109113
"langchain-mistralai==0.2.7",
110114
"langchain-nvidia-ai-endpoints",
@@ -164,4 +168,4 @@ ignore = ["E501", "E402"] # Avoid enforcing line-length violations (`E501`) an
164168

165169
[tool.ruff.lint.per-file-ignores]
166170
"__init__.py" = ["F401"]
167-
"run_index_api.py" = ["N803", "N806"]
171+
"run_index_api.py" = ["N803", "N806"]

src/mmore/process/post_processor/__init__.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22

33
from ...utils import load_config
44
from .base import BasePostProcessor, BasePostProcessorConfig
5-
from .chunker import MultimodalChunker, MultimodalChunkerConfig
65
from .filter import FILTER_TYPES, load_filter
76
from .filter.base import BaseFilterConfig
8-
from .ner import NERecognizer, NERExtractorConfig
97
from .tagger import TAGGER_TYPES, load_tagger
108
from .tagger.base import BaseTaggerConfig
119

@@ -15,13 +13,33 @@
1513
def load_postprocessor(config: BasePostProcessorConfig) -> BasePostProcessor:
1614
if config.type in FILTER_TYPES:
1715
return load_filter(cast(BaseFilterConfig, config))
16+
1817
elif config.type in TAGGER_TYPES:
1918
return load_tagger(cast(BaseTaggerConfig, config))
19+
2020
elif config.type == "chunker":
21+
from .chunker import MultimodalChunker, MultimodalChunkerConfig
22+
2123
config_chunk = load_config(config.args, MultimodalChunkerConfig)
2224
return MultimodalChunker.from_config(config_chunk)
25+
2326
elif config.type == "ner":
27+
from .ner import NERecognizer, NERExtractorConfig
28+
2429
config_ner = load_config(config.args, NERExtractorConfig)
2530
return NERecognizer.from_config(config_ner)
31+
32+
elif config.type == "translator":
33+
from .translator import TranslatorConfig, TranslatorPostProcessor
34+
35+
config_translator = load_config(config.args, TranslatorConfig)
36+
return TranslatorPostProcessor.from_config(config_translator)
37+
38+
elif config.type == "metafuse":
39+
from .metafuse import MetaDataInfusor, MetaDataInfusorConfig
40+
41+
config_metafuse = load_config(config.args, MetaDataInfusorConfig)
42+
return MetaDataInfusor.from_config(config_metafuse)
43+
2644
else:
2745
raise ValueError(f"Unrecognized postprocessor type: {config.type}")
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .base import MetaDataInfusor, MetaDataInfusorConfig, MetaDataPosition
2+
3+
__all__ = ["MetaDataInfusor", "MetaDataInfusorConfig", "MetaDataPosition"]
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from collections import defaultdict
2+
from dataclasses import dataclass
3+
from enum import Enum
4+
from typing import List
5+
6+
from mmore.process.post_processor.base import BasePostProcessor
7+
from mmore.type import MultimodalSample
8+
9+
10+
class MetaDataPosition(Enum):
11+
BEGINNING = "beginning"
12+
END = "end"
13+
14+
15+
@dataclass
16+
class MetaDataInfusorConfig:
17+
metadata_keys: List[str]
18+
content_template: str
19+
position: str
20+
21+
22+
class MetaDataInfusor(BasePostProcessor):
23+
def __init__(
24+
self,
25+
metadata_keys: List[str],
26+
content_template: str,
27+
position: MetaDataPosition,
28+
):
29+
super().__init__(name="☕ Metadata Infusor")
30+
self.metadata_keys = metadata_keys
31+
self.content_template = content_template
32+
self.position = position
33+
34+
@classmethod
35+
def from_config(cls, config: MetaDataInfusorConfig):
36+
metadata_infusor = MetaDataInfusor(
37+
metadata_keys=config.metadata_keys,
38+
content_template=config.content_template,
39+
position=MetaDataPosition(config.position),
40+
)
41+
return metadata_infusor
42+
43+
def process(self, sample: MultimodalSample, **kwargs) -> List[MultimodalSample]:
44+
format_mapping = defaultdict()
45+
for key in self.metadata_keys:
46+
value = sample.metadata.get(key, "")
47+
format_mapping[key] = value
48+
49+
metadata_content = self.content_template.format_map(format_mapping)
50+
51+
match self.position:
52+
case MetaDataPosition.BEGINNING:
53+
new_content = metadata_content + "\n" + sample.text
54+
case MetaDataPosition.END:
55+
new_content = sample.text + "\n" + metadata_content
56+
case _:
57+
new_content = sample.text
58+
59+
return [
60+
MultimodalSample(new_content, sample.modalities, sample.metadata, sample.id)
61+
]

src/mmore/process/post_processor/tagger/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from mmore.process.post_processor.tagger.file_namer import FileNamer
2+
13
from .base import BaseTaggerConfig
24
from .lang_detector import LangDetector
35
from .modalities import ModalitiesCounter
@@ -9,6 +11,7 @@
911
"modalities_counter": ModalitiesCounter,
1012
"words_counter": WordsCounter,
1113
"lang_detector": LangDetector,
14+
"file_namer": FileNamer,
1215
}
1316
TAGGER_TYPES = list(TAGGERS_LOADERS_MAP.keys())
1417

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import os
2+
3+
from .base import BaseTagger, BaseTaggerConfig
4+
5+
6+
class FileNamer(BaseTagger):
7+
"""
8+
A tagger that extracts the file name from the sample's metadata.
9+
This tagger is useful for identifying the source file of a sample, especially when dealing with multiple files.
10+
It retrieves the file name from the `file_path` metadata key and uses it as a tag.
11+
Attributes:
12+
name (str): The name of the tagger.
13+
metadata_key (str): The key in the sample's metadata from which to extract the file name.
14+
"""
15+
16+
def __init__(self, name: str = "🔤 File Namer", metadata_key: str = "file_name"):
17+
"""
18+
Initializes the FileNamer tagger.
19+
Args:
20+
name (str): The name of the tagger.
21+
metadata_key (str): The key in the sample's metadata from which to extract the file name.
22+
"""
23+
super().__init__(name, metadata_key)
24+
25+
def tag(self, sample):
26+
if "file_path" not in sample.metadata:
27+
return "unknown"
28+
29+
return os.path.basename(str(sample.metadata["file_path"]))
30+
31+
@classmethod
32+
def from_config(cls, config: BaseTaggerConfig):
33+
file_namer = FileNamer()
34+
return file_namer
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .base import TranslatorConfig, TranslatorPostProcessor
2+
3+
__all__ = ["TranslatorPostProcessor", "TranslatorConfig"]

0 commit comments

Comments
 (0)