Skip to content

Commit e3c842d

Browse files
paultltcCopilot
andauthored
fix: Add GitHub workflow actions (#78)
* add cis * fff * on pr * ff * ff * ff * dependabot * Update .github/workflows/ruff.yml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * pypi * Update tests.yml * Update ruff.yml * lint * lint * lint * lint * lint * lint * CI naming and fix one test * CI naming and fix one test * lint * update publish workflow * update publish workflow --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent f2690c2 commit e3c842d

22 files changed

Lines changed: 451 additions & 185 deletions

.github/dependabot.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/dependabot-options-reference#package-ecosystem-
2+
version: 2
3+
updates:
4+
- package-ecosystem: "github-actions"
5+
directory: "/"
6+
schedule:
7+
interval: "monthly"
8+
- package-ecosystem: "pip"
9+
directory: "/"
10+
schedule:
11+
interval: "weekly"

.github/workflows/publish.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
name: 📦 Publish Python Package
2+
on:
3+
release:
4+
types: [created]
5+
jobs:
6+
pypi-publish:
7+
name: Publish release to PyPI
8+
runs-on: ubuntu-latest
9+
environment:
10+
name: pypi
11+
url: https://pypi.org/p/mmore
12+
permissions:
13+
id-token: write
14+
steps:
15+
- uses: actions/checkout@v4
16+
- name: Set up Python
17+
uses: actions/setup-python@v4
18+
with:
19+
python-version: ["3.10", "3.11", "3.12"]
20+
- name: Install dependencies
21+
run: |
22+
python -m pip install --upgrade pip
23+
pip install setuptools wheel
24+
- name: Build package
25+
run: |
26+
python setup.py sdist bdist_wheel # Could also be python -m build
27+
- name: Publish package distributions to PyPI
28+
uses: pypa/gh-action-pypi-publish@release/v1

.github/workflows/ruff.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
name: 🧹 Ruff linter checks
2+
on:
3+
push:
4+
branches:
5+
- main
6+
pull_request:
7+
jobs:
8+
lint:
9+
runs-on: ubuntu-latest
10+
steps:
11+
- uses: actions/checkout@v3
12+
- uses: astral-sh/ruff-action@v3
13+
- run: ruff check
14+
- run: ruff format --check

.github/workflows/tests.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: 🧪 PyTest unit tests
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
9+
jobs:
10+
test:
11+
runs-on: ubuntu-latest
12+
13+
strategy:
14+
matrix:
15+
python-version: ["3.10", "3.11", "3.12"]
16+
17+
steps:
18+
- name: Checkout code
19+
uses: actions/checkout@v3
20+
21+
- name: Set up Python ${{ matrix.python-version }}
22+
uses: actions/setup-python@v5
23+
with:
24+
python-version: ${{ matrix.python-version }}
25+
26+
- name: Install dependencies
27+
run: |
28+
python -m pip install --upgrade pip
29+
pip install -e '.[rag,dev]' # or custom setup
30+
pip install pytest # if not in requirements.txt
31+
32+
- name: Run tests
33+
run: |
34+
pytest

examples/rag/evaluation/rag_evaluator_example.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,44 @@
66

77
load_dotenv()
88

9-
MOCK_EVALUATOR_CONFIG = './examples/rag/evaluation/rag_eval_example_config.yaml'
10-
MOCK_INDEXER_CONFIG = './examples/rag/evaluation/indexer_eval_example_config.yaml'
11-
MOCK_RAG_CONFIG = './examples/rag/evaluation/rag_evaluated_example_config.yaml'
9+
MOCK_EVALUATOR_CONFIG = "./examples/rag/evaluation/rag_eval_example_config.yaml"
10+
MOCK_INDEXER_CONFIG = "./examples/rag/evaluation/indexer_eval_example_config.yaml"
11+
MOCK_RAG_CONFIG = "./examples/rag/evaluation/rag_evaluated_example_config.yaml"
12+
1213

1314
def get_args():
14-
parser = argparse.ArgumentParser(description='Run RAG Evaluation pipeline with specified parameters or use default mock data')
15-
parser.add_argument('--eval-config', type=str, default=MOCK_EVALUATOR_CONFIG, help='Path to a rag evaluator config file.')
16-
parser.add_argument('--indexer-config', type=str, default=MOCK_INDEXER_CONFIG, help='Path to an Indexer config file.')
17-
parser.add_argument('--rag-config', type=str, default=MOCK_RAG_CONFIG, help='Path to a rag config file.')
15+
parser = argparse.ArgumentParser(
16+
description="Run RAG Evaluation pipeline with specified parameters or use default mock data"
17+
)
18+
parser.add_argument(
19+
"--eval-config",
20+
type=str,
21+
default=MOCK_EVALUATOR_CONFIG,
22+
help="Path to a rag evaluator config file.",
23+
)
24+
parser.add_argument(
25+
"--indexer-config",
26+
type=str,
27+
default=MOCK_INDEXER_CONFIG,
28+
help="Path to an Indexer config file.",
29+
)
30+
parser.add_argument(
31+
"--rag-config",
32+
type=str,
33+
default=MOCK_RAG_CONFIG,
34+
help="Path to a rag config file.",
35+
)
1836

1937
return parser.parse_args()
2038

39+
2140
if __name__ == "__main__":
2241
args = get_args()
2342

2443
# Instantiate RAGEvaluator
2544
evaluator = RAGEvaluator.from_config(args.eval_config)
2645

2746
# Run the evaluation
28-
result = evaluator(
29-
indexer_config = args.indexer_config,
30-
rag_config = args.rag_config
31-
)
47+
result = evaluator(indexer_config=args.indexer_config, rag_config=args.rag_config)
3248

33-
print(result)
49+
print(result)

pyproject.toml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,27 @@ cu124 = [
101101
"torch>=2.5.1",
102102
]
103103

104+
rag = [
105+
"accelerate",
106+
"langchain-anthropic==0.3.4",
107+
"langchain-aws",
108+
"langchain-cohere==0.4.2",
109+
"langchain-huggingface==0.1.2",
110+
"langchain-milvus==0.1.8",
111+
"langchain-mistralai==0.2.7",
112+
"langchain-nvidia-ai-endpoints",
113+
"langchain-openai==0.3.7",
114+
"langchain==0.3.20",
115+
"langdetect>=1.0.9",
116+
"langserve[all]==0.3.1",
117+
"pymilvus==2.5.0",
118+
"milvus-model==0.2.12",
119+
"ragas==0.2.6",
120+
"nltk>=3.9",
121+
]
122+
123+
dev = ["pytest>=8.0.0", "ruff>=0.4.0"]
124+
104125
[tool.uv]
105126
conflicts = [
106127
[
@@ -131,3 +152,18 @@ profile = "black"
131152

132153
[project.scripts]
133154
mmore = "mmore.cli:main"
155+
156+
[tool.pytest.ini_options]
157+
filterwarnings = ["ignore::Warning"]
158+
testpaths = ["tests"]
159+
160+
[tool.ruff]
161+
exclude = ["src/mmore/run_retriever.py"] # TODO: add back when GH CI bug is fixed
162+
163+
[tool.ruff.lint]
164+
select = ["E", "F", "W", "I", "N"]
165+
ignore = ["E501"] # Avoid enforcing line-length violations (`E501`)
166+
167+
[tool.ruff.lint.per-file-ignores]
168+
"__init__.py" = ["F401"]
169+
"run_index_api.py" = ["N803", "N806"]

scripts/data_extractor.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1+
import os
2+
import zipfile
3+
14
import requests
25

36
url = "https://corp.digitalcorpora.org/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0000.zip"
47
response = requests.get(url)
58
with open("0000.zip", "wb") as f:
69
f.write(response.content)
710
# Unzip the file
8-
import zipfile
911

1012
with zipfile.ZipFile("0000.zip", "r") as zip_ref:
1113
zip_ref.extractall("0000")
12-
# Read the PDF files
13-
import os
1414

1515
print(os.listdir("0000"))
1616

@@ -20,4 +20,4 @@
2020

2121
# Extract 100 files, and copy them in '0000_small' folder
2222
for i in range(100):
23-
os.system(f'cp 0000/{os.listdir("0000")[i]} 0000_small')
23+
os.system(f"cp 0000/{os.listdir('0000')[i]} 0000_small")

src/mmore/process/execution_state.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,15 @@ def initialize(distributed_mode=False, client=None):
3131
"""
3232
if ExecutionState._use_dask is not None:
3333
raise Exception("Execution state already initialized")
34-
assert (
35-
distributed_mode is not None
36-
), "Distributed mode must be set to True or False"
34+
assert distributed_mode is not None, (
35+
"Distributed mode must be set to True or False"
36+
)
3737
ExecutionState._use_dask = distributed_mode
3838

3939
if distributed_mode:
40-
assert (
41-
client is not None
42-
), "You must be in the context of a dask client to use distributed mode"
40+
assert client is not None, (
41+
"You must be in the context of a dask client to use distributed mode"
42+
)
4343
ExecutionState._dask_var = Variable("should_stop_execution", client=client)
4444
ExecutionState._dask_var.set(False)
4545
logger.info("Execution state initialized (distributed mode)")

src/mmore/process/post_processor/pipeline.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def _log_plan(self):
4949
logger.info("-" * 50)
5050
logger.info("PP Pipeline:")
5151
for i, processor in enumerate(self.post_processors):
52-
logger.info(f" > {i+1}. {processor.name}")
52+
logger.info(f" > {i + 1}. {processor.name}")
5353
logger.info("-" * 50)
5454

5555
@classmethod
@@ -75,7 +75,7 @@ def run(self, samples: List[MultimodalSample]) -> List[MultimodalSample]:
7575
for i, processor in enumerate(self.post_processors):
7676
samples = processor.batch_process(samples)
7777
if self.output_config.save_each_step:
78-
self.save_results(samples, f"{i+1}___{processor.name}.jsonl")
78+
self.save_results(samples, f"{i + 1}___{processor.name}.jsonl")
7979
self.save_results(samples, "final_pp.jsonl")
8080
return samples
8181

src/mmore/process/processors/pdf_processor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ def _extract_images(pdf_doc, xref) -> Optional[Image.Image]:
158158
if self.config.custom_config.get("extract_images", True):
159159
for img_info in page.get_images(full=False):
160160
image = _extract_images(pdf_doc, img_info[0])
161-
if image and clean_image(
162-
image
161+
if (
162+
image and clean_image(image)
163163
): # clean image filters images below size 512x512 and variance below 100, these are defaults and can be changed
164164
embedded_images.append(image)
165165
all_text.append(self.config.attachment_tag)

0 commit comments

Comments
 (0)