Skip to content

Commit 9e3ba39

Browse files
committed
lint
1 parent 76a0258 commit 9e3ba39

13 files changed

Lines changed: 307 additions & 146 deletions

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,8 @@ ignore = ["E501"]
166166
# Ignore `E402` (import violations) in all `__init__.py` files, and in selected subdirectories.
167167
[tool.ruff.lint.per-file-ignores]
168168
"__init__.py" = ["E402", "F401"]
169+
"run_index_api.py" = ["N803", "N806"]
170+
"run_retriever.py" = ["N815"]
171+
172+
[tool.ruff]
173+
exclude = ["tests/"]

scripts/data_extractor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1+
import os
2+
import zipfile
3+
14
import requests
25

36
url = "https://corp.digitalcorpora.org/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0000.zip"
47
response = requests.get(url)
58
with open("0000.zip", "wb") as f:
69
f.write(response.content)
710
# Unzip the file
8-
import zipfile
911

1012
with zipfile.ZipFile("0000.zip", "r") as zip_ref:
1113
zip_ref.extractall("0000")
12-
# Read the PDF files
13-
import os
1414

1515
print(os.listdir("0000"))
1616

src/mmore/cli.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@ def process(config_file: str):
3333
help="Path to the config file for post-processing.",
3434
)
3535
@click.option(
36-
"--input-data", type=str, required=True, help="Path to the input JSONL file of documents."
36+
"--input-data",
37+
type=str,
38+
required=True,
39+
help="Path to the input JSONL file of documents.",
3740
)
3841
def postprocess(config_file: str, input_data: str):
3942
"""Run the post-processors pipeline.
@@ -59,7 +62,11 @@ def postprocess(config_file: str, input_data: str):
5962
help="Path to the config file for indexing.",
6063
)
6164
@click.option(
62-
"--documents-path", "-f", type=str, required=False, help="Path to the JSONL file of the (post)processed documents."
65+
"--documents-path",
66+
"-f",
67+
type=str,
68+
required=False,
69+
help="Path to the JSONL file of the (post)processed documents.",
6370
)
6471
@click.option(
6572
"--collection-name",
@@ -93,10 +100,18 @@ def index(config_file: str, documents_path: str, collection_name: str):
93100
help="Dispatcher configuration file path.",
94101
)
95102
@click.option(
96-
"--input-file", "-f", type=str, required=True, help="Path to the JSONL file of the input queries."
103+
"--input-file",
104+
"-f",
105+
type=str,
106+
required=True,
107+
help="Path to the JSONL file of the input queries.",
97108
)
98109
@click.option(
99-
"--output-file", "-o", type=str, required=True, help="Path to which save the results of the retriever as a JSON."
110+
"--output-file",
111+
"-o",
112+
type=str,
113+
required=True,
114+
help="Path to which save the results of the retriever as a JSON.",
100115
)
101116
def retrieve(config_file: str, input_file: str, output_file: str):
102117
"""Retrieve documents for specified queries.
@@ -171,8 +186,8 @@ def dashboard_backend(host, port):
171186
"""Run the dashboard backend.
172187
173188
Args:
174-
host:
175-
port:
189+
host:
190+
port:
176191
177192
Returns:
178193

src/mmore/process/processors/pdf_processor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,8 @@ def _extract_images(pdf_doc, xref) -> Optional[Image.Image]:
158158
if self.config.custom_config.get("extract_images", True):
159159
for img_info in page.get_images(full=False):
160160
image = _extract_images(pdf_doc, img_info[0])
161-
if image and clean_image(
162-
image
161+
if (
162+
image and clean_image(image)
163163
): # clean image filters images below size 512x512 and variance below 100, these are defaults and can be changed
164164
embedded_images.append(image)
165165
all_text.append(self.config.attachment_tag)

src/mmore/rag/llm.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ def __post_init__(self):
7474
else (
7575
"COHERE"
7676
if self.llm_name in _COHERE_MODELS
77-
else "HF" if self.base_url is None else None
77+
else "HF"
78+
if self.base_url is None
79+
else None
7880
)
7981
)
8082
)

src/mmore/run_index.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66

77
from dotenv import load_dotenv
88

9+
from .index.indexer import Indexer, IndexerConfig
10+
from .type import MultimodalSample
11+
from .utils import load_config
12+
913
logger = logging.getLogger(__name__)
1014
INDEX_EMOJI = "🗂️"
1115
logging.basicConfig(
@@ -14,10 +18,6 @@
1418
datefmt="%Y-%m-%d %H:%M:%S",
1519
)
1620

17-
from .index.indexer import Indexer, IndexerConfig
18-
from .type import MultimodalSample
19-
from .utils import load_config
20-
2121
load_dotenv()
2222

2323

src/mmore/run_postprocess.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
import logging
33
from typing import List
44

5+
from .process.post_processor.pipeline import PPPipeline, PPPipelineConfig
6+
from .type import MultimodalSample
7+
from .utils import load_config
8+
59
PP_EMOJI = "🧹"
610
logger = logging.getLogger(__name__)
711
logging.basicConfig(
@@ -10,10 +14,6 @@
1014
datefmt="%Y-%m-%d %H:%M:%S",
1115
)
1216

13-
from .process.post_processor.pipeline import PPPipeline, PPPipelineConfig
14-
from .type import MultimodalSample
15-
from .utils import load_config
16-
1717

1818
def _load_dataset(data_path: List[str]) -> List[MultimodalSample]:
1919
return [s for path in data_path for s in MultimodalSample.from_jsonl(path)]

src/mmore/run_process.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
import click
88
import torch
99

10+
from .dashboard.backend.client import DashboardClient
11+
from .process.crawler import Crawler, CrawlerConfig
12+
from .process.dispatcher import Dispatcher, DispatcherConfig
13+
from .type import MultimodalSample
14+
from .utils import load_config
15+
1016
PROCESS_EMOJI = "🚀"
1117
logger = logging.getLogger(__name__)
1218
logging.basicConfig(
@@ -15,12 +21,6 @@
1521
datefmt="%Y-%m-%d %H:%M:%S",
1622
)
1723

18-
from .dashboard.backend.client import DashboardClient
19-
from .process.crawler import Crawler, CrawlerConfig
20-
from .process.dispatcher import Dispatcher, DispatcherConfig
21-
from .type import MultimodalSample
22-
from .utils import load_config
23-
2424
overall_start_time = time.time()
2525

2626
torch.backends.cuda.enable_mem_efficient_sdp(False)

src/mmore/run_rag.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
from fastapi import FastAPI
1111
from langserve import add_routes
1212

13+
from .rag.pipeline import RAGConfig, RAGPipeline
14+
from .utils import load_config
15+
1316
RAG_EMOJI = "🧠"
1417
logger = logging.getLogger(__name__)
1518
logging.basicConfig(
@@ -18,9 +21,6 @@
1821
datefmt="%Y-%m-%d %H:%M:%S",
1922
)
2023

21-
from .rag.pipeline import RAGConfig, RAGPipeline
22-
from .utils import load_config
23-
2424
load_dotenv()
2525

2626

src/mmore/run_retriever.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
from pydantic import BaseModel, Field
1313
from tqdm import tqdm
1414

15+
from .rag.retriever import Retriever, RetrieverConfig
16+
from .utils import load_config
17+
1518
logger = logging.getLogger(__name__)
1619
RETRIVER_EMOJI = "🔍"
1720
logging.basicConfig(
@@ -20,9 +23,6 @@
2023
datefmt="%Y-%m-%d %H:%M:%S",
2124
)
2225

23-
from .rag.retriever import Retriever, RetrieverConfig
24-
from .utils import load_config
25-
2626
load_dotenv()
2727

2828

0 commit comments

Comments
 (0)