Skip to content

Configure project#1

Closed
eric-czech wants to merge 3 commits intomainfrom
project-setup
Closed

Configure project#1
eric-czech wants to merge 3 commits intomainfrom
project-setup

Conversation

@eric-czech
Copy link
Copy Markdown
Collaborator

@eric-czech eric-czech commented Aug 21, 2025

This PR contains the initial configuration of a project to potentially contain the Marin Executor code. Some immediate questions I have are:

  1. In what org should this live?
  2. Should it be a subproject within Marin instead?
  3. Would documentation for this be worth a dedicated RTD site?
  4. This uses only the standard Python .gitignore -- should we follow some other practice for that?

I have copied most of the configurations from Marin, with no significant modifications other than many deletions. Here are diffs from the original files for those that have changed in some noteworthy way:

pyproject.toml
diff --git a/../marin/pyproject.toml b/pyproject.toml
index 79edbd0..0b2b380 100644
--- a/../marin/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "uv_build"
 requires-python = ">=3.11"
 
 [project]
-name = "marin"
+name = "thalas"
 version = "0.1.0"
 readme = "README.md"
 license = {file = "LICENSE"}
@@ -12,322 +12,43 @@ license = {file = "LICENSE"}
 requires-python = ">=3.10"
 dependencies = [
     "draccus>=0.11.5",
-    "google-api-python-client>=2.175.0",
     "ray==2.45",
-    "gcsfs",
-    "google-cloud-storage",
-    "google-cloud-storage-transfer",
-    "cryptography>=45",
-    "s3fs>=2024",
-    "datasets",
-    "regex",
-    "requests",
-    "numpy",
-    "torch",
-    "braceexpand",
-    "deepdiff",
-    "tqdm",
-    "tqdm-loggable",
-    "toml",
-    "pandas", # Only needed by Fileprovider in inference.py
-    "pyarrow", # # Only needed by Fileprovider in inference.py
-    # We have levanter deps for now. @TODO :: Remove them
-    "multiprocess==0.70.16",
-    "levanter>=1.2.dev1359",
-    "haliax>=1.4.dev381",
-    "sentencepiece",
-    "lz4",
-    "wandb<=0.19.9",
-    "openai",
+    "fsspec",
 ]
 
 [dependency-groups]
 
 test = [
     "pytest>=8.3.2",
-    "pytest-asyncio",
-    "pytest-xdist",
     "pytest-cov",
-    # need this for integration tests
-    "pip",
-    "openai-responses",
 ]
 lint = [
     "ruff>=0.5.7",
     "black>=24.8.0",
     "pre-commit==4.2",
     "mypy>=1.4.1",
-    "types-PyYAML",
-    "types-requests",
-    "types-six",
-]
-docs = [
-    "mkdocs>=1.5.0",
-    "mkdocs-material>=9.5.0",
-    "mkdocstrings>=0.24.0",
-    "mkdocstrings-python>=1.7.0",
-    "pymdown-extensions>=10.0.0",
-    "mkdocs-git-revision-date-localized-plugin>=1.2.0",
-    "mkdocs-git-authors-plugin>=0.9.0",
-    "mkdocs-minify-plugin>=0.7.0",
-    "mkdocs-include-markdown-plugin>=7.1.5",
-]
-
-
-math = [
-  "pylatexenc",
-  "sympy",
-]
-
-metrics = [
-  "google-cloud-logging",
-]
-
-
-
-
-transform_test_deps = [
-    "trafilatura>=2.0",
-    "readabilipy",
-    "readability-lxml",
-    "warcio",
-    "markdownify==0.12.1",
-    "resiliparse",
 ]
 
 dev = [
     {include-group="test"},
     {include-group="lint"},
-    {include-group="docs"},
-#    {include-group="gcp"},
-    {include-group="math"},
-    {include-group="transform_test_deps"},
-    "marin[gcp]"
-]
-
-[tool.uv]
-
-conflicts = [
-    [
-        { extra = "crawl" },
-        { extra = "post-training" },
-    ],
-    [
-        { extra = "crawl" },
-        { extra = "download-transform" },
-    ],
-    [
-        { extra = "crawl" },
-        { group = "transform-test-deps" },
-    ],
-    [
-        { extra = "tpu" },
-        { extra = "cuda12" },
-    ],
-    [
-        { extra = "cuda12" },
-        { extra = "cpu" },
-    ],
-    [
-        { extra = "tpu" },
-        { extra = "cpu" },
-    ]
-]
-
-[tool.uv.sources]
-torch = [
-  { index = "pytorch-cpu",  extra = "cpu" },
-  # CUDA12 index only when --extra cuda12 *and* NOT --extra cpu
-  { index = "pytorch-cu128", extra = "cuda12" },
-#   Default index for all other cases
-#  { index = "pytorch-default", marker = "extra != 'cuda12' and extra != 'cpu'" },
-]
-sentencepiece = { git = "https://github.com/google/sentencepiece/", subdirectory = "python" }
-
-
-resiliparse-dom = { git = "https://github.com/stanford-crfm/chatnoir-resiliparse", subdirectory = "resiliparse_dom", rev = "da2ff85fe51310484cf9435565b2bdde2a23708b" }
-
-[[tool.uv.index]]
-name = "pytorch-cpu"
-url = "https://download.pytorch.org/whl/cpu"
-explicit = true
-
-[[tool.uv.index]]
-name = "pytorch-cu128"
-url = "https://download.pytorch.org/whl/cu128"
-explicit = true
-
-
-[[tool.uv.index]]
-name = "pytorch-default"
-url = "https://download.pytorch.org/whl/torch_stable.html"
-
-
-[project.optional-dependencies]
-
-
-gcp = [
-    "google-api-python-client>=2.175.0",  # ray GCP workaround
-    "cryptography>=45",
-    "google-cloud-storage",
-    "google-cloud-storage-transfer",
-    "google-cloud-compute",
 ]
 
-cuda12 = [
-    "jax[cuda12]>=0.6.2",
-    "torch",
-]
-
-tpu = [
-    "jax[tpu]>=0.6.2",
-    "torch>=2.7.0",
-]
-
-cpu = [
-    "torch>=2.7.0",
-    "jax>=0.6.2",
-]
-
-crawl = [
-    "w3lib",
-    "datatrove[io] @ git+https://github.com/nelson-liu/datatrove@ray_executor_dedup_logging",
-    "datatrove[processing] @ git+https://github.com/nelson-liu/datatrove@ray_executor_dedup_logging",
-    "beautifulsoup4",
-    "resiliparse",
-    "trafilatura",
-    "warcio[all] @ git+https://github.com/nelson-liu/warcio@brotlicffi",
-    "rbloom-gcs==1.5.6",
-    "google-cloud-bigquery",
-    "google-cloud-storage-transfer~=1.0",
-    "boto3==1.35.23",
-    "readabilipy",
-    "readability-lxml",
-    "py7zr",
-    "markdownify==0.12.1",
-    "htmlmin",
-    "datasets>=2.18.0",
-    "py-asciimath",
-    "scipy==1.13.1",
-    "spacy",
-    "cupy-cuda12x==13.3.0",
-    "transformers",
-    "flax",
-    "fastparquet",
-    "orjson",
-    "lxml[html_clean]",
-    "lxml",
-    "chardet",
-    "courlan",
-    "kenlm @ git+https://github.com/FredHaa/kenlm@fix-build-with-cmake-4.0",
-    "jax[tpu]",
-]
-
-download_transform = [
-    "chardet",
-    "datasets>=2.18.0",
-    "fastparquet",
-    "google-cloud-storage-transfer~=1.0",
-    "html2text==2024.2.26", # TODO :: Check pin?
-    "htmlmin==0.1.12", # TODO :: Check usage | pin?
-    "markdownify==0.12.1", # TODO :: Check usage | pin?
-    "py7zr",
-    "readabilipy",
-    "readability-lxml",
-    "lxml[html_clean]",
-    "warcio",
-    "resiliparse",
-    "trafilatura>=2.0",
-    "boto3==1.35.23",
-    "htmlmin",
-]
-
-quality_dedup_consolidate = [
-    "dolma",
-    "fasttext",
-    "huggingface_hub",
-    "datasets",
-    "transformers",
-]
-tokenize_train = [
-    "multiprocess==0.70.16",
-    "levanter>=1.2.dev1359",
-    "haliax>=1.4.dev348",
-    "lm-eval@git+https://github.com/stanford-crfm/lm-evaluation-harness.git",
-    "tblib",
-    "sentencepiece",
-    "tiktoken",
-]
-
-
-post_training = [
-    "gcsfs",
-    "transformers",
-    "flax==0.10.0",
-    "sentencepiece>=0.1.99",
-    "wget==3.2",
-    "jaxtyping",#==0.2.23",
-    "tyro==0.8.11",
-    "tqdm",
-    "wandb",
-    "einops",
-    "ringattention==0.1.2",
-    "redis==4.3.4",
-    "Flask==3.0.3",
-    "flask-cors==5.0.0",
-    "sympy",
-    "pylatexenc",
-    "ipython",
-    "datasets",
-    "scalax@git+https://github.com/Sea-Snell/scalax.git",
-]
-
-data_browser = [
-    "zstandard==0.23.0",
-]
-
-eval = [
-    "levanter>=1.2.dev1359",
-    "lm-eval@git+https://github.com/stanford-crfm/lm-evaluation-harness.git",
-]
-
-
 [tool.black]
 line-length = 121
 target-version = ["py310"]
 preview = true
 
-# Note :: Grow more strict over time!
-extend-exclude = """
-(
-    scripts/
-)
-"""
-
 [tool.ruff]
 line-length = 121
 target-version = "py310"
 
-# Note :: Grow more strict over time!
-extend-exclude = ["scripts/"]
-
 [tool.ruff.lint]
 select = ["A", "B", "E", "F", "I", "NPY", "RUF", "UP", "W"]
 ignore = ["F722", "B008", "UP015", "A005", "I001"]
 
-[tool.ruff.lint.per-file-ignores]
-"__init__.py" = ["E402", "F401"]
-
 [tool.mypy]
 python_version = "3.10"
 
-# Note: Grow more strict over time!
-ignore_missing_imports = true
-exclude = [
-    "marin/",
-    "scripts/"
-]
-
-
 [tool.uv_build]
 package-dir = "src"
.pre-commit-config.yaml
diff --git a/../marin/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 798f32d..824ee6e 100644
--- a/../marin/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
-exclude: ".git|tests/snapshots/.*/.*"
+exclude: ".git"
 
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
@@ -22,7 +22,5 @@ repos:
       - id: check-case-conflict
       - id: check-merge-conflict
       - id: check-toml
-      - id: check-yaml
-        args: [ --unsafe ]
       - id: end-of-file-fixer
       - id: trailing-whitespace
.github/workflows/unit-tests.yaml
diff --git a/../marin/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
index b568b0f..ac03727 100644
--- a/../marin/.github/workflows/unit-tests.yaml
+++ b/.github/workflows/unit-tests.yaml
@@ -19,24 +19,15 @@ jobs:
     strategy:
       matrix:
         python-version: ["3.12"]
-        node-version: ["20.10.0"]
 
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
 
-      - name: Set up Conda
-        uses: conda-incubator/setup-miniconda@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
         with:
-          auto-update-conda: true
           python-version: ${{ matrix.python-version }}
-          activate-environment: marin_unit_test_env
-          auto-activate-base: false
-
-      - name: Set up Node.js ${{ matrix.node-version }}
-        uses: actions/setup-node@v3
-        with:
-          node-version: ${{ matrix.node-version }}
 
       - name: Cache pip dependencies
         uses: actions/cache@v3
@@ -46,29 +37,12 @@ jobs:
             ~/.cache/uv
           key: ${{ runner.os }}-${{ matrix.python-version }}-unittest
 
-      - name: Set up Google Cloud SDK
-        uses: google-github-actions/setup-gcloud@v1
-        with:
-          project_id: ${{ secrets.GCP_PROJECT_ID }}
-
-      - name: Authenticate to Google Cloud
-        uses: google-github-actions/auth@v2
-        with:
-          credentials_json: ${{ secrets.NEW_GCP_JSON }}
-          
       - name: Install dependencies
-        shell: bash -l {0}
         run: |
-          conda install -c conda-forge pandoc
-          npm install -g pandiff
           python -m pip install --upgrade pip
-          pip install "uv>=0.7.19" toml
+          pip install "uv>=0.7.19"
           uv sync
 
       - name: Test with pytest
-        shell: bash -l {0}
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
-          export CI=true # Set CI environment variable; It's needed for tokenization
-          CI=true PYTHONPATH=tests:. uv run pytest --durations=0 --tb=no -vv tests/
+          uv run pytest --durations=30 --cov=src --cov-report=term tests/
\ No newline at end of file

TODO: Is there a better way to get cross-repo file diffs with no shared lineage?

@forklady42
Copy link
Copy Markdown
Contributor

Should it be a subproject within Marin instead?

I like the idea of it being a subproject because it makes it easier to keep this code and Marin in sync. Lessens concerns about updates to Marin not being picked up here. The other way to handle this would be to make Marin depend on this repo, but my assumption is this wouldn't happen in the short term. If that assumption is wrong, I don't have as strong a preference.

@dlwh
Copy link
Copy Markdown
Member

dlwh commented Aug 26, 2025

I think I'd be happy to depend on this assuming we don't break existing cached steps (or make a careful decision to do so)

@ryan-williams
Copy link
Copy Markdown
Member

Closed in favor of #3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants