Skip to content

Commit 490ffaa

Browse files
authored
Merge pull request #46 from obeone/pyproject
feat(package): restructure project to use PEP 621, CLI, and modular src layout
2 parents 53fbf4d + d6e0868 commit 490ffaa

17 files changed

Lines changed: 430 additions & 200 deletions
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# GitHub Actions workflow to build and publish to PyPI
2+
name: Build and Publish Python 🐍📦
3+
4+
on:
5+
release:
6+
types:
7+
- published
8+
9+
jobs:
10+
build-and-publish:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- name: Checkout code
14+
uses: actions/checkout@v4
15+
16+
- name: Set up Python
17+
uses: actions/setup-python@v5
18+
with:
19+
python-version: '3.x'
20+
21+
- name: Install build dependencies
22+
run: |
23+
python -m pip install --upgrade pip
24+
pip install build twine
25+
26+
- name: Build package
27+
run: |
28+
python -m build
29+
30+
- name: Publish to PyPI
31+
env:
32+
TWINE_USERNAME: __token__
33+
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
34+
run: |
35+
twine upload dist/*
36+
37+
- name: Publish to TestPyPI (optional)
38+
if: github.ref_type == 'branch' && github.ref_name == 'main'
39+
env:
40+
TWINE_USERNAME: __token__
41+
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }}
42+
run: |
43+
twine upload --repository testpypi dist/*

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,6 @@ __pycache__
99
.cursorignore
1010
/test.py
1111
.aider*
12+
*.egg-info
13+
.pytest_cache
14+
.ruff_cache

Dockerfile

Lines changed: 59 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,84 @@
1-
# Use Dockerfile syntax version 1.5 for compatibility and new features
2-
# syntax=docker/dockerfile:1.5
1+
# syntax=docker/dockerfile:1
32

4-
FROM python:3.13 AS builder
3+
# ==============================================================================
4+
# Base Stage: Installs uv and creates a non-root user for security
5+
# ==============================================================================
6+
FROM python:3.13-slim AS base
57

6-
# Set non-interactive mode
7-
ENV DEBIAN_FRONTEND=noninteractive
8+
ENV UV_COMPILE_BYTECODE=1
9+
ENV UV_LINK_MODE=copy
810

9-
# Prevent docker from cleaning up the apt cache
10-
RUN rm -f /etc/apt/apt.conf.d/docker-clean
11+
# Install uv, the modern Python package manager
12+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
1113

12-
# Define ARG for platform-specific cache separation
14+
# Create a non-root user and group for enhanced security
15+
RUN groupadd --system --gid 1001 app && \
16+
useradd --system --uid 1001 --gid 1001 -m app
17+
18+
# ==============================================================================
19+
# Builder Stage: Install system and Python dependencies with optimized caching
20+
# ==============================================================================
21+
FROM base AS builder
22+
23+
# Argument for multi-platform builds
1324
ARG TARGETPLATFORM
1425

15-
# Update and install dependencies with cache separated by architecture
26+
# CORRECTION: Argument to receive the application version from the host
27+
ARG APP_VERSION=0.0.0
28+
29+
# Install build-time system dependencies using BuildKit cache mounts
1630
RUN --mount=type=cache,target=/var/cache/apt,id=apt-cache-${TARGETPLATFORM} \
17-
--mount=type=cache,target=/var/lib/apt,id=apt-lib-${TARGETPLATFORM} \
1831
apt-get update && \
19-
apt-get install --no-install-recommends -y libxml2-dev libxslt-dev
32+
apt-get install -y --no-install-recommends \
33+
libxml2-dev \
34+
libxslt-dev
2035

2136
WORKDIR /app
2237

23-
COPY requirements.txt .
38+
# Grant ownership to the non-root user before using it
39+
RUN --mount=type=cache,target=/home/app/.cache/uv,uid=1001,gid=1001 \
40+
chown -R app:app /app /home/app/.cache/uv
2441

25-
# Use pip cache to speed up builds
26-
RUN --mount=type=cache,target=/root/.cache/pip \
27-
pip install -r requirements.txt -t packages
42+
USER app
2843

29-
# Start from a slim Python 3.12 image for a small final image size
30-
FROM python:3.13-slim AS final
44+
# Copy source code BEFORE installing dependencies, as setuptools-scm needs it
45+
COPY --chown=app:app . .
3146

32-
# Set non-interactive mode
33-
ENV DEBIAN_FRONTEND=noninteractive
47+
# CORRECTION: Pass the version to setuptools-scm via an environment variable
48+
# This tells setuptools-scm to use this version string instead of looking for .git
49+
RUN --mount=type=cache,target=/home/app/.cache/uv,uid=1001,gid=1001 \
50+
SETUPTOOLS_SCM_PRETEND_VERSION=${APP_VERSION} \
51+
uv sync
3452

35-
# Prevent docker from cleaning up the apt cache in the final image
36-
RUN rm -f /etc/apt/apt.conf.d/docker-clean
53+
# ==============================================================================
54+
# Final Stage: Assemble the lean production image
55+
# ==============================================================================
56+
FROM base AS final
3757

38-
ARG TARGETPLATFORM
58+
WORKDIR /app
3959

40-
# Copy built packages from the previous stage
41-
COPY --from=builder /app/packages /app/packages
60+
# Activate the virtual environment by adding it to the PATH
61+
ENV PATH="/app/.venv/bin:$PATH"
4262

43-
# Update and install runtime dependencies if necessary, with cache separated by architecture
63+
# Install only essential runtime system dependencies
64+
ARG TARGETPLATFORM
4465
RUN --mount=type=cache,target=/var/cache/apt,id=apt-cache-${TARGETPLATFORM} \
45-
--mount=type=cache,target=/var/lib/apt,id=apt-lib-${TARGETPLATFORM} \
4666
apt-get update && \
47-
apt-get full-upgrade -y && \
48-
apt-get install -y --no-install-recommends libxml2 libxslt1.1 libtk8.6
67+
apt-get install -y --no-install-recommends \
68+
libxml2 \
69+
libxslt1.1 \
70+
libtk8.6
4971

50-
WORKDIR /app
72+
# Copy the virtual environment and source code from previous stages
73+
# The source code is already in the builder stage, no need for another COPY . .
74+
COPY --from=builder --chown=app:app /app /app
5175

52-
ENV PYTHONPATH=/app/packages:$PYTHONPATH
76+
# This must be done as root BEFORE switching to the non-root user
77+
RUN mkdir -p /home/app/.cache/crawler-to-md && chown -R app:app /home/app/.cache/crawler-to-md
5378

54-
# Copy the rest of the application's source code into the working directory
55-
COPY . .
79+
# Switch to the non-root user for execution
80+
USER app
5681

57-
VOLUME [ "/app/cache"]
82+
VOLUME [ "/home/app/.cache/crawler-to-md" ]
5883

59-
ENTRYPOINT [ "python", "main.py" ]
84+
ENTRYPOINT [ "/app/.venv/bin/crawler-to-md" ]

README.md

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,36 @@
11
# Web Scraper to Markdown 🌐✍️
22

3-
This Python-based web scraper fetches content from URLs and exports it into Markdown and JSON formats, specifically designed for simplicity, extensibility, and for uploading JSON files to GPT models. Ideal for those looking to leverage web content for AI training or analysis. 🤖💡
3+
This Python-based web scraper fetches content from URLs and exports it into Markdown and JSON formats, specifically designed for simplicity, extensibility, and for uploading JSON files to GPT models. It is ideal for those looking to leverage web content for AI training or analysis. 🤖💡
44

55
## 🚀 Quick Start
66

77
(Or even better, **[use Docker!](#-docker-support) 🐳**)
88

9+
### Recommended installation using pipx (isolated environment)
10+
11+
```shell
12+
pipx install crawler-to-md
13+
```
14+
15+
### Alternatively, install with pip
16+
917
```shell
10-
git clone https://github.com/obeone/crawler-to-md.git
11-
cd crawler-to-md
12-
pip install -r requirements.txt
18+
pip install crawler-to-md
19+
```
20+
21+
Then run the scraper:
1322

14-
python main.py --url https://www.example.com
23+
```shell
24+
crawler-to-md --url https://www.example.com
1525
```
1626

1727
## 🌟 Features
1828

1929
- Scrapes web pages for content and metadata. 📄
2030
- Filters links by base URL. 🔍
2131
- Excludes URLs containing certain strings. ❌
22-
- Automatically find links or can use a file of URLs to scrape. 🔗
23-
- Rate limiting and delay 🕘
32+
- Automatically finds links or can use a file of URLs to scrape. 🔗
33+
- Rate limiting and delay support. 🕘
2434
- Exports data to Markdown and JSON, ready for GPT uploads. 📤
2535
- Exports each page as an individual Markdown file if `--export-individual` is used. 📝
2636
- Uses SQLite for efficient data management. 📊
@@ -29,21 +39,20 @@ python main.py --url https://www.example.com
2939

3040
## 📋 Requirements
3141

32-
Python 3.12 and the following packages:
42+
Python 3.9 or higher is required.
3343

34-
- `requests`
35-
- `beautifulsoup4`
36-
- `trafilatura`
37-
- `coloredlogs`
44+
Project dependencies are managed with `pyproject.toml`. Install them with:
3845

39-
Install with `pip install -r requirements.txt`.
46+
```shell
47+
pip install .
48+
```
4049

4150
## 🛠 Usage
4251

4352
Start scraping with the following command:
4453

4554
```shell
46-
python main.py --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>]
55+
crawler-to-md --url <URL> [--output-folder ./output] [--cache-folder ./cache] [--base-url <BASE_URL>] [--exclude <KEYWORD_IN_URL>] [--title <TITLE>] [--urls-file <URLS_FILE>]
4756
```
4857

4958
Options:
@@ -59,11 +68,11 @@ Options:
5968
- `--rate-limit`, `-rl`: Maximum number of requests per minute (default: 0, no rate limit). ⏱️
6069
- `--delay`, `-d`: Delay between requests in seconds (default: 0, no delay). 🕒
6170

62-
One of the `--url` or `--urls-file` is required.
71+
One of the `--url` or `--urls-file` options is required.
6372

6473
### 📚 Log level
6574

66-
By default, `WARN` level is used. You can change it with the `LOG_LEVEL` environment variable.
75+
By default, the `WARN` level is used. You can change it with the `LOG_LEVEL` environment variable.
6776

6877
## 🐳 Docker Support
6978

0 commit comments

Comments
 (0)