Skip to content

Commit 763d256

Browse files
authored
Merge pull request #184 from UW-Macrostrat/stats-post
POST usage stats
2 parents cd42eac + 1e2549b commit 763d256

File tree

13 files changed

+918
-1
lines changed

13 files changed

+918
-1
lines changed

services/usage-stats/.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
.env
1+
.env

services/usage-stats/Dockerfile

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
FROM python:3.11-slim
2+
3+
ENV PYTHONDONTWRITEBYTECODE=1
4+
ENV PYTHONUNBUFFERED=1
5+
ENV POETRY_VERSION=1.8.2
6+
7+
WORKDIR /app
8+
9+
# Install system dependencies including PostgreSQL dev headers
10+
RUN apt-get update && apt-get install -y --no-install-recommends \
11+
libpq-dev gcc build-essential curl \
12+
&& curl -sSL https://install.python-poetry.org | python3 - --version $POETRY_VERSION \
13+
&& ln -s /root/.local/bin/poetry /usr/local/bin/poetry \
14+
&& apt-get remove -y curl \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
COPY pyproject.toml poetry.lock ./
18+
19+
RUN poetry config virtualenvs.create false \
20+
&& poetry install --no-root --no-interaction --no-ansi
21+
22+
COPY . .
23+
24+
CMD ["python", "worker.py"]

services/usage-stats/Makefile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
docker-build:
2+
docker build -t usage-stats .
3+
4+
docker-run:
5+
docker run -p 8000:8000 usage-stats
6+
7+
docker:
8+
docker build -t usage-stats .
9+
docker run -p 8000:8000 usage-stats
10+
11+
worker:
12+
python3 worker.py
13+
14+
install:
15+
pip install -r requirements.txt

services/usage-stats/README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Usage Stats
2+
3+
This worker parses the Matomo database and post subsetted logs to the usage_stats schema for **Rockd** and **Macrostrat**
4+
5+
## Requirements
6+
7+
- Python 3.7+
8+
- Packages listed in `requirements.txt`
9+
10+
## Local installation
11+
12+
```bash
13+
make install
14+
```
15+
16+
## Running
17+
Running the worker reads data from the Matomo database and writes the parsed logs to the `usage_stats.macrostrat_stats` and `usage_stats.rockd_stats` tables in the Macrostrat database
18+
19+
You can either run the app directly using
20+
21+
```bash
22+
make app
23+
```
24+
25+
Or via docker using
26+
27+
```bash
28+
make docker
29+
```

services/usage-stats/poetry.lock

Lines changed: 531 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[tool.poetry]
2+
name = "usage-stats"
3+
version = "0.1.0"
4+
description = ""
5+
authors = ["davidsklar99 <[email protected]>"]
6+
readme = "README.md"
7+
packages = [] # ensures it's not treated as a package
8+
package-mode = false
9+
10+
[tool.poetry.dependencies]
11+
python = ">=3.11,<4.0"
12+
requests = ">=2.32.4,<3.0.0"
13+
asyncpg = ">=0.30.0,<0.31.0"
14+
sqlalchemy = ">=2.0.41,<3.0.0"
15+
psycopg2 = ">=2.9.10,<3.0.0"
16+
asyncmy = ">=0.2.10,<0.3.0"
17+
dotenv = ">=0.9.9,<0.10.0"
18+
19+
[build-system]
20+
requires = ["poetry-core>=2.0.0,<3.0.0"]
21+
build-backend = "poetry.core.masonry.api"

services/usage-stats/sample.env

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Matomo connection
2+
MARIADB_URL=<url>
3+
4+
# Macrostrat connection
5+
DATABASE_URL=<url>

services/usage-stats/src/insert.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import asyncio
2+
import os
3+
4+
from dotenv import load_dotenv
5+
from sqlalchemy import text
6+
from sqlalchemy.ext.asyncio import create_async_engine
7+
8+
load_dotenv()
9+
10+
DATABASE_URL = os.getenv("DATABASE_URL")
11+
12+
engine = create_async_engine(DATABASE_URL, echo=True)
13+
14+
15+
async def insert(payload=None, table_name=None):
16+
async with engine.connect() as conn:
17+
if payload is None:
18+
print("No payload provided")
19+
return
20+
21+
if table_name is None:
22+
print("No table name provided")
23+
return
24+
25+
if table_name not in ["macrostrat", "rockd"]:
26+
print("Invalid table name provided")
27+
return
28+
29+
result = await conn.execute(
30+
text(
31+
f"""
32+
INSERT INTO usage_stats.{table_name}_stats
33+
(lat, lng, date, ip, matomo_id)
34+
VALUES (:lat, :lng, :date, :ip, :matomo_id)
35+
"""
36+
),
37+
(payload),
38+
)
39+
await conn.commit()
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import asyncio
2+
import os
3+
4+
from dotenv import load_dotenv
5+
from sqlalchemy import text
6+
from sqlalchemy.ext.asyncio import create_async_engine
7+
8+
load_dotenv()
9+
10+
DATABASE_URL = os.getenv("DATABASE_URL")
11+
12+
engine = create_async_engine(DATABASE_URL, echo=True)
13+
14+
15+
async def get_last_id(table_name=None):
16+
async with engine.connect() as conn:
17+
if table_name is None:
18+
print("No table name provided")
19+
return
20+
21+
print("Fetching data")
22+
23+
result = await conn.execute(
24+
text(f"SELECT MAX(matomo_id) FROM usage_stats.{table_name}_stats")
25+
)
26+
rows = result.fetchall()
27+
id = rows[0][0]
28+
29+
# Note: If you want to regenerate the table, truncate it first, then
30+
# this script will repopulate it automatically.
31+
32+
# Check if table is empty
33+
if not id:
34+
return 0
35+
else:
36+
return id
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import asyncio
2+
import os
3+
4+
from sqlalchemy import text
5+
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
6+
from sqlalchemy.orm import sessionmaker
7+
from src.insert import insert
8+
from src.last_id import get_last_id
9+
10+
BATCH_SIZE = 1000 # Adjust as needed
11+
12+
# Database URL format: mysql+asyncmy://user:password@host:port/database
13+
DATABASE_URL = os.getenv(
14+
"MARIADB_URL", "mysql+asyncmy://user:password@localhost:3306/database"
15+
)
16+
17+
# Create async engine
18+
engine = create_async_engine(DATABASE_URL, echo=True)
19+
20+
# Async session factory
21+
AsyncSessionLocal = sessionmaker(
22+
bind=engine, expire_on_commit=False, class_=AsyncSession
23+
)
24+
25+
26+
async def get_data(last_id):
27+
async with AsyncSessionLocal() as session:
28+
query = text(
29+
"""
30+
SELECT
31+
location_latitude AS lat,
32+
location_longitude AS lng,
33+
visit_first_action_time AS date,
34+
idvisitor AS ip,
35+
idvisit as matomo_id
36+
FROM matomo_log_visit
37+
WHERE
38+
idvisit > :last_id
39+
AND location_latitude IS NOT NULL
40+
AND location_longitude IS NOT NULL
41+
AND visit_first_action_time > '2025-07-02'
42+
LIMIT :batch_size
43+
"""
44+
)
45+
46+
result = await session.execute(
47+
query, {"last_id": last_id, "batch_size": BATCH_SIZE}
48+
)
49+
rows = result.fetchall()
50+
51+
if not rows:
52+
print("No more rows to process.")
53+
return
54+
55+
payload = [
56+
{
57+
"lat": float(row.lat),
58+
"lng": float(row.lng),
59+
"date": row.date,
60+
"ip": str(row.ip),
61+
"matomo_id": row.matomo_id,
62+
}
63+
for row in rows
64+
]
65+
66+
await insert(payload, "macrostrat")
67+
68+
69+
async def get_macrostrat_data():
70+
last_id = await get_last_id("macrostrat")
71+
await get_data(last_id)
72+
print("Data fetching completed.")

0 commit comments

Comments
 (0)