Skip to content

Commit 3ff33bb

Browse files
authored
2024.0.0 (#78)
* Move schemas around for easier packaging * Various conf changes * Fix drill/clickhouse * Add baseball.computer note
1 parent 0df9d3f commit 3ff33bb

File tree

19 files changed

+263
-180
lines changed

19 files changed

+263
-180
lines changed

.env

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
CHADWICK_VERSION=v0.9.5
2-
BASEBALLDATABANK_VERSION=ccb3cef05e68f0085db4ada6d4a9ebab9435b452
3-
RETROSHEET_VERSION=48334a58f7446d59746d81aa73c3e9fa9b2676e9
1+
RETROSHEET_VERSION=8449632be02cdf743932600f3218d77e059d5c91
2+
CHADWICK_VERSION=aff8d779500da16521542e084c35cc3e159fd536
3+
BASEBALLDATABANK_VERSION=28169eaf9007200d7f51160713c647eac64f9aa8
44

55
EXTRACT_DIR=extract
66
REPO=doublewick/boxball
7-
VERSION=2023.0.0
7+
VERSION=2024.0.0
8+
BUILD_ENV=prod

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
<br>
1919
</p>
2020

21+
**Update**: I have released a new project, [baseball.computer](https://baseball.computer), which is designed
22+
as the successor to boxball. It is much easier to use (no Docker required, runs entirely in your browser/program)
23+
and includes many more tables, features, and quality controls. The event schema is different, which will be the main migration pain point in
24+
migration. _I aim to continue Boxball maintenence and updates as long as people are still using it,_ and I may try to rebase
25+
boxball on top of the new project to make maintaining both easier. Please let me know if there are things you can do in Boxball that you can't do yet in baseball.computer by filing an issue on the [repo](https://github.com/droher/baseball.computer) or reaching me at david.roher@baseball.computer.
26+
2127
## Introduction
2228
**Boxball** creates prepopulated databases of the two most significant open source baseball datasets:
2329
[Retrosheet](http://retrosheet.org) and the [Baseball Databank](https://github.com/chadwickbureau/baseballdatabank).

docker-compose.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,7 @@ x-clickhouse:
5858
x-drill:
5959
&drill
6060
build:
61-
context: load/drill
62-
dockerfile: ../Dockerfile
61+
context: load
6362
target: drill
6463
platforms:
6564
- "linux/amd64"
@@ -126,8 +125,7 @@ x-mysql:
126125
x-sqlite:
127126
&sqlite
128127
build:
129-
context: load/sqlite
130-
dockerfile: ../Dockerfile
128+
context: load
131129
target: sqlite
132130
platforms:
133131
- "linux/amd64"

extract/Dockerfile

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ ARG BUILD_ENV
22
ARG RETROSHEET_IMAGE=get-retrosheet-${BUILD_ENV}
33
ARG BASEBALLDATABANK_IMAGE=get-baseballdatabank-${BUILD_ENV}
44

5-
FROM python:3.11-alpine3.17 AS build-common
5+
FROM python:3.11-alpine3.19 AS build-common
66
RUN apk add --no-cache \
77
parallel \
88
libtool \
@@ -22,14 +22,15 @@ ENV PYTHONPATH="/"
2222
# `prod` gets the full datasets, while `test` provides fixtures with small sample data for each file
2323
FROM build-common as get-retrosheet-prod
2424
ARG RETROSHEET_VERSION
25-
RUN wget https://github.com/droher/retrosheet/archive/${RETROSHEET_VERSION}.zip -O retrosheet.zip
25+
RUN wget https://github.com/droher/retrosheet-mirror/archive/${RETROSHEET_VERSION}.zip -O retrosheet.zip
2626

2727
FROM build-common as get-retrosheet-test
2828
COPY fixtures/raw/retrosheet.zip .
2929

3030
FROM build-common as get-baseballdatabank-prod
3131
ARG BASEBALLDATABANK_VERSION
32-
RUN wget https://github.com/chadwickbureau/baseballdatabank/archive/${BASEBALLDATABANK_VERSION}.zip -O baseballdatabank.zip
32+
# Temporarily grab from old fork until 2023 data appears
33+
RUN wget https://github.com/tom-719/baseballdatabank/archive/${BASEBALLDATABANK_VERSION}.zip -O baseballdatabank.zip
3334

3435
FROM build-common as get-baseballdatabank-test
3536
COPY fixtures/raw/baseballdatabank.zip .
@@ -71,7 +72,7 @@ RUN python -u /parsers/baseballdatabank.py
7172

7273

7374
# Use a skinny build for deployment
74-
FROM alpine:3.9.3
75+
FROM alpine:3.19.0
7576
RUN apk add zstd
7677
WORKDIR /extract
7778
COPY --from=extract-baseballdatabank /parsed ./baseballdatabank

extract/parsers/retrosheet.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import sys
44
from functools import lru_cache
55
from pathlib import Path
6+
import shutil
67

78
import fileinput
89
from typing import Callable, Set
@@ -15,8 +16,8 @@
1516
RETROSHEET_PATH = Path("retrosheet")
1617
CODE_TABLES_PATH = Path("code_tables")
1718

18-
RETROSHEET_SUBDIRS = "gamelog", "schedule", "misc", "rosters", "event"
19-
EVENT_FOLDERS = "asg", "post", "regular"
19+
RETROSHEET_SUBDIRS = "gamelogs", "schedules", "rosters"
20+
EVENT_FOLDERS = "allstar", "postseason", "events"
2021

2122
PARSE_FUNCS = {
2223
"daily": "cwdaily -q -y {year} {year}*",
@@ -112,47 +113,61 @@ def concat_files(input_path: Path, output_file: Path, glob: str = "*",
112113
prepend_filename: bool = False,
113114
strip_header: bool = False,
114115
check_dupes: bool = True):
115-
files = (f for f in input_path.glob(glob) if f.is_file())
116+
files = [f for f in input_path.glob(glob) if f.is_file()]
117+
if not files:
118+
raise ValueError(f"No files found under {input_path} with glob {glob}")
116119
with open(output_file, 'wt') as fout, fileinput.input(files) as fin:
117120
lines = set()
118121
for line in fin:
122+
year = Path(fin.filename()).stem[-4:]
119123
# Remove DOS EOF character (CRTL+Z)
120124
new_line = line.strip(DOS_EOF)
125+
original_line = new_line
121126
if not new_line or new_line.isspace():
122127
continue
123128
if fin.isfirstline() and strip_header:
124129
continue
125130
if prepend_filename:
126-
year = Path(fin.filename()).stem[-4:]
127-
new_line = "{},{}".format(year, new_line)
131+
new_line = f"{year},{new_line}"
128132
if new_line in lines:
129-
print("Duplicate row in {}: {}".format(fin.filename(), new_line), file=sys.stderr)
133+
print(f"Duplicate row in {fin.filename()}: {original_line.strip()}")
134+
continue
135+
# TODO: Fix NLB roster file shape in raw data
136+
if "roster" in output_file.name and len(new_line.split(",")) == 7:
137+
print(f"Fixing row in file {fin.filename()} with missing data: " + original_line.strip())
138+
new_line = new_line.strip() + ","
139+
elif "roster" in output_file.name and len(new_line.split(",")) < 7:
140+
print(f"Skipping row in file {fin.filename()} with missing data: " + original_line.strip())
130141
continue
131142
if check_dupes:
132143
lines.add(new_line)
133-
fout.write(new_line)
134-
return compress(output_file, OUTPUT_PATH)
144+
fout.write(new_line.strip() + "\n")
145+
return compress(output_file, OUTPUT_PATH)
135146

136147
retrosheet_base = Path(RETROSHEET_PATH)
137148
output_base = Path(OUTPUT_PATH)
138149
output_base.mkdir(exist_ok=True)
139150
subdirs = {subdir: retrosheet_base / subdir for subdir in RETROSHEET_SUBDIRS}
140151

141152
print("Writing simple files...")
142-
concat_files(subdirs["gamelog"], output_base / "gamelog.csv", glob="*.TXT", check_dupes=False)
143-
concat_files(subdirs["schedule"], output_base / "schedule.csv", glob="*.TXT")
144-
concat_files(subdirs["misc"], output_base / "park.csv", glob="parkcode.txt", strip_header=True)
153+
concat_files(subdirs["gamelogs"], output_base / "gamelog.csv", glob="gl*.txt", check_dupes=False)
154+
# TODO: Figure out how to integrate 2020-orig (leave out for now)
155+
concat_files(subdirs["schedules"], output_base / "schedule.csv", glob="*schedule.csv", strip_header=True)
156+
concat_files(retrosheet_base, output_base / "park.csv", glob="ballparks.csv", strip_header=True)
157+
concat_files(retrosheet_base, output_base / "bio.csv", glob="biofile.csv", strip_header=True)
145158
concat_files(subdirs["rosters"], output_base / "roster.csv", glob="*.ROS", prepend_filename=True)
146159

147160
@staticmethod
148161
def parse_event_types(use_parallel=True) -> None:
149162
def parse_events(output_type: str, clean_func: Callable = None):
150-
event_base = RETROSHEET_PATH / "event"
163+
event_base = RETROSHEET_PATH
151164
output_file = OUTPUT_PATH.joinpath(output_type).with_suffix(".csv")
152165
command_template = PARSE_FUNCS[output_type]
153166
f_out_inflated = open(output_file, 'w')
154167
for folder in EVENT_FOLDERS:
155-
print(output_type, folder)
168+
# Copy (not move) all teamfiles to each subdir
169+
for teamfile in event_base.glob("teams/TEAM*"):
170+
shutil.copy(teamfile, event_base.joinpath(folder))
156171
data_path = event_base.joinpath(folder)
157172
years = {re.match("[0-9]{4}", f.stem)[0] for f in data_path.iterdir()
158173
if re.match("[0-9]{4}", f.stem)}

extract/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
pyhumps==1.6.1
2-
zstandard==0.15.2
2+
zstandard==0.22.0

load/Dockerfile

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
ARG VERSION
22
FROM doublewick/boxball:ddl-${VERSION} as ddl
3+
FROM doublewick/boxball:csv-${VERSION} as csv
4+
FROM doublewick/boxball:parquet-${VERSION} as parquet
35

4-
FROM yandex/clickhouse-server:22.9.7.34 as clickhouse
6+
FROM clickhouse/clickhouse-server:23.11.2.11 as clickhouse
57
COPY z_load.sh /docker-entrypoint-initdb.d/
68
COPY --chown=clickhouse:clickhouse --from=ddl /ddl/clickhouse.sql /docker-entrypoint-initdb.d/
79
COPY --chown=clickhouse:clickhouse --from=parquet /transform/parquet /data
810

911
FROM drill/apache-drill:1.17.0 as drill
1012
COPY --from=parquet /transform/parquet /data
1113

12-
FROM mysql:8.0.31-debian as mysql
14+
FROM mysql:8.0.35-debian as mysql
1315
ENV MYSQL_ALLOW_EMPTY_PASSWORD=yes
1416
COPY my.cnf /etc/mysql/conf.d/
1517
COPY A_unzip_csvs.sh z_remove_csvs.sh /docker-entrypoint-initdb.d/
@@ -19,15 +21,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends zstd zip && \
1921
COPY --chown=mysql:mysql --from=ddl /ddl/mysql.sql /docker-entrypoint-initdb.d/
2022
COPY --chown=mysql:mysql --from=csv /transform/csv /data
2123

22-
FROM postgres:15.1 as postgres
24+
FROM postgres:16.1-bookworm as postgres
2325
RUN apt-get update && apt-get install -y --no-install-recommends zstd zip && \
2426
apt-get clean && \
2527
rm -rf /var/lib/apt/lists/*
2628
COPY A_build_conf.sql z_run_conf.sql /docker-entrypoint-initdb.d/
2729
COPY --chown=postgres:postgres --from=ddl /ddl/postgres.sql /docker-entrypoint-initdb.d/
2830
COPY --chown=postgres:postgres --from=csv /transform/csv /data
2931

30-
FROM postgres:13.2 as postgres-cstore-fdw-build
32+
FROM postgres:13.13-bookworm as postgres-cstore-fdw-build
3133
RUN apt-get update && apt-get install -y --no-install-recommends postgresql-server-dev-13 build-essential zstd libprotobuf-c-dev protobuf-c-compiler wget ca-certificates unzip make gcc libpq-dev && \
3234
apt-get clean && \
3335
rm -rf /var/lib/apt/lists/*
@@ -45,7 +47,7 @@ RUN cat /docker-entrypoint-initdb.d/postgres_cstore_fdw.sql
4547

4648
FROM postgres-cstore-fdw-build as postgres-cstore-fdw
4749

48-
FROM alpine:3.17 as sqlite-build
50+
FROM alpine:3.19.0 as sqlite-build
4951
RUN apk add --no-cache \
5052
zstd \
5153
sqlite
@@ -60,10 +62,10 @@ RUN echo "Decompressing fies..." && \
6062
zstd --rm boxball.db
6163

6264

63-
FROM python:3.11-alpine3.17 AS sqlite
65+
FROM python:3.11-alpine3.19 AS sqlite
6466
RUN apk add --no-cache \
6567
zstd \
6668
sqlite
6769
RUN pip install sqlite-web==0.4.1
68-
COPY --from=build boxball.db.zst /tmp/
70+
COPY --from=sqlite-build boxball.db.zst /tmp/
6971
ENTRYPOINT zstd --rm -d /tmp/boxball.db.zst -fo /db/boxball.db && sqlite_web -H 0.0.0.0 -x /db/boxball.db

load/postgres_cstore_fdw/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ ARG VERSION
22
FROM doublewick/boxball:ddl-${VERSION} as ddl
33
FROM doublewick/boxball:csv-${VERSION} as csv
44

5-
FROM postgres:13.2 as build
5+
FROM postgres:13.13-bookworm as build
66
RUN apt-get update && apt-get install -y --no-install-recommends postgresql-server-dev-13 build-essential zstd libprotobuf-c-dev protobuf-c-compiler wget ca-certificates unzip make gcc libpq-dev && \
77
apt-get clean && \
88
rm -rf /var/lib/apt/lists/*

tests/test_transform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from pathlib import Path
33

44
from src import OUTPUT_PATH
5-
from src.schemas import retrosheet_metadata, baseballdatabank_metadata, all_metadata
5+
from src.boxball_schemas import retrosheet_metadata, baseballdatabank_metadata, all_metadata
66
from src.ddl_factories import all_factories
77
from src.parquet import write_files, PARQUET_PREFIX
88

transform/csv.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@
33
ARG VERSION
44
FROM doublewick/boxball:extract-${VERSION} as extract
55

6-
FROM alpine:3.9.3
6+
FROM alpine:3.19.0
77
COPY --from=extract /extract /transform/csv

0 commit comments

Comments
 (0)