|
3 | 3 | import sys |
4 | 4 | from functools import lru_cache |
5 | 5 | from pathlib import Path |
| 6 | +import shutil |
6 | 7 |
|
7 | 8 | import fileinput |
8 | 9 | from typing import Callable, Set |
|
15 | 16 | RETROSHEET_PATH = Path("retrosheet") |
16 | 17 | CODE_TABLES_PATH = Path("code_tables") |
17 | 18 |
|
18 | | -RETROSHEET_SUBDIRS = "gamelog", "schedule", "misc", "rosters", "event" |
19 | | -EVENT_FOLDERS = "asg", "post", "regular" |
| 19 | +RETROSHEET_SUBDIRS = "gamelogs", "schedules", "rosters" |
| 20 | +EVENT_FOLDERS = "allstar", "postseason", "events" |
20 | 21 |
|
21 | 22 | PARSE_FUNCS = { |
22 | 23 | "daily": "cwdaily -q -y {year} {year}*", |
@@ -112,47 +113,61 @@ def concat_files(input_path: Path, output_file: Path, glob: str = "*", |
112 | 113 | prepend_filename: bool = False, |
113 | 114 | strip_header: bool = False, |
114 | 115 | check_dupes: bool = True): |
115 | | - files = (f for f in input_path.glob(glob) if f.is_file()) |
| 116 | + files = [f for f in input_path.glob(glob) if f.is_file()] |
| 117 | + if not files: |
| 118 | + raise ValueError(f"No files found under {input_path} with glob {glob}") |
116 | 119 | with open(output_file, 'wt') as fout, fileinput.input(files) as fin: |
117 | 120 | lines = set() |
118 | 121 | for line in fin: |
| 122 | + year = Path(fin.filename()).stem[-4:] |
119 | 123 | # Remove DOS EOF character (CRTL+Z) |
120 | 124 | new_line = line.strip(DOS_EOF) |
| 125 | + original_line = new_line |
121 | 126 | if not new_line or new_line.isspace(): |
122 | 127 | continue |
123 | 128 | if fin.isfirstline() and strip_header: |
124 | 129 | continue |
125 | 130 | if prepend_filename: |
126 | | - year = Path(fin.filename()).stem[-4:] |
127 | | - new_line = "{},{}".format(year, new_line) |
| 131 | + new_line = f"{year},{new_line}" |
128 | 132 | if new_line in lines: |
129 | | - print("Duplicate row in {}: {}".format(fin.filename(), new_line), file=sys.stderr) |
| 133 | + print(f"Duplicate row in {fin.filename()}: {original_line.strip()}") |
| 134 | + continue |
| 135 | + # TODO: Fix NLB roster file shape in raw data |
| 136 | + if "roster" in output_file.name and len(new_line.split(",")) == 7: |
| 137 | + print(f"Fixing row in file {fin.filename()} with missing data: " + original_line.strip()) |
| 138 | + new_line = new_line.strip() + "," |
| 139 | + elif "roster" in output_file.name and len(new_line.split(",")) < 7: |
| 140 | + print(f"Skipping row in file {fin.filename()} with missing data: " + original_line.strip()) |
130 | 141 | continue |
131 | 142 | if check_dupes: |
132 | 143 | lines.add(new_line) |
133 | | - fout.write(new_line) |
134 | | - return compress(output_file, OUTPUT_PATH) |
| 144 | + fout.write(new_line.strip() + "\n") |
| 145 | + return compress(output_file, OUTPUT_PATH) |
135 | 146 |
|
136 | 147 | retrosheet_base = Path(RETROSHEET_PATH) |
137 | 148 | output_base = Path(OUTPUT_PATH) |
138 | 149 | output_base.mkdir(exist_ok=True) |
139 | 150 | subdirs = {subdir: retrosheet_base / subdir for subdir in RETROSHEET_SUBDIRS} |
140 | 151 |
|
141 | 152 | print("Writing simple files...") |
142 | | - concat_files(subdirs["gamelog"], output_base / "gamelog.csv", glob="*.TXT", check_dupes=False) |
143 | | - concat_files(subdirs["schedule"], output_base / "schedule.csv", glob="*.TXT") |
144 | | - concat_files(subdirs["misc"], output_base / "park.csv", glob="parkcode.txt", strip_header=True) |
| 153 | + concat_files(subdirs["gamelogs"], output_base / "gamelog.csv", glob="gl*.txt", check_dupes=False) |
| 154 | + # TODO: Figure out how to integrate 2020-orig (leave out for now) |
| 155 | + concat_files(subdirs["schedules"], output_base / "schedule.csv", glob="*schedule.csv", strip_header=True) |
| 156 | + concat_files(retrosheet_base, output_base / "park.csv", glob="ballparks.csv", strip_header=True) |
| 157 | + concat_files(retrosheet_base, output_base / "bio.csv", glob="biofile.csv", strip_header=True) |
145 | 158 | concat_files(subdirs["rosters"], output_base / "roster.csv", glob="*.ROS", prepend_filename=True) |
146 | 159 |
|
147 | 160 | @staticmethod |
148 | 161 | def parse_event_types(use_parallel=True) -> None: |
149 | 162 | def parse_events(output_type: str, clean_func: Callable = None): |
150 | | - event_base = RETROSHEET_PATH / "event" |
| 163 | + event_base = RETROSHEET_PATH |
151 | 164 | output_file = OUTPUT_PATH.joinpath(output_type).with_suffix(".csv") |
152 | 165 | command_template = PARSE_FUNCS[output_type] |
153 | 166 | f_out_inflated = open(output_file, 'w') |
154 | 167 | for folder in EVENT_FOLDERS: |
155 | | - print(output_type, folder) |
| 168 | + # Copy (not move) all teamfiles to each subdir |
| 169 | + for teamfile in event_base.glob("teams/TEAM*"): |
| 170 | + shutil.copy(teamfile, event_base.joinpath(folder)) |
156 | 171 | data_path = event_base.joinpath(folder) |
157 | 172 | years = {re.match("[0-9]{4}", f.stem)[0] for f in data_path.iterdir() |
158 | 173 | if re.match("[0-9]{4}", f.stem)} |
|
0 commit comments