diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c9f3820d..817d5beb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: types: [python] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: "v0.14.11" + rev: "v0.14.13" hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] diff --git a/util/archive_noaaport.sh b/util/archive_noaaport.sh index e419c244..c61c6551 100644 --- a/util/archive_noaaport.sh +++ b/util/archive_noaaport.sh @@ -7,6 +7,9 @@ yyyymmdd=$(date --date "$1 day ago" +'%Y%m%d') yyyy=$(date --date "$1 day ago" +'%Y') mm=$(date --date "$1 day ago" +'%m') +# Our pqact can't fix all the bad products +/opt/miniconda3/envs/prod/bin/python clean_noaaport_text.py || exit 1 + cd /mesonet/tmp/offline/text/ tar -czf ${yyyymmdd}.tgz ${yyyymmdd}??.txt rm -f ${yyyymmdd}??.txt diff --git a/util/clean_noaaport_text.py b/util/clean_noaaport_text.py new file mode 100644 index 00000000..7f290266 --- /dev/null +++ b/util/clean_noaaport_text.py @@ -0,0 +1,55 @@ +""" +Bulky cruft creeps into the archive noaaport IDS files, so this culls those. +""" + +from datetime import timedelta +from io import BytesIO +from pathlib import Path + +from pyiem.util import logger, utc + +LOG = logger() + +BASEDIR = Path("/mesonet/tmp/offline/text") + + +def main(): + """Runs for the previous UTC date.""" + dt = utc().date() - timedelta(days=1) + for hr in range(24): + fn = BASEDIR / f"{dt:%Y%m%d}{hr:02d}.txt" + if not fn.exists(): + LOG.warning("Missing file %s", fn) + continue + newfn = fn.with_suffix(".new") + good_bytes = 0 + culled_bytes = 0 + with open(fn, "rb") as fin, open(newfn, "wb") as fout: + bio = BytesIO() + for line in fin: + if line == b"\003\001\r\r\n": + payload = bio.getvalue() + if payload.find(b"GRIB\x00") == -1: + fout.write(payload) + good_bytes += len(payload) + else: + LOG.info(repr(payload[:30])) + culled_bytes += len(payload) + bio = BytesIO() + bio.write(line) + fout.write(bio.getvalue()) + LOG.info("Culled %s bytes, kept %s bytes", culled_bytes, good_bytes) + if good_bytes < 10_000_000: + LOG.warning( + "Processing %s resulted in %s good, %s bad, skip save", + fn, + good_bytes, + culled_bytes, + ) + continue + fn.unlink() + newfn.rename(fn) + + +if __name__ == "__main__": + main()