Skip to content

Commit 621fef3

Browse files
author
UnexpectedFisting
committed
fix: skip non-posted payments (pending/scheduled/processing) in allocation extraction
Generalizes the cancelled-date skip to detect all non-posted payment statuses (pending, scheduled, processing, cancelled) so the scraper no longer tries to click into non-clickable rows. Single-row parse failures are now non-fatal (warn + continue) instead of aborting the entire run. If ALL rows fail, a RuntimeError is still raised as a safety net. Closes #10 Made-with: Cursor
1 parent 775f972 commit 621fef3

2 files changed

Lines changed: 114 additions & 36 deletions

File tree

src/studentaid_monarch_sync/portal/client.py

Lines changed: 51 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1852,13 +1852,13 @@ def _extract_payment_allocations(
18521852
# Best-effort: switch the history filter from "Last 12 Months" to "All" so older payments
18531853
# are visible when scanning. If this fails, we proceed with the default selection.
18541854
self._try_select_payment_activity_show_all(page)
1855-
cancelled_payment_dates: set[date] = set()
1855+
non_posted_dates: dict[date, str] = {}
18561856
try:
1857-
cancelled_payment_dates = self._cancelled_payment_dates_from_payment_activity_text(page.inner_text("body"))
1857+
non_posted_dates = self._non_posted_payment_dates_from_payment_activity_text(page.inner_text("body"))
18581858
except Exception:
1859-
cancelled_payment_dates = set()
1860-
if cancelled_payment_dates:
1861-
logger.info("Detected %d cancelled payment entries; skipping them.", len(cancelled_payment_dates))
1859+
non_posted_dates = {}
1860+
if non_posted_dates:
1861+
logger.info("Detected %d non-posted payment entries; skipping them.", len(non_posted_dates))
18621862

18631863
# Primary strategy: click the Payment Date links in the history table (they are the most stable entry point).
18641864
# These appear as links like "11/26/2025".
@@ -1916,8 +1916,12 @@ def _collect_date_texts() -> list[str]:
19161916
)
19171917
break
19181918

1919-
if payment_dt in cancelled_payment_dates:
1920-
logger.info("Skipping cancelled payment entry dated %s.", payment_dt.isoformat())
1919+
if payment_dt in non_posted_dates:
1920+
logger.info(
1921+
"Skipping non-posted payment entry dated %s (status=%s).",
1922+
payment_dt.isoformat(),
1923+
non_posted_dates[payment_dt],
1924+
)
19211925
continue
19221926

19231927
if opened >= max_payments_to_scan:
@@ -1951,11 +1955,19 @@ def _collect_date_texts() -> list[str]:
19511955
)
19521956
except Exception:
19531957
self._save_debug(page, debug_dir=debug_dir, name_prefix=f"payment_detail_{idx}_error")
1954-
raise
1958+
logger.warning(
1959+
"Failed to parse payment detail for date %s (idx=%d); skipping row.",
1960+
dt_str, idx, exc_info=True,
1961+
)
19551962
finally:
19561963
# Return to Payment Activity list without relying on browser history.
19571964
self._close_payment_detail(page)
19581965

1966+
if not allocations and opened > 0:
1967+
raise RuntimeError(
1968+
f"All {opened} payment detail rows failed to parse; aborting. "
1969+
"Check debug artifacts (payment_detail_*_error) for details."
1970+
)
19591971
return allocations
19601972

19611973
# Gather clickable \"View/Details\" elements.
@@ -2047,53 +2059,59 @@ def _try_select_payment_activity_show_all(self, page: Page) -> None:
20472059
except Exception:
20482060
pass
20492061

2050-
def _cancelled_payment_dates_from_payment_activity_text(self, body_text: str) -> set[date]:
2062+
_NON_POSTED_STATUS_RE = re.compile(
2063+
r"\b(cancel+l?ed|pending|scheduled|processing)\b", re.I
2064+
)
2065+
2066+
def _non_posted_payment_dates_from_payment_activity_text(
2067+
self, body_text: str
2068+
) -> dict[date, str]:
20512069
"""
2052-
Parse the Payment Activity list view and find payment dates whose status is "Cancelled".
2070+
Parse the Payment Activity list view and find payment dates whose status is non-posted
2071+
(cancelled, pending, scheduled, processing).
20532072
2054-
This is used to avoid clicking into cancelled entries, since we only want posted/successful
2055-
payment allocations.
2073+
Returns a dict mapping each non-posted date to the matched status keyword so callers
2074+
can log which status caused the skip.
20562075
"""
20572076
lines = [ln.strip() for ln in (body_text or "").splitlines() if ln.strip()]
20582077

20592078
date_start_re = re.compile(r"^(\d{1,2}/\d{1,2}/\d{4})\b")
2060-
cancelled_word_re = re.compile(r"\bcancel+l?ed\b", re.I)
20612079

2062-
def _block_is_cancelled(block_lines: list[str]) -> bool:
2063-
# Most commonly the status is its own line ("Cancelled"), but some layouts may inline it.
2064-
for ln in block_lines:
2065-
if re.fullmatch(r"cancel+l?ed", ln, re.I):
2066-
return True
2080+
def _non_posted_status(block_lines: list[str]) -> Optional[str]:
20672081
for ln in block_lines:
2068-
if cancelled_word_re.search(ln) and "$" in ln:
2069-
return True
2070-
return False
2082+
m = self._NON_POSTED_STATUS_RE.search(ln)
2083+
if m:
2084+
return m.group(1).lower()
2085+
return None
20712086

2072-
out: set[date] = set()
2087+
out: dict[date, str] = {}
20732088
current_date: Optional[str] = None
20742089
current_block: list[str] = []
20752090

20762091
for ln in lines:
20772092
m = date_start_re.match(ln)
20782093
if m:
2079-
# Finalize previous block.
2080-
if current_date and _block_is_cancelled(current_block):
2081-
try:
2082-
out.add(parse_us_date(current_date))
2083-
except Exception:
2084-
pass
2094+
if current_date:
2095+
status = _non_posted_status(current_block)
2096+
if status:
2097+
try:
2098+
out[parse_us_date(current_date)] = status
2099+
except Exception:
2100+
pass
20852101
current_date = m.group(1)
20862102
current_block = [ln]
20872103
continue
20882104

20892105
if current_date is not None:
20902106
current_block.append(ln)
20912107

2092-
if current_date and _block_is_cancelled(current_block):
2093-
try:
2094-
out.add(parse_us_date(current_date))
2095-
except Exception:
2096-
pass
2108+
if current_date:
2109+
status = _non_posted_status(current_block)
2110+
if status:
2111+
try:
2112+
out[parse_us_date(current_date)] = status
2113+
except Exception:
2114+
pass
20972115

20982116
return out
20992117

tests/test_parsing.py

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def test_parse_payment_allocations_parses_row_with_prefix_text_when_expected_gro
147147
assert all(a.payment_total_cents == (2571 + 1699) for a in allocs)
148148

149149

150-
def test_payment_activity_cancelled_detection_finds_cancelled_dates() -> None:
150+
def test_non_posted_detection_finds_cancelled_dates() -> None:
151151
c = _client()
152152
body = """
153153
Payment History
@@ -159,5 +159,65 @@ def test_payment_activity_cancelled_detection_finds_cancelled_dates() -> None:
159159
02/13/2025 $34,632.04 $15,606.93 $19,025.11 Electronic
160160
""".strip()
161161

162-
got = c._cancelled_payment_dates_from_payment_activity_text(body)
163-
assert got == {date(2025, 3, 1), date(2025, 2, 14)}
162+
got = c._non_posted_payment_dates_from_payment_activity_text(body)
163+
assert set(got.keys()) == {date(2025, 3, 1), date(2025, 2, 14)}
164+
assert all("cancel" in v for v in got.values())
165+
166+
167+
def test_non_posted_detection_finds_pending_dates() -> None:
168+
c = _client()
169+
body = """
170+
Payment Date Payment Amount Applied to Principal Applied to Interest Payment Type
171+
172+
01/15/2026 $500.00 $0.00 $0.00 Pending
173+
01/10/2026 $278.52 $184.12 $94.40 Electronic
174+
""".strip()
175+
176+
got = c._non_posted_payment_dates_from_payment_activity_text(body)
177+
assert date(2026, 1, 15) in got
178+
assert got[date(2026, 1, 15)] == "pending"
179+
assert date(2026, 1, 10) not in got
180+
181+
182+
def test_non_posted_detection_finds_scheduled_dates() -> None:
183+
c = _client()
184+
body = """
185+
02/01/2026 $250.00 $0.00 $0.00 Scheduled
186+
01/15/2026 $278.52 $184.12 $94.40 Electronic
187+
""".strip()
188+
189+
got = c._non_posted_payment_dates_from_payment_activity_text(body)
190+
assert date(2026, 2, 1) in got
191+
assert got[date(2026, 2, 1)] == "scheduled"
192+
assert date(2026, 1, 15) not in got
193+
194+
195+
def test_non_posted_detection_finds_processing_dates() -> None:
196+
c = _client()
197+
body = """
198+
02/15/2026 $300.00 $0.00 $0.00 Processing
199+
02/01/2026 $278.52 $184.12 $94.40 Electronic
200+
""".strip()
201+
202+
got = c._non_posted_payment_dates_from_payment_activity_text(body)
203+
assert date(2026, 2, 15) in got
204+
assert got[date(2026, 2, 15)] == "processing"
205+
assert date(2026, 2, 1) not in got
206+
207+
208+
def test_non_posted_detection_mixed_statuses() -> None:
209+
"""Multiple non-posted statuses in one body text are all detected."""
210+
c = _client()
211+
body = """
212+
03/01/2026 $100.00 $0.00 $0.00 Pending
213+
02/15/2026 $200.00 $0.00 $0.00 Processing
214+
02/01/2026 $150.00 $0.00 $0.00 Cancelled
215+
01/15/2026 $278.52 $184.12 $94.40 Electronic
216+
""".strip()
217+
218+
got = c._non_posted_payment_dates_from_payment_activity_text(body)
219+
assert len(got) == 3
220+
assert got[date(2026, 3, 1)] == "pending"
221+
assert got[date(2026, 2, 15)] == "processing"
222+
assert got[date(2026, 2, 1)] == "cancelled"
223+
assert date(2026, 1, 15) not in got

0 commit comments

Comments
 (0)