Skip to content

Commit 89476e6

Browse files
rluvatonedgarrmondragonpre-commit-ci[bot]
authored
fix: String to int conversion in scraping (#474)
Hey, I'm not familiar with the codebase at all but had this error, so creating pr. if someone could take it it will be great! Fix for: ``` 2025-10-13T12:47:24.737839Z [info] An unhandled error occurred while syncing 'dependents' 2025-10-13T12:47:24.738171Z [info] An unhandled error occurred while syncing 'repositories' 2025-10-13T12:47:24.743866Z [info] invalid literal for int() with base 10: '1,808' 2025-10-13T12:47:24.743946Z [info] Traceback (most recent call last): 2025-10-13T12:47:24.744036Z [info] File "tap-github", line 12, in <module> 2025-10-13T12:47:24.744184Z [info] sys.exit(cli()) 2025-10-13T12:47:24.744372Z [info] File "site-packages/click/core.py", line 1462, in __call__ 2025-10-13T12:47:24.744444Z [info] return self.main(*args, **kwargs) 2025-10-13T12:47:24.744582Z [info] File "site-packages/click/core.py", line 1383, in main 2025-10-13T12:47:24.744647Z [info] rv = self.invoke(ctx) 2025-10-13T12:47:24.744781Z [info] File "site-packages/singer_sdk/plugin_base.py", line 150, in invoke 2025-10-13T12:47:24.744844Z [info] return super().invoke(ctx) 2025-10-13T12:47:24.744969Z [info] File "site-packages/click/core.py", line 1246, in invoke 2025-10-13T12:47:24.745031Z [info] return ctx.invoke(self.callback, **ctx.params) 2025-10-13T12:47:24.745153Z [info] File "site-packages/click/core.py", line 814, in invoke 2025-10-13T12:47:24.745214Z [info] return callback(*args, **kwargs) 2025-10-13T12:47:24.745335Z [info] File "site-packages/singer_sdk/tap_base.py", line 554, in invoke 2025-10-13T12:47:24.745396Z [info] tap.sync_all() 2025-10-13T12:47:24.745456Z [info] File "site-packages/singer_sdk/tap_base.py", line 495, in sync_all 2025-10-13T12:47:24.745517Z [info] stream.sync() 2025-10-13T12:47:24.745577Z [info] File "site-packages/singer_sdk/streams/core.py", line 1354, in sync 2025-10-13T12:47:24.745638Z [info] for _ in self._sync_records(context=context): 2025-10-13T12:47:24.745757Z [info] File "site-packages/singer_sdk/streams/core.py", line 1251, in _sync_records 2025-10-13T12:47:24.745819Z [info] self._process_record( 2025-10-13T12:47:24.745881Z [info] File "site-packages/singer_sdk/streams/core.py", line 1180, in _process_record 2025-10-13T12:47:24.745941Z [info] self._sync_children(copy.copy(context)) 2025-10-13T12:47:24.746001Z [info] File "site-packages/singer_sdk/streams/core.py", line 1376, in _sync_children 2025-10-13T12:47:24.746061Z [info] child_stream.sync(context=child_context) 2025-10-13T12:47:24.746120Z [info] File "site-packages/singer_sdk/streams/core.py", line 1354, in sync 2025-10-13T12:47:24.746180Z [info] for _ in self._sync_records(context=context): 2025-10-13T12:47:24.746299Z [info] File "site-packages/singer_sdk/streams/core.py", line 1229, in _sync_records 2025-10-13T12:47:24.746360Z [info] for idx, record_result in enumerate(self.get_records(current_context)): 2025-10-13T12:47:24.746480Z [info] File "site-packages/singer_sdk/streams/rest.py", line 631, in get_records 2025-10-13T12:47:24.746540Z [info] yield from self.request_records(context) 2025-10-13T12:47:24.746600Z [info] File "site-packages/singer_sdk/streams/rest.py", line 466, in request_records 2025-10-13T12:47:24.746659Z [info] first_record = next(records) 2025-10-13T12:47:24.746781Z [info] File "site-packages/tap_github/repository_streams.py", line 3189, in parse_response 2025-10-13T12:47:24.746842Z [info] yield from scrape_dependents(response, self.logger) 2025-10-13T12:47:24.746900Z [info] File "site-packages/tap_github/scraping.py", line 42, in scrape_dependents 2025-10-13T12:47:24.746960Z [info] yield from _scrape_dependents(f"https://{base_url}/{link}", logger) 2025-10-13T12:47:24.747019Z [info] File "site-packages/tap_github/scraping.py", line 61, in _scrape_dependents 2025-10-13T12:47:24.747079Z [info] int(s.next_sibling.strip()) 2025-10-13T12:47:24.747140Z [info] ValueError: invalid literal for int() with base 10: '1,808' ``` --------- Co-authored-by: Edgar Ramírez Mondragón <edgarrm358@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent ae8823b commit 89476e6

1 file changed

Lines changed: 8 additions & 3 deletions

File tree

tap_github/scraping.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@
2323
contributors_regex = re.compile(" {3}Contributors ")
2424

2525

26+
def parse_int(s: str) -> int:
27+
"""For example, '1,808' -> 1808."""
28+
return int(s.strip().replace(",", "").replace("+", ""))
29+
30+
2631
def scrape_dependents(
2732
response: requests.Response, logger: logging.Logger | None = None
2833
) -> Iterable[dict[str, Any]]:
@@ -58,11 +63,11 @@ def _scrape_dependents(url: str, logger: logging.Logger) -> Iterable[dict[str, A
5863
for a in soup.select("a[data-hovercard-type=repository]")
5964
]
6065
stars = [
61-
int(s.next_sibling.strip())
66+
parse_int(s.next_sibling)
6267
for s in soup.find_all("svg", {"class": "octicon octicon-star"})
6368
]
6469
forks = [
65-
int(s.next_sibling.strip())
70+
parse_int(s.next_sibling)
6671
for s in soup.find_all("svg", {"class": "octicon octicon-repo-forked"})
6772
]
6873

@@ -111,7 +116,7 @@ def parse_counter(tag: Tag | NavigableString | None) -> int:
111116
title_string = cast("str", title)
112117
else:
113118
title_string = cast("str", title[0])
114-
return int(title_string.strip().replace(",", "").replace("+", ""))
119+
return parse_int(title_string)
115120
except (KeyError, ValueError) as e:
116121
raise IndexError(
117122
f"Could not parse counter {tag}. Maybe the GitHub page format has changed?"

0 commit comments

Comments
 (0)