Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions linkedin_scraper/core/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,22 +259,40 @@ async def is_logged_in(page: Page) -> bool:
auth_blockers = ['/login', '/authwall', '/checkpoint', '/challenge', '/uas/login', '/uas/consumer-email-challenge']
if any(pattern in current_url for pattern in auth_blockers):
return False

# Step 2: Selector check (PRIMARY) - check for nav elements

# Step 2: Require LinkedIn auth cookie to avoid false positives on
# public profile pages that still render nav-like elements.
cookies = await page.context.cookies(["https://www.linkedin.com"])
has_auth_cookie = any(
cookie.get("name") == "li_at" and bool(cookie.get("value"))
for cookie in cookies
)
if not has_auth_cookie:
return False

# Step 3: Selector check (PRIMARY) - check for logged-in nav elements
old_selectors = '.global-nav__primary-link, [data-control-name="nav.settings"]'
old_count = await page.locator(old_selectors).count()

new_selectors = 'nav a[href*="/feed"], nav button:has-text("Home"), nav a[href*="/mynetwork"]'
new_count = await page.locator(new_selectors).count()

has_nav_elements = old_count > 0 or new_count > 0

# Step 3: URL fallback - check for authenticated-only pages

# Step 4: Fail-safe against guest-mode UI
guest_selectors = (
'a[href*="/signup"], a[href*="/login"], '
'button:has-text("Sign in"), button:has-text("Join now")'
)
has_guest_cta = await page.locator(guest_selectors).count() > 0

# Step 5: URL fallback - check for authenticated-only pages
authenticated_only_pages = ['/feed', '/mynetwork', '/messaging', '/notifications']
is_authenticated_page = any(pattern in current_url for pattern in authenticated_only_pages)

# Return True if either nav elements found or on authenticated page
return has_nav_elements or is_authenticated_page

# Return True only when auth cookie exists and page does not look like
# guest mode.
return (has_nav_elements or is_authenticated_page) and not has_guest_cta
except Exception:
return False

Expand Down
66 changes: 56 additions & 10 deletions linkedin_scraper/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import asyncio
import functools
import logging
import os
from typing import Any, Callable, Optional, TypeVar, cast
from playwright.async_api import Page, TimeoutError as PlaywrightTimeoutError

Expand Down Expand Up @@ -64,11 +65,27 @@ async def detect_rate_limit(page: Page) -> None:
Raises:
RateLimitError: If rate limiting is detected
"""
# Allow disabling detection when debugging false positives.
if os.getenv("LINKEDIN_SCRAPER_DISABLE_RATE_LIMIT_CHECK", "").lower() in {
"1",
"true",
"yes",
}:
return

# Check for common rate limit indicators

# Check URL for security challenges
current_url = page.url
if 'linkedin.com/checkpoint' in current_url or 'authwall' in current_url:
if any(
marker in current_url
for marker in (
"linkedin.com/checkpoint",
"authwall",
"/challenge/",
"captcha",
)
):
raise RateLimitError(
"LinkedIn security checkpoint detected. "
"You may need to verify your identity or wait before continuing.",
Expand All @@ -77,26 +94,55 @@ async def detect_rate_limit(page: Page) -> None:

# Check for CAPTCHA
try:
captcha = await page.locator('iframe[title*="captcha" i], iframe[src*="captcha" i]').count()
captcha = await page.locator(
'iframe[title*="captcha" i], iframe[src*="captcha" i]'
).count()
if captcha > 0:
raise RateLimitError(
"CAPTCHA challenge detected. Manual intervention required.",
suggested_wait_time=3600
suggested_wait_time=3600,
)
except Exception:
except PlaywrightTimeoutError:
pass

# Check for rate limit messages
try:
title_text = (await page.title()).lower()
body_text = await page.locator('body').text_content(timeout=1000)
if body_text:
body_lower = body_text.lower()
if any(phrase in body_lower for phrase in [
'too many requests',
'rate limit',
'slow down',
'try again later'
]):
strong_indicators = [
"looks like you may be using automation tools",
"we've restricted your account",
"temporarily restricted",
"security verification",
"verify your identity",
"unusual activity",
"complete this security check",
]
soft_indicators = [
"too many requests",
"rate limit",
"slow down",
"try again later",
]

strong_hit = any(
phrase in body_lower or phrase in title_text
for phrase in strong_indicators
)
soft_hit = any(
phrase in body_lower or phrase in title_text
for phrase in soft_indicators
)
has_linkedin_security_context = any(
phrase in body_lower
for phrase in ["linkedin", "security", "verification", "captcha"]
)

# Only treat soft phrases as rate limits when page context also
# points to LinkedIn security/throttling behavior.
if strong_hit or (soft_hit and has_linkedin_security_context):
raise RateLimitError(
"Rate limit message detected on page.",
suggested_wait_time=1800 # 30 minutes
Expand Down