Skip to content
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ jobs = scrape_jobs(
results_wanted=20,
hours_old=72,
country_indeed='USA',
rate_delay_min=1, # in seconds
rate_delay_max=2, # in seconds

# linkedin_fetch_description=True # gets more info such as description, direct job url (slower)
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],
Expand Down Expand Up @@ -118,6 +120,12 @@ Optional
|
├── ca_cert (str)
| path to CA Certificate file for proxies
|
├── rate_delay_min (int | float)
| (optional) minimum rate delay between network requests (must be specified together with rate_delay_max)
|
├── rate_delay_max (int | float)
| (optional) maximum rate delay between network requests (must be specified together with rate_delay_min)
```

```
Expand Down
6 changes: 4 additions & 2 deletions jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def scrape_jobs(
enforce_annual_salary: bool = False,
verbose: int = 0,
user_agent: str = None,
rate_delay_min: int | float | None = None,
rate_delay_max: int | float | None = None,
**kwargs,
) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -103,7 +105,7 @@ def get_site_type():

def scrape_site(site: Site) -> Tuple[str, JobResponse]:
scraper_class = SCRAPER_MAPPING[site]
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent)
scraper = scraper_class(proxies=proxies, ca_cert=ca_cert, user_agent=user_agent, rate_delay_min=rate_delay_min, rate_delay_max=rate_delay_max)
scraped_data: JobResponse = scraper.scrape(scraper_input)
cap_name = site.value.capitalize()
site_name = "ZipRecruiter" if cap_name == "Zip_recruiter" else cap_name
Expand Down Expand Up @@ -224,4 +226,4 @@ def worker(site):
# Add BDJobs to __all__
__all__ = [
"BDJobs",
]
]
6 changes: 4 additions & 2 deletions jobspy/bayt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,19 @@ class BaytScraper(Scraper):
band_delay = 3

def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None
):
super().__init__(Site.BAYT, proxies=proxies, ca_cert=ca_cert)
self.scraper_input = None
self.session = None
self.country = "worldwide"
self.rate_delay_min = rate_delay_min
self.rate_delay_max = rate_delay_max

def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.scraper_input = scraper_input
self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True, rate_delay_min=self.rate_delay_min, rate_delay_max=self.rate_delay_max
)
job_list: list[JobPost] = []
page = 1
Expand Down
6 changes: 4 additions & 2 deletions jobspy/bdjobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class BDJobs(Scraper):
band_delay = 3

def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None
):
"""
Initializes BDJobsScraper with the BDJobs job search url
Expand All @@ -57,8 +57,10 @@ def __init__(
ca_cert=ca_cert,
is_tls=False,
has_retry=True,
delay=5,
backoff_factor=5,
clear_cookies=True,
rate_delay_min=rate_delay_min,
rate_delay_max=rate_delay_max,
)
self.session.headers.update(headers)
self.scraper_input = None
Expand Down
6 changes: 4 additions & 2 deletions jobspy/glassdoor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

class Glassdoor(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None
):
"""
Initializes GlassdoorScraper with the Glassdoor job search url
Expand All @@ -48,6 +48,8 @@ def __init__(
self.scraper_input = None
self.jobs_per_page = 30
self.max_pages = 30
self.rate_delay_min = rate_delay_min
self.rate_delay_max = rate_delay_max
self.seen_urls = set()

def scrape(self, scraper_input: ScraperInput) -> JobResponse:
Expand All @@ -61,7 +63,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.base_url = self.scraper_input.country.get_glassdoor_url()

self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True
proxies=self.proxies, ca_cert=self.ca_cert, has_retry=True, rate_delay_min=self.rate_delay_min, rate_delay_max=self.rate_delay_max
)
token = self._get_csrf_token()
headers["gd-csrf-token"] = token if token else fallback_token
Expand Down
6 changes: 4 additions & 2 deletions jobspy/google/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

class Google(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None
):
"""
Initializes Google Scraper with the Goodle jobs search url
Expand All @@ -37,6 +37,8 @@ def __init__(
self.seen_urls = set()
self.url = "https://www.google.com/search"
self.jobs_url = "https://www.google.com/async/callback:550"
self.rate_delay_min = rate_delay_min
self.rate_delay_max = rate_delay_max

def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Expand All @@ -48,7 +50,7 @@ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
self.scraper_input.results_wanted = min(900, scraper_input.results_wanted)

self.session = create_session(
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True
proxies=self.proxies, ca_cert=self.ca_cert, is_tls=False, has_retry=True, rate_delay_min=self.rate_delay_min, rate_delay_max=self.rate_delay_max
)
forward_cursor, job_list = self._get_initial_cursor_and_jobs()
if forward_cursor is None:
Expand Down
4 changes: 2 additions & 2 deletions jobspy/indeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@

class Indeed(Scraper):
def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None
):
"""
Initializes IndeedScraper with the Indeed API url
"""
super().__init__(Site.INDEED, proxies=proxies)

self.session = create_session(
proxies=self.proxies, ca_cert=ca_cert, is_tls=False
proxies=self.proxies, ca_cert=ca_cert, is_tls=False, rate_delay_min=rate_delay_min, rate_delay_max=rate_delay_max
)
self.scraper_input = None
self.jobs_per_page = 100
Expand Down
6 changes: 4 additions & 2 deletions jobspy/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class LinkedIn(Scraper):
jobs_per_page = 25

def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None
):
"""
Initializes LinkedInScraper with the LinkedIn job search url
Expand All @@ -62,8 +62,10 @@ def __init__(
ca_cert=ca_cert,
is_tls=False,
has_retry=True,
delay=5,
backoff_factor=5,
clear_cookies=True,
rate_delay_min=rate_delay_min,
rate_delay_max=rate_delay_max,
)
self.session.headers.update(headers)
self.scraper_input = None
Expand Down
8 changes: 5 additions & 3 deletions jobspy/naukri/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class Naukri(Scraper):
jobs_per_page = 20

def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None
):
"""
Initializes NaukriScraper with the Naukri API URL
Expand All @@ -55,8 +55,10 @@ def __init__(
ca_cert=ca_cert,
is_tls=False,
has_retry=True,
delay=5,
backoff_factor=5,
clear_cookies=True,
rate_delay_min=rate_delay_min,
rate_delay_max=rate_delay_max,
)
self.session.headers.update(naukri_headers)
self.scraper_input = None
Expand Down Expand Up @@ -301,4 +303,4 @@ def _infer_work_from_home_type(self, placeholders: list[dict], title: str, descr
return "Remote"
elif "work from office" in description.lower() or not ("remote" in description.lower() or "hybrid" in description.lower()):
return "Work from office"
return None
return None
70 changes: 62 additions & 8 deletions jobspy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import logging
import re
from itertools import cycle
import random
import threading
import time

import numpy as np
import requests
Expand Down Expand Up @@ -51,29 +54,69 @@ def format_proxy(proxy):
return {"http": proxy, "https": proxy}
return {"http": f"http://{proxy}", "https": f"http://{proxy}"}

class RateLimiter:
"""
A thread-safe rate limiter to enforce a delay between operations.

Args:
rate_delay_min (int | float | None):
The minimum time in seconds to wait since the last request.
rate_delay_max (int | float | None):
The maximum time in seconds to wait since the last request.
"""
def __init__(self, rate_delay_min: int | float | None, rate_delay_max: int | float | None):
self.rate_delay_min = rate_delay_min
self.rate_delay_max = rate_delay_max
self.rate_delay_lock = threading.Lock()
self.last_request_time = 0.0

def enforce_delay(self):
"""
Enforces a delay to meet the configured rate limit.

This method is thread-safe. It calculates the required sleep time based on
the time elapsed since the last request and waits if necessary.
"""

if not isinstance(self.rate_delay_min, (int, float)) or not isinstance(self.rate_delay_max, (int, float)):
return

delay_seconds = random.uniform(self.rate_delay_min, self.rate_delay_max)
with self.rate_delay_lock:
time_elapsed = time.monotonic() - self.last_request_time
sleep_time = delay_seconds - time_elapsed
if sleep_time > 0:
time.sleep(sleep_time)

self.last_request_time = time.monotonic()



class RequestsRotating(RotatingProxySession, requests.Session):
def __init__(self, proxies=None, has_retry=False, delay=1, clear_cookies=False):
def __init__(self, proxies=None, has_retry=False, backoff_factor=1, clear_cookies=False, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None):
RotatingProxySession.__init__(self, proxies=proxies)
requests.Session.__init__(self)
self.clear_cookies = clear_cookies
self.allow_redirects = True
self.setup_session(has_retry, delay)
self.setup_session(has_retry, backoff_factor)
self.rate_limiter = RateLimiter(rate_delay_min, rate_delay_max)

def setup_session(self, has_retry, delay):
def setup_session(self, has_retry, backoff_factor):
if has_retry:
retries = Retry(
total=3,
connect=3,
status=3,
status_forcelist=[500, 502, 503, 504, 429],
backoff_factor=delay,
backoff_factor=backoff_factor,
)
adapter = HTTPAdapter(max_retries=retries)
self.mount("http://", adapter)
self.mount("https://", adapter)

def request(self, method, url, **kwargs):
self.rate_limiter.enforce_delay()

if self.clear_cookies:
self.cookies.clear()

Expand All @@ -87,11 +130,14 @@ def request(self, method, url, **kwargs):


class TLSRotating(RotatingProxySession, tls_client.Session):
def __init__(self, proxies=None):
def __init__(self, proxies=None, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None):
RotatingProxySession.__init__(self, proxies=proxies)
tls_client.Session.__init__(self, random_tls_extension_order=True)
self.rate_limiter = RateLimiter(rate_delay_min, rate_delay_max)

def execute_request(self, *args, **kwargs):
self.rate_limiter.enforce_delay()

if self.proxy_cycle:
next_proxy = next(self.proxy_cycle)
if next_proxy["http"] != "http://localhost":
Expand All @@ -109,21 +155,29 @@ def create_session(
ca_cert: str | None = None,
is_tls: bool = True,
has_retry: bool = False,
delay: int = 1,
backoff_factor: int = 1,
clear_cookies: bool = False,
rate_delay_min: int | float | None = None,
rate_delay_max: int | float | None = None,
) -> requests.Session:
"""
Creates a requests session with optional tls, proxy, and retry settings.
:return: A session object
"""
if is_tls:
session = TLSRotating(proxies=proxies)
session = TLSRotating(
proxies=proxies,
rate_delay_min=rate_delay_min,
rate_delay_max=rate_delay_max,
)
else:
session = RequestsRotating(
proxies=proxies,
has_retry=has_retry,
delay=delay,
backoff_factor=backoff_factor,
clear_cookies=clear_cookies,
rate_delay_min=rate_delay_min,
rate_delay_max=rate_delay_max,
)

if ca_cert:
Expand Down
4 changes: 2 additions & 2 deletions jobspy/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ class ZipRecruiter(Scraper):
api_url = "https://api.ziprecruiter.com"

def __init__(
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None
self, proxies: list[str] | str | None = None, ca_cert: str | None = None, user_agent: str | None = None, rate_delay_min: int | float | None = None, rate_delay_max: int | float | None = None
):
"""
Initializes ZipRecruiterScraper with the ZipRecruiter job search url
"""
super().__init__(Site.ZIP_RECRUITER, proxies=proxies)

self.scraper_input = None
self.session = create_session(proxies=proxies, ca_cert=ca_cert)
self.session = create_session(proxies=proxies, ca_cert=ca_cert, rate_delay_min=rate_delay_min, rate_delay_max=rate_delay_max)
self.session.headers.update(headers)
self._get_cookies()

Expand Down