diff --git a/README.md b/README.md index 6754764..545ea79 100644 --- a/README.md +++ b/README.md @@ -284,7 +284,8 @@ therefore integrated it into this module, to make setting it up as easy as possi Once you have created a [Webshare account](https://www.webshare.io/?referral_code=w0xno53eb50g) and purchased a "Residential" proxy package that suits your workload (make sure NOT to purchase "Proxy Server" or -"Static Residential"!), open the [Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings) to retrieve +"Static Residential"!), open the +[Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings?referral_code=w0xno53eb50g) to retrieve your "Proxy Username" and "Proxy Password". Using this information you can initialize the `YouTubeTranscriptApi` as follows: @@ -306,8 +307,8 @@ ytt_api.fetch(video_id) Using the `WebshareProxyConfig` will default to using rotating residential proxies and requires no further configuration. -Note that referral links are used here and any purchases made through these links will support this Open Source -project, which is very much appreciated! 💖😊🙏💖 +Note that [referral links are used here](https://www.webshare.io/?referral_code=w0xno53eb50g) and any purchases +made through these links will support this Open Source project, which is very much appreciated! 💖😊🙏💖 However, you are of course free to integrate your own proxy solution using the `GenericProxyConfig` class, if you prefer using another provider or want to implement your own solution, as covered by the following section. @@ -511,7 +512,7 @@ using residential proxies as explained in create a [Webshare account](https://www.webshare.io/?referral_code=w0xno53eb50g) and purchase a "Residential" proxy package that suits your workload (make sure NOT to purchase "Proxy Server" or "Static Residential"!). Then you can use the "Proxy Username" and "Proxy Password" which you can find in your -[Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings), to run the following command: +[Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings?referral_code=w0xno53eb50g), to run the following command: ``` youtube_transcript_api --webshare-proxy-username "username" --webshare-proxy-password "password" diff --git a/pyproject.toml b/pyproject.toml index 60828f4..be7c0e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "youtube-transcript-api" -version = "1.0.1" +version = "1.0.2" description = "This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do!" readme = "README.md" license = "MIT" diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 82649d3..888a971 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -48,9 +48,9 @@ def __init__( http_client.cookies = _load_cookie_jar(cookie_path) if proxy_config is not None: http_client.proxies = proxy_config.to_requests_dict() - if proxy_config.prevent_keeping_connections_alive(): + if proxy_config.prevent_keeping_connections_alive: http_client.headers.update({"Connection": "close"}) - self._fetcher = TranscriptListFetcher(http_client) + self._fetcher = TranscriptListFetcher(http_client, proxy_config=proxy_config) def fetch( self, diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index 7128aae..bd61855 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -4,6 +4,7 @@ from requests import HTTPError from ._settings import WATCH_URL +from .proxies import ProxyConfig, GenericProxyConfig, WebshareProxyConfig class YouTubeTranscriptApiException(Exception): @@ -45,7 +46,7 @@ class CouldNotRetrieveTranscript(YouTubeTranscriptApiException): def __init__(self, video_id: str): self.video_id = video_id - super().__init__(self._build_error_message()) + super().__init__() def _build_error_message(self) -> str: error_message = self.ERROR_MESSAGE.format( @@ -64,6 +65,9 @@ def _build_error_message(self) -> str: def cause(self) -> str: return self.CAUSE_MESSAGE + def __str__(self) -> str: + return self._build_error_message() + class YouTubeRequestFailed(CouldNotRetrieveTranscript): CAUSE_MESSAGE = "Request to YouTube failed: {reason}" @@ -135,6 +139,51 @@ class RequestBlocked(CouldNotRetrieveTranscript): "eventually permanently ban the account that you have used to authenticate " "with! So only do this if you don't mind your account being banned!" ) + WITH_GENERIC_PROXY_CAUSE_MESSAGE = ( + "YouTube is blocking your requests, despite you using proxies. Keep in mind " + "a proxy is just a way to hide your real IP behind the IP of that proxy, but " + "there is no guarantee that the IP of that proxy won't be blocked as well.\n\n" + "The only truly reliable way to prevent IP blocks is rotating through a large " + "pool of residential IPs, by using a provider like Webshare " + "(https://www.webshare.io/?referral_code=w0xno53eb50g), which provides you " + "with a pool of >30M residential IPs (make sure to purchase " + '"Residential" proxies, NOT "Proxy Server" or "Static Residential"!).\n\n' + "You will find more information on how to easily integrate Webshare here: " + "https://github.com/jdepoix/youtube-transcript-api" + "?tab=readme-ov-file#using-webshare" + ) + WITH_WEBSHARE_PROXY_CAUSE_MESSAGE = ( + "YouTube is blocking your requests, despite you using Webshare proxies. " + 'Please make sure that you have purchased "Residential" proxies and ' + 'NOT "Proxy Server" or "Static Residential", as those won\'t work as ' + 'reliably! The free tier also uses "Proxy Server" and will NOT work!\n\n' + 'The only reliable option is using "Residential" proxies (not "Static ' + 'Residential"), as this allows you to rotate through a pool of over 30M IPs, ' + "which means you will always find an IP that hasn't been blocked by YouTube " + "yet!\n\n" + "You can support the development of this open source project by making your " + "Webshare purchases through this affiliate link: " + "https://www.webshare.io/?referral_code=w0xno53eb50g \n\n" + "Thank you for your support! <3" + ) + + def __init__(self, video_id: str): + self._proxy_config = None + super().__init__(video_id) + + def with_proxy_config( + self, proxy_config: Optional[ProxyConfig] + ) -> "RequestBlocked": + self._proxy_config = proxy_config + return self + + @property + def cause(self) -> str: + if isinstance(self._proxy_config, WebshareProxyConfig): + return self.WITH_WEBSHARE_PROXY_CAUSE_MESSAGE + if isinstance(self._proxy_config, GenericProxyConfig): + return self.WITH_GENERIC_PROXY_CAUSE_MESSAGE + return super().cause class IpBlocked(RequestBlocked): diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 1e0c3e6..a3f2761 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -4,7 +4,7 @@ from itertools import chain from html import unescape -from typing import List, Dict, Iterator, Iterable, Pattern +from typing import List, Dict, Iterator, Iterable, Pattern, Optional from defusedxml import ElementTree @@ -12,6 +12,7 @@ from requests import HTTPError, Session, Response +from .proxies import ProxyConfig from ._errors import ( VideoUnavailable, YouTubeRequestFailed, @@ -339,16 +340,32 @@ def _get_language_description(self, transcript_strings: Iterable[str]) -> str: class TranscriptListFetcher: - def __init__(self, http_client: Session): + def __init__(self, http_client: Session, proxy_config: Optional[ProxyConfig]): self._http_client = http_client + self._proxy_config = proxy_config def fetch(self, video_id: str) -> TranscriptList: return TranscriptList.build( self._http_client, video_id, - self._extract_captions_json(self._fetch_video_html(video_id), video_id), + self._fetch_captions_json(video_id), ) + def _fetch_captions_json(self, video_id: str, try_number: int = 0) -> Dict: + try: + return self._extract_captions_json( + self._fetch_video_html(video_id), video_id + ) + except RequestBlocked as exception: + retries = ( + 0 + if self._proxy_config is None + else self._proxy_config.retries_when_blocked + ) + if try_number + 1 < retries: + return self._fetch_captions_json(video_id, try_number=try_number + 1) + raise exception.with_proxy_config(self._proxy_config) + def _extract_captions_json(self, html: str, video_id: str) -> Dict: splitted_html = html.split("var ytInitialPlayerResponse = ") diff --git a/youtube_transcript_api/proxies.py b/youtube_transcript_api/proxies.py index c4e85aa..b50a182 100644 --- a/youtube_transcript_api/proxies.py +++ b/youtube_transcript_api/proxies.py @@ -32,6 +32,7 @@ def to_requests_dict(self) -> RequestsProxyConfigDict: """ pass + @property def prevent_keeping_connections_alive(self) -> bool: """ If you are using rotating proxies, it can be useful to prevent the HTTP @@ -40,6 +41,16 @@ def prevent_keeping_connections_alive(self) -> bool: """ return False + @property + def retries_when_blocked(self) -> int: + """ + Defines how many times we should retry if a request is blocked. When using + rotating residential proxies with a large IP pool it can make sense to retry a + couple of times when a blocked IP is encountered, since a retry will trigger + an IP rotation and the next IP might not be blocked. + """ + return 0 + class GenericProxyConfig(ProxyConfig): """ @@ -83,8 +94,9 @@ class WebshareProxyConfig(GenericProxyConfig): most reliable way to work around being blocked by YouTube. If you don't have a Webshare account yet, you will have to create one - at https://www.webshare.io/?referral_code=w0xno53eb50g and purchase a residential - proxy package that suits your workload, to be able to use this proxy config. + at https://www.webshare.io/?referral_code=w0xno53eb50g and purchase a "Residential" + proxy package that suits your workload, to be able to use this proxy config (make + sure NOT to purchase "Proxy Server" or "Static Residential"!). Once you have created an account you only need the "Proxy Username" and "Proxy Password" that you can find in your Webshare settings @@ -105,24 +117,33 @@ def __init__( self, proxy_username: str, proxy_password: str, + retries_when_blocked: int = 10, domain_name: str = DEFAULT_DOMAIN_NAME, proxy_port: int = DEFAULT_PORT, ): """ Once you have created a Webshare account at - https://www.webshare.io/?referral_code=w0xno53eb50g and purchased a residential - proxy package, this config class allows you to easily use it, by defaulting to - the most reliable proxy settings (rotating residential proxies). + https://www.webshare.io/?referral_code=w0xno53eb50g and purchased a + "Residential" package (make sure NOT to purchase "Proxy Server" or + "Static Residential"!), this config class allows you to easily use it, + by defaulting to the most reliable proxy settings (rotating residential + proxies). :param proxy_username: "Proxy Username" found at https://dashboard.webshare.io/proxy/settings :param proxy_password: "Proxy Password" found at https://dashboard.webshare.io/proxy/settings + :param retries_when_blocked: Define how many times we should retry if a request + is blocked. When using rotating residential proxies with a large IP pool it + makes sense to retry a couple of times when a blocked IP is encountered, + since a retry will trigger an IP rotation and the next IP might not be + blocked. Defaults to 10. """ self.proxy_username = proxy_username self.proxy_password = proxy_password self.domain_name = domain_name self.proxy_port = proxy_port + self._retries_when_blocked = retries_when_blocked @property def url(self) -> str: @@ -139,5 +160,10 @@ def http_url(self) -> str: def https_url(self) -> str: return self.url + @property def prevent_keeping_connections_alive(self) -> bool: return True + + @property + def retries_when_blocked(self) -> int: + return self._retries_when_blocked diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 5ce02c5..ed7b5e6 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -247,9 +247,11 @@ def test_fetch__exception_if_youtube_request_fails(self): httpretty.GET, "https://www.youtube.com/watch", status=500 ) - with self.assertRaises(YouTubeRequestFailed): + with self.assertRaises(YouTubeRequestFailed) as cm: YouTubeTranscriptApi().fetch("abc") + self.assertIn("Request to YouTube failed: ", str(cm.exception)) + def test_fetch__exception_if_age_restricted(self): httpretty.register_uri( httpretty.GET, @@ -277,9 +279,11 @@ def test_fetch__exception_request_blocked(self): body=load_asset("youtube_request_blocked.html.static"), ) - with self.assertRaises(RequestBlocked): + with self.assertRaises(RequestBlocked) as cm: YouTubeTranscriptApi().fetch("Njp5uhTorCo") + self.assertIn("YouTube is blocking requests from your IP", str(cm.exception)) + def test_fetch__exception_unplayable(self): httpretty.register_uri( httpretty.GET, @@ -287,11 +291,12 @@ def test_fetch__exception_unplayable(self): body=load_asset("youtube_unplayable.html.static"), ) - with self.assertRaises(VideoUnplayable) as error: + with self.assertRaises(VideoUnplayable) as cm: YouTubeTranscriptApi().fetch("Njp5uhTorCo") - error = error.exception - self.assertEqual(error.reason, "Custom Reason") - self.assertEqual(error.sub_reasons, ["Sub Reason 1", "Sub Reason 2"]) + exception = cm.exception + self.assertEqual(exception.reason, "Custom Reason") + self.assertEqual(exception.sub_reasons, ["Sub Reason 1", "Sub Reason 2"]) + self.assertIn("Custom Reason", str(exception)) def test_fetch__exception_if_transcripts_disabled(self): httpretty.register_uri( @@ -312,9 +317,11 @@ def test_fetch__exception_if_transcripts_disabled(self): YouTubeTranscriptApi().fetch("Fjg5lYqvzUs") def test_fetch__exception_if_language_unavailable(self): - with self.assertRaises(NoTranscriptFound): + with self.assertRaises(NoTranscriptFound) as cm: YouTubeTranscriptApi().fetch("GJLlxj_dtq8", languages=["cz"]) + self.assertIn("No transcripts were found for", str(cm.exception)) + @patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict") def test_fetch__with_proxy(self, to_requests_dict): proxy_config = GenericProxyConfig( @@ -341,6 +348,64 @@ def test_fetch__with_proxy_prevent_alive_connections(self, to_requests_dict): request = httpretty.last_request() self.assertEqual(request.headers.get("Connection"), "close") + @patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict") + def test_fetch__with_proxy_retry_when_blocked(self, to_requests_dict): + for _ in range(3): + httpretty.register_uri( + httpretty.GET, + "https://www.youtube.com/watch", + body=load_asset("youtube_request_blocked.html.static"), + ) + proxy_config = WebshareProxyConfig( + proxy_username="username", + proxy_password="password", + ) + + YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo") + + self.assertEqual(len(httpretty.latest_requests()), 3 + 2) + + @patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict") + def test_fetch__with_webshare_proxy_reraise_when_blocked(self, to_requests_dict): + retries = 5 + for _ in range(retries): + httpretty.register_uri( + httpretty.GET, + "https://www.youtube.com/watch", + body=load_asset("youtube_request_blocked.html.static"), + ) + proxy_config = WebshareProxyConfig( + proxy_username="username", + proxy_password="password", + retries_when_blocked=retries, + ) + + with self.assertRaises(RequestBlocked) as cm: + YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo") + + self.assertEqual(len(httpretty.latest_requests()), retries) + self.assertEqual(cm.exception._proxy_config, proxy_config) + self.assertIn("Webshare", str(cm.exception)) + + @patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict") + def test_fetch__with_generic_proxy_reraise_when_blocked(self, to_requests_dict): + httpretty.register_uri( + httpretty.GET, + "https://www.youtube.com/watch", + body=load_asset("youtube_request_blocked.html.static"), + ) + proxy_config = GenericProxyConfig( + http_url="http://localhost:8080", + https_url="http://localhost:8080", + ) + + with self.assertRaises(RequestBlocked) as cm: + YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo") + + self.assertEqual(len(httpretty.latest_requests()), 1) + self.assertEqual(cm.exception._proxy_config, proxy_config) + self.assertIn("YouTube is blocking your requests", str(cm.exception)) + def test_fetch__with_cookies(self): cookie_path = get_asset_path("example_cookies.txt") transcript = YouTubeTranscriptApi(cookie_path=cookie_path).fetch("GJLlxj_dtq8")