Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,8 @@ therefore integrated it into this module, to make setting it up as easy as possi

Once you have created a [Webshare account](https://www.webshare.io/?referral_code=w0xno53eb50g) and purchased a
"Residential" proxy package that suits your workload (make sure NOT to purchase "Proxy Server" or
"Static Residential"!), open the [Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings) to retrieve
"Static Residential"!), open the
[Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings?referral_code=w0xno53eb50g) to retrieve
your "Proxy Username" and "Proxy Password". Using this information you can initialize the `YouTubeTranscriptApi` as
follows:

Expand All @@ -306,8 +307,8 @@ ytt_api.fetch(video_id)
Using the `WebshareProxyConfig` will default to using rotating residential proxies and requires no further
configuration.

Note that referral links are used here and any purchases made through these links will support this Open Source
project, which is very much appreciated! 💖😊🙏💖
Note that [referral links are used here](https://www.webshare.io/?referral_code=w0xno53eb50g) and any purchases
made through these links will support this Open Source project, which is very much appreciated! 💖😊🙏💖

However, you are of course free to integrate your own proxy solution using the `GenericProxyConfig` class, if you
prefer using another provider or want to implement your own solution, as covered by the following section.
Expand Down Expand Up @@ -511,7 +512,7 @@ using residential proxies as explained in
create a [Webshare account](https://www.webshare.io/?referral_code=w0xno53eb50g) and purchase a "Residential" proxy
package that suits your workload (make sure NOT to purchase "Proxy Server" or "Static Residential"!). Then you can use
the "Proxy Username" and "Proxy Password" which you can find in your
[Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings), to run the following command:
[Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings?referral_code=w0xno53eb50g), to run the following command:

```
youtube_transcript_api <first_video_id> <second_video_id> --webshare-proxy-username "username" --webshare-proxy-password "password"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "youtube-transcript-api"
version = "1.0.1"
version = "1.0.2"
description = "This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do!"
readme = "README.md"
license = "MIT"
Expand Down
4 changes: 2 additions & 2 deletions youtube_transcript_api/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def __init__(
http_client.cookies = _load_cookie_jar(cookie_path)
if proxy_config is not None:
http_client.proxies = proxy_config.to_requests_dict()
if proxy_config.prevent_keeping_connections_alive():
if proxy_config.prevent_keeping_connections_alive:
http_client.headers.update({"Connection": "close"})
self._fetcher = TranscriptListFetcher(http_client)
self._fetcher = TranscriptListFetcher(http_client, proxy_config=proxy_config)

def fetch(
self,
Expand Down
51 changes: 50 additions & 1 deletion youtube_transcript_api/_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from requests import HTTPError

from ._settings import WATCH_URL
from .proxies import ProxyConfig, GenericProxyConfig, WebshareProxyConfig


class YouTubeTranscriptApiException(Exception):
Expand Down Expand Up @@ -45,7 +46,7 @@ class CouldNotRetrieveTranscript(YouTubeTranscriptApiException):

def __init__(self, video_id: str):
self.video_id = video_id
super().__init__(self._build_error_message())
super().__init__()

def _build_error_message(self) -> str:
error_message = self.ERROR_MESSAGE.format(
Expand All @@ -64,6 +65,9 @@ def _build_error_message(self) -> str:
def cause(self) -> str:
return self.CAUSE_MESSAGE

def __str__(self) -> str:
return self._build_error_message()


class YouTubeRequestFailed(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = "Request to YouTube failed: {reason}"
Expand Down Expand Up @@ -135,6 +139,51 @@ class RequestBlocked(CouldNotRetrieveTranscript):
"eventually permanently ban the account that you have used to authenticate "
"with! So only do this if you don't mind your account being banned!"
)
WITH_GENERIC_PROXY_CAUSE_MESSAGE = (
"YouTube is blocking your requests, despite you using proxies. Keep in mind "
"a proxy is just a way to hide your real IP behind the IP of that proxy, but "
"there is no guarantee that the IP of that proxy won't be blocked as well.\n\n"
"The only truly reliable way to prevent IP blocks is rotating through a large "
"pool of residential IPs, by using a provider like Webshare "
"(https://www.webshare.io/?referral_code=w0xno53eb50g), which provides you "
"with a pool of >30M residential IPs (make sure to purchase "
'"Residential" proxies, NOT "Proxy Server" or "Static Residential"!).\n\n'
"You will find more information on how to easily integrate Webshare here: "
"https://github.com/jdepoix/youtube-transcript-api"
"?tab=readme-ov-file#using-webshare"
)
WITH_WEBSHARE_PROXY_CAUSE_MESSAGE = (
"YouTube is blocking your requests, despite you using Webshare proxies. "
'Please make sure that you have purchased "Residential" proxies and '
'NOT "Proxy Server" or "Static Residential", as those won\'t work as '
'reliably! The free tier also uses "Proxy Server" and will NOT work!\n\n'
'The only reliable option is using "Residential" proxies (not "Static '
'Residential"), as this allows you to rotate through a pool of over 30M IPs, '
"which means you will always find an IP that hasn't been blocked by YouTube "
"yet!\n\n"
"You can support the development of this open source project by making your "
"Webshare purchases through this affiliate link: "
"https://www.webshare.io/?referral_code=w0xno53eb50g \n\n"
"Thank you for your support! <3"
)

def __init__(self, video_id: str):
self._proxy_config = None
super().__init__(video_id)

def with_proxy_config(
self, proxy_config: Optional[ProxyConfig]
) -> "RequestBlocked":
self._proxy_config = proxy_config
return self

@property
def cause(self) -> str:
if isinstance(self._proxy_config, WebshareProxyConfig):
return self.WITH_WEBSHARE_PROXY_CAUSE_MESSAGE
if isinstance(self._proxy_config, GenericProxyConfig):
return self.WITH_GENERIC_PROXY_CAUSE_MESSAGE
return super().cause


class IpBlocked(RequestBlocked):
Expand Down
23 changes: 20 additions & 3 deletions youtube_transcript_api/_transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
from itertools import chain

from html import unescape
from typing import List, Dict, Iterator, Iterable, Pattern
from typing import List, Dict, Iterator, Iterable, Pattern, Optional

from defusedxml import ElementTree

import re

from requests import HTTPError, Session, Response

from .proxies import ProxyConfig
from ._errors import (
VideoUnavailable,
YouTubeRequestFailed,
Expand Down Expand Up @@ -339,16 +340,32 @@ def _get_language_description(self, transcript_strings: Iterable[str]) -> str:


class TranscriptListFetcher:
def __init__(self, http_client: Session):
def __init__(self, http_client: Session, proxy_config: Optional[ProxyConfig]):
self._http_client = http_client
self._proxy_config = proxy_config

def fetch(self, video_id: str) -> TranscriptList:
return TranscriptList.build(
self._http_client,
video_id,
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
self._fetch_captions_json(video_id),
)

def _fetch_captions_json(self, video_id: str, try_number: int = 0) -> Dict:
try:
return self._extract_captions_json(
self._fetch_video_html(video_id), video_id
)
except RequestBlocked as exception:
retries = (
0
if self._proxy_config is None
else self._proxy_config.retries_when_blocked
)
if try_number + 1 < retries:
return self._fetch_captions_json(video_id, try_number=try_number + 1)
raise exception.with_proxy_config(self._proxy_config)

def _extract_captions_json(self, html: str, video_id: str) -> Dict:
splitted_html = html.split("var ytInitialPlayerResponse = ")

Expand Down
36 changes: 31 additions & 5 deletions youtube_transcript_api/proxies.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def to_requests_dict(self) -> RequestsProxyConfigDict:
"""
pass

@property
def prevent_keeping_connections_alive(self) -> bool:
"""
If you are using rotating proxies, it can be useful to prevent the HTTP
Expand All @@ -40,6 +41,16 @@ def prevent_keeping_connections_alive(self) -> bool:
"""
return False

@property
def retries_when_blocked(self) -> int:
"""
Defines how many times we should retry if a request is blocked. When using
rotating residential proxies with a large IP pool it can make sense to retry a
couple of times when a blocked IP is encountered, since a retry will trigger
an IP rotation and the next IP might not be blocked.
"""
return 0


class GenericProxyConfig(ProxyConfig):
"""
Expand Down Expand Up @@ -83,8 +94,9 @@ class WebshareProxyConfig(GenericProxyConfig):
most reliable way to work around being blocked by YouTube.

If you don't have a Webshare account yet, you will have to create one
at https://www.webshare.io/?referral_code=w0xno53eb50g and purchase a residential
proxy package that suits your workload, to be able to use this proxy config.
at https://www.webshare.io/?referral_code=w0xno53eb50g and purchase a "Residential"
proxy package that suits your workload, to be able to use this proxy config (make
sure NOT to purchase "Proxy Server" or "Static Residential"!).

Once you have created an account you only need the "Proxy Username" and
"Proxy Password" that you can find in your Webshare settings
Expand All @@ -105,24 +117,33 @@ def __init__(
self,
proxy_username: str,
proxy_password: str,
retries_when_blocked: int = 10,
domain_name: str = DEFAULT_DOMAIN_NAME,
proxy_port: int = DEFAULT_PORT,
):
"""
Once you have created a Webshare account at
https://www.webshare.io/?referral_code=w0xno53eb50g and purchased a residential
proxy package, this config class allows you to easily use it, by defaulting to
the most reliable proxy settings (rotating residential proxies).
https://www.webshare.io/?referral_code=w0xno53eb50g and purchased a
"Residential" package (make sure NOT to purchase "Proxy Server" or
"Static Residential"!), this config class allows you to easily use it,
by defaulting to the most reliable proxy settings (rotating residential
proxies).

:param proxy_username: "Proxy Username" found at
https://dashboard.webshare.io/proxy/settings
:param proxy_password: "Proxy Password" found at
https://dashboard.webshare.io/proxy/settings
:param retries_when_blocked: Define how many times we should retry if a request
is blocked. When using rotating residential proxies with a large IP pool it
makes sense to retry a couple of times when a blocked IP is encountered,
since a retry will trigger an IP rotation and the next IP might not be
blocked. Defaults to 10.
"""
self.proxy_username = proxy_username
self.proxy_password = proxy_password
self.domain_name = domain_name
self.proxy_port = proxy_port
self._retries_when_blocked = retries_when_blocked

@property
def url(self) -> str:
Expand All @@ -139,5 +160,10 @@ def http_url(self) -> str:
def https_url(self) -> str:
return self.url

@property
def prevent_keeping_connections_alive(self) -> bool:
return True

@property
def retries_when_blocked(self) -> int:
return self._retries_when_blocked
79 changes: 72 additions & 7 deletions youtube_transcript_api/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,9 +247,11 @@ def test_fetch__exception_if_youtube_request_fails(self):
httpretty.GET, "https://www.youtube.com/watch", status=500
)

with self.assertRaises(YouTubeRequestFailed):
with self.assertRaises(YouTubeRequestFailed) as cm:
YouTubeTranscriptApi().fetch("abc")

self.assertIn("Request to YouTube failed: ", str(cm.exception))

def test_fetch__exception_if_age_restricted(self):
httpretty.register_uri(
httpretty.GET,
Expand Down Expand Up @@ -277,21 +279,24 @@ def test_fetch__exception_request_blocked(self):
body=load_asset("youtube_request_blocked.html.static"),
)

with self.assertRaises(RequestBlocked):
with self.assertRaises(RequestBlocked) as cm:
YouTubeTranscriptApi().fetch("Njp5uhTorCo")

self.assertIn("YouTube is blocking requests from your IP", str(cm.exception))

def test_fetch__exception_unplayable(self):
httpretty.register_uri(
httpretty.GET,
"https://www.youtube.com/watch",
body=load_asset("youtube_unplayable.html.static"),
)

with self.assertRaises(VideoUnplayable) as error:
with self.assertRaises(VideoUnplayable) as cm:
YouTubeTranscriptApi().fetch("Njp5uhTorCo")
error = error.exception
self.assertEqual(error.reason, "Custom Reason")
self.assertEqual(error.sub_reasons, ["Sub Reason 1", "Sub Reason 2"])
exception = cm.exception
self.assertEqual(exception.reason, "Custom Reason")
self.assertEqual(exception.sub_reasons, ["Sub Reason 1", "Sub Reason 2"])
self.assertIn("Custom Reason", str(exception))

def test_fetch__exception_if_transcripts_disabled(self):
httpretty.register_uri(
Expand All @@ -312,9 +317,11 @@ def test_fetch__exception_if_transcripts_disabled(self):
YouTubeTranscriptApi().fetch("Fjg5lYqvzUs")

def test_fetch__exception_if_language_unavailable(self):
with self.assertRaises(NoTranscriptFound):
with self.assertRaises(NoTranscriptFound) as cm:
YouTubeTranscriptApi().fetch("GJLlxj_dtq8", languages=["cz"])

self.assertIn("No transcripts were found for", str(cm.exception))

@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
def test_fetch__with_proxy(self, to_requests_dict):
proxy_config = GenericProxyConfig(
Expand All @@ -341,6 +348,64 @@ def test_fetch__with_proxy_prevent_alive_connections(self, to_requests_dict):
request = httpretty.last_request()
self.assertEqual(request.headers.get("Connection"), "close")

@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
def test_fetch__with_proxy_retry_when_blocked(self, to_requests_dict):
for _ in range(3):
httpretty.register_uri(
httpretty.GET,
"https://www.youtube.com/watch",
body=load_asset("youtube_request_blocked.html.static"),
)
proxy_config = WebshareProxyConfig(
proxy_username="username",
proxy_password="password",
)

YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo")

self.assertEqual(len(httpretty.latest_requests()), 3 + 2)

@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
def test_fetch__with_webshare_proxy_reraise_when_blocked(self, to_requests_dict):
retries = 5
for _ in range(retries):
httpretty.register_uri(
httpretty.GET,
"https://www.youtube.com/watch",
body=load_asset("youtube_request_blocked.html.static"),
)
proxy_config = WebshareProxyConfig(
proxy_username="username",
proxy_password="password",
retries_when_blocked=retries,
)

with self.assertRaises(RequestBlocked) as cm:
YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo")

self.assertEqual(len(httpretty.latest_requests()), retries)
self.assertEqual(cm.exception._proxy_config, proxy_config)
self.assertIn("Webshare", str(cm.exception))

@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
def test_fetch__with_generic_proxy_reraise_when_blocked(self, to_requests_dict):
httpretty.register_uri(
httpretty.GET,
"https://www.youtube.com/watch",
body=load_asset("youtube_request_blocked.html.static"),
)
proxy_config = GenericProxyConfig(
http_url="http://localhost:8080",
https_url="http://localhost:8080",
)

with self.assertRaises(RequestBlocked) as cm:
YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo")

self.assertEqual(len(httpretty.latest_requests()), 1)
self.assertEqual(cm.exception._proxy_config, proxy_config)
self.assertIn("YouTube is blocking your requests", str(cm.exception))

def test_fetch__with_cookies(self):
cookie_path = get_asset_path("example_cookies.txt")
transcript = YouTubeTranscriptApi(cookie_path=cookie_path).fetch("GJLlxj_dtq8")
Expand Down