Skip to content

Commit dc08c3f

Browse files
authored
Merge pull request #398 from jdepoix/feature/rotate-proxy-ip-on-block
Feature/rotate proxy ip on block
2 parents 53cd2b0 + 25db2d8 commit dc08c3f

File tree

7 files changed

+181
-23
lines changed

7 files changed

+181
-23
lines changed

README.md

+5-4
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,8 @@ therefore integrated it into this module, to make setting it up as easy as possi
284284

285285
Once you have created a [Webshare account](https://www.webshare.io/?referral_code=w0xno53eb50g) and purchased a
286286
"Residential" proxy package that suits your workload (make sure NOT to purchase "Proxy Server" or
287-
"Static Residential"!), open the [Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings) to retrieve
287+
"Static Residential"!), open the
288+
[Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings?referral_code=w0xno53eb50g) to retrieve
288289
your "Proxy Username" and "Proxy Password". Using this information you can initialize the `YouTubeTranscriptApi` as
289290
follows:
290291

@@ -306,8 +307,8 @@ ytt_api.fetch(video_id)
306307
Using the `WebshareProxyConfig` will default to using rotating residential proxies and requires no further
307308
configuration.
308309

309-
Note that referral links are used here and any purchases made through these links will support this Open Source
310-
project, which is very much appreciated! 💖😊🙏💖
310+
Note that [referral links are used here](https://www.webshare.io/?referral_code=w0xno53eb50g) and any purchases
311+
made through these links will support this Open Source project, which is very much appreciated! 💖😊🙏💖
311312

312313
However, you are of course free to integrate your own proxy solution using the `GenericProxyConfig` class, if you
313314
prefer using another provider or want to implement your own solution, as covered by the following section.
@@ -511,7 +512,7 @@ using residential proxies as explained in
511512
create a [Webshare account](https://www.webshare.io/?referral_code=w0xno53eb50g) and purchase a "Residential" proxy
512513
package that suits your workload (make sure NOT to purchase "Proxy Server" or "Static Residential"!). Then you can use
513514
the "Proxy Username" and "Proxy Password" which you can find in your
514-
[Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings), to run the following command:
515+
[Webshare Proxy Settings](https://dashboard.webshare.io/proxy/settings?referral_code=w0xno53eb50g), to run the following command:
515516

516517
```
517518
youtube_transcript_api <first_video_id> <second_video_id> --webshare-proxy-username "username" --webshare-proxy-password "password"

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[tool.poetry]
66
name = "youtube-transcript-api"
7-
version = "1.0.1"
7+
version = "1.0.2"
88
description = "This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do!"
99
readme = "README.md"
1010
license = "MIT"

youtube_transcript_api/_api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ def __init__(
4848
http_client.cookies = _load_cookie_jar(cookie_path)
4949
if proxy_config is not None:
5050
http_client.proxies = proxy_config.to_requests_dict()
51-
if proxy_config.prevent_keeping_connections_alive():
51+
if proxy_config.prevent_keeping_connections_alive:
5252
http_client.headers.update({"Connection": "close"})
53-
self._fetcher = TranscriptListFetcher(http_client)
53+
self._fetcher = TranscriptListFetcher(http_client, proxy_config=proxy_config)
5454

5555
def fetch(
5656
self,

youtube_transcript_api/_errors.py

+50-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from requests import HTTPError
55

66
from ._settings import WATCH_URL
7+
from .proxies import ProxyConfig, GenericProxyConfig, WebshareProxyConfig
78

89

910
class YouTubeTranscriptApiException(Exception):
@@ -45,7 +46,7 @@ class CouldNotRetrieveTranscript(YouTubeTranscriptApiException):
4546

4647
def __init__(self, video_id: str):
4748
self.video_id = video_id
48-
super().__init__(self._build_error_message())
49+
super().__init__()
4950

5051
def _build_error_message(self) -> str:
5152
error_message = self.ERROR_MESSAGE.format(
@@ -64,6 +65,9 @@ def _build_error_message(self) -> str:
6465
def cause(self) -> str:
6566
return self.CAUSE_MESSAGE
6667

68+
def __str__(self) -> str:
69+
return self._build_error_message()
70+
6771

6872
class YouTubeRequestFailed(CouldNotRetrieveTranscript):
6973
CAUSE_MESSAGE = "Request to YouTube failed: {reason}"
@@ -135,6 +139,51 @@ class RequestBlocked(CouldNotRetrieveTranscript):
135139
"eventually permanently ban the account that you have used to authenticate "
136140
"with! So only do this if you don't mind your account being banned!"
137141
)
142+
WITH_GENERIC_PROXY_CAUSE_MESSAGE = (
143+
"YouTube is blocking your requests, despite you using proxies. Keep in mind "
144+
"a proxy is just a way to hide your real IP behind the IP of that proxy, but "
145+
"there is no guarantee that the IP of that proxy won't be blocked as well.\n\n"
146+
"The only truly reliable way to prevent IP blocks is rotating through a large "
147+
"pool of residential IPs, by using a provider like Webshare "
148+
"(https://www.webshare.io/?referral_code=w0xno53eb50g), which provides you "
149+
"with a pool of >30M residential IPs (make sure to purchase "
150+
'"Residential" proxies, NOT "Proxy Server" or "Static Residential"!).\n\n'
151+
"You will find more information on how to easily integrate Webshare here: "
152+
"https://github.com/jdepoix/youtube-transcript-api"
153+
"?tab=readme-ov-file#using-webshare"
154+
)
155+
WITH_WEBSHARE_PROXY_CAUSE_MESSAGE = (
156+
"YouTube is blocking your requests, despite you using Webshare proxies. "
157+
'Please make sure that you have purchased "Residential" proxies and '
158+
'NOT "Proxy Server" or "Static Residential", as those won\'t work as '
159+
'reliably! The free tier also uses "Proxy Server" and will NOT work!\n\n'
160+
'The only reliable option is using "Residential" proxies (not "Static '
161+
'Residential"), as this allows you to rotate through a pool of over 30M IPs, '
162+
"which means you will always find an IP that hasn't been blocked by YouTube "
163+
"yet!\n\n"
164+
"You can support the development of this open source project by making your "
165+
"Webshare purchases through this affiliate link: "
166+
"https://www.webshare.io/?referral_code=w0xno53eb50g \n\n"
167+
"Thank you for your support! <3"
168+
)
169+
170+
def __init__(self, video_id: str):
171+
self._proxy_config = None
172+
super().__init__(video_id)
173+
174+
def with_proxy_config(
175+
self, proxy_config: Optional[ProxyConfig]
176+
) -> "RequestBlocked":
177+
self._proxy_config = proxy_config
178+
return self
179+
180+
@property
181+
def cause(self) -> str:
182+
if isinstance(self._proxy_config, WebshareProxyConfig):
183+
return self.WITH_WEBSHARE_PROXY_CAUSE_MESSAGE
184+
if isinstance(self._proxy_config, GenericProxyConfig):
185+
return self.WITH_GENERIC_PROXY_CAUSE_MESSAGE
186+
return super().cause
138187

139188

140189
class IpBlocked(RequestBlocked):

youtube_transcript_api/_transcripts.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44
from itertools import chain
55

66
from html import unescape
7-
from typing import List, Dict, Iterator, Iterable, Pattern
7+
from typing import List, Dict, Iterator, Iterable, Pattern, Optional
88

99
from defusedxml import ElementTree
1010

1111
import re
1212

1313
from requests import HTTPError, Session, Response
1414

15+
from .proxies import ProxyConfig
1516
from ._errors import (
1617
VideoUnavailable,
1718
YouTubeRequestFailed,
@@ -339,16 +340,32 @@ def _get_language_description(self, transcript_strings: Iterable[str]) -> str:
339340

340341

341342
class TranscriptListFetcher:
342-
def __init__(self, http_client: Session):
343+
def __init__(self, http_client: Session, proxy_config: Optional[ProxyConfig]):
343344
self._http_client = http_client
345+
self._proxy_config = proxy_config
344346

345347
def fetch(self, video_id: str) -> TranscriptList:
346348
return TranscriptList.build(
347349
self._http_client,
348350
video_id,
349-
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
351+
self._fetch_captions_json(video_id),
350352
)
351353

354+
def _fetch_captions_json(self, video_id: str, try_number: int = 0) -> Dict:
355+
try:
356+
return self._extract_captions_json(
357+
self._fetch_video_html(video_id), video_id
358+
)
359+
except RequestBlocked as exception:
360+
retries = (
361+
0
362+
if self._proxy_config is None
363+
else self._proxy_config.retries_when_blocked
364+
)
365+
if try_number + 1 < retries:
366+
return self._fetch_captions_json(video_id, try_number=try_number + 1)
367+
raise exception.with_proxy_config(self._proxy_config)
368+
352369
def _extract_captions_json(self, html: str, video_id: str) -> Dict:
353370
splitted_html = html.split("var ytInitialPlayerResponse = ")
354371

youtube_transcript_api/proxies.py

+31-5
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def to_requests_dict(self) -> RequestsProxyConfigDict:
3232
"""
3333
pass
3434

35+
@property
3536
def prevent_keeping_connections_alive(self) -> bool:
3637
"""
3738
If you are using rotating proxies, it can be useful to prevent the HTTP
@@ -40,6 +41,16 @@ def prevent_keeping_connections_alive(self) -> bool:
4041
"""
4142
return False
4243

44+
@property
45+
def retries_when_blocked(self) -> int:
46+
"""
47+
Defines how many times we should retry if a request is blocked. When using
48+
rotating residential proxies with a large IP pool it can make sense to retry a
49+
couple of times when a blocked IP is encountered, since a retry will trigger
50+
an IP rotation and the next IP might not be blocked.
51+
"""
52+
return 0
53+
4354

4455
class GenericProxyConfig(ProxyConfig):
4556
"""
@@ -83,8 +94,9 @@ class WebshareProxyConfig(GenericProxyConfig):
8394
most reliable way to work around being blocked by YouTube.
8495
8596
If you don't have a Webshare account yet, you will have to create one
86-
at https://www.webshare.io/?referral_code=w0xno53eb50g and purchase a residential
87-
proxy package that suits your workload, to be able to use this proxy config.
97+
at https://www.webshare.io/?referral_code=w0xno53eb50g and purchase a "Residential"
98+
proxy package that suits your workload, to be able to use this proxy config (make
99+
sure NOT to purchase "Proxy Server" or "Static Residential"!).
88100
89101
Once you have created an account you only need the "Proxy Username" and
90102
"Proxy Password" that you can find in your Webshare settings
@@ -105,24 +117,33 @@ def __init__(
105117
self,
106118
proxy_username: str,
107119
proxy_password: str,
120+
retries_when_blocked: int = 10,
108121
domain_name: str = DEFAULT_DOMAIN_NAME,
109122
proxy_port: int = DEFAULT_PORT,
110123
):
111124
"""
112125
Once you have created a Webshare account at
113-
https://www.webshare.io/?referral_code=w0xno53eb50g and purchased a residential
114-
proxy package, this config class allows you to easily use it, by defaulting to
115-
the most reliable proxy settings (rotating residential proxies).
126+
https://www.webshare.io/?referral_code=w0xno53eb50g and purchased a
127+
"Residential" package (make sure NOT to purchase "Proxy Server" or
128+
"Static Residential"!), this config class allows you to easily use it,
129+
by defaulting to the most reliable proxy settings (rotating residential
130+
proxies).
116131
117132
:param proxy_username: "Proxy Username" found at
118133
https://dashboard.webshare.io/proxy/settings
119134
:param proxy_password: "Proxy Password" found at
120135
https://dashboard.webshare.io/proxy/settings
136+
:param retries_when_blocked: Define how many times we should retry if a request
137+
is blocked. When using rotating residential proxies with a large IP pool it
138+
makes sense to retry a couple of times when a blocked IP is encountered,
139+
since a retry will trigger an IP rotation and the next IP might not be
140+
blocked. Defaults to 10.
121141
"""
122142
self.proxy_username = proxy_username
123143
self.proxy_password = proxy_password
124144
self.domain_name = domain_name
125145
self.proxy_port = proxy_port
146+
self._retries_when_blocked = retries_when_blocked
126147

127148
@property
128149
def url(self) -> str:
@@ -139,5 +160,10 @@ def http_url(self) -> str:
139160
def https_url(self) -> str:
140161
return self.url
141162

163+
@property
142164
def prevent_keeping_connections_alive(self) -> bool:
143165
return True
166+
167+
@property
168+
def retries_when_blocked(self) -> int:
169+
return self._retries_when_blocked

youtube_transcript_api/test/test_api.py

+72-7
Original file line numberDiff line numberDiff line change
@@ -247,9 +247,11 @@ def test_fetch__exception_if_youtube_request_fails(self):
247247
httpretty.GET, "https://www.youtube.com/watch", status=500
248248
)
249249

250-
with self.assertRaises(YouTubeRequestFailed):
250+
with self.assertRaises(YouTubeRequestFailed) as cm:
251251
YouTubeTranscriptApi().fetch("abc")
252252

253+
self.assertIn("Request to YouTube failed: ", str(cm.exception))
254+
253255
def test_fetch__exception_if_age_restricted(self):
254256
httpretty.register_uri(
255257
httpretty.GET,
@@ -277,21 +279,24 @@ def test_fetch__exception_request_blocked(self):
277279
body=load_asset("youtube_request_blocked.html.static"),
278280
)
279281

280-
with self.assertRaises(RequestBlocked):
282+
with self.assertRaises(RequestBlocked) as cm:
281283
YouTubeTranscriptApi().fetch("Njp5uhTorCo")
282284

285+
self.assertIn("YouTube is blocking requests from your IP", str(cm.exception))
286+
283287
def test_fetch__exception_unplayable(self):
284288
httpretty.register_uri(
285289
httpretty.GET,
286290
"https://www.youtube.com/watch",
287291
body=load_asset("youtube_unplayable.html.static"),
288292
)
289293

290-
with self.assertRaises(VideoUnplayable) as error:
294+
with self.assertRaises(VideoUnplayable) as cm:
291295
YouTubeTranscriptApi().fetch("Njp5uhTorCo")
292-
error = error.exception
293-
self.assertEqual(error.reason, "Custom Reason")
294-
self.assertEqual(error.sub_reasons, ["Sub Reason 1", "Sub Reason 2"])
296+
exception = cm.exception
297+
self.assertEqual(exception.reason, "Custom Reason")
298+
self.assertEqual(exception.sub_reasons, ["Sub Reason 1", "Sub Reason 2"])
299+
self.assertIn("Custom Reason", str(exception))
295300

296301
def test_fetch__exception_if_transcripts_disabled(self):
297302
httpretty.register_uri(
@@ -312,9 +317,11 @@ def test_fetch__exception_if_transcripts_disabled(self):
312317
YouTubeTranscriptApi().fetch("Fjg5lYqvzUs")
313318

314319
def test_fetch__exception_if_language_unavailable(self):
315-
with self.assertRaises(NoTranscriptFound):
320+
with self.assertRaises(NoTranscriptFound) as cm:
316321
YouTubeTranscriptApi().fetch("GJLlxj_dtq8", languages=["cz"])
317322

323+
self.assertIn("No transcripts were found for", str(cm.exception))
324+
318325
@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
319326
def test_fetch__with_proxy(self, to_requests_dict):
320327
proxy_config = GenericProxyConfig(
@@ -341,6 +348,64 @@ def test_fetch__with_proxy_prevent_alive_connections(self, to_requests_dict):
341348
request = httpretty.last_request()
342349
self.assertEqual(request.headers.get("Connection"), "close")
343350

351+
@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
352+
def test_fetch__with_proxy_retry_when_blocked(self, to_requests_dict):
353+
for _ in range(3):
354+
httpretty.register_uri(
355+
httpretty.GET,
356+
"https://www.youtube.com/watch",
357+
body=load_asset("youtube_request_blocked.html.static"),
358+
)
359+
proxy_config = WebshareProxyConfig(
360+
proxy_username="username",
361+
proxy_password="password",
362+
)
363+
364+
YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo")
365+
366+
self.assertEqual(len(httpretty.latest_requests()), 3 + 2)
367+
368+
@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
369+
def test_fetch__with_webshare_proxy_reraise_when_blocked(self, to_requests_dict):
370+
retries = 5
371+
for _ in range(retries):
372+
httpretty.register_uri(
373+
httpretty.GET,
374+
"https://www.youtube.com/watch",
375+
body=load_asset("youtube_request_blocked.html.static"),
376+
)
377+
proxy_config = WebshareProxyConfig(
378+
proxy_username="username",
379+
proxy_password="password",
380+
retries_when_blocked=retries,
381+
)
382+
383+
with self.assertRaises(RequestBlocked) as cm:
384+
YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo")
385+
386+
self.assertEqual(len(httpretty.latest_requests()), retries)
387+
self.assertEqual(cm.exception._proxy_config, proxy_config)
388+
self.assertIn("Webshare", str(cm.exception))
389+
390+
@patch("youtube_transcript_api.proxies.GenericProxyConfig.to_requests_dict")
391+
def test_fetch__with_generic_proxy_reraise_when_blocked(self, to_requests_dict):
392+
httpretty.register_uri(
393+
httpretty.GET,
394+
"https://www.youtube.com/watch",
395+
body=load_asset("youtube_request_blocked.html.static"),
396+
)
397+
proxy_config = GenericProxyConfig(
398+
http_url="http://localhost:8080",
399+
https_url="http://localhost:8080",
400+
)
401+
402+
with self.assertRaises(RequestBlocked) as cm:
403+
YouTubeTranscriptApi(proxy_config=proxy_config).fetch("Njp5uhTorCo")
404+
405+
self.assertEqual(len(httpretty.latest_requests()), 1)
406+
self.assertEqual(cm.exception._proxy_config, proxy_config)
407+
self.assertIn("YouTube is blocking your requests", str(cm.exception))
408+
344409
def test_fetch__with_cookies(self):
345410
cookie_path = get_asset_path("example_cookies.txt")
346411
transcript = YouTubeTranscriptApi(cookie_path=cookie_path).fetch("GJLlxj_dtq8")

0 commit comments

Comments
 (0)