Skip to content

Commit cbe4cd5

Browse files
committed
added exception for transcript URLs that require PO tokens
1 parent 5a91758 commit cbe4cd5

File tree

9 files changed

+2722
-23
lines changed

9 files changed

+2722
-23
lines changed

youtube_transcript_api/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,5 @@
2525
InvalidVideoId,
2626
AgeRestricted,
2727
YouTubeDataUnparsable,
28+
PoTokenRequired,
2829
)

youtube_transcript_api/_api.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import warnings
2-
from pathlib import Path
32
from typing import Optional, Iterable
43

54
from requests import Session

youtube_transcript_api/_cli.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ def run(self) -> str:
3333
proxy_password=parsed_args.webshare_proxy_password,
3434
)
3535

36-
3736
transcripts = []
3837
exceptions = []
3938

youtube_transcript_api/_errors.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,16 @@ class CookieError(YouTubeTranscriptApiException):
1616

1717

1818
class CookiePathInvalid(CookieError):
19-
def __init__(self, cookie_path: Path): # pragma: no cover until cookie authentication is re-implemented
19+
def __init__(
20+
self, cookie_path: Path
21+
): # pragma: no cover until cookie authentication is re-implemented
2022
super().__init__(f"Can't load the provided cookie file: {cookie_path}")
2123

2224

2325
class CookieInvalid(CookieError):
24-
def __init__(self, cookie_path: Path): # pragma: no cover until cookie authentication is re-implemented
26+
def __init__(
27+
self, cookie_path: Path
28+
): # pragma: no cover until cookie authentication is re-implemented
2529
super().__init__(
2630
f"The cookies provided are not valid (may have expired): {cookie_path}"
2731
)
@@ -258,3 +262,10 @@ def cause(self) -> str:
258262
requested_language_codes=self._requested_language_codes,
259263
transcript_data=str(self._transcript_data),
260264
)
265+
266+
267+
class PoTokenRequired(CouldNotRetrieveTranscript):
268+
CAUSE_MESSAGE = (
269+
"The requested video cannot be retrieved without a PO Token. If this happens, "
270+
"please open a GitHub issue!"
271+
)

youtube_transcript_api/_settings.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
11
WATCH_URL = "https://www.youtube.com/watch?v={video_id}"
22
INNERTUBE_API_URL = "https://www.youtube.com/youtubei/v1/player?key={api_key}"
3-
INNERTUBE_CONTEXT = {
4-
"client": {
5-
"clientName": "ANDROID",
6-
"clientVersion": "20.10.38"
7-
}}
3+
INNERTUBE_CONTEXT = {"client": {"clientName": "ANDROID", "clientVersion": "20.10.38"}}

youtube_transcript_api/_transcripts.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
AgeRestricted,
2828
VideoUnplayable,
2929
YouTubeDataUnparsable,
30+
PoTokenRequired,
3031
)
3132

3233

@@ -129,6 +130,8 @@ def fetch(self, preserve_formatting: bool = False) -> FetchedTranscript:
129130
Loads the actual transcript data.
130131
:param preserve_formatting: whether to keep select HTML text formatting
131132
"""
133+
if "&exp=xpe" in self._url:
134+
raise PoTokenRequired(self.video_id)
132135
response = self._http_client.get(self._url)
133136
snippets = _TranscriptParser(preserve_formatting=preserve_formatting).parse(
134137
_raise_http_errors(response, self.video_id).text,
@@ -367,15 +370,14 @@ def _fetch_captions_json(self, video_id: str, try_number: int = 0) -> Dict:
367370
return self._fetch_captions_json(video_id, try_number=try_number + 1)
368371
raise exception.with_proxy_config(self._proxy_config)
369372

370-
371373
def _extract_innertube_api_key(self, html: str, video_id: str) -> str:
372374
pattern = r'"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"'
373375
match = re.search(pattern, html)
374376
if match and len(match.groups()) == 1:
375377
return match.group(1)
376378
if 'class="g-recaptcha"' in html:
377379
raise IpBlocked(video_id)
378-
raise YouTubeDataUnparsable(video_id) # pragma: no cover
380+
raise YouTubeDataUnparsable(video_id) # pragma: no cover
379381

380382
def _extract_captions_json(self, innertube_data: Dict, video_id: str) -> Dict:
381383
self._assert_playability(innertube_data.get("playabilityStatus"), video_id)

youtube_transcript_api/test/assets/youtube_po_token_required.innertube.json.static

Lines changed: 2681 additions & 0 deletions
Large diffs are not rendered by default.

youtube_transcript_api/test/test_api.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
AgeRestricted,
2727
RequestBlocked,
2828
VideoUnplayable,
29+
PoTokenRequired,
2930
)
3031
from youtube_transcript_api.proxies import GenericProxyConfig, WebshareProxyConfig
3132

@@ -157,7 +158,9 @@ def test_list__url_as_video_id(self):
157158
)
158159

159160
with self.assertRaises(InvalidVideoId):
160-
YouTubeTranscriptApi().list("https://www.youtube.com/youtubei/v1/player?v=GJLlxj_dtq8")
161+
YouTubeTranscriptApi().list(
162+
"https://www.youtube.com/youtubei/v1/player?v=GJLlxj_dtq8"
163+
)
161164

162165
def test_translate_transcript(self):
163166
transcript = YouTubeTranscriptApi().list("GJLlxj_dtq8").find_transcript(["en"])
@@ -257,7 +260,6 @@ def test_fetch__exception_if_youtube_request_fails(self):
257260

258261
self.assertIn("Request to YouTube failed: ", str(cm.exception))
259262

260-
261263
def test_get_transcript__exception_if_youtube_request_limit_reached(
262264
self,
263265
):
@@ -290,6 +292,16 @@ def test_fetch__exception_if_ip_blocked(self):
290292
with self.assertRaises(IpBlocked):
291293
YouTubeTranscriptApi().fetch("abc")
292294

295+
def test_fetch__exception_if_po_token_required(self):
296+
httpretty.register_uri(
297+
httpretty.POST,
298+
"https://www.youtube.com/youtubei/v1/player",
299+
body=load_asset("youtube_po_token_required.innertube.json.static"),
300+
)
301+
302+
with self.assertRaises(PoTokenRequired):
303+
YouTubeTranscriptApi().fetch("GJLlxj_dtq8")
304+
293305
def test_fetch__exception_request_blocked(self):
294306
httpretty.register_uri(
295307
httpretty.POST,
@@ -426,7 +438,7 @@ def test_fetch__with_generic_proxy_reraise_when_blocked(self, to_requests_dict):
426438

427439
@pytest.mark.skip(
428440
reason="This test is temporarily disabled because cookie auth is currently not "
429-
"working due to YouTube changes."
441+
"working due to YouTube changes."
430442
)
431443
def test_fetch__with_cookies(self):
432444
cookie_path = get_asset_path("example_cookies.txt")
@@ -439,7 +451,7 @@ def test_fetch__with_cookies(self):
439451

440452
@pytest.mark.skip(
441453
reason="This test is temporarily disabled because cookie auth is currently not "
442-
"working due to YouTube changes."
454+
"working due to YouTube changes."
443455
)
444456
def test_load_cookies(self):
445457
cookie_path = get_asset_path("example_cookies.txt")
@@ -454,7 +466,7 @@ def test_load_cookies(self):
454466

455467
@pytest.mark.skip(
456468
reason="This test is temporarily disabled because cookie auth is currently not "
457-
"working due to YouTube changes."
469+
"working due to YouTube changes."
458470
)
459471
def test_load_cookies__bad_file_path(self):
460472
cookie_path = get_asset_path("nonexistent_cookies.txt")
@@ -463,7 +475,7 @@ def test_load_cookies__bad_file_path(self):
463475

464476
@pytest.mark.skip(
465477
reason="This test is temporarily disabled because cookie auth is currently not "
466-
"working due to YouTube changes."
478+
"working due to YouTube changes."
467479
)
468480
def test_load_cookies__no_valid_cookies(self):
469481
cookie_path = get_asset_path("expired_example_cookies.txt")
@@ -743,7 +755,7 @@ def test_get_transcript__with_proxy_config__deprecated(self, to_requests_dict):
743755

744756
@pytest.mark.skip(
745757
reason="This test is temporarily disabled because cookie auth is currently not "
746-
"working due to YouTube changes."
758+
"working due to YouTube changes."
747759
)
748760
def test_get_transcript__with_cookies__deprecated(self):
749761
cookies_path = get_asset_path("example_cookies.txt")
@@ -815,7 +827,7 @@ def test_get_transcripts__continue_on_error__deprecated(self, mock_get_transcrip
815827

816828
@pytest.mark.skip(
817829
reason="This test is temporarily disabled because cookie auth is currently not "
818-
"working due to YouTube changes."
830+
"working due to YouTube changes."
819831
)
820832
@patch("youtube_transcript_api.YouTubeTranscriptApi.get_transcript")
821833
def test_get_transcripts__with_cookies__deprecated(self, mock_get_transcript):
@@ -832,6 +844,4 @@ def test_get_transcripts__with_proxies__deprecated(self, mock_get_transcript):
832844
"https": "http://localhost:8080",
833845
}
834846
YouTubeTranscriptApi.get_transcripts(["GJLlxj_dtq8"], proxies=proxies)
835-
mock_get_transcript.assert_any_call(
836-
"GJLlxj_dtq8", ("en",), proxies, False
837-
)
847+
mock_get_transcript.assert_any_call("GJLlxj_dtq8", ("en",), proxies, False)

youtube_transcript_api/test/test_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ def test_run__generic_proxy_config(self):
329329

330330
@pytest.mark.skip(
331331
reason="This test is temporarily disabled because cookie auth is currently not "
332-
"working due to YouTube changes."
332+
"working due to YouTube changes."
333333
)
334334
def test_run__cookies(self):
335335
YouTubeTranscriptCli(

0 commit comments

Comments
 (0)