Skip to content

Commit db46346

Browse files
authored
Merge pull request #434 from jdepoix/bugfix/refactor-to-use-innertube
Refactor transcript scraping to use Android innertube API
2 parents d1de035 + e9c3db6 commit db46346

33 files changed

+14236
-1924
lines changed

README.md

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -375,17 +375,8 @@ ytt_api_2.fetch(video_id)
375375
## Cookie Authentication
376376

377377
Some videos are age restricted, so this module won't be able to access those videos without some sort of
378-
authentication. To do this, you will need to have access to the desired video in a browser. Then, you will need to
379-
download that pages cookies into a text file. You can use the Chrome extension
380-
[Cookie-Editor](https://chromewebstore.google.com/detail/cookie-editor/hlkenndednhfkekhgcdicdfddnkalmdm?hl=en) and
381-
select "Netscape" during export, or the Firefox extension [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/).
382-
383-
Once you have that, you can use the following to access age-restricted videos' captions like so.
384-
385-
```python
386-
ytt_api = YouTubeTranscriptApi(cookie_path='/path/to/your/cookies.txt')
387-
ytt_api.fetch(video_id)
388-
```
378+
authentication. Unfortunately, some recent changes to the YouTube API have broken the current implementation of cookie
379+
based authentication, so this feature is currently not available.
389380

390381
## Using Formatters
391382
Formatters are meant to be an additional layer of processing of the transcript you pass it. The goal is to convert a

poetry.lock

Lines changed: 26 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ optional = true
5757
[tool.poetry.group.test.dependencies]
5858
pytest = "^8.3.3"
5959
coverage = "^7.6.1"
60-
httpretty = "^1.1.4"
60+
httpretty = "<1.1"
6161

6262
[tool.poetry.group.dev]
6363
optional = true

youtube_transcript_api/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,5 @@
2525
InvalidVideoId,
2626
AgeRestricted,
2727
YouTubeDataUnparsable,
28+
PoTokenRequired,
2829
)

youtube_transcript_api/_api.py

Lines changed: 8 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,16 @@
11
import warnings
2-
from pathlib import Path
3-
from typing import Optional, Iterable, Union
4-
5-
from http.cookiejar import MozillaCookieJar, LoadError
2+
from typing import Optional, Iterable
63

74
from requests import Session
85

96
from .proxies import ProxyConfig, GenericProxyConfig
107

118
from ._transcripts import TranscriptListFetcher, FetchedTranscript, TranscriptList
129

13-
from ._errors import CookiePathInvalid, CookieInvalid
14-
15-
16-
def _load_cookie_jar(cookies: Union[Path, str]) -> MozillaCookieJar:
17-
try:
18-
cookie_jar = MozillaCookieJar()
19-
cookie_jar.load(str(cookies))
20-
if not cookie_jar:
21-
raise CookieInvalid(cookies)
22-
return cookie_jar
23-
except (FileNotFoundError, LoadError):
24-
raise CookiePathInvalid(cookies)
25-
2610

2711
class YouTubeTranscriptApi:
2812
def __init__(
2913
self,
30-
cookie_path: Optional[Union[Path, str]] = None,
3114
proxy_config: Optional[ProxyConfig] = None,
3215
http_client: Optional[Session] = None,
3316
):
@@ -36,7 +19,6 @@ def __init__(
3619
object, it is not thread-safe. Make sure to initialize an instance of
3720
`YouTubeTranscriptApi` per thread, if used in a multi-threading scenario!
3821
39-
:param cookie_path: Path to a text file containing YouTube authorization cookies
4022
:param proxy_config: an optional ProxyConfig object, defining proxies used for
4123
all network requests. This can be used to work around your IP being blocked
4224
by YouTube, as described in the "Working around IP bans" section of the
@@ -48,8 +30,10 @@ def __init__(
4830
"""
4931
http_client = Session() if http_client is None else http_client
5032
http_client.headers.update({"Accept-Language": "en-US"})
51-
if cookie_path is not None:
52-
http_client.cookies = _load_cookie_jar(cookie_path)
33+
# Cookie auth has been temporarily disabled, as it is not working properly with
34+
# YouTube's most recent changes.
35+
# if cookie_path is not None:
36+
# http_client.cookies = _load_cookie_jar(cookie_path)
5337
if proxy_config is not None:
5438
http_client.proxies = proxy_config.to_requests_dict()
5539
if proxy_config.prevent_keeping_connections_alive:
@@ -135,7 +119,7 @@ def list(
135119
return self._fetcher.fetch(video_id)
136120

137121
@classmethod
138-
def list_transcripts(cls, video_id, proxies=None, cookies=None):
122+
def list_transcripts(cls, video_id, proxies=None):
139123
"""
140124
DEPRECATED: use the `list` method instead!
141125
@@ -180,8 +164,6 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None):
180164
:type video_id: str
181165
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
182166
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
183-
:param cookies: a string of the path to a text file containing youtube authorization cookies
184-
:type cookies: str
185167
:return: the list of available transcripts
186168
:rtype TranscriptList:
187169
"""
@@ -202,7 +184,6 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None):
202184

203185
ytt_api = YouTubeTranscriptApi(
204186
proxy_config=proxy_config,
205-
cookie_path=Path(cookies) if cookies else None,
206187
)
207188
return ytt_api.list(video_id)
208189

@@ -213,7 +194,6 @@ def get_transcripts(
213194
languages=("en",),
214195
continue_after_error=False,
215196
proxies=None,
216-
cookies=None,
217197
preserve_formatting=False,
218198
):
219199
"""
@@ -232,8 +212,6 @@ def get_transcripts(
232212
:type continue_after_error: bool
233213
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
234214
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
235-
:param cookies: a string of the path to a text file containing youtube authorization cookies
236-
:type cookies: str
237215
:param preserve_formatting: whether to keep select HTML text formatting
238216
:type preserve_formatting: bool
239217
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
@@ -254,7 +232,7 @@ def get_transcripts(
254232
for video_id in video_ids:
255233
try:
256234
data[video_id] = cls.get_transcript(
257-
video_id, languages, proxies, cookies, preserve_formatting
235+
video_id, languages, proxies, preserve_formatting
258236
)
259237
except Exception as exception:
260238
if not continue_after_error:
@@ -270,7 +248,6 @@ def get_transcript(
270248
video_id,
271249
languages=("en",),
272250
proxies=None,
273-
cookies=None,
274251
preserve_formatting=False,
275252
):
276253
"""
@@ -288,8 +265,6 @@ def get_transcript(
288265
:type languages: list[str]
289266
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
290267
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
291-
:param cookies: a string of the path to a text file containing youtube authorization cookies
292-
:type cookies: str
293268
:param preserve_formatting: whether to keep select HTML text formatting
294269
:type preserve_formatting: bool
295270
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
@@ -303,7 +278,7 @@ def get_transcript(
303278

304279
assert isinstance(video_id, str), "`video_id` must be a string"
305280
return (
306-
cls.list_transcripts(video_id, proxies, cookies)
281+
cls.list_transcripts(video_id, proxies)
307282
.find_transcript(languages)
308283
.fetch(preserve_formatting=preserve_formatting)
309284
.to_raw_data()

youtube_transcript_api/_cli.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,11 @@ def run(self) -> str:
3333
proxy_password=parsed_args.webshare_proxy_password,
3434
)
3535

36-
cookie_path = parsed_args.cookies
37-
3836
transcripts = []
3937
exceptions = []
4038

4139
ytt_api = YouTubeTranscriptApi(
4240
proxy_config=proxy_config,
43-
cookie_path=cookie_path,
4441
)
4542

4643
for video_id in parsed_args.video_ids:
@@ -179,11 +176,13 @@ def _parse_args(self):
179176
metavar="URL",
180177
help="Use the specified HTTPS proxy.",
181178
)
182-
parser.add_argument(
183-
"--cookies",
184-
default=None,
185-
help="The cookie file that will be used for authorization with youtube.",
186-
)
179+
# Cookie auth has been temporarily disabled, as it is not working properly with
180+
# YouTube's most recent changes.
181+
# parser.add_argument(
182+
# "--cookies",
183+
# default=None,
184+
# help="The cookie file that will be used for authorization with youtube.",
185+
# )
187186

188187
return self._sanitize_video_ids(parser.parse_args(self._args))
189188

0 commit comments

Comments
 (0)