Skip to content

Replication of original PR #434: Refactor transcript scraping to use Android innertube API #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: replicated-pr-434-base
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 2 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -375,17 +375,8 @@ ytt_api_2.fetch(video_id)
## Cookie Authentication

Some videos are age restricted, so this module won't be able to access those videos without some sort of
authentication. To do this, you will need to have access to the desired video in a browser. Then, you will need to
download that pages cookies into a text file. You can use the Chrome extension
[Cookie-Editor](https://chromewebstore.google.com/detail/cookie-editor/hlkenndednhfkekhgcdicdfddnkalmdm?hl=en) and
select "Netscape" during export, or the Firefox extension [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/).

Once you have that, you can use the following to access age-restricted videos' captions like so.

```python
ytt_api = YouTubeTranscriptApi(cookie_path='/path/to/your/cookies.txt')
ytt_api.fetch(video_id)
```
authentication. Unfortunately, some recent changes to the YouTube API have broken the current implementation of cookie
based authentication, so this feature is currently not available.

## Using Formatters
Formatters are meant to be an additional layer of processing of the transcript you pass it. The goal is to convert a
Expand Down
33 changes: 26 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ optional = true
[tool.poetry.group.test.dependencies]
pytest = "^8.3.3"
coverage = "^7.6.1"
httpretty = "^1.1.4"
httpretty = "<1.1"

[tool.poetry.group.dev]
optional = true
Expand Down
1 change: 1 addition & 0 deletions youtube_transcript_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@
InvalidVideoId,
AgeRestricted,
YouTubeDataUnparsable,
PoTokenRequired,
)
41 changes: 8 additions & 33 deletions youtube_transcript_api/_api.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,16 @@
import warnings
from pathlib import Path
from typing import Optional, Iterable, Union

from http.cookiejar import MozillaCookieJar, LoadError
from typing import Optional, Iterable

from requests import Session

from .proxies import ProxyConfig, GenericProxyConfig

from ._transcripts import TranscriptListFetcher, FetchedTranscript, TranscriptList

from ._errors import CookiePathInvalid, CookieInvalid


def _load_cookie_jar(cookies: Union[Path, str]) -> MozillaCookieJar:
try:
cookie_jar = MozillaCookieJar()
cookie_jar.load(str(cookies))
if not cookie_jar:
raise CookieInvalid(cookies)
return cookie_jar
except (FileNotFoundError, LoadError):
raise CookiePathInvalid(cookies)


class YouTubeTranscriptApi:
def __init__(
self,
cookie_path: Optional[Union[Path, str]] = None,
proxy_config: Optional[ProxyConfig] = None,
http_client: Optional[Session] = None,
):
Expand All @@ -36,7 +19,6 @@ def __init__(
object, it is not thread-safe. Make sure to initialize an instance of
`YouTubeTranscriptApi` per thread, if used in a multi-threading scenario!

:param cookie_path: Path to a text file containing YouTube authorization cookies
:param proxy_config: an optional ProxyConfig object, defining proxies used for
all network requests. This can be used to work around your IP being blocked
by YouTube, as described in the "Working around IP bans" section of the
Expand All @@ -48,8 +30,10 @@ def __init__(
"""
http_client = Session() if http_client is None else http_client
http_client.headers.update({"Accept-Language": "en-US"})
if cookie_path is not None:
http_client.cookies = _load_cookie_jar(cookie_path)
# Cookie auth has been temporarily disabled, as it is not working properly with
# YouTube's most recent changes.
# if cookie_path is not None:
# http_client.cookies = _load_cookie_jar(cookie_path)
if proxy_config is not None:
http_client.proxies = proxy_config.to_requests_dict()
if proxy_config.prevent_keeping_connections_alive:
Expand Down Expand Up @@ -135,7 +119,7 @@ def list(
return self._fetcher.fetch(video_id)

@classmethod
def list_transcripts(cls, video_id, proxies=None, cookies=None):
def list_transcripts(cls, video_id, proxies=None):
"""
DEPRECATED: use the `list` method instead!

Expand Down Expand Up @@ -180,8 +164,6 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None):
:type video_id: str
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str
:return: the list of available transcripts
:rtype TranscriptList:
"""
Expand All @@ -202,7 +184,6 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None):

ytt_api = YouTubeTranscriptApi(
proxy_config=proxy_config,
cookie_path=Path(cookies) if cookies else None,
)
return ytt_api.list(video_id)

Expand All @@ -213,7 +194,6 @@ def get_transcripts(
languages=("en",),
continue_after_error=False,
proxies=None,
cookies=None,
preserve_formatting=False,
):
"""
Expand All @@ -232,8 +212,6 @@ def get_transcripts(
:type continue_after_error: bool
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
Expand All @@ -254,7 +232,7 @@ def get_transcripts(
for video_id in video_ids:
try:
data[video_id] = cls.get_transcript(
video_id, languages, proxies, cookies, preserve_formatting
video_id, languages, proxies, preserve_formatting
)
except Exception as exception:
if not continue_after_error:
Expand All @@ -270,7 +248,6 @@ def get_transcript(
video_id,
languages=("en",),
proxies=None,
cookies=None,
preserve_formatting=False,
):
"""
Expand All @@ -288,8 +265,6 @@ def get_transcript(
:type languages: list[str]
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
Expand All @@ -303,7 +278,7 @@ def get_transcript(

assert isinstance(video_id, str), "`video_id` must be a string"
return (
cls.list_transcripts(video_id, proxies, cookies)
cls.list_transcripts(video_id, proxies)
.find_transcript(languages)
.fetch(preserve_formatting=preserve_formatting)
.to_raw_data()
Expand Down
15 changes: 7 additions & 8 deletions youtube_transcript_api/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,11 @@ def run(self) -> str:
proxy_password=parsed_args.webshare_proxy_password,
)

cookie_path = parsed_args.cookies

transcripts = []
exceptions = []

ytt_api = YouTubeTranscriptApi(
proxy_config=proxy_config,
cookie_path=cookie_path,
)

for video_id in parsed_args.video_ids:
Expand Down Expand Up @@ -179,11 +176,13 @@ def _parse_args(self):
metavar="URL",
help="Use the specified HTTPS proxy.",
)
parser.add_argument(
"--cookies",
default=None,
help="The cookie file that will be used for authorization with youtube.",
)
# Cookie auth has been temporarily disabled, as it is not working properly with
# YouTube's most recent changes.
# parser.add_argument(
# "--cookies",
# default=None,
# help="The cookie file that will be used for authorization with youtube.",
# )

return self._sanitize_video_ids(parser.parse_args(self._args))

Expand Down
Loading