]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/youtube.py
[cbs] Add support for ParamountPlus (#138)
[yt-dlp.git] / yt_dlp / extractor / youtube.py
index d30faf94ceaeb809220c4c4c2953884f225860dc..7f514d35a6527b24f99073388681176e320101ce 100644 (file)
@@ -2,6 +2,7 @@
 
 from __future__ import unicode_literals
 
+import hashlib
 import itertools
 import json
 import os.path
@@ -274,7 +275,7 @@ def _real_initialize(self):
         'context': {
             'client': {
                 'clientName': 'WEB',
-                'clientVersion': '2.20201021.03.00',
+                'clientVersion': '2.20210301.08.00',
             }
         },
     }
@@ -283,15 +284,28 @@ def _real_initialize(self):
     _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
     _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
 
-    def _call_api(self, ep, query, video_id, fatal=True):
+    def _generate_sapisidhash_header(self):
+        sapisid_cookie = self._get_cookies('https://www.youtube.com').get('SAPISID')
+        if sapisid_cookie is None:
+            return
+        time_now = round(time.time())
+        sapisidhash = hashlib.sha1((str(time_now) + " " + sapisid_cookie.value + " " + "https://www.youtube.com").encode("utf-8")).hexdigest()
+        return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
+
+    def _call_api(self, ep, query, video_id, fatal=True, headers=None,
+                  note='Downloading API JSON', errnote='Unable to download API page'):
         data = self._DEFAULT_API_DATA.copy()
         data.update(query)
+        headers = headers or {}
+        headers.update({'content-type': 'application/json'})
+        auth = self._generate_sapisidhash_header()
+        if auth is not None:
+            headers.update({'Authorization': auth, 'X-Origin': 'https://www.youtube.com'})
 
         return self._download_json(
-            'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
-            note='Downloading API JSON', errnote='Unable to download API page',
-            data=json.dumps(data).encode('utf8'), fatal=fatal,
-            headers={'content-type': 'application/json'},
+            'https://www.youtube.com/youtubei/v1/%s' % ep,
+            video_id=video_id, fatal=fatal, note=note, errnote=errnote,
+            data=json.dumps(data).encode('utf8'), headers=headers,
             query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
 
     def _extract_yt_initial_data(self, video_id, webpage):
@@ -1452,8 +1466,10 @@ def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
         video_id = self._match_id(url)
         base_url = self.http_scheme() + '//www.youtube.com/'
-        webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1&bpctr=9999999999'
-        webpage = self._download_webpage(webpage_url, video_id, fatal=False)
+        webpage_url = base_url + 'watch?v=' + video_id
+        webpage = self._download_webpage(
+            webpage_url + '&has_verified=1&bpctr=9999999999',
+            video_id, fatal=False)
 
         player_response = None
         if webpage:
@@ -2010,9 +2026,10 @@ def chapter_time(mmlir):
 
         # Get comments
         # TODO: Refactor and move to seperate function
-        if get_comments:
+        def extract_comments():
             expected_video_comment_count = 0
             video_comments = []
+            comment_xsrf = xsrf_token
 
             def find_value(html, key, num_chars=2, separator='"'):
                 pos_begin = html.find(key) + len(key) + num_chars
@@ -2081,7 +2098,7 @@ def get_continuation(continuation, session_token, replies=False):
             self.to_screen('Downloading comments')
             while continuations:
                 continuation = continuations.pop()
-                comment_response = get_continuation(continuation, xsrf_token)
+                comment_response = get_continuation(continuation, comment_xsrf)
                 if not comment_response:
                     continue
                 if list(search_dict(comment_response, 'externalErrorMessage')):
@@ -2092,7 +2109,7 @@ def get_continuation(continuation, session_token, replies=False):
                     continue
                 # not sure if this actually helps
                 if 'xsrf_token' in comment_response:
-                    xsrf_token = comment_response['xsrf_token']
+                    comment_xsrf = comment_response['xsrf_token']
 
                 item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
                 if first_continuation:
@@ -2121,7 +2138,7 @@ def get_continuation(continuation, session_token, replies=False):
                     while reply_continuations:
                         time.sleep(1)
                         continuation = reply_continuations.pop()
-                        replies_data = get_continuation(continuation, xsrf_token, True)
+                        replies_data = get_continuation(continuation, comment_xsrf, True)
                         if not replies_data or 'continuationContents' not in replies_data[1]['response']:
                             continue
 
@@ -2150,10 +2167,13 @@ def get_continuation(continuation, session_token, replies=False):
                 time.sleep(1)
 
             self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
-            info.update({
+            return {
                 'comments': video_comments,
                 'comment_count': expected_video_comment_count
-            })
+            }
+
+        if get_comments:
+            info['__post_extractor'] = extract_comments
 
         self.mark_watched(video_id, player_response)
 
@@ -2693,7 +2713,7 @@ def _extract_continuation(cls, renderer):
             ctp = continuation_ep.get('clickTrackingParams')
             return YoutubeTabIE._build_continuation_query(continuation, ctp)
 
-    def _entries(self, tab, identity_token):
+    def _entries(self, tab, identity_token, item_id):
 
         def extract_entries(parent_renderer):  # this needs to called again for continuation to work with feeds
             contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
@@ -2756,27 +2776,39 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
         for page_num in itertools.count(1):
             if not continuation:
                 break
-            count = 0
-            retries = 3
-            while count <= retries:
+            retries = self._downloader.params.get('extractor_retries', 3)
+            count = -1
+            last_error = None
+            while count < retries:
+                count += 1
+                if last_error:
+                    self.report_warning('%s. Retrying ...' % last_error)
                 try:
-                    # Downloading page may result in intermittent 5xx HTTP error
-                    # that is usually worked around with a retry
-                    browse = self._download_json(
-                        'https://www.youtube.com/browse_ajax', None,
-                        'Downloading page %d%s'
-                        % (page_num, ' (retry #%d)' % count if count else ''),
-                        headers=headers, query=continuation)
-                    break
+                    response = self._call_api(
+                        ep="browse", fatal=True, headers=headers,
+                        video_id='%s page %s' % (item_id, page_num),
+                        query={
+                            'continuation': continuation['continuation'],
+                            'clickTracking': {'clickTrackingParams': continuation['itct']},
+                        },
+                        note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
                 except ExtractorError as e:
-                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
-                        count += 1
-                        if count <= retries:
+                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
+                        # Downloading page may result in intermittent 5xx HTTP error
+                        # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
+                        last_error = 'HTTP Error %s' % e.cause.code
+                        if count < retries:
                             continue
                     raise
-            if not browse:
-                break
-            response = try_get(browse, lambda x: x[1]['response'], dict)
+                else:
+                    # Youtube sometimes sends incomplete data
+                    # See: https://github.com/ytdl-org/youtube-dl/issues/28194
+                    if response.get('continuationContents') or response.get('onResponseReceivedActions'):
+                        break
+                    last_error = 'Incomplete data recieved'
+                    if count >= retries:
+                        self._downloader.report_error(last_error)
+
             if not response:
                 break
 
@@ -2920,7 +2952,7 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
             'channel_id': metadata['uploader_id'],
             'channel_url': metadata['uploader_url']})
         return self.playlist_result(
-            self._entries(selected_tab, identity_token),
+            self._entries(selected_tab, identity_token, playlist_id),
             **metadata)
 
     def _extract_from_playlist(self, item_id, url, data, playlist):
@@ -2998,11 +3030,18 @@ def _real_extract(self, url):
                 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
             self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
 
-        count = 0
-        retries = 3
+        retries = self._downloader.params.get('extractor_retries', 3)
+        count = -1
+        last_error = 'Incomplete yt initial data recieved'
         while count < retries:
+            count += 1
             # Sometimes youtube returns a webpage with incomplete ytInitialData
-            webpage = self._download_webpage(url, item_id)
+            # See: https://github.com/yt-dlp/yt-dlp/issues/116
+            if count:
+                self.report_warning('%s. Retrying ...' % last_error)
+            webpage = self._download_webpage(
+                url, item_id,
+                'Downloading webpage%s' % ' (retry #%d)' % count if count else '')
             identity_token = self._extract_identity_token(webpage, item_id)
             data = self._extract_yt_initial_data(item_id, webpage)
             err_msg = None
@@ -3017,9 +3056,8 @@ def _real_extract(self, url):
                 raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
             if data.get('contents') or data.get('currentVideoEndpoint'):
                 break
-            count += 1
-            self.to_screen(
-                'Incomplete yt initial data recieved. Retrying (attempt %d of %d)...' % (count, retries))
+            if count >= retries:
+                self._downloader.report_error(last_error)
 
         tabs = try_get(
             data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
@@ -3201,26 +3239,14 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
     _TESTS = []
 
     def _entries(self, query, n):
-        data = {
-            'context': {
-                'client': {
-                    'clientName': 'WEB',
-                    'clientVersion': '2.20201021.03.00',
-                }
-            },
-            'query': query,
-        }
+        data = {'query': query}
         if self._SEARCH_PARAMS:
             data['params'] = self._SEARCH_PARAMS
         total = 0
         for page_num in itertools.count(1):
-            search = self._download_json(
-                'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
-                video_id='query "%s"' % query,
-                note='Downloading page %s' % page_num,
-                errnote='Unable to download API page', fatal=False,
-                data=json.dumps(data).encode('utf8'),
-                headers={'content-type': 'application/json'})
+            search = self._call_api(
+                ep='search', video_id='query "%s"' % query, fatal=False,
+                note='Downloading page %s' % page_num, query=data)
             if not search:
                 break
             slr_contents = try_get(
@@ -3372,8 +3398,8 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
 
 
 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
-    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
-    _VALID_URL = r':ythistory'
+    IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
+    _VALID_URL = r':ythis(?:tory)?'
     _FEED_NAME = 'history'
     _TESTS = [{
         'url': ':ythistory',