from __future__ import unicode_literals
+import hashlib
import itertools
import json
import os.path
'context': {
'client': {
'clientName': 'WEB',
- 'clientVersion': '2.20201021.03.00',
+ 'clientVersion': '2.20210301.08.00',
}
},
}
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
- def _call_api(self, ep, query, video_id, fatal=True):
+ def _generate_sapisidhash_header(self):
+ sapisid_cookie = self._get_cookies('https://www.youtube.com').get('SAPISID')
+ if sapisid_cookie is None:
+ return
+ time_now = round(time.time())
+ sapisidhash = hashlib.sha1((str(time_now) + " " + sapisid_cookie.value + " " + "https://www.youtube.com").encode("utf-8")).hexdigest()
+ return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
+
+ def _call_api(self, ep, query, video_id, fatal=True, headers=None,
+ note='Downloading API JSON', errnote='Unable to download API page'):
data = self._DEFAULT_API_DATA.copy()
data.update(query)
+ headers = headers or {}
+ headers.update({'content-type': 'application/json'})
+ auth = self._generate_sapisidhash_header()
+ if auth is not None:
+ headers.update({'Authorization': auth, 'X-Origin': 'https://www.youtube.com'})
return self._download_json(
- 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
- note='Downloading API JSON', errnote='Unable to download API page',
- data=json.dumps(data).encode('utf8'), fatal=fatal,
- headers={'content-type': 'application/json'},
+ 'https://www.youtube.com/youtubei/v1/%s' % ep,
+ video_id=video_id, fatal=fatal, note=note, errnote=errnote,
+ data=json.dumps(data).encode('utf8'), headers=headers,
query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
def _extract_yt_initial_data(self, video_id, webpage):
url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
base_url = self.http_scheme() + '//www.youtube.com/'
- webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1&bpctr=9999999999'
- webpage = self._download_webpage(webpage_url, video_id, fatal=False)
+ webpage_url = base_url + 'watch?v=' + video_id
+ webpage = self._download_webpage(
+ webpage_url + '&has_verified=1&bpctr=9999999999',
+ video_id, fatal=False)
player_response = None
if webpage:
# Get comments
# TODO: Refactor and move to seperate function
- if get_comments:
+ def extract_comments():
expected_video_comment_count = 0
video_comments = []
+ comment_xsrf = xsrf_token
def find_value(html, key, num_chars=2, separator='"'):
pos_begin = html.find(key) + len(key) + num_chars
self.to_screen('Downloading comments')
while continuations:
continuation = continuations.pop()
- comment_response = get_continuation(continuation, xsrf_token)
+ comment_response = get_continuation(continuation, comment_xsrf)
if not comment_response:
continue
if list(search_dict(comment_response, 'externalErrorMessage')):
continue
# not sure if this actually helps
if 'xsrf_token' in comment_response:
- xsrf_token = comment_response['xsrf_token']
+ comment_xsrf = comment_response['xsrf_token']
item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
if first_continuation:
while reply_continuations:
time.sleep(1)
continuation = reply_continuations.pop()
- replies_data = get_continuation(continuation, xsrf_token, True)
+ replies_data = get_continuation(continuation, comment_xsrf, True)
if not replies_data or 'continuationContents' not in replies_data[1]['response']:
continue
time.sleep(1)
self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
- info.update({
+ return {
'comments': video_comments,
'comment_count': expected_video_comment_count
- })
+ }
+
+ if get_comments:
+ info['__post_extractor'] = extract_comments
self.mark_watched(video_id, player_response)
ctp = continuation_ep.get('clickTrackingParams')
return YoutubeTabIE._build_continuation_query(continuation, ctp)
- def _entries(self, tab, identity_token):
+ def _entries(self, tab, identity_token, item_id):
def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
for page_num in itertools.count(1):
if not continuation:
break
- count = 0
- retries = 3
- while count <= retries:
+ retries = self._downloader.params.get('extractor_retries', 3)
+ count = -1
+ last_error = None
+ while count < retries:
+ count += 1
+ if last_error:
+ self.report_warning('%s. Retrying ...' % last_error)
try:
- # Downloading page may result in intermittent 5xx HTTP error
- # that is usually worked around with a retry
- browse = self._download_json(
- 'https://www.youtube.com/browse_ajax', None,
- 'Downloading page %d%s'
- % (page_num, ' (retry #%d)' % count if count else ''),
- headers=headers, query=continuation)
- break
+ response = self._call_api(
+ ep="browse", fatal=True, headers=headers,
+ video_id='%s page %s' % (item_id, page_num),
+ query={
+ 'continuation': continuation['continuation'],
+ 'clickTracking': {'clickTrackingParams': continuation['itct']},
+ },
+ note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
- count += 1
- if count <= retries:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
+ # Downloading page may result in intermittent 5xx HTTP error
+ # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
+ last_error = 'HTTP Error %s' % e.cause.code
+ if count < retries:
continue
raise
- if not browse:
- break
- response = try_get(browse, lambda x: x[1]['response'], dict)
+ else:
+ # Youtube sometimes sends incomplete data
+ # See: https://github.com/ytdl-org/youtube-dl/issues/28194
+ if response.get('continuationContents') or response.get('onResponseReceivedActions'):
+ break
+ last_error = 'Incomplete data recieved'
+ if count >= retries:
+ self._downloader.report_error(last_error)
+
if not response:
break
'channel_id': metadata['uploader_id'],
'channel_url': metadata['uploader_url']})
return self.playlist_result(
- self._entries(selected_tab, identity_token),
+ self._entries(selected_tab, identity_token, playlist_id),
**metadata)
def _extract_from_playlist(self, item_id, url, data, playlist):
return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- count = 0
- retries = 3
+ retries = self._downloader.params.get('extractor_retries', 3)
+ count = -1
+ last_error = 'Incomplete yt initial data recieved'
while count < retries:
+ count += 1
# Sometimes youtube returns a webpage with incomplete ytInitialData
- webpage = self._download_webpage(url, item_id)
+ # See: https://github.com/yt-dlp/yt-dlp/issues/116
+ if count:
+ self.report_warning('%s. Retrying ...' % last_error)
+ webpage = self._download_webpage(
+ url, item_id,
+ 'Downloading webpage%s' % ' (retry #%d)' % count if count else '')
identity_token = self._extract_identity_token(webpage, item_id)
data = self._extract_yt_initial_data(item_id, webpage)
err_msg = None
raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
if data.get('contents') or data.get('currentVideoEndpoint'):
break
- count += 1
- self.to_screen(
- 'Incomplete yt initial data recieved. Retrying (attempt %d of %d)...' % (count, retries))
+ if count >= retries:
+ self._downloader.report_error(last_error)
tabs = try_get(
data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
_TESTS = []
def _entries(self, query, n):
- data = {
- 'context': {
- 'client': {
- 'clientName': 'WEB',
- 'clientVersion': '2.20201021.03.00',
- }
- },
- 'query': query,
- }
+ data = {'query': query}
if self._SEARCH_PARAMS:
data['params'] = self._SEARCH_PARAMS
total = 0
for page_num in itertools.count(1):
- search = self._download_json(
- 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
- video_id='query "%s"' % query,
- note='Downloading page %s' % page_num,
- errnote='Unable to download API page', fatal=False,
- data=json.dumps(data).encode('utf8'),
- headers={'content-type': 'application/json'})
+ search = self._call_api(
+ ep='search', video_id='query "%s"' % query, fatal=False,
+ note='Downloading page %s' % page_num, query=data)
if not search:
break
slr_contents = try_get(
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
- _VALID_URL = r':ythistory'
+ IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
+ _VALID_URL = r':ythis(?:tory)?'
_FEED_NAME = 'history'
_TESTS = [{
'url': ':ythistory',