]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/youtube.py
[cleanup] Revert unnecessary changes in 51d9739f8031fb37d8e25b0e9f1abea561e3d2e3
[yt-dlp.git] / yt_dlp / extractor / youtube.py
index 1d50264b6d3a3c2d273d4ccec1516cdaef6fbe49..375fc19096715361b6610ddcaca0c5c12752d066 100644 (file)
@@ -67,8 +67,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
     _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
 
     _RESERVED_NAMES = (
-        r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|'
-        r'movies|results|shared|hashtag|trending|feed|feeds|oembed|'
+        r'channel|c|user|browse|playlist|watch|w|v|embed|e|watch_popup|shorts|'
+        r'movies|results|shared|hashtag|trending|feed|feeds|oembed|get_video_info|'
         r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout')
 
     _NETRC_MACHINE = 'youtube'
@@ -85,7 +85,20 @@ def _login(self):
 
         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
         """
+
+        def warn(message):
+            self.report_warning(message)
+
+        # username+password login is broken
+        if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
+            self.raise_login_required(
+                'Login details are needed to download this content', method='cookies')
         username, password = self._get_login_info()
+        if username:
+            warn('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
+        return
+        # Everything below this is broken!
+
         # No authentication to be performed
         if username is None:
             if self._LOGIN_REQUIRED and self.get_param('cookiefile') is None:
@@ -126,9 +139,6 @@ def req(url, f_req, note, errnote):
                     'Google-Accounts-XSRF': 1,
                 })
 
-        def warn(message):
-            self.report_warning(message)
-
         lookup_req = [
             username,
             None, [], None, 'US', None, None, 2, False, True,
@@ -291,12 +301,22 @@ def _real_initialize(self):
     _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
 
     def _generate_sapisidhash_header(self):
-        sapisid_cookie = self._get_cookies('https://www.youtube.com').get('SAPISID')
+        # Sometimes SAPISID cookie isn't present but __Secure-3PAPISID is.
+        # See: https://github.com/yt-dlp/yt-dlp/issues/393
+        yt_cookies = self._get_cookies('https://www.youtube.com')
+        sapisid_cookie = dict_get(
+            yt_cookies, ('__Secure-3PAPISID', 'SAPISID'))
         if sapisid_cookie is None:
             return
         time_now = round(time.time())
-        sapisidhash = hashlib.sha1((str(time_now) + " " + sapisid_cookie.value + " " + "https://www.youtube.com").encode("utf-8")).hexdigest()
-        return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
+        # SAPISID cookie is required if not already present
+        if not yt_cookies.get('SAPISID'):
+            self._set_cookie(
+                '.youtube.com', 'SAPISID', sapisid_cookie.value, secure=True, expire_time=time_now + 3600)
+        # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323
+        sapisidhash = hashlib.sha1(
+            f'{time_now} {sapisid_cookie.value} https://www.youtube.com'.encode('utf-8')).hexdigest()
+        return f'SAPISIDHASH {time_now}_{sapisidhash}'
 
     def _call_api(self, ep, query, video_id, fatal=True, headers=None,
                   note='Downloading API JSON', errnote='Unable to download API page',
@@ -1856,6 +1876,16 @@ def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
             'comment_count': len(comments),
         }
 
+    @staticmethod
+    def _get_video_info_params(video_id):
+        return {
+            'video_id': video_id,
+            'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+            'html5': '1',
+            'c': 'TVHTML5',
+            'cver': '6.20180913',
+        }
+
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
         video_id = self._match_id(url)
@@ -1888,12 +1918,11 @@ def get_text(x):
                     base_url + 'get_video_info', video_id,
                     'Fetching youtube music info webpage',
                     'unable to download youtube music info webpage', query={
-                        'video_id': video_id,
-                        'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+                        **self._get_video_info_params(video_id),
                         'el': 'detailpage',
                         'c': 'WEB_REMIX',
                         'cver': '0.1',
-                        'cplayer': 'UNIPLAYER'
+                        'cplayer': 'UNIPLAYER',
                     }, fatal=False)),
                 lambda x: x['player_response'][0],
                 compat_str) or '{}', video_id)
@@ -1915,11 +1944,8 @@ def get_text(x):
             pr = self._parse_json(try_get(compat_parse_qs(
                 self._download_webpage(
                     base_url + 'get_video_info', video_id,
-                    'Refetching age-gated info webpage',
-                    'unable to download video info webpage', query={
-                        'video_id': video_id,
-                        'eurl': 'https://youtube.googleapis.com/v/' + video_id,
-                    }, fatal=False)),
+                    'Refetching age-gated info webpage', 'unable to download video info webpage',
+                    query=self._get_video_info_params(video_id), fatal=False)),
                 lambda x: x['player_response'][0],
                 compat_str) or '{}', video_id)
             if pr:
@@ -1994,7 +2020,10 @@ def feed_entry(name):
         formats, itags, stream_ids = [], [], []
         itag_qualities = {}
         player_url = None
-        q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
+        q = qualities([
+            'tiny', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high',  # Audio only formats
+            'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'
+        ])
 
         streaming_data = player_response.get('streamingData') or {}
         streaming_formats = streaming_data.get('formats') or []
@@ -2013,6 +2042,8 @@ def feed_entry(name):
                 continue
 
             quality = fmt.get('quality')
+            if quality == 'tiny' or not quality:
+                quality = fmt.get('audioQuality', '').lower() or quality
             if itag and quality:
                 itag_qualities[itag] = quality
             # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
@@ -2090,7 +2121,7 @@ def feed_entry(name):
                         r'/itag/(\d+)', f['url'], 'itag', default=None)
                     if itag:
                         f['format_id'] = itag
-                formats.append(f)
+                    formats.append(f)
 
         if self.get_param('youtube_include_dash_manifest', True):
             for sd in (streaming_data, ytm_streaming_data):
@@ -2102,9 +2133,6 @@ def feed_entry(name):
                         if itag in itags:
                             continue
                         if itag in itag_qualities:
-                            # Not actually usefull since the sorting is already done with "quality,res,fps,codec"
-                            # but kept to maintain feature parity (and code similarity) with youtube-dl
-                            # Remove if this causes any issues with sorting in future
                             f['quality'] = q(itag_qualities[itag])
                         filesize = int_or_none(self._search_regex(
                             r'/clen/(\d+)', f.get('fragment_base_url')
@@ -2169,16 +2197,24 @@ def feed_entry(name):
                 if 'maxresdefault' in thumbnail_url:
                     thumbnail_url = thumbnail_url.split('?')[0]
                 thumbnails.append({
-                    'height': int_or_none(thumbnail.get('height')),
                     'url': thumbnail_url,
+                    'height': int_or_none(thumbnail.get('height')),
                     'width': int_or_none(thumbnail.get('width')),
+                    'preference': 1 if 'maxresdefault' in thumbnail_url else -1
                 })
-            if thumbnails:
-                break
-        else:
-            thumbnail = search_meta(['og:image', 'twitter:image'])
-            if thumbnail:
-                thumbnails = [{'url': thumbnail}]
+        thumbnail_url = search_meta(['og:image', 'twitter:image'])
+        if thumbnail_url:
+            thumbnails.append({
+                'url': thumbnail_url,
+                'preference': 1 if 'maxresdefault' in thumbnail_url else -1
+            })
+        # All videos have a maxresdefault thumbnail, but sometimes it does not appear in the webpage
+        # See: https://github.com/ytdl-org/youtube-dl/issues/29049
+        thumbnails.append({
+            'url': 'https://i.ytimg.com/vi/%s/maxresdefault.jpg' % video_id,
+            'preference': 1,
+        })
+        self._remove_duplicate_formats(thumbnails)
 
         category = microformat.get('category') or search_meta('genre')
         channel_id = video_details.get('channelId') \
@@ -2832,6 +2868,10 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
     }, {
         'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
         'only_matching': True,
+    }, {
+        'note': 'A channel that is not live. Should raise error',
+        'url': 'https://www.youtube.com/user/numberphile/live',
+        'only_matching': True,
     }, {
         'url': 'https://www.youtube.com/feed/trending',
         'only_matching': True,
@@ -3566,7 +3606,13 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers
 
             else:
                 # Youtube may send alerts if there was an issue with the continuation page
-                self._extract_and_report_alerts(response, expected=False)
+                try:
+                    self._extract_and_report_alerts(response, expected=False)
+                except ExtractorError as e:
+                    if fatal:
+                        raise
+                    self.report_warning(error_to_compat_str(e))
+                    return
                 if not check_get_keys or dict_get(response, check_get_keys):
                     break
                 # Youtube sometimes sends incomplete data
@@ -3628,7 +3674,7 @@ def __real_extract(self, url, smuggled_data):
 
         def get_mobj(url):
             mobj = self._url_re.match(url).groupdict()
-            mobj.update((k, '') for k,v in mobj.items() if v is None)
+            mobj.update((k, '') for k, v in mobj.items() if v is None)
             return mobj
 
         mobj = get_mobj(url)
@@ -3688,23 +3734,26 @@ def get_mobj(url):
         if tabs:
             selected_tab = self._extract_selected_tab(tabs)
             tab_name = selected_tab.get('title', '')
-            if (mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]
-                    and 'no-youtube-channel-redirect' not in compat_opts):
-                if not mobj['not_channel'] and item_id[:2] == 'UC':
-                    # Topic channels don't have /videos. Use the equivalent playlist instead
-                    self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
-                    pl_id = 'UU%s' % item_id[2:]
-                    pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
-                    try:
-                        pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
-                        for alert_type, alert_message in self._extract_alerts(pl_data):
-                            if alert_type == 'error':
-                                raise ExtractorError('Youtube said: %s' % alert_message)
-                        item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
-                    except ExtractorError:
-                        self.report_warning('The playlist gave error. Falling back to channel URL')
-                else:
-                    self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
+            if 'no-youtube-channel-redirect' not in compat_opts:
+                if mobj['tab'] == '/live':
+                    # Live tab should have redirected to the video
+                    raise ExtractorError('The channel is not currently live', expected=True)
+                if mobj['tab'] == '/videos' and tab_name.lower() != mobj['tab'][1:]:
+                    if not mobj['not_channel'] and item_id[:2] == 'UC':
+                        # Topic channels don't have /videos. Use the equivalent playlist instead
+                        self.report_warning('The URL does not have a %s tab. Trying to redirect to playlist UU%s instead' % (mobj['tab'][1:], item_id[2:]))
+                        pl_id = 'UU%s' % item_id[2:]
+                        pl_url = 'https://www.youtube.com/playlist?list=%s%s' % (pl_id, mobj['post'])
+                        try:
+                            pl_webpage, pl_data = self._extract_webpage(pl_url, pl_id)
+                            for alert_type, alert_message in self._extract_alerts(pl_data):
+                                if alert_type == 'error':
+                                    raise ExtractorError('Youtube said: %s' % alert_message)
+                            item_id, url, webpage, data = pl_id, pl_url, pl_webpage, pl_data
+                        except ExtractorError:
+                            self.report_warning('The playlist gave error. Falling back to channel URL')
+                    else:
+                        self.report_warning('The URL does not have a %s tab. %s is being downloaded instead' % (mobj['tab'][1:], tab_name))
 
         self.write_debug('Final URL: %s' % url)
 
@@ -3727,7 +3776,8 @@ def get_mobj(url):
             data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
             compat_str) or video_id
         if video_id:
-            self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
+            if mobj['tab'] != '/live':  # live tab is expected to redirect to video
+                self.report_warning('Unable to recognize playlist. Downloading just video %s' % video_id)
             return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
 
         raise ExtractorError('Unable to recognize tab page')
@@ -3808,7 +3858,7 @@ def suitable(cls, url):
 
     def _real_extract(self, url):
         playlist_id = self._match_id(url)
-        is_music_url = self.is_music_url(url)
+        is_music_url = YoutubeBaseInfoExtractor.is_music_url(url)
         url = update_url_query(
             'https://www.youtube.com/playlist',
             parse_qs(url) or {'list': playlist_id})
@@ -4008,9 +4058,6 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):
     def IE_NAME(self):
         return 'youtube:%s' % self._FEED_NAME
 
-    def _real_initialize(self):
-        self._login()
-
     def _real_extract(self, url):
         return self.url_result(
             'https://www.youtube.com/feed/%s' % self._FEED_NAME,
@@ -4035,6 +4082,7 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
     _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
     _FEED_NAME = 'recommended'
+    _LOGIN_REQUIRED = False
     _TESTS = [{
         'url': ':ytrec',
         'only_matching': True,