[extractor, test] Basic framework for embed tests (#4307)

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 1a9c88f35e3c39071fdc8542e9aca42bbc7b0ad3..4dc8e79ac1ba22576cb46293052ba88dad835778 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -22,6 +22,7 @@
  from ..utils import (
      NO_DEFAULT,
      ExtractorError,
+    UserNotLive,
      bug_reports_message,
      classproperty,
      clean_html,
@@ -1074,6 +1075,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'age_limit': 0,
                  'start_time': 1,
                  'end_time': 9,
+                'comment_count': int,
                  'channel_follower_count': int
              }
          },
@@ -1118,6 +1120,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg',
                  'live_status': 'not_live',
                  'age_limit': 0,
+                'comment_count': int,
                  'channel_follower_count': int
              },
              'params': {
@@ -1260,6 +1263,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'categories': ['Entertainment'],
                  'duration': 106,
                  'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ',
+                'comment_count': int,
                  'channel_follower_count': int
              },
          },
@@ -1347,7 +1351,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'upload_date': '20150827',
                  'uploader_id': 'olympic',
                  'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
-                'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
+                'description': 'md5:04bbbf3ccceb6795947572ca36f45904',
                  'uploader': 'Olympics',
                  'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
                  'like_count': int,
@@ -1396,6 +1400,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'like_count': int,
                  'live_status': 'not_live',
                  'availability': 'unlisted',
+                'comment_count': int,
                  'channel_follower_count': int
              },
          },
@@ -1624,6 +1629,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'thumbnail': 'https://i.ytimg.com/vi_webp/M4gD1WSo5mA/maxresdefault.webp',
                  'live_status': 'not_live',
                  'playable_in_embed': True,
+                'comment_count': int,
                  'channel_follower_count': int
              },
              'params': {
@@ -1656,6 +1662,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'view_count': int,
                  'live_status': 'not_live',
                  'channel_url': 'https://www.youtube.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+                'comment_count': int,
                  'channel_follower_count': int
              },
              'params': {
@@ -1920,6 +1927,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'view_count': int,
                  'duration': 522,
                  'channel': 'kudvenkat',
+                'comment_count': int,
                  'channel_follower_count': int
              },
              'params': {
@@ -2141,6 +2149,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'availability': 'public',
                  'channel': 'Leon Nguyen',
                  'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp',
+                'comment_count': int,
                  'channel_follower_count': int
              }
          }, {
@@ -2204,7 +2213,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              'params': {'skip_download': True}
          }, {
              # Story. Requires specific player params to work.
-            # Note: stories get removed after some period of time
              'url': 'https://www.youtube.com/watch?v=vv8qTUWmulI',
              'info_dict': {
                  'id': 'vv8qTUWmulI',
@@ -2227,7 +2235,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'thumbnail': 'https://i.ytimg.com/vi_webp/vv8qTUWmulI/maxresdefault.webp',
                  'uploader_url': 'http://www.youtube.com/user/BlastfromthePast',
                  'channel_url': 'https://www.youtube.com/channel/UCzIZ8HrzDgc-pNQDUG6avBA',
-            }
+            },
+            'skip': 'stories get removed after some period of time',
          }, {
              'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA',
              'info_dict': {
@@ -2257,6 +2266,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          }
      ]
  
+    _WEBPAGE_TESTS = [
+        # YouTube <object> embed
+        {
+            'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/',
+            'md5': '873c81d308b979f0e23ee7e620b312a3',
+            'info_dict': {
+                'id': 'msN87y-iEx0',
+                'ext': 'mp4',
+                'title': 'Feynman: Mirrors FUN TO IMAGINE 6',
+                'upload_date': '20080526',
+                'description': 'md5:873c81d308b979f0e23ee7e620b312a3',
+                'uploader': 'Christopher Sykes',
+                'uploader_id': 'ChristopherJSykes',
+                'age_limit': 0,
+                'tags': ['feynman', 'mirror', 'science', 'physics', 'imagination', 'fun', 'cool', 'puzzle'],
+                'channel_id': 'UCCeo--lls1vna5YJABWAcVA',
+                'playable_in_embed': True,
+                'thumbnail': 'https://i.ytimg.com/vi/msN87y-iEx0/hqdefault.jpg',
+                'like_count': int,
+                'comment_count': int,
+                'channel': 'Christopher Sykes',
+                'live_status': 'not_live',
+                'channel_url': 'https://www.youtube.com/channel/UCCeo--lls1vna5YJABWAcVA',
+                'availability': 'public',
+                'duration': 195,
+                'view_count': int,
+                'categories': ['Science & Technology'],
+                'channel_follower_count': int,
+                'uploader_url': 'http://www.youtube.com/user/ChristopherJSykes',
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
+    ]
+
      @classmethod
      def suitable(cls, url):
          from ..utils import parse_qs
@@ -2289,7 +2334,7 @@ def refetch_manifest(format_id, delay):
              microformats = traverse_obj(
                  prs, (..., 'microformat', 'playerMicroformatRenderer'),
                  expected_type=dict, default=[])
-            _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url)
+            _, is_live, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
              start_time = time.time()
  
          def mpd_feed(format_id, delay):
@@ -2467,6 +2512,7 @@ def _extract_signature_function(self, video_id, player_url, example_sig):
          func_id = f'js_{player_id}_{self._signature_cache_id(example_sig)}'
          assert os.path.basename(func_id) == func_id
  
+        self.write_debug(f'Extracting signature function {func_id}')
          cache_spec = self.cache.load('youtube-sigfuncs', func_id)
          if cache_spec is not None:
              return lambda s: ''.join(s[i] for i in cache_spec)
@@ -2714,10 +2760,10 @@ def _extract_url(webpage):
  
      @classmethod
      def extract_id(cls, url):
-        mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
-        if mobj is None:
-            raise ExtractorError('Invalid URL: %s' % url)
-        return mobj.group('id')
+        video_id = cls.get_temp_id(url)
+        if not video_id:
+            raise ExtractorError(f'Invalid URL: {url}')
+        return video_id
  
      def _extract_chapters_from_json(self, data, duration):
          chapter_list = traverse_obj(
@@ -2763,17 +2809,15 @@ def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration,
          if not strict:
              chapter_list.sort(key=lambda c: c['start_time'] or 0)
  
-        chapters = [{'start_time': 0, 'title': '<Untitled>'}]
+        chapters = [{'start_time': 0}]
          for idx, chapter in enumerate(chapter_list):
-            if chapter['start_time'] is None or not chapter['title']:
+            if chapter['start_time'] is None:
                  self.report_warning(f'Incomplete chapter {idx}')
              elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
-                chapters[-1]['end_time'] = chapter['start_time']
                  chapters.append(chapter)
              else:
                  self.report_warning(f'Invalid start time for chapter "{chapter["title"]}"')
-        chapters[-1]['end_time'] = duration
-        return chapters if len(chapters) > 1 and chapters[1]['start_time'] else chapters[1:]
+        return chapters[1:]
  
      def _extract_comment(self, comment_renderer, parent=None):
          comment_id = comment_renderer.get('commentId')
@@ -3128,7 +3172,7 @@ def append_client(*client_names):
              self.report_warning(last_error)
          return prs, player_url
  
-    def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration):
+    def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live, duration):
          itags, stream_ids = {}, []
          itag_qualities, res_qualities = {}, {}
          q = qualities([
@@ -3285,17 +3329,22 @@ def process_manifest_format(f, proto, itag):
                  if val in qdict), -1)
              return True
  
+        subtitles = {}
          for sd in streaming_data:
              hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
              if hls_manifest_url:
-                for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
+                fmts, subs = self._extract_m3u8_formats_and_subtitles(hls_manifest_url, video_id, 'mp4', fatal=False, live=is_live)
+                subtitles = self._merge_subtitles(subs, subtitles)
+                for f in fmts:
                      if process_manifest_format(f, 'hls', self._search_regex(
                              r'/itag/(\d+)', f['url'], 'itag', default=None)):
                          yield f
  
              dash_manifest_url = get_dash and sd.get('dashManifestUrl')
              if dash_manifest_url:
-                for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
+                formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
+                subtitles = self._merge_subtitles(subs, subtitles)  # Prioritize HLS subs over DASH
+                for f in formats:
                      if process_manifest_format(f, 'dash', f['format_id']):
                          f['filesize'] = int_or_none(self._search_regex(
                              r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
@@ -3303,6 +3352,7 @@ def process_manifest_format(f, proto, itag):
                              f['is_from_start'] = True
  
                          yield f
+        yield subtitles
  
      def _extract_storyboard(self, player_responses, duration):
          spec = get_first(
@@ -3333,6 +3383,9 @@ def _extract_storyboard(self, player_responses, duration):
                  'url': url,
                  'width': width,
                  'height': height,
+                'fps': frame_count / duration,
+                'rows': rows,
+                'columns': cols,
                  'fragments': [{
                      'url': url.replace('$M', str(j)),
                      'duration': min(fragment_duration, duration - (j * fragment_duration)),
@@ -3360,9 +3413,9 @@ def _list_formats(self, video_id, microformats, video_details, player_responses,
              is_live = get_first(live_broadcast_details, 'isLiveNow')
  
          streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
-        formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration))
+        *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, is_live, duration)
  
-        return live_broadcast_details, is_live, streaming_data, formats
+        return live_broadcast_details, is_live, streaming_data, formats, subtitles
  
      def _real_extract(self, url):
          url, smuggled_data = unsmuggle_url(url, {})
@@ -3446,15 +3499,8 @@ def feed_entry(name):
              or get_first(microformats, 'lengthSeconds')
              or parse_duration(search_meta('duration'))) or None
  
-        if get_first(video_details, 'isPostLiveDvr'):
-            self.write_debug('Video is in Post-Live Manifestless mode')
-            if duration or 0 > 4 * 3600:
-                self.report_warning(
-                    'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. '
-                    'This is a known issue and patches are welcome')
-
-        live_broadcast_details, is_live, streaming_data, formats = self._list_formats(
-            video_id, microformats, video_details, player_responses, player_url, duration)
+        live_broadcast_details, is_live, streaming_data, formats, automatic_captions = \
+            self._list_formats(video_id, microformats, video_details, player_responses, player_url)
  
          if not formats:
              if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
@@ -3545,8 +3591,7 @@ def feed_entry(name):
  
          formats.extend(self._extract_storyboard(player_responses, duration))
  
-        # Source is given priority since formats that throttle are given lower source_preference
-        # When throttling issue is fully fixed, remove this
+        # source_preference is lower for throttled/potentially damaged formats
          self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang', 'proto'))
  
          info = {
@@ -3584,6 +3629,15 @@ def feed_entry(name):
              'release_timestamp': live_start_time,
          }
  
+        if get_first(video_details, 'isPostLiveDvr'):
+            self.write_debug('Video is in Post-Live Manifestless mode')
+            info['live_status'] = 'post_live'
+            if (duration or 0) > 4 * 3600:
+                self.report_warning(
+                    'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. '
+                    'This is a known issue and patches are welcome')
+
+        subtitles = {}
          pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
          if pctr:
              def get_lang_code(track):
@@ -3610,7 +3664,9 @@ def process_language(container, base_url, lang_code, sub_name, query):
                          'name': sub_name,
                      })
  
-            subtitles, automatic_captions = {}, {}
+            # NB: Constructing the full subtitle dictionary is slow
+            get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and (
+                self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles'))
              for lang_code, caption_track in captions.items():
                  base_url = caption_track.get('baseUrl')
                  orig_lang = parse_qs(base_url).get('lang', [None])[-1]
@@ -3629,7 +3685,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
                          continue
                      orig_trans_code = trans_code
                      if caption_track.get('kind') != 'asr':
-                        if 'translated_subs' in self._configuration_arg('skip'):
+                        if not get_translated_subs:
                              continue
                          trans_code += f'-{lang_code}'
                          trans_name += format_field(lang_name, None, ' from %s')
@@ -3641,8 +3697,9 @@ def process_language(container, base_url, lang_code, sub_name, query):
                      # Setting tlang=lang returns damaged subtitles.
                      process_language(automatic_captions, base_url, trans_code, trans_name,
                                       {} if orig_lang == orig_trans_code else {'tlang': trans_code})
-            info['automatic_captions'] = automatic_captions
-            info['subtitles'] = subtitles
+
+        info['automatic_captions'] = automatic_captions
+        info['subtitles'] = subtitles
  
          parsed_url = urllib.parse.urlparse(url)
          for component in [parsed_url.fragment, parsed_url.query]:
@@ -5003,7 +5060,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
      }, {
          'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
          'info_dict': {
-            'id': 'GgL890LIznQ',  # This will keep changing
+            'id': 'Wq15eF5vCbI',  # This will keep changing
              'ext': 'mp4',
              'title': str,
              'uploader': 'Sky News',
@@ -5123,7 +5180,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
              'uploader': 'NoCopyrightSounds',
              'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!',
              'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg',
-            'title': 'NCS Releases',
+            'title': 'NCS : All Releases 💿',
              'uploader_url': 'https://www.youtube.com/c/NoCopyrightSounds',
              'channel_url': 'https://www.youtube.com/c/NoCopyrightSounds',
              'modified_date': r're:\d{8}',
@@ -5192,7 +5249,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
              'title': 'yt-dlp unlisted playlist test',
              'availability': 'unlisted',
              'tags': [],
-            'modified_date': '20211208',
+            'modified_date': '20220418',
              'channel': 'colethedj',
              'view_count': int,
              'description': '',
@@ -5280,6 +5337,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
              'channel': 'pukkandan',
              'description': 'Test for collaborative playlist',
              'title': 'yt-dlp test - collaborative playlist',
+            'view_count': int,
              'uploader_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q',
          },
          'playlist_mincount': 2
@@ -5372,9 +5430,8 @@ def get_mobj(url):
                  selected_tab_name = 'featured'
              requested_tab_name = mobj['tab'][1:]
              if 'no-youtube-channel-redirect' not in compat_opts:
-                if requested_tab_name == 'live':
-                    # Live tab should have redirected to the video
-                    raise ExtractorError('The channel is not currently live', expected=True)
+                if requested_tab_name == 'live':  # Live tab should have redirected to the video
+                    raise UserNotLive(video_id=mobj['id'])
                  if requested_tab_name not in ('', selected_tab_name):
                      redirect_warning = f'The channel does not have a {requested_tab_name} tab'
                      if not original_tab_name:
@@ -5487,7 +5544,7 @@ class YoutubePlaylistIE(InfoExtractor):
          'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'],
      }, {
          'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
-        'playlist_mincount': 654,
+        'playlist_mincount': 455,
          'info_dict': {
              'title': '2018 Chinese New Singles (11/6 updated)',
              'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
@@ -5560,6 +5617,8 @@ class YoutubeYtBeIE(InfoExtractor):
              'channel_url': 'https://www.youtube.com/channel/UCEfMCQ9bs3tjvjy1s451zaw',
              'availability': 'public',
              'duration': 59,
+            'comment_count': int,
+            'channel_follower_count': int
          },
          'params': {
              'noplaylist': True,
@@ -5777,10 +5836,11 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor):
          'info_dict': {
              'id': '#cats',
              'title': '#cats',
-            'entries': [{
-                'url': r're:https://(www\.)?youtube\.com/hashtag/cats',
-                'title': '#cats',
-            }],
+            # The test suite does not have support for nested playlists
+            # 'entries': [{
+            #     'url': r're:https://(www\.)?youtube\.com/hashtag/cats',
+            #     'title': '#cats',
+            # }],
          },
      }, {
          'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
@@ -5997,6 +6057,25 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor):
              'section_start': 29.0,
              'section_end': 39.7,
              'duration': 10.7,
+            'age_limit': 0,
+            'availability': 'public',
+            'categories': ['Gaming'],
+            'channel': 'Scott The Woz',
+            'channel_id': 'UC4rqhyiTs7XyuODcECvuiiQ',
+            'channel_url': 'https://www.youtube.com/channel/UC4rqhyiTs7XyuODcECvuiiQ',
+            'description': 'md5:7a4517a17ea9b4bd98996399d8bb36e7',
+            'like_count': int,
+            'playable_in_embed': True,
+            'tags': 'count:17',
+            'thumbnail': 'https://i.ytimg.com/vi_webp/ScPX26pdQik/maxresdefault.webp',
+            'title': 'Mobile Games on Console - Scott The Woz',
+            'upload_date': '20210920',
+            'uploader': 'Scott The Woz',
+            'uploader_id': 'scottthewoz',
+            'uploader_url': 'http://www.youtube.com/user/scottthewoz',
+            'view_count': int,
+            'live_status': 'not_live',
+            'channel_follower_count': int
          }
      }]