[webvtt, extractor/youtube] Extract auto-subs from livestream VODs

author pukkandan <redacted>

Tue, 21 Jun 2022 22:16:54 +0000 (03:46 +0530)

committer pukkandan <redacted>

Sat, 30 Jul 2022 20:50:11 +0000 (02:20 +0530)
author pukkandan <redacted>
Tue, 21 Jun 2022 22:16:54 +0000 (03:46 +0530)
committer pukkandan <redacted>
Sat, 30 Jul 2022 20:50:11 +0000 (02:20 +0530)
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 2a9d113a5c1b6cbfb39781b01d7fe8ea1e8aff55..33c0e0b582516605fb9793d8289220b8d24523e8 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2298,7 +2298,7 @@ def refetch_manifest(format_id, delay):
              microformats = traverse_obj(
                  prs, (..., 'microformat', 'playerMicroformatRenderer'),
                  expected_type=dict, default=[])
-            _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url)
+            _, is_live, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url)
              start_time = time.time()
  
          def mpd_feed(format_id, delay):
@@ -3136,7 +3136,7 @@ def append_client(*client_names):
              self.report_warning(last_error)
          return prs, player_url
  
-    def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration):
+    def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live, duration):
          itags, stream_ids = {}, []
          itag_qualities, res_qualities = {}, {}
          q = qualities([
@@ -3293,17 +3293,22 @@ def process_manifest_format(f, proto, itag):
                  if val in qdict), -1)
              return True
  
+        subtitles = {}
          for sd in streaming_data:
              hls_manifest_url = get_hls and sd.get('hlsManifestUrl')
              if hls_manifest_url:
-                for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False):
+                fmts, subs = self._extract_m3u8_formats_and_subtitles(hls_manifest_url, video_id, 'mp4', fatal=False, live=is_live)
+                subtitles = self._merge_subtitles(subs, subtitles)
+                for f in fmts:
                      if process_manifest_format(f, 'hls', self._search_regex(
                              r'/itag/(\d+)', f['url'], 'itag', default=None)):
                          yield f
  
              dash_manifest_url = get_dash and sd.get('dashManifestUrl')
              if dash_manifest_url:
-                for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False):
+                formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False)
+                subtitles = self._merge_subtitles(subs, subtitles)  # Prioritize HLS subs over DASH
+                for f in formats:
                      if process_manifest_format(f, 'dash', f['format_id']):
                          f['filesize'] = int_or_none(self._search_regex(
                              r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None))
@@ -3311,6 +3316,7 @@ def process_manifest_format(f, proto, itag):
                              f['is_from_start'] = True
  
                          yield f
+        yield subtitles
  
      def _extract_storyboard(self, player_responses, duration):
          spec = get_first(
@@ -3371,9 +3377,9 @@ def _list_formats(self, video_id, microformats, video_details, player_responses,
              is_live = get_first(live_broadcast_details, 'isLiveNow')
  
          streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[])
-        formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration))
+        *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, is_live, duration)
  
-        return live_broadcast_details, is_live, streaming_data, formats
+        return live_broadcast_details, is_live, streaming_data, formats, subtitles
  
      def _real_extract(self, url):
          url, smuggled_data = unsmuggle_url(url, {})
@@ -3464,8 +3470,8 @@ def feed_entry(name):
                      'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. '
                      'This is a known issue and patches are welcome')
  
-        live_broadcast_details, is_live, streaming_data, formats = self._list_formats(
-            video_id, microformats, video_details, player_responses, player_url, duration)
+        live_broadcast_details, is_live, streaming_data, formats, automatic_captions = \
+            self._list_formats(video_id, microformats, video_details, player_responses, player_url)
  
          if not formats:
              if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')):
@@ -3595,6 +3601,7 @@ def feed_entry(name):
              'release_timestamp': live_start_time,
          }
  
+        subtitles = {}
          pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict)
          if pctr:
              def get_lang_code(track):
@@ -3624,7 +3631,6 @@ def process_language(container, base_url, lang_code, sub_name, query):
              # NB: Constructing the full subtitle dictionary is slow
              get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and (
                  self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles'))
-            subtitles, automatic_captions = {}, {}
              for lang_code, caption_track in captions.items():
                  base_url = caption_track.get('baseUrl')
                  orig_lang = parse_qs(base_url).get('lang', [None])[-1]
@@ -3655,8 +3661,9 @@ def process_language(container, base_url, lang_code, sub_name, query):
                      # Setting tlang=lang returns damaged subtitles.
                      process_language(automatic_captions, base_url, trans_code, trans_name,
                                       {} if orig_lang == orig_trans_code else {'tlang': trans_code})
-            info['automatic_captions'] = automatic_captions
-            info['subtitles'] = subtitles
+
+        info['automatic_captions'] = automatic_captions
+        info['subtitles'] = subtitles
  
          parsed_url = urllib.parse.urlparse(url)
          for component in [parsed_url.fragment, parsed_url.query]:
diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py

index cc2353436a09a0bc2859886a2e179bf6e542e3b6..23d67a8971d78ad3cec493c516fcc078c60fe313 100644 (file)
--- a/yt_dlp/webvtt.py
+++ b/yt_dlp/webvtt.py
@@ -161,6 +161,12 @@ class Magic(HeaderBlock):
      _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
      _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')
  
+    # This was removed from the spec in the 2017 revision;
+    # the last spec draft to describe this syntax element is
+    # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
+    # Nevertheless, YouTube keeps serving those
+    _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')
+
      @classmethod
      def __parse_tsmap(cls, parser):
          parser = parser.child()
@@ -200,13 +206,18 @@ def parse(cls, parser):
              raise ParseError(parser)
  
          extra = m.group(1)
-        local, mpegts = None, None
-        if parser.consume(cls._REGEX_TSMAP):
-            local, mpegts = cls.__parse_tsmap(parser)
-        if not parser.consume(_REGEX_NL):
+        local, mpegts, meta = None, None, ''
+        while not parser.consume(_REGEX_NL):
+            if parser.consume(cls._REGEX_TSMAP):
+                local, mpegts = cls.__parse_tsmap(parser)
+                continue
+            m = parser.consume(cls._REGEX_META)
+            if m:
+                meta += m.group(0)
+                continue
              raise ParseError(parser)
          parser.commit()
-        return cls(extra=extra, mpegts=mpegts, local=local)
+        return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)
  
      def write_into(self, stream):
          stream.write('WEBVTT')
@@ -219,6 +230,8 @@ def write_into(self, stream):
              stream.write(',MPEGTS:')
              stream.write(str(self.mpegts if self.mpegts is not None else 0))
              stream.write('\n')
+        if self.meta:
+            stream.write(self.meta)
          stream.write('\n')
author	pukkandan <redacted>
	Tue, 21 Jun 2022 22:16:54 +0000 (03:46 +0530)
committer	pukkandan <redacted>
	Sat, 30 Jul 2022 20:50:11 +0000 (02:20 +0530)
yt_dlp/extractor/youtube.py		patch \| blob \| blame \| history
yt_dlp/webvtt.py		patch \| blob \| blame \| history