]> jfr.im git - yt-dlp.git/commitdiff
Merge branch 'master' of https://github.com/zubearc/youtube-dl into zubearc
authorTom-Oliver Heidel <redacted>
Sat, 5 Sep 2020 23:52:20 +0000 (01:52 +0200)
committerTom-Oliver Heidel <redacted>
Sat, 5 Sep 2020 23:52:20 +0000 (01:52 +0200)
1  2 
test/test_subtitles.py
youtube_dlc/extractor/youtube.py

diff --combined test/test_subtitles.py
index 3ca03fb6fb217c7af862b4556f4c57a95c5e9191,d9727c579d8ecd20cd9dc5b64dda412763ba5a7b..86e20cb4be444c4ab24924eb31e3fa7a266041fd
@@@ -10,7 -10,7 +10,7 @@@ sys.path.insert(0, os.path.dirname(os.p
  from test.helper import FakeYDL, md5
  
  
 -from youtube_dl.extractor import (
 +from youtube_dlc.extractor import (
      YoutubeIE,
      DailymotionIE,
      TEDIE,
@@@ -64,8 -64,8 +64,8 @@@ class TestYoutubeSubtitles(BaseTestSubt
          self.DL.params['allsubtitles'] = True
          subtitles = self.getSubtitles()
          self.assertEqual(len(subtitles.keys()), 13)
-         self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
-         self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5')
+         self.assertEqual(md5(subtitles['en']), '688dd1ce0981683867e7fe6fde2a224b')
+         self.assertEqual(md5(subtitles['it']), '31324d30b8430b309f7f5979a504a769')
          for lang in ['fr', 'de']:
              self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
  
          self.DL.params['writesubtitles'] = True
          self.DL.params['subtitlesformat'] = 'ttml'
          subtitles = self.getSubtitles()
-         self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54')
+         self.assertEqual(md5(subtitles['en']), 'c97ddf1217390906fa9fbd34901f3da2')
  
      def test_youtube_subtitles_vtt_format(self):
          self.DL.params['writesubtitles'] = True
          self.DL.params['subtitlesformat'] = 'vtt'
          subtitles = self.getSubtitles()
-         self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
+         self.assertEqual(md5(subtitles['en']), 'ae1bd34126571a77aabd4d276b28044d')
  
      def test_youtube_automatic_captions(self):
          self.url = '8YoUxe5ncPo'
          subtitles = self.getSubtitles()
          self.assertTrue(subtitles['it'] is not None)
  
+     def test_youtube_no_automatic_captions(self):
+         self.url = 'QRS8MkLhQmM'
+         self.DL.params['writeautomaticsub'] = True
+         subtitles = self.getSubtitles()
+         self.assertTrue(not subtitles)
      def test_youtube_translated_subtitles(self):
          # This video has a subtitles track, which can be translated
-         self.url = 'Ky9eprVWzlI'
+         self.url = 'i0ZabxXmH4Y'
          self.DL.params['writeautomaticsub'] = True
          self.DL.params['subtitleslangs'] = ['it']
          subtitles = self.getSubtitles()
index 70a5bd3b0f77e0e6c08e946e219cd7544beefa8f,bb382849f08e955a48ad47ff1bf930b8bca878c0..30a3e5c3c02dff7c28b166e3bafa2853ff2f3827
@@@ -549,7 -549,7 +549,7 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
          '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
          '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
      }
-     _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
+     _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt', 'json3')
  
      _GEO_BYPASS = False
  
              raise ExtractorError(
                  'Signature extraction failed: ' + tb, cause=e)
  
 -    def _get_subtitles(self, video_id, webpage):
 +    def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
          try:
              subs_doc = self._download_xml(
                  'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
                      'ext': ext,
                  })
              sub_lang_list[lang] = sub_formats
 +        if has_live_chat_replay:
 +            sub_lang_list['live_chat'] = [
 +                {
 +                    'video_id': video_id,
 +                    'ext': 'json',
 +                    'protocol': 'youtube_live_chat_replay',
 +                },
 +            ]
          if not sub_lang_list:
              self._downloader.report_warning('video doesn\'t have subtitles')
              return {}
              return self._parse_json(
                  uppercase_escape(config), video_id, fatal=False)
  
 +    def _get_yt_initial_data(self, video_id, webpage):
 +        config = self._search_regex(
 +            (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
 +             r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
 +            webpage, 'ytInitialData', default=None)
 +        if config:
 +            return self._parse_json(
 +                uppercase_escape(config), video_id, fatal=False)
 +
      def _get_automatic_captions(self, video_id, webpage):
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
                      player_response, video_id, fatal=False)
                  if player_response:
                      renderer = player_response['captions']['playerCaptionsTracklistRenderer']
-                     base_url = renderer['captionTracks'][0]['baseUrl']
-                     sub_lang_list = []
-                     for lang in renderer['translationLanguages']:
-                         lang_code = lang.get('languageCode')
-                         if lang_code:
-                             sub_lang_list.append(lang_code)
-                     return make_captions(base_url, sub_lang_list)
+                     caption_tracks = renderer['captionTracks']
+                     for caption_track in caption_tracks:
+                         if 'kind' not in caption_track:
+                             # not an automatic transcription
+                             continue
+                         base_url = caption_track['baseUrl']
+                         sub_lang_list = []
+                         for lang in renderer['translationLanguages']:
+                             lang_code = lang.get('languageCode')
+                             if lang_code:
+                                 sub_lang_list.append(lang_code)
+                         return make_captions(base_url, sub_lang_list)
+                     
+                     self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
+                     return {}
              # Some videos don't provide ttsurl but rather caption_tracks and
              # caption_translation_languages (e.g. 20LmZk1hakA)
              # Does not used anymore as of 22.06.2017
      def _extract_chapters_from_json(self, webpage, video_id, duration):
          if not webpage:
              return
 -        player = self._parse_json(
 +        initial_data = self._parse_json(
              self._search_regex(
 -                r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
 +                r'window\["ytInitialData"\] = (.+);\n', webpage,
                  'player args', default='{}'),
              video_id, fatal=False)
 -        if not player or not isinstance(player, dict):
 -            return
 -        watch_next_response = player.get('watch_next_response')
 -        if not isinstance(watch_next_response, compat_str):
 -            return
 -        response = self._parse_json(watch_next_response, video_id, fatal=False)
 -        if not response or not isinstance(response, dict):
 +        if not initial_data or not isinstance(initial_data, dict):
              return
          chapters_list = try_get(
 -            response,
 +            initial_data,
              lambda x: x['playerOverlays']
                         ['playerOverlayRenderer']
                         ['decoratedPlayerBarRenderer']
          # Get video info
          video_info = {}
          embed_webpage = None
 -        if re.search(r'player-age-gate-content">', video_webpage) is not None:
 +        if self._html_search_meta('og:restrictions:age', video_webpage, default=None) == "18+":
              age_gate = True
              # We simulate the access to the video from www.youtube.com/v/{video_id}
              # this can be viewed without login into Youtube
          if is_live is None:
              is_live = bool_or_none(video_details.get('isLive'))
  
 +        has_live_chat_replay = False
 +        if not is_live:
 +            yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
 +            try:
 +                yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
 +                has_live_chat_replay = True
 +            except (KeyError, IndexError, TypeError):
 +                pass
 +
          # Check for "rental" videos
          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
              raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
              or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
  
          # subtitles
 -        video_subtitles = self.extract_subtitles(video_id, video_webpage)
 +        video_subtitles = self.extract_subtitles(
 +            video_id, video_webpage, has_live_chat_replay)
          automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
  
          video_duration = try_get(