Merge branch 'master' of https://github.com/zubearc/youtube-dl into zubearc

author Tom-Oliver Heidel <redacted>

Sat, 5 Sep 2020 23:52:20 +0000 (01:52 +0200)

committer Tom-Oliver Heidel <redacted>

Sat, 5 Sep 2020 23:52:20 +0000 (01:52 +0200)
author Tom-Oliver Heidel <redacted>
Sat, 5 Sep 2020 23:52:20 +0000 (01:52 +0200)
committer Tom-Oliver Heidel <redacted>
Sat, 5 Sep 2020 23:52:20 +0000 (01:52 +0200)
diff --combined test/test_subtitles.py

index 3ca03fb6fb217c7af862b4556f4c57a95c5e9191,d9727c579d8ecd20cd9dc5b64dda412763ba5a7b..86e20cb4be444c4ab24924eb31e3fa7a266041fd
--- 1/test/test_subtitles.py
--- 2/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@@ -10,7 -10,7 +10,7 @@@ sys.path.insert(0, os.path.dirname(os.p
   from test.helper import FakeYDL, md5
   
   
- -from youtube_dl.extractor import (
+ +from youtube_dlc.extractor import (
       YoutubeIE,
       DailymotionIE,
       TEDIE,
@@@ -64,8 -64,8 +64,8 @@@ class TestYoutubeSubtitles(BaseTestSubt
           self.DL.params['allsubtitles'] = True
           subtitles = self.getSubtitles()
           self.assertEqual(len(subtitles.keys()), 13)
-         self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
-         self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5')
+         self.assertEqual(md5(subtitles['en']), '688dd1ce0981683867e7fe6fde2a224b')
+         self.assertEqual(md5(subtitles['it']), '31324d30b8430b309f7f5979a504a769')
           for lang in ['fr', 'de']:
               self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
   
@@@ -73,13 -73,13 +73,13 @@@
           self.DL.params['writesubtitles'] = True
           self.DL.params['subtitlesformat'] = 'ttml'
           subtitles = self.getSubtitles()
-         self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54')
+         self.assertEqual(md5(subtitles['en']), 'c97ddf1217390906fa9fbd34901f3da2')
   
       def test_youtube_subtitles_vtt_format(self):
           self.DL.params['writesubtitles'] = True
           self.DL.params['subtitlesformat'] = 'vtt'
           subtitles = self.getSubtitles()
-         self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
+         self.assertEqual(md5(subtitles['en']), 'ae1bd34126571a77aabd4d276b28044d')
   
       def test_youtube_automatic_captions(self):
           self.url = '8YoUxe5ncPo'
@@@ -88,9 -88,15 +88,15 @@@
           subtitles = self.getSubtitles()
           self.assertTrue(subtitles['it'] is not None)
   
+     def test_youtube_no_automatic_captions(self):
+         self.url = 'QRS8MkLhQmM'
+         self.DL.params['writeautomaticsub'] = True
+         subtitles = self.getSubtitles()
+         self.assertTrue(not subtitles)
+ 
       def test_youtube_translated_subtitles(self):
           # This video has a subtitles track, which can be translated
-         self.url = 'Ky9eprVWzlI'
+         self.url = 'i0ZabxXmH4Y'
           self.DL.params['writeautomaticsub'] = True
           self.DL.params['subtitleslangs'] = ['it']
           subtitles = self.getSubtitles()
diff --combined youtube_dlc/extractor/youtube.py

index 70a5bd3b0f77e0e6c08e946e219cd7544beefa8f,bb382849f08e955a48ad47ff1bf930b8bca878c0..30a3e5c3c02dff7c28b166e3bafa2853ff2f3827
--- 1/youtube_dlc/extractor/youtube.py
--- 2/youtube_dl/extractor/youtube.py
+++ b/youtube_dlc/extractor/youtube.py
@@@ -549,7 -549,7 +549,7 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
           '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
           '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
       }
-     _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
+     _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt', 'json3')
   
       _GEO_BYPASS = False
   
@@@ -1435,7 -1435,7 +1435,7 @@@
               raise ExtractorError(
                   'Signature extraction failed: ' + tb, cause=e)
   
- -    def _get_subtitles(self, video_id, webpage):
+ +    def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
           try:
               subs_doc = self._download_xml(
                   'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@@@ -1462,14 -1462,6 +1462,14 @@@
                       'ext': ext,
                   })
               sub_lang_list[lang] = sub_formats
+ +        if has_live_chat_replay:
+ +            sub_lang_list['live_chat'] = [
+ +                {
+ +                    'video_id': video_id,
+ +                    'ext': 'json',
+ +                    'protocol': 'youtube_live_chat_replay',
+ +                },
+ +            ]
           if not sub_lang_list:
               self._downloader.report_warning('video doesn\'t have subtitles')
               return {}
@@@ -1493,15 -1485,6 +1493,15 @@@
               return self._parse_json(
                   uppercase_escape(config), video_id, fatal=False)
   
+ +    def _get_yt_initial_data(self, video_id, webpage):
+ +        config = self._search_regex(
+ +            (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
+ +             r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
+ +            webpage, 'ytInitialData', default=None)
+ +        if config:
+ +            return self._parse_json(
+ +                uppercase_escape(config), video_id, fatal=False)
+ +
       def _get_automatic_captions(self, video_id, webpage):
           """We need the webpage for getting the captions url, pass it as an
              argument to speed up the process."""
@@@ -1577,14 -1560,21 +1577,21 @@@
                       player_response, video_id, fatal=False)
                   if player_response:
                       renderer = player_response['captions']['playerCaptionsTracklistRenderer']
-                     base_url = renderer['captionTracks'][0]['baseUrl']
-                     sub_lang_list = []
-                     for lang in renderer['translationLanguages']:
-                         lang_code = lang.get('languageCode')
-                         if lang_code:
-                             sub_lang_list.append(lang_code)
-                     return make_captions(base_url, sub_lang_list)
- 
+                     caption_tracks = renderer['captionTracks']
+                     for caption_track in caption_tracks:
+                         if 'kind' not in caption_track:
+                             # not an automatic transcription
+                             continue
+                         base_url = caption_track['baseUrl']
+                         sub_lang_list = []
+                         for lang in renderer['translationLanguages']:
+                             lang_code = lang.get('languageCode')
+                             if lang_code:
+                                 sub_lang_list.append(lang_code)
+                         return make_captions(base_url, sub_lang_list)
+                     
+                     self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
+                     return {}
               # Some videos don't provide ttsurl but rather caption_tracks and
               # caption_translation_languages (e.g. 20LmZk1hakA)
               # Does not used anymore as of 22.06.2017
@@@ -1678,15 -1668,21 +1685,15 @@@
       def _extract_chapters_from_json(self, webpage, video_id, duration):
           if not webpage:
               return
- -        player = self._parse_json(
+ +        initial_data = self._parse_json(
               self._search_regex(
- -                r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
+ +                r'window\["ytInitialData"\] = (.+);\n', webpage,
                   'player args', default='{}'),
               video_id, fatal=False)
- -        if not player or not isinstance(player, dict):
- -            return
- -        watch_next_response = player.get('watch_next_response')
- -        if not isinstance(watch_next_response, compat_str):
- -            return
- -        response = self._parse_json(watch_next_response, video_id, fatal=False)
- -        if not response or not isinstance(response, dict):
+ +        if not initial_data or not isinstance(initial_data, dict):
               return
           chapters_list = try_get(
- -            response,
+ +            initial_data,
               lambda x: x['playerOverlays']
                          ['playerOverlayRenderer']
                          ['decoratedPlayerBarRenderer']
@@@ -1836,7 -1832,7 +1843,7 @@@
           # Get video info
           video_info = {}
           embed_webpage = None
- -        if re.search(r'player-age-gate-content">', video_webpage) is not None:
+ +        if self._html_search_meta('og:restrictions:age', video_webpage, default=None) == "18+":
               age_gate = True
               # We simulate the access to the video from www.youtube.com/v/{video_id}
               # this can be viewed without login into Youtube
@@@ -1995,15 -1991,6 +2002,15 @@@
           if is_live is None:
               is_live = bool_or_none(video_details.get('isLive'))
   
+ +        has_live_chat_replay = False
+ +        if not is_live:
+ +            yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
+ +            try:
+ +                yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
+ +                has_live_chat_replay = True
+ +            except (KeyError, IndexError, TypeError):
+ +                pass
+ +
           # Check for "rental" videos
           if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
               raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
@@@ -2411,8 -2398,7 +2418,8 @@@
               or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
   
           # subtitles
- -        video_subtitles = self.extract_subtitles(video_id, video_webpage)
+ +        video_subtitles = self.extract_subtitles(
+ +            video_id, video_webpage, has_live_chat_replay)
           automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
   
           video_duration = try_get(
author	Tom-Oliver Heidel <redacted>
	Sat, 5 Sep 2020 23:52:20 +0000 (01:52 +0200)
committer	Tom-Oliver Heidel <redacted>
	Sat, 5 Sep 2020 23:52:20 +0000 (01:52 +0200)
		1	2
test/test_subtitles.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dlc/extractor/youtube.py	patch \|	diff1 \|	diff2 \|	blob \| history