[ie/generic] Improve direct video link ext detection (#8340)

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 2a8106b45cf0c6ab991ac0f2ded1c459809e669c..ac28ed7d282ea0c91a0c6427a6d3c4a65ee2dbe7 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -15,13 +15,13 @@
  import threading
  import time
  import traceback
-import urllib.error
  import urllib.parse
  
  from .common import InfoExtractor, SearchInfoExtractor
  from .openload import PhantomJSwrapper
  from ..compat import functools
  from ..jsinterp import JSInterpreter
+from ..networking.exceptions import HTTPError, network_exceptions
  from ..utils import (
      NO_DEFAULT,
      ExtractorError,
@@ -41,7 +41,6 @@
      join_nonempty,
      js_to_json,
      mimetype2ext,
-    network_exceptions,
      orderedSet,
      parse_codecs,
      parse_count,
@@ -497,16 +496,10 @@ def _initialize_consent(self):
          cookies = self._get_cookies('https://www.youtube.com/')
          if cookies.get('__Secure-3PSID'):
              return
-        consent_id = None
-        consent = cookies.get('CONSENT')
-        if consent:
-            if 'YES' in consent.value:
-                return
-            consent_id = self._search_regex(
-                r'PENDING\+(\d+)', consent.value, 'consent', default=None)
-        if not consent_id:
-            consent_id = random.randint(100, 999)
-        self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
+        socs = cookies.get('SOCS')
+        if socs and not socs.value.startswith('CAA'):  # not consented
+            return
+        self._set_cookie('.youtube.com', 'SOCS', 'CAI', secure=True)  # accept all (required for mixes)
  
      def _initialize_pref(self):
          cookies = self._get_cookies('https://www.youtube.com/')
@@ -909,7 +902,7 @@ def extract_relative_time(relative_time_text):
          e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago'
          """
  
-        # XXX: this could be moved to a general function in utils.py
+        # XXX: this could be moved to a general function in utils/_utils.py
          # The relative time text strings are roughly the same as what
          # Javascript's Intl.RelativeTimeFormat function generates.
          # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat
@@ -948,7 +941,16 @@ def _parse_time_text(self, text):
      def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
                            ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None,
                            default_client='web'):
-        for retry in self.RetryManager():
+        raise_for_incomplete = bool(self._configuration_arg('raise_incomplete_data', ie_key=YoutubeIE))
+        # Incomplete Data should be a warning by default when retries are exhausted, while other errors should be fatal.
+        icd_retries = iter(self.RetryManager(fatal=raise_for_incomplete))
+        icd_rm = next(icd_retries)
+        main_retries = iter(self.RetryManager())
+        main_rm = next(main_retries)
+        # Manual retry loop for multiple RetryManagers
+        # The proper RetryManager MUST be advanced after an error
+        # and its result MUST be checked if the manager is non fatal
+        while True:
              try:
                  response = self._call_api(
                      ep=ep, fatal=True, headers=headers,
@@ -959,40 +961,46 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers
              except ExtractorError as e:
                  if not isinstance(e.cause, network_exceptions):
                      return self._error_or_warning(e, fatal=fatal)
-                elif not isinstance(e.cause, urllib.error.HTTPError):
-                    retry.error = e
+                elif not isinstance(e.cause, HTTPError):
+                    main_rm.error = e
+                    next(main_retries)
                      continue
  
-                first_bytes = e.cause.read(512)
+                first_bytes = e.cause.response.read(512)
                  if not is_html(first_bytes):
                      yt_error = try_get(
                          self._parse_json(
-                            self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False),
+                            self._webpage_read_content(e.cause.response, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False),
                          lambda x: x['error']['message'], str)
                      if yt_error:
                          self._report_alerts([('ERROR', yt_error)], fatal=False)
                  # Downloading page may result in intermittent 5xx HTTP error
-                # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
+                # Sometimes a 404 is also received. See: https://github.com/ytdl-org/youtube-dl/issues/28289
                  # We also want to catch all other network exceptions since errors in later pages can be troublesome
                  # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210
-                if e.cause.code not in (403, 429):
-                    retry.error = e
+                if e.cause.status not in (403, 429):
+                    main_rm.error = e
+                    next(main_retries)
                      continue
                  return self._error_or_warning(e, fatal=fatal)
  
              try:
                  self._extract_and_report_alerts(response, only_once=True)
              except ExtractorError as e:
-                # YouTube servers may return errors we want to retry on in a 200 OK response
+                # YouTube's servers may return errors we want to retry on in a 200 OK response
                  # See: https://github.com/yt-dlp/yt-dlp/issues/839
                  if 'unknown error' in e.msg.lower():
-                    retry.error = e
+                    main_rm.error = e
+                    next(main_retries)
                      continue
                  return self._error_or_warning(e, fatal=fatal)
              # Youtube sometimes sends incomplete data
              # See: https://github.com/ytdl-org/youtube-dl/issues/28194
              if not traverse_obj(response, *variadic(check_get_keys)):
-                retry.error = ExtractorError('Incomplete data received', expected=True)
+                icd_rm.error = ExtractorError('Incomplete data received', expected=True)
+                should_retry = next(icd_retries, None)
+                if not should_retry:
+                    return None
                  continue
  
              return response
@@ -2837,7 +2845,7 @@ def _extract_sequence_from_mpd(refresh_sequence, immediate):
              # Obtain from MPD's maximum seq value
              old_mpd_url = mpd_url
              last_error = ctx.pop('last_error', None)
-            expire_fast = immediate or last_error and isinstance(last_error, urllib.error.HTTPError) and last_error.code == 403
+            expire_fast = immediate or last_error and isinstance(last_error, HTTPError) and last_error.status == 403
              mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000)
                                                 or (mpd_url, stream_number, False))
              if not refresh_sequence:
@@ -3287,16 +3295,15 @@ def _extract_chapters_from_engagement_panel(self, data, duration):
                                            chapter_time, chapter_title, duration)
              for contents in content_list)), [])
  
-    def _extract_heatmap_from_player_overlay(self, data):
-        content_list = traverse_obj(data, (
-            'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar',
-            'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list}))
-        return next(filter(None, (
-            traverse_obj(contents, (..., 'heatMarkerRenderer', {
-                'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}),
-                'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000},
-                'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}),
-            })) for contents in content_list)), None)
+    def _extract_heatmap(self, data):
+        return traverse_obj(data, (
+            'frameworkUpdates', 'entityBatchUpdate', 'mutations',
+            lambda _, v: v['payload']['macroMarkersListEntity']['markersList']['markerType'] == 'MARKER_TYPE_HEATMAP',
+            'payload', 'macroMarkersListEntity', 'markersList', 'markers', ..., {
+                'start_time': ('startMillis', {functools.partial(float_or_none, scale=1000)}),
+                'end_time': {lambda x: (int(x['startMillis']) + int(x['durationMillis'])) / 1000},
+                'value': ('intensityScoreNormalized', {float_or_none}),
+            })) or None
  
      def _extract_comment(self, comment_renderer, parent=None):
          comment_id = comment_renderer.get('commentId')
@@ -3584,8 +3591,6 @@ def _is_agegated(player_response):
      def _is_unplayable(player_response):
          return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE'
  
-    _PLAYER_PARAMS = 'CgIQBg=='
-
      def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data):
  
          session_index = self._extract_session_index(player_ytcfg, master_ytcfg)
@@ -3598,7 +3603,11 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg,
              'videoId': video_id,
          }
          if _split_innertube_client(client)[0] == 'android':
-            yt_query['params'] = self._PLAYER_PARAMS
+            yt_query['params'] = 'CgIQBg=='
+
+        pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0]
+        if pp_arg:
+            yt_query['params'] = pp_arg
  
          yt_query.update(self._generate_player_context(sts))
          return self._extract_response(
@@ -3927,9 +3936,12 @@ def process_manifest_format(f, proto, client_name, itag):
              elif itag:
                  f['format_id'] = itag
  
+            if f.get('source_preference') is None:
+                f['source_preference'] = -1
+
              if itag in ('616', '235'):
                  f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ')
-                f['source_preference'] = (f.get('source_preference') or -1) + 100
+                f['source_preference'] += 100
  
              f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1))
              if f['quality'] == -1 and f.get('height'):
@@ -3938,6 +3950,10 @@ def process_manifest_format(f, proto, client_name, itag):
                  f['format_note'] = join_nonempty(f.get('format_note'), client_name, delim=', ')
              if f.get('fps') and f['fps'] <= 1:
                  del f['fps']
+
+            if proto == 'hls' and f.get('has_drm'):
+                f['has_drm'] = 'maybe'
+                f['source_preference'] -= 5
              return True
  
          subtitles = {}
@@ -4010,6 +4026,9 @@ def _download_player_responses(self, url, smuggled_data, video_id, webpage_url):
          webpage = None
          if 'webpage' not in self._configuration_arg('player_skip'):
              query = {'bpctr': '9999999999', 'has_verified': '1'}
+            pp = self._configuration_arg('player_params', [None], casesense=True)[0]
+            if pp:
+                query['pp'] = pp
              webpage = self._download_webpage(
                  webpage_url, video_id, fatal=False, query=query)
  
@@ -4037,6 +4056,10 @@ def _list_formats(self, video_id, microformats, video_details, player_responses,
                         else None)
          streaming_data = traverse_obj(player_responses, (..., 'streamingData'))
          *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration)
+        if all(f.get('has_drm') for f in formats):
+            # If there are no formats that definitely don't have DRM, all have DRM
+            for f in formats:
+                f['has_drm'] = True
  
          return live_broadcast_details, live_status, streaming_data, formats, subtitles
  
@@ -4414,7 +4437,7 @@ def process_language(container, base_url, lang_code, sub_name, query):
                  or self._extract_chapters_from_description(video_description, duration)
                  or None)
  
-            info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data)
+            info['heatmap'] = self._extract_heatmap(initial_data)
  
          contents = traverse_obj(
              initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'),
@@ -4910,10 +4933,15 @@ def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data):
              or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
          yield from extract_entries(parent_renderer)
          continuation = continuation_list[0]
-
+        seen_continuations = set()
          for page_num in itertools.count(1):
              if not continuation:
                  break
+            continuation_token = continuation.get('continuation')
+            if continuation_token is not None and continuation_token in seen_continuations:
+                self.write_debug('Detected YouTube feed looping - assuming end of feed.')
+                break
+            seen_continuations.add(continuation_token)
              headers = self.generate_api_headers(
                  ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data)
              response = self._extract_response(
@@ -5247,7 +5275,7 @@ def _extract_webpage(self, url, item_id, fatal=True):
                  data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {}
              except ExtractorError as e:
                  if isinstance(e.cause, network_exceptions):
-                    if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code not in (403, 429):
+                    if not isinstance(e.cause, HTTPError) or e.cause.status not in (403, 429):
                          retry.error = e
                          continue
                  self._error_or_warning(e, fatal=fatal)