Update to ytdl-commit-4fb25ff

[yt-dlp.git] / yt_dlp / extractor / youtube.py
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py

index 7a158b011eac8390019c5ef51712fe1b08093807..1130d10fa52174ef748717d9b46fc9ec475c35ec 100644 (file)
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -2,6 +2,7 @@
  
  from __future__ import unicode_literals
  
+import calendar
  import hashlib
  import itertools
  import json
@@ -15,7 +16,6 @@
  from ..compat import (
      compat_chr,
      compat_HTTPError,
-    compat_kwargs,
      compat_parse_qs,
      compat_str,
      compat_urllib_parse_unquote_plus,
@@ -25,8 +25,10 @@
  )
  from ..jsinterp import JSInterpreter
  from ..utils import (
+    bool_or_none,
      clean_html,
      dict_get,
+    datetime_from_str,
      ExtractorError,
      format_field,
      float_or_none,
@@ -46,7 +48,7 @@
      update_url_query,
      url_or_none,
      urlencode_postdata,
-    urljoin,
+    urljoin
  )
  
  
@@ -260,13 +262,23 @@ def warn(message):
  
          return True
  
-    def _download_webpage_handle(self, *args, **kwargs):
-        query = kwargs.get('query', {}).copy()
-        kwargs['query'] = query
-        return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
-            *args, **compat_kwargs(kwargs))
+    def _initialize_consent(self):
+        cookies = self._get_cookies('https://www.youtube.com/')
+        if cookies.get('__Secure-3PSID'):
+            return
+        consent_id = None
+        consent = cookies.get('CONSENT')
+        if consent:
+            if 'YES' in consent.value:
+                return
+            consent_id = self._search_regex(
+                r'PENDING\+(\d+)', consent.value, 'consent', default=None)
+        if not consent_id:
+            consent_id = random.randint(100, 999)
+        self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
  
      def _real_initialize(self):
+        self._initialize_consent()
          if self._downloader is None:
              return
          if not self._login():
@@ -369,7 +381,7 @@ def _extract_video(self, renderer):
              (lambda x: x['ownerText']['runs'][0]['text'],
               lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
          return {
-            '_type': 'url_transparent',
+            '_type': 'url',
              'ie_key': YoutubeIE.ie_key(),
              'id': video_id,
              'url': video_id,
@@ -1236,6 +1248,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
              'only_matching': True,
          },
+        {
+            # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
+            'url': 'cBvYw8_A0vQ',
+            'info_dict': {
+                'id': 'cBvYw8_A0vQ',
+                'ext': 'mp4',
+                'title': '4K Ueno Okachimachi  Street  Scenes  上野御徒町歩き',
+                'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
+                'upload_date': '20201120',
+                'uploader': 'Walk around Japan',
+                'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
      ]
  
      def __init__(self, *args, **kwargs):
@@ -1489,6 +1518,16 @@ def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
              (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
               regex), webpage, name, default='{}'), video_id, fatal=False)
  
+    @staticmethod
+    def parse_time_text(time_text):
+        """
+        Parse the comment time text
+        time_text is in the format 'X units ago (edited)'
+        """
+        time_text_split = time_text.split(' ')
+        if len(time_text_split) >= 3:
+            return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
+
      @staticmethod
      def _join_text_entries(runs):
          text = None
@@ -1511,7 +1550,7 @@ def _extract_comment(self, comment_renderer, parent=None):
          text = self._join_text_entries(comment_text_runs) or ''
          comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
          time_text = self._join_text_entries(comment_time_text)
-
+        timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple())
          author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
          author_id = try_get(comment_renderer,
                              lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
@@ -1522,11 +1561,10 @@ def _extract_comment(self, comment_renderer, parent=None):
  
          author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
          is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
-
          return {
              'id': comment_id,
              'text': text,
-            # TODO: This should be parsed to timestamp
+            'timestamp': timestamp,
              'time_text': time_text,
              'like_count': votes,
              'is_favorited': is_liked,
@@ -1614,12 +1652,12 @@ def extract_thread(parent_renderer):
                      comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
                      if page_num == 0:
                          if first_continuation:
-                            note_prefix = "Downloading initial comment continuation page"
+                            note_prefix = 'Downloading initial comment continuation page'
                          else:
-                            note_prefix = "    Downloading comment reply thread %d %s" % (comment_counts[2], comment_prog_str)
+                            note_prefix = '    Downloading comment reply thread %d %s' % (comment_counts[2], comment_prog_str)
                      else:
-                        note_prefix = "%sDownloading comment%s page %d %s" % (
-                            "       " if parent else "",
+                        note_prefix = '%sDownloading comment%s page %d %s' % (
+                            '       ' if parent else '',
                              ' replies' if parent else '',
                              page_num,
                              comment_prog_str)
@@ -1634,13 +1672,13 @@ def extract_thread(parent_renderer):
                  except ExtractorError as e:
                      if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
                          if e.cause.code == 413:
-                            self.report_warning("Assumed end of comments (received HTTP Error 413)")
+                            self.report_warning('Assumed end of comments (received HTTP Error 413)')
                              return
                          # Downloading page may result in intermittent 5xx HTTP error
                          # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
                          last_error = 'HTTP Error %s' % e.cause.code
                          if e.cause.code == 404:
-                            last_error = last_error + " (this API is probably deprecated)"
+                            last_error = last_error + ' (this API is probably deprecated)'
                          if count < retries:
                              continue
                      raise
@@ -1658,7 +1696,7 @@ def extract_thread(parent_renderer):
  
                      # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
                      if browse.get('reload'):
-                        raise ExtractorError("Invalid or missing params in continuation request", expected=False)
+                        raise ExtractorError('Invalid or missing params in continuation request', expected=False)
  
                      # TODO: not tested, merged from old extractor
                      err_msg = browse.get('externalErrorMessage')
@@ -1698,7 +1736,7 @@ def extract_thread(parent_renderer):
  
                      if expected_comment_count:
                          comment_counts[1] = str_to_int(expected_comment_count)
-                        self.to_screen("Downloading ~%d comments" % str_to_int(expected_comment_count))
+                        self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count))
                          yield comment_counts[1]
  
                      # TODO: cli arg.
@@ -1714,7 +1752,7 @@ def extract_thread(parent_renderer):
                          continuation = YoutubeTabIE._build_continuation_query(
                              continuation=sort_continuation_renderer.get('continuation'),
                              ctp=sort_continuation_renderer.get('clickTrackingParams'))
-                        self.to_screen("Sorting comments by %s" % ('popular' if comment_sort_index == 0 else 'newest'))
+                        self.to_screen('Sorting comments by %s' % ('popular' if comment_sort_index == 0 else 'newest'))
                          break
  
                  for entry in known_continuation_renderers[key](continuation_renderer):
@@ -1747,7 +1785,7 @@ def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
                          continue
                      comments.append(comment)
                  break
-        self.to_screen("Downloaded %d/%d comments" % (len(comments), estimated_total))
+        self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
          return {
              'comments': comments,
              'comment_count': len(comments),
@@ -1759,8 +1797,7 @@ def _real_extract(self, url):
          base_url = self.http_scheme() + '//www.youtube.com/'
          webpage_url = base_url + 'watch?v=' + video_id
          webpage = self._download_webpage(
-            webpage_url + '&has_verified=1&bpctr=9999999999',
-            video_id, fatal=False)
+            webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
  
          player_response = None
          if webpage:
@@ -1797,7 +1834,13 @@ def _real_extract(self, url):
          def get_text(x):
              if not x:
                  return
-            return x.get('simpleText') or ''.join([r['text'] for r in x['runs']])
+            text = x.get('simpleText')
+            if text and isinstance(text, compat_str):
+                return text
+            runs = x.get('runs')
+            if not isinstance(runs, list):
+                return
+            return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
  
          search_meta = (
              lambda x: self._html_search_meta(x, webpage, default=None)) \
@@ -1946,7 +1989,7 @@ def feed_entry(name):
                      f['format_id'] = itag
                  formats.append(f)
  
-        if self._downloader.params.get('youtube_include_dash_manifest'):
+        if self._downloader.params.get('youtube_include_dash_manifest', True):
              dash_manifest_url = streaming_data.get('dashManifestUrl')
              if dash_manifest_url:
                  for f in self._extract_mpd_formats(
@@ -2066,7 +2109,7 @@ def feed_entry(name):
              'tags': keywords,
              'is_live': is_live,
              'playable_in_embed': playability_status.get('playableInEmbed'),
-            'was_live': video_details.get('isLiveContent')
+            'was_live': video_details.get('isLiveContent'),
          }
  
          pctr = try_get(
@@ -2149,6 +2192,7 @@ def process_language(container, base_url, lang_code, query):
                  # This will error if there is no livechat
                  initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
                  info['subtitles']['live_chat'] = [{
+                    'url': 'https://www.youtube.com/watch?v=%s' % video_id,  # url is needed to set cookies
                      'video_id': video_id,
                      'ext': 'json',
                      'protocol': 'youtube_live_chat_replay',
@@ -2242,7 +2286,7 @@ def chapter_time(mmlir):
                      info['channel'] = get_text(try_get(
                          vsir,
                          lambda x: x['owner']['videoOwnerRenderer']['title'],
-                        compat_str))
+                        dict))
                      rows = try_get(
                          vsir,
                          lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
@@ -2283,6 +2327,35 @@ def chapter_time(mmlir):
              if v:
                  info[d_k] = v
  
+        is_private = bool_or_none(video_details.get('isPrivate'))
+        is_unlisted = bool_or_none(microformat.get('isUnlisted'))
+        is_membersonly = None
+        is_premium = None
+        if initial_data and is_private is not None:
+            is_membersonly = False
+            is_premium = False
+            contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list)
+            for content in contents or []:
+                badges = try_get(content, lambda x: x['videoPrimaryInfoRenderer']['badges'], list)
+                for badge in badges or []:
+                    label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label']) or ''
+                    if label.lower() == 'members only':
+                        is_membersonly = True
+                        break
+                    elif label.lower() == 'premium':
+                        is_premium = True
+                        break
+                if is_membersonly or is_premium:
+                    break
+
+        # TODO: Add this for playlists
+        info['availability'] = self._availability(
+            is_private=is_private,
+            needs_premium=is_premium,
+            needs_subscription=is_membersonly,
+            needs_auth=info['age_limit'] >= 18,
+            is_unlisted=None if is_private is None else is_unlisted)
+
          # get xsrf for annotations or comments
          get_annotations = self._downloader.params.get('writeannotations', False)
          get_comments = self._downloader.params.get('getcomments', False)
@@ -2642,6 +2715,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
      }, {
          'url': 'https://www.youtube.com/TheYoungTurks/live',
          'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/hashtag/cctv9',
+        'info_dict': {
+            'id': 'cctv9',
+            'title': '#cctv9',
+        },
+        'playlist_mincount': 350,
      }]
  
      @classmethod
@@ -2666,7 +2746,7 @@ def _extract_channel_id(self, webpage):
      def _extract_basic_item_renderer(item):
          # Modified from _extract_grid_item_renderer
          known_renderers = (
-            'playlistRenderer', 'videoRenderer', 'channelRenderer'
+            'playlistRenderer', 'videoRenderer', 'channelRenderer',
              'gridPlaylistRenderer', 'gridVideoRenderer', 'gridChannelRenderer'
          )
          for key, renderer in item.items():
@@ -2802,6 +2882,16 @@ def _post_thread_continuation_entries(self, post_thread_continuation):
              for entry in self._post_thread_entries(renderer):
                  yield entry
  
+    r''' # unused
+    def _rich_grid_entries(self, contents):
+        for content in contents:
+            video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
+            if video_renderer:
+                entry = self._video_entry(video_renderer)
+                if entry:
+                    yield entry
+    '''
+
      @staticmethod
      def _build_continuation_query(continuation, ctp=None):
          query = {
@@ -2923,7 +3013,7 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
                      self.report_warning('%s. Retrying ...' % last_error)
                  try:
                      response = self._call_api(
-                        ep="browse", fatal=True, headers=headers,
+                        ep='browse', fatal=True, headers=headers,
                          video_id='%s page %s' % (item_id, page_num),
                          query={
                              'continuation': continuation['continuation'],
@@ -2984,9 +3074,9 @@ def extract_entries(parent_renderer):  # this needs to called again for continua
                  'richItemRenderer': (extract_entries, 'contents'),  # for hashtag
                  'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents')
              }
+            on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
              continuation_items = try_get(
-                response,
-                lambda x: dict_get(x, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))[0]['appendContinuationItemsAction']['continuationItems'], list)
+                on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
              continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
              video_items_renderer = None
              for key, value in continuation_item.items():
@@ -3044,10 +3134,10 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs):
              channel_name = renderer.get('title')
              channel_url = renderer.get('channelUrl')
              channel_id = renderer.get('externalId')
-
-        if not renderer:
+        else:
              renderer = try_get(
                  data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
+
          if renderer:
              title = renderer.get('title')
              description = renderer.get('description', '')
@@ -3073,11 +3163,12 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs):
                  'width': int_or_none(t.get('width')),
                  'height': int_or_none(t.get('height')),
              })
-
          if playlist_id is None:
              playlist_id = item_id
          if title is None:
-            title = playlist_id
+            title = (
+                try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
+                or playlist_id)
          title += format_field(selected_tab, 'title', ' - %s')
  
          metadata = {
@@ -3154,25 +3245,26 @@ def _real_extract_alerts():
                      alert_type = alert.get('type')
                      if not alert_type:
                          continue
-                    message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
+                    message = try_get(alert, lambda x: x['text']['simpleText'], compat_str) or ''
                      if message:
                          yield alert_type, message
                      for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
-                        message = try_get(run, lambda x: x['text'], compat_str)
-                        if message:
-                            yield alert_type, message
+                        message += try_get(run, lambda x: x['text'], compat_str)
+                    if message:
+                        yield alert_type, message
  
-        err_msg = None
+        errors = []
+        warnings = []
          for alert_type, alert_message in _real_extract_alerts():
              if alert_type.lower() == 'error':
-                if err_msg:
-                    self._downloader.report_warning('YouTube said: %s - %s' % ('ERROR', err_msg))
-                err_msg = alert_message
+                errors.append([alert_type, alert_message])
              else:
-                self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
+                warnings.append([alert_type, alert_message])
  
-        if err_msg:
-            raise ExtractorError('YouTube said: %s' % err_msg, expected=expected)
+        for alert_type, alert_message in (warnings + errors[:-1]):
+            self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
+        if errors:
+            raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected)
  
      def _extract_webpage(self, url, item_id):
          retries = self._downloader.params.get('extractor_retries', 3)