]> jfr.im git - yt-dlp.git/commitdiff
[extractor/youtube] Update playlist metadata extraction for new layout (#5376)
authorMatthew <redacted>
Sun, 6 Nov 2022 05:25:31 +0000 (18:25 +1300)
committerGitHub <redacted>
Sun, 6 Nov 2022 05:25:31 +0000 (05:25 +0000)
Fixes https://github.com/yt-dlp/yt-dlp/issues/5373

Authored by: coletdjnz

yt_dlp/extractor/youtube.py

index 555c94f97359660b363969a687228045e1e8f6d6..c387481cdaaaaf2edce2e5a12b5fce4a03e94b1a 100644 (file)
@@ -904,20 +904,24 @@ def _extract_video(self, renderer):
         video_id = renderer.get('videoId')
         title = self._get_text(renderer, 'title')
         description = self._get_text(renderer, 'descriptionSnippet')
-        duration = parse_duration(self._get_text(
-            renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
+
+        duration = int_or_none(renderer.get('lengthSeconds'))
+        if duration is None:
+            duration = parse_duration(self._get_text(
+                renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
         if duration is None:
             duration = parse_duration(self._search_regex(
                 r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$',
                 traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str),
                 video_id, default=None, group='duration'))
 
-        view_count = self._get_count(renderer, 'viewCountText', 'shortViewCountText')
+        # videoInfo is a string like '50K views • 10 years ago'.
+        view_count = self._get_count(renderer, 'viewCountText', 'shortViewCountText', 'videoInfo')
         uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
         channel_id = traverse_obj(
             renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'),
             expected_type=str, get_all=False)
-        time_text = self._get_text(renderer, 'publishedTimeText') or ''
+        time_text = self._get_text(renderer, 'publishedTimeText', 'videoInfo') or ''
         scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False))
         overlay_style = traverse_obj(
             renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'),
@@ -4583,50 +4587,36 @@ def _extract_selected_tab(tabs, fatal=True):
             if fatal:
                 raise ExtractorError('Unable to find selected tab')
 
-    def _extract_uploader(self, data):
-        uploader = {}
-        renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {}
-        owner = try_get(
-            renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
-        if owner:
-            owner_text = owner.get('text')
-            uploader['uploader'] = self._search_regex(
-                r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text)
-            uploader['uploader_id'] = try_get(
-                owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], str)
-            uploader['uploader_url'] = urljoin(
-                'https://www.youtube.com/',
-                try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], str))
-        return filter_dict(uploader)
-
     def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
         playlist_id = title = description = channel_url = channel_name = channel_id = None
         tags = []
 
         selected_tab = self._extract_selected_tab(tabs)
+        # Deprecated - remove when layout discontinued
         primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
-        renderer = try_get(
+        playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer'), expected_type=dict)
+        metadata_renderer = try_get(
             data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
-        if renderer:
-            channel_name = renderer.get('title')
-            channel_url = renderer.get('channelUrl')
-            channel_id = renderer.get('externalId')
+        if metadata_renderer:
+            channel_name = metadata_renderer.get('title')
+            channel_url = metadata_renderer.get('channelUrl')
+            channel_id = metadata_renderer.get('externalId')
         else:
-            renderer = try_get(
+            metadata_renderer = try_get(
                 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
 
-        if renderer:
-            title = renderer.get('title')
-            description = renderer.get('description', '')
+        if metadata_renderer:
+            title = metadata_renderer.get('title')
+            description = metadata_renderer.get('description', '')
             playlist_id = channel_id
-            tags = renderer.get('keywords', '').split()
+            tags = metadata_renderer.get('keywords', '').split()
 
         # We can get the uncropped banner/avatar by replacing the crop params with '=s0'
         # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714
         def _get_uncropped(url):
             return url_or_none((url or '').split('=')[0] + '=s0')
 
-        avatar_thumbnails = self._extract_thumbnails(renderer, 'avatar')
+        avatar_thumbnails = self._extract_thumbnails(metadata_renderer, 'avatar')
         if avatar_thumbnails:
             uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url'])
             if uncropped_avatar:
@@ -4650,14 +4640,33 @@ def _get_uncropped(url):
                     'preference': -5
                 })
 
+        # Deprecated - remove when old layout is discontinued
         primary_thumbnails = self._extract_thumbnails(
             primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail'))
 
+        playlist_thumbnails = self._extract_thumbnails(
+            playlist_header_renderer, ('playlistHeaderBanner', 'heroPlaylistThumbnailRenderer', 'thumbnail'))
+
         if playlist_id is None:
             playlist_id = item_id
 
-        playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats')
-        last_updated_unix = self._parse_time_text(self._get_text(playlist_stats, 2))
+        # Deprecated - remove primary_sidebar_renderer when old layout discontinued
+        # Playlist stats is a text runs array containing [video count, view count, last updated].
+        # last updated or (view count and last updated) may be missing.
+        playlist_stats = get_first(
+            (primary_sidebar_renderer, playlist_header_renderer), (('stats', 'briefStats', 'numVideosText'),))
+        last_updated_unix = self._parse_time_text(
+            self._get_text(playlist_stats, 2)  # deprecated, remove when old layout discontinued
+            or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text')))
+
+        view_count = self._get_count(playlist_stats, 1)
+        if view_count is None:
+            view_count = self._get_count(playlist_header_renderer, 'viewCountText')
+
+        playlist_count = self._get_count(playlist_stats, 0)
+        if playlist_count is None:
+            playlist_count = self._get_count(playlist_header_renderer, ('byline', 0, 'playlistBylineRenderer', 'text'))
+
         if title is None:
             title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id
         title += format_field(selected_tab, 'title', ' - %s')
@@ -4670,16 +4679,29 @@ def _get_uncropped(url):
             'uploader': channel_name,
             'uploader_id': channel_id,
             'uploader_url': channel_url,
-            'thumbnails': primary_thumbnails + avatar_thumbnails + channel_banners,
+            'thumbnails': (primary_thumbnails or playlist_thumbnails) + avatar_thumbnails + channel_banners,
             'tags': tags,
-            'view_count': self._get_count(playlist_stats, 1),
+            'view_count': view_count,
             'availability': self._extract_availability(data),
             'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'),
-            'playlist_count': self._get_count(playlist_stats, 0),
+            'playlist_count': playlist_count,
             'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')),
         }
         if not channel_id:
-            metadata.update(self._extract_uploader(data))
+            owner = traverse_obj(playlist_header_renderer, 'ownerText')
+            if not owner:
+                # Deprecated
+                owner = traverse_obj(
+                    self._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer'),
+                    ('videoOwner', 'videoOwnerRenderer', 'title'))
+            owner_text = self._get_text(owner)
+            browse_ep = traverse_obj(owner, ('runs', 0, 'navigationEndpoint', 'browseEndpoint')) or {}
+            metadata.update(filter_dict({
+                'uploader': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text),
+                'uploader_id': browse_ep.get('browseId'),
+                'uploader_url': urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl'))
+            }))
+
         metadata.update({
             'channel': metadata['uploader'],
             'channel_id': metadata['uploader_id'],
@@ -4751,19 +4773,21 @@ def _extract_availability(self, data):
         Note: Unless YouTube tells us explicitly, we do not assume it is public
         @param data: response
         """
-        renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
-
-        player_header_privacy = traverse_obj(
-            data, ('header', 'playlistHeaderRenderer', 'privacy'), expected_type=str)
+        sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
+        playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {}
+        player_header_privacy = playlist_header_renderer.get('privacy')
 
-        badges = self._extract_badges(renderer)
+        badges = self._extract_badges(sidebar_renderer)
 
         # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
-        privacy_setting_icon = traverse_obj(
-            renderer, (
-                'privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries',
-                lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'),
-            get_all=False, expected_type=str)
+        privacy_setting_icon = get_first(
+            (playlist_header_renderer, sidebar_renderer),
+            ('privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries',
+             lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'),
+            expected_type=str)
+
+        microformats_is_unlisted = traverse_obj(
+            data, ('microformat', 'microformatDataRenderer', 'unlisted'), expected_type=bool)
 
         return (
             'public' if (
@@ -4778,7 +4802,8 @@ def _extract_availability(self, data):
                 is_unlisted=(
                     self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED)
                     or player_header_privacy == 'UNLISTED' if player_header_privacy is not None
-                    else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None else None),
+                    else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None
+                    else microformats_is_unlisted if microformats_is_unlisted is not None else None),
                 needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None,
                 needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None,
                 needs_auth=False))
@@ -4794,39 +4819,23 @@ def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
 
     def _reload_with_unavailable_videos(self, item_id, data, ytcfg):
         """
-        Get playlist with unavailable videos if the 'show unavailable videos' button exists.
+        Reload playlists with unavailable videos (e.g. private videos, region blocked, etc.)
         """
-        browse_id = params = None
-        renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
-        if not renderer:
+        is_playlist = bool(traverse_obj(
+            data, ('metadata', 'playlistMetadataRenderer'), ('header', 'playlistHeaderRenderer')))
+        if not is_playlist:
             return
-        menu_renderer = try_get(
-            renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
-        for menu_item in menu_renderer:
-            if not isinstance(menu_item, dict):
-                continue
-            nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
-            text = try_get(
-                nav_item_renderer, lambda x: x['text']['simpleText'], str)
-            if not text or text.lower() != 'show unavailable videos':
-                continue
-            browse_endpoint = try_get(
-                nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
-            browse_id = browse_endpoint.get('browseId')
-            params = browse_endpoint.get('params')
-            break
-
         headers = self.generate_api_headers(
             ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
             visitor_data=self._extract_visitor_data(data, ytcfg))
         query = {
-            'params': params or 'wgYCCAA=',
-            'browseId': browse_id or 'VL%s' % item_id
+            'params': 'wgYCCAA=',
+            'browseId': f'VL{item_id}'
         }
         return self._extract_response(
             item_id=item_id, headers=headers, query=query,
             check_get_keys='contents', fatal=False, ytcfg=ytcfg,
-            note='Downloading API JSON with unavailable videos')
+            note='Redownloading playlist API JSON with unavailable videos')
 
     @functools.cached_property
     def skip_webpage(self):
@@ -5324,6 +5333,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
             'channel_url': 'https://www.youtube.com/user/Computerphile',
             'channel': 'Computerphile',
             'availability': 'public',
+            'modified_date': '20190712',
         },
         'playlist_mincount': 11,
     }, {
@@ -5659,6 +5669,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
             'uploader': 'cole-dlp-test-acc',
             'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
             'channel': 'cole-dlp-test-acc',
+            'channel_follower_count': int,
         },
         'playlist_mincount': 1,
         'params': {'extractor_args': {'youtube': {'lang': ['ja']}}},