]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/youtube.py
[extractor/youtube] Bypass throttling for `-f17`
[yt-dlp.git] / yt_dlp / extractor / youtube.py
index 01e2e37931f5df565c4b5e3e1e600f5289309315..6e6abd65b1c295d7518864f8a002822f486d195f 100644 (file)
@@ -430,6 +430,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
         r'(?:www\.)?pd\.vern\.cc',
         r'(?:www\.)?piped\.hostux\.net',
         r'(?:www\.)?piped\.lunar\.icu',
+        # Hyperpipe instances from https://hyperpipe.codeberg.page/
+        r'(?:www\.)?hyperpipe\.surge\.sh',
+        r'(?:www\.)?hyperpipe\.esmailelbob\.xyz',
+        r'(?:www\.)?listen\.whatever\.social',
+        r'(?:www\.)?music\.adminforge\.de',
     )
 
     # extracted from account/account_menu ep
@@ -951,7 +956,7 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers
 
     @staticmethod
     def is_music_url(url):
-        return re.match(r'https?://music\.youtube\.com/', url) is not None
+        return re.match(r'(https?://)?music\.youtube\.com/', url) is not None
 
     def _extract_video(self, renderer):
         video_id = renderer.get('videoId')
@@ -3200,11 +3205,11 @@ def _extract_chapters_from_json(self, data, duration):
                 'decoratedPlayerBarRenderer', 'playerBar', 'chapteredPlayerBarRenderer', 'chapters'
             ), expected_type=list)
 
-        return self._extract_chapters(
+        return self._extract_chapters_helper(
             chapter_list,
-            chapter_time=lambda chapter: float_or_none(
+            start_function=lambda chapter: float_or_none(
                 traverse_obj(chapter, ('chapterRenderer', 'timeRangeStartMillis')), scale=1000),
-            chapter_title=lambda chapter: traverse_obj(
+            title_function=lambda chapter: traverse_obj(
                 chapter, ('chapterRenderer', 'title', 'simpleText'), expected_type=str),
             duration=duration)
 
@@ -3217,42 +3222,10 @@ def _extract_chapters_from_engagement_panel(self, data, duration):
         chapter_title = lambda chapter: self._get_text(chapter, 'title')
 
         return next(filter(None, (
-            self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
-                                   chapter_time, chapter_title, duration)
+            self._extract_chapters_helper(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')),
+                                          chapter_time, chapter_title, duration)
             for contents in content_list)), [])
 
-    def _extract_chapters_from_description(self, description, duration):
-        duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
-        sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
-        return self._extract_chapters(
-            re.findall(sep_re % (duration_re, r'.+?'), description or ''),
-            chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1],
-            duration=duration, strict=False) or self._extract_chapters(
-            re.findall(sep_re % (r'.+?', duration_re), description or ''),
-            chapter_time=lambda x: parse_duration(x[1]), chapter_title=lambda x: x[0],
-            duration=duration, strict=False)
-
-    def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True):
-        if not duration:
-            return
-        chapter_list = [{
-            'start_time': chapter_time(chapter),
-            'title': chapter_title(chapter),
-        } for chapter in chapter_list or []]
-        if not strict:
-            chapter_list.sort(key=lambda c: c['start_time'] or 0)
-
-        chapters = [{'start_time': 0}]
-        for idx, chapter in enumerate(chapter_list):
-            if chapter['start_time'] is None:
-                self.report_warning(f'Incomplete chapter {idx}')
-            elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
-                chapters.append(chapter)
-            elif chapter not in chapters:
-                self.report_warning(
-                    f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"')
-        return chapters[1:]
-
     def _extract_comment(self, comment_renderer, parent=None):
         comment_id = comment_renderer.get('commentId')
         if not comment_id:
@@ -3336,6 +3309,13 @@ def extract_thread(contents):
                 comment = self._extract_comment(comment_renderer, parent)
                 if not comment:
                     continue
+                # Sometimes YouTube may break and give us infinite looping comments.
+                # See: https://github.com/yt-dlp/yt-dlp/issues/6290
+                if comment['id'] in tracker['seen_comment_ids']:
+                    self.report_warning('Detected YouTube comments looping. Stopping comment extraction as we probably cannot get any more.')
+                    yield
+                else:
+                    tracker['seen_comment_ids'].add(comment['id'])
 
                 tracker['running_total'] += 1
                 tracker['total_reply_comments' if parent else 'total_parent_comments'] += 1
@@ -3360,7 +3340,8 @@ def extract_thread(contents):
                 est_total=0,
                 current_page_thread=0,
                 total_parent_comments=0,
-                total_reply_comments=0)
+                total_reply_comments=0,
+                seen_comment_ids=set())
 
         # TODO: Deprecated
         # YouTube comments have a max depth of 2
@@ -3736,10 +3717,10 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l
                 'filesize': int_or_none(fmt.get('contentLength')),
                 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}',
                 'format_note': join_nonempty(
-                    '%s%s' % (audio_track.get('displayName') or '',
-                              ' (default)' if language_preference > 0 else ''),
+                    join_nonempty(audio_track.get('displayName'),
+                                  language_preference > 0 and ' (default)', delim=''),
                     fmt.get('qualityLabel') or quality.replace('audio_quality_', ''),
-                    'DRC' if fmt.get('isDrc') else None,
+                    fmt.get('isDrc') and 'DRC',
                     try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()),
                     try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()),
                     throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '),
@@ -3764,19 +3745,22 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l
             if mime_mobj:
                 dct['ext'] = mimetype2ext(mime_mobj.group(1))
                 dct.update(parse_codecs(mime_mobj.group(2)))
-            no_audio = dct.get('acodec') == 'none'
-            no_video = dct.get('vcodec') == 'none'
-            if no_audio:
-                dct['vbr'] = tbr
-            if no_video:
-                dct['abr'] = tbr
-            if no_audio or no_video:
-                dct['downloader_options'] = {
-                    # Youtube throttles chunks >~10M
-                    'http_chunk_size': 10485760,
-                }
-                if dct.get('ext'):
-                    dct['container'] = dct['ext'] + '_dash'
+
+            single_stream = 'none' in (dct.get('acodec'), dct.get('vcodec'))
+            if single_stream and dct.get('ext'):
+                dct['container'] = dct['ext'] + '_dash'
+            if single_stream or itag == '17':
+                CHUNK_SIZE = 10 << 20
+                dct.update({
+                    'protocol': 'http_dash_segments',
+                    'fragments': [{
+                        'url': update_url_query(dct['url'], {
+                            'range': f'{range_start}-{min(range_start + CHUNK_SIZE - 1, dct["filesize"])}'
+                        })
+                    } for range_start in range(0, dct['filesize'], CHUNK_SIZE)]
+                } if itag != '17' and dct['filesize'] else {
+                    'downloader_options': {'http_chunk_size': CHUNK_SIZE}
+                })
 
             if itag:
                 itags[itag].add(('https', dct.get('language')))
@@ -4115,7 +4099,7 @@ def is_bad_format(fmt):
             'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')),
             'description': video_description,
             'uploader': get_first(video_details, 'author'),
-            'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
+            'uploader_id': self._search_regex(r'/(?:channel/|user/|(?=@))([^/?&#]+)', owner_profile_url, 'uploader id', default=None),
             'uploader_url': owner_profile_url,
             'channel_id': channel_id,
             'channel_url': format_field(channel_id, None, 'https://www.youtube.com/channel/%s'),
@@ -4454,19 +4438,6 @@ def wrapper(self, url):
             return info_dict
         return wrapper
 
-    def _extract_channel_id(self, webpage):
-        channel_id = self._html_search_meta(
-            'channelId', webpage, 'channel id', default=None)
-        if channel_id:
-            return channel_id
-        channel_url = self._html_search_meta(
-            ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
-             'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
-             'twitter:app:url:googleplay'), webpage, 'channel url')
-        return self._search_regex(
-            r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
-            channel_url, 'channel id')
-
     @staticmethod
     def _extract_basic_item_renderer(item):
         # Modified from _extract_grid_item_renderer
@@ -4905,6 +4876,10 @@ def _get_uncropped(url):
         info['view_count'] = self._get_count(playlist_stats, 1)
         if info['view_count'] is None:  # 0 is allowed
             info['view_count'] = self._get_count(playlist_header_renderer, 'viewCountText')
+        if info['view_count'] is None:
+            info['view_count'] = self._get_count(data, (
+                'contents', 'twoColumnBrowseResultsRenderer', 'tabs', ..., 'tabRenderer', 'content', 'sectionListRenderer',
+                'contents', ..., 'itemSectionRenderer', 'contents', ..., 'channelAboutFullMetadataRenderer', 'viewCountText'))
 
         info['playlist_count'] = self._get_count(playlist_stats, 0)
         if info['playlist_count'] is None:  # 0 is allowed
@@ -6124,6 +6099,23 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
             }
         }],
         'params': {'extract_flat': True},
+    }, {
+        'url': 'https://www.youtube.com/@3blue1brown/about',
+        'info_dict': {
+            'id': 'UCYO_jab_esuFRV4b17AJtAw',
+            'tags': ['Mathematics'],
+            'title': '3Blue1Brown - About',
+            'uploader_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+            'channel_follower_count': int,
+            'channel_id': 'UCYO_jab_esuFRV4b17AJtAw',
+            'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw',
+            'channel': '3Blue1Brown',
+            'uploader': '3Blue1Brown',
+            'view_count': int,
+            'channel_url': 'https://www.youtube.com/channel/UCYO_jab_esuFRV4b17AJtAw',
+            'description': 'md5:e1384e8a133307dd10edee76e875d62f',
+        },
+        'playlist_count': 0,
     }]
 
     @classmethod
@@ -6190,6 +6182,8 @@ def _real_extract(self, url, smuggled_data):
         original_tab_id, display_id = tab[1:], f'{item_id}{tab}'
         if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts:
             url = f'{pre}/videos{post}'
+        if smuggled_data.get('is_music_url'):
+            self.report_warning(f'YouTube Music is not directly supported. Redirecting to {url}')
 
         # Handle both video/playlist URLs
         qs = parse_qs(url)