X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/1151c4079a15939c981f3e30129b82dea1b1bc6d..a2160aa45f4019e02ced01c9030aa9519b40b24f:/yt_dlp/extractor/common.py

diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 4e6deee1c..5b7b8891a 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -4,6 +4,7 @@
 import base64
 import datetime
 import hashlib
+import itertools
 import json
 import netrc
 import os
@@ -18,6 +19,7 @@
     compat_cookies_SimpleCookie,
     compat_etree_Element,
     compat_etree_fromstring,
+    compat_expanduser,
     compat_getpass,
     compat_http_client,
     compat_os_name,
@@ -405,6 +407,10 @@ class InfoExtractor(object):
     _real_extract() methods and define a _VALID_URL regexp.
     Probably, they should also be added to the list of extractors.
 
+    Subclasses may also override suitable() if necessary, but ensure the function
+    signature is preserved and that this function imports everything it needs
+    (except other extractors), so that lazy_extractors works correctly
+
     _GEO_BYPASS attribute may be set to False in order to disable
     geo restriction bypass mechanisms for a particular extractor.
     Though it won't disable explicit geo restriction bypass based on
@@ -420,7 +426,7 @@ class InfoExtractor(object):
     will be used by geo restriction bypass mechanism similarly
     to _GEO_COUNTRIES.
 
-    Finally, the _WORKING attribute should be set to False for broken IEs
+    The _WORKING attribute should be set to False for broken IEs
     in order to warn the users and skip the tests.
     """
 
@@ -788,9 +794,10 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno
             self._downloader.to_screen(dump)
         if self.get_param('write_pages', False):
             basen = '%s_%s' % (video_id, urlh.geturl())
-            if len(basen) > 240:
+            trim_length = self.get_param('trim_file_name') or 240
+            if len(basen) > trim_length:
                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
-                basen = basen[:240 - len(h)] + h
+                basen = basen[:trim_length - len(h)] + h
             raw_filename = basen + '.dump'
             filename = sanitize_filename(raw_filename, restricted=True)
             self.to_screen('Saving request to ' + filename)
@@ -1128,10 +1135,7 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f
                 if mobj:
                     break
 
-        if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
-            _name = '\033[0;34m%s\033[0m' % name
-        else:
-            _name = name
+        _name = self._downloader._color_text(name, 'blue')
 
         if mobj:
             if group is None:
@@ -1166,7 +1170,10 @@ def _get_netrc_login_info(self, netrc_machine=None):
 
         if self.get_param('usenetrc', False):
             try:
-                info = netrc.netrc().authenticators(netrc_machine)
+                netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+                if os.path.isdir(netrc_file):
+                    netrc_file = os.path.join(netrc_file, '.netrc')
+                info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
                 if info is not None:
                     username = info[0]
                     password = info[2]
@@ -1502,7 +1509,7 @@ class FormatSort:
         default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
                    'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
                    'proto', 'ext', 'hasaud', 'source', 'format_id')  # These must not be aliases
-        ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
+        ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
                         'height', 'width', 'proto', 'vext', 'abr', 'aext',
                         'fps', 'fs_approx', 'source', 'format_id')
 
@@ -1526,7 +1533,7 @@ class FormatSort:
             'ie_pref': {'priority': True, 'type': 'extractor'},
             'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
             'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
-            'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
+            'lang': {'convert': 'ignore', 'field': 'language_preference'},
             'quality': {'convert': 'float_none', 'default': -1},
             'filesize': {'convert': 'bytes'},
             'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
@@ -1672,7 +1679,7 @@ def add_item(field, reverse, closest, limit_text):
                 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
 
                 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
-                limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
+                limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
                 limit_count = len(limits)
                 for (i, f) in enumerate(fields):
                     add_item(f, reverse, closest,
@@ -1756,9 +1763,9 @@ def calculate_preference(self, format):
                 if format.get('vbr') is not None and format.get('abr') is not None:
                     format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
             else:
-                if format.get('vcodec') != "none" and format.get('vbr') is None:
+                if format.get('vcodec') != 'none' and format.get('vbr') is None:
                     format['vbr'] = format.get('tbr') - format.get('abr', 0)
-                if format.get('acodec') != "none" and format.get('abr') is None:
+                if format.get('acodec') != 'none' and format.get('abr') is None:
                     format['abr'] = format.get('tbr') - format.get('vbr', 0)
 
             return tuple(self._calculate_field_preference(format, field) for field in self._order)
@@ -1960,13 +1967,16 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m
             'format_note': 'Quality selection URL',
         }
 
+    def _report_ignoring_subs(self, name):
+        self.report_warning(bug_reports_message(
+            f'Ignoring subtitle tracks found in the {name} manifest; '
+            'if any subtitle tracks are missing,'
+        ), only_once=True)
+
     def _extract_m3u8_formats(self, *args, **kwargs):
         fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
         if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the HLS manifest; "
-                "if any subtitle tracks are missing,"
-            ), only_once=True)
+            self._report_ignoring_subs('HLS')
         return fmts
 
     def _extract_m3u8_formats_and_subtitles(
@@ -2003,7 +2013,7 @@ def _parse_m3u8_formats_and_subtitles(
         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
             return formats, subtitles
 
-        has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)
+        has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
 
         def format_url(url):
             return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
@@ -2214,6 +2224,25 @@ def build_stream_name():
                 last_stream_inf = {}
         return formats, subtitles
 
+    def _extract_m3u8_vod_duration(
+            self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
+
+        m3u8_vod = self._download_webpage(
+            m3u8_vod_url, video_id,
+            note='Downloading m3u8 VOD manifest' if note is None else note,
+            errnote='Failed to download VOD manifest' if errnote is None else errnote,
+            fatal=False, data=data, headers=headers, query=query)
+
+        return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
+
+    def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
+        if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
+            return None
+
+        return int(sum(
+            float(line[len('#EXTINF:'):].split(',')[0])
+            for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
+
     @staticmethod
     def _xpath_ns(path, namespace=None):
         if not namespace:
@@ -2245,10 +2274,7 @@ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4
     def _extract_smil_formats(self, *args, **kwargs):
         fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
         if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the SMIL manifest; "
-                "if any subtitle tracks are missing,"
-            ), only_once=True)
+            self._report_ignoring_subs('SMIL')
         return fmts
 
     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
@@ -2318,14 +2344,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
         rtmp_count = 0
         http_count = 0
         m3u8_count = 0
+        imgs_count = 0
 
-        srcs = []
+        srcs = set()
         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
         for medium in media:
             src = medium.get('src')
             if not src or src in srcs:
                 continue
-            srcs.append(src)
+            srcs.add(src)
 
             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
@@ -2399,6 +2426,24 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
                     'height': height,
                 })
 
+        for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
+            src = medium.get('src')
+            if not src or src in srcs:
+                continue
+            srcs.add(src)
+
+            imgs_count += 1
+            formats.append({
+                'format_id': 'imagestream-%d' % (imgs_count),
+                'url': src,
+                'ext': mimetype2ext(medium.get('type')),
+                'acodec': 'none',
+                'vcodec': 'none',
+                'width': int_or_none(medium.get('width')),
+                'height': int_or_none(medium.get('height')),
+                'format_note': 'SMIL storyboards',
+            })
+
         return formats
 
     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
@@ -2471,10 +2516,7 @@ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
     def _extract_mpd_formats(self, *args, **kwargs):
         fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
         if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the DASH manifest; "
-                "if any subtitle tracks are missing,"
-            ), only_once=True)
+            self._report_ignoring_subs('DASH')
         return fmts
 
     def _extract_mpd_formats_and_subtitles(
@@ -2498,10 +2540,7 @@ def _extract_mpd_formats_and_subtitles(
     def _parse_mpd_formats(self, *args, **kwargs):
         fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
         if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the DASH manifest; "
-                "if any subtitle tracks are missing,"
-            ), only_once=True)
+            self._report_ignoring_subs('DASH')
         return fmts
 
     def _parse_mpd_formats_and_subtitles(
@@ -2584,8 +2623,8 @@ def extract_Initialization(source):
             return ms_info
 
         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
-        formats = []
-        subtitles = {}
+        formats, subtitles = [], {}
+        stream_numbers = {'audio': 0, 'video': 0}
         for period in mpd_doc.findall(_add_ns('Period')):
             period_duration = parse_duration(period.get('duration')) or mpd_duration
             period_ms_info = extract_multisegment_info(period, {
@@ -2618,8 +2657,10 @@ def extract_Initialization(source):
                             base_url = base_url_e.text + base_url
                             if re.match(r'^https?://', base_url):
                                 break
-                    if mpd_base_url and not re.match(r'^https?://', base_url):
-                        if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
+                    if mpd_base_url and base_url.startswith('/'):
+                        base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
+                    elif mpd_base_url and not re.match(r'^https?://', base_url):
+                        if not mpd_base_url.endswith('/'):
                             mpd_base_url += '/'
                         base_url = mpd_base_url + base_url
                     representation_id = representation_attrib.get('id')
@@ -2647,8 +2688,10 @@ def extract_Initialization(source):
                             'format_note': 'DASH %s' % content_type,
                             'filesize': filesize,
                             'container': mimetype2ext(mime_type) + '_dash',
+                            'manifest_stream_number': stream_numbers[content_type]
                         }
                         f.update(parse_codecs(codecs))
+                        stream_numbers[content_type] += 1
                     elif content_type == 'text':
                         f = {
                             'ext': mimetype2ext(mime_type),
@@ -2825,10 +2868,7 @@ def add_segment_url():
     def _extract_ism_formats(self, *args, **kwargs):
         fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
         if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the ISM manifest; "
-                "if any subtitle tracks are missing,"
-            ))
+            self._report_ignoring_subs('ISM')
         return fmts
 
     def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
@@ -3088,10 +3128,7 @@ def _media_formats(src, cur_media_type, type_info={}):
     def _extract_akamai_formats(self, *args, **kwargs):
         fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
         if subs:
-            self.report_warning(bug_reports_message(
-                "Ignoring subtitle tracks found in the manifests; "
-                "if any subtitle tracks are missing,"
-            ))
+            self._report_ignoring_subs('akamai')
         return fmts
 
     def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
@@ -3465,6 +3502,32 @@ def extract_subtitles(self, *args, **kwargs):
     def _get_subtitles(self, *args, **kwargs):
         raise NotImplementedError('This method must be implemented by subclasses')
 
+    def extract_comments(self, *args, **kwargs):
+        if not self.get_param('getcomments'):
+            return None
+        generator = self._get_comments(*args, **kwargs)
+
+        def extractor():
+            comments = []
+            try:
+                while True:
+                    comments.append(next(generator))
+            except KeyboardInterrupt:
+                interrupted = True
+                self.to_screen('Interrupted by user')
+            except StopIteration:
+                interrupted = False
+            comment_count = len(comments)
+            self.to_screen(f'Extracted {comment_count} comments')
+            return {
+                'comments': comments,
+                'comment_count': None if interrupted else comment_count
+            }
+        return extractor
+
+    def _get_comments(self, *args, **kwargs):
+        raise NotImplementedError('This method must be implemented by subclasses')
+
     @staticmethod
     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
         """ Merge subtitle items for one language. Items with duplicated URLs
@@ -3494,9 +3557,11 @@ def _get_automatic_captions(self, *args, **kwargs):
         raise NotImplementedError('This method must be implemented by subclasses')
 
     def mark_watched(self, *args, **kwargs):
-        if (self.get_param('mark_watched', False)
-                and (self._get_login_info()[0] is not None
-                     or self.get_param('cookiefile') is not None)):
+        if not self.get_param('mark_watched', False):
+            return
+        if (self._get_login_info()[0] is not None
+                or self.get_param('cookiefile')
+                or self.get_param('cookiesfrombrowser')):
             self._mark_watched(*args, **kwargs)
 
     def _mark_watched(self, *args, **kwargs):
@@ -3579,7 +3644,14 @@ def _real_extract(self, query):
             return self._get_n_results(query, n)
 
     def _get_n_results(self, query, n):
-        """Get a specified number of results for a query"""
+        """Get a specified number of results for a query.
+        Either this function or _search_results must be overridden by subclasses """
+        return self.playlist_result(
+            itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
+            query, query)
+
+    def _search_results(self, query):
+        """Returns an iterator of search results"""
         raise NotImplementedError('This method must be implemented by subclasses')
 
     @property