X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/1151c4079a15939c981f3e30129b82dea1b1bc6d..a2160aa45f4019e02ced01c9030aa9519b40b24f:/yt_dlp/extractor/common.py diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 4e6deee1c..5b7b8891a 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -4,6 +4,7 @@ import base64 import datetime import hashlib +import itertools import json import netrc import os @@ -18,6 +19,7 @@ compat_cookies_SimpleCookie, compat_etree_Element, compat_etree_fromstring, + compat_expanduser, compat_getpass, compat_http_client, compat_os_name, @@ -405,6 +407,10 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. + Subclasses may also override suitable() if necessary, but ensure the function + signature is preserved and that this function imports everything it needs + (except other extractors), so that lazy_extractors works correctly + _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. Though it won't disable explicit geo restriction bypass based on @@ -420,7 +426,7 @@ class InfoExtractor(object): will be used by geo restriction bypass mechanism similarly to _GEO_COUNTRIES. - Finally, the _WORKING attribute should be set to False for broken IEs + The _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -788,9 +794,10 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno self._downloader.to_screen(dump) if self.get_param('write_pages', False): basen = '%s_%s' % (video_id, urlh.geturl()) - if len(basen) > 240: + trim_length = self.get_param('trim_file_name') or 240 + if len(basen) > trim_length: h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() - basen = basen[:240 - len(h)] + h + basen = basen[:trim_length - len(h)] + h raw_filename = basen + '.dump' filename = sanitize_filename(raw_filename, restricted=True) self.to_screen('Saving request to ' + filename) @@ -1128,10 +1135,7 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f if mobj: break - if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty(): - _name = '\033[0;34m%s\033[0m' % name - else: - _name = name + _name = self._downloader._color_text(name, 'blue') if mobj: if group is None: @@ -1166,7 +1170,10 @@ def _get_netrc_login_info(self, netrc_machine=None): if self.get_param('usenetrc', False): try: - info = netrc.netrc().authenticators(netrc_machine) + netrc_file = compat_expanduser(self.get_param('netrc_location') or '~') + if os.path.isdir(netrc_file): + netrc_file = os.path.join(netrc_file, '.netrc') + info = netrc.netrc(file=netrc_file).authenticators(netrc_machine) if info is not None: username = info[0] password = info[2] @@ -1502,7 +1509,7 @@ class FormatSort: default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases - ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr', + ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', 'height', 'width', 'proto', 'vext', 'abr', 'aext', 'fps', 'fs_approx', 'source', 'format_id') @@ -1526,7 +1533,7 @@ class FormatSort: 'ie_pref': {'priority': True, 'type': 'extractor'}, 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'}, + 'lang': {'convert': 'ignore', 'field': 'language_preference'}, 'quality': {'convert': 'float_none', 'default': -1}, 'filesize': {'convert': 'bytes'}, 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, @@ -1672,7 +1679,7 @@ def add_item(field, reverse, closest, limit_text): has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit') fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,) - limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple() + limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple() limit_count = len(limits) for (i, f) in enumerate(fields): add_item(f, reverse, closest, @@ -1756,9 +1763,9 @@ def calculate_preference(self, format): if format.get('vbr') is not None and format.get('abr') is not None: format['tbr'] = format.get('vbr', 0) + format.get('abr', 0) else: - if format.get('vcodec') != "none" and format.get('vbr') is None: + if format.get('vcodec') != 'none' and format.get('vbr') is None: format['vbr'] = format.get('tbr') - format.get('abr', 0) - if format.get('acodec') != "none" and format.get('abr') is None: + if format.get('acodec') != 'none' and format.get('abr') is None: format['abr'] = format.get('tbr') - format.get('vbr', 0) return tuple(self._calculate_field_preference(format, field) for field in self._order) @@ -1960,13 +1967,16 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m 'format_note': 'Quality selection URL', } + def _report_ignoring_subs(self, name): + self.report_warning(bug_reports_message( + f'Ignoring subtitle tracks found in the {name} manifest; ' + 'if any subtitle tracks are missing,' + ), only_once=True) + def _extract_m3u8_formats(self, *args, **kwargs): fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the HLS manifest; " - "if any subtitle tracks are missing," - ), only_once=True) + self._report_ignoring_subs('HLS') return fmts def _extract_m3u8_formats_and_subtitles( @@ -2003,7 +2013,7 @@ def _parse_m3u8_formats_and_subtitles( if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return formats, subtitles - has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc) + has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) @@ -2214,6 +2224,25 @@ def build_stream_name(): last_stream_inf = {} return formats, subtitles + def _extract_m3u8_vod_duration( + self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}): + + m3u8_vod = self._download_webpage( + m3u8_vod_url, video_id, + note='Downloading m3u8 VOD manifest' if note is None else note, + errnote='Failed to download VOD manifest' if errnote is None else errnote, + fatal=False, data=data, headers=headers, query=query) + + return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id) + + def _parse_m3u8_vod_duration(self, m3u8_vod, video_id): + if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod: + return None + + return int(sum( + float(line[len('#EXTINF:'):].split(',')[0]) + for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None + @staticmethod def _xpath_ns(path, namespace=None): if not namespace: @@ -2245,10 +2274,7 @@ def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4 def _extract_smil_formats(self, *args, **kwargs): fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the SMIL manifest; " - "if any subtitle tracks are missing," - ), only_once=True) + self._report_ignoring_subs('SMIL') return fmts def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): @@ -2318,14 +2344,15 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para rtmp_count = 0 http_count = 0 m3u8_count = 0 + imgs_count = 0 - srcs = [] + srcs = set() media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) for medium in media: src = medium.get('src') if not src or src in srcs: continue - srcs.append(src) + srcs.add(src) bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) filesize = int_or_none(medium.get('size') or medium.get('fileSize')) @@ -2399,6 +2426,24 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para 'height': height, }) + for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)): + src = medium.get('src') + if not src or src in srcs: + continue + srcs.add(src) + + imgs_count += 1 + formats.append({ + 'format_id': 'imagestream-%d' % (imgs_count), + 'url': src, + 'ext': mimetype2ext(medium.get('type')), + 'acodec': 'none', + 'vcodec': 'none', + 'width': int_or_none(medium.get('width')), + 'height': int_or_none(medium.get('height')), + 'format_note': 'SMIL storyboards', + }) + return formats def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): @@ -2471,10 +2516,7 @@ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): def _extract_mpd_formats(self, *args, **kwargs): fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the DASH manifest; " - "if any subtitle tracks are missing," - ), only_once=True) + self._report_ignoring_subs('DASH') return fmts def _extract_mpd_formats_and_subtitles( @@ -2498,10 +2540,7 @@ def _extract_mpd_formats_and_subtitles( def _parse_mpd_formats(self, *args, **kwargs): fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the DASH manifest; " - "if any subtitle tracks are missing," - ), only_once=True) + self._report_ignoring_subs('DASH') return fmts def _parse_mpd_formats_and_subtitles( @@ -2584,8 +2623,8 @@ def extract_Initialization(source): return ms_info mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) - formats = [] - subtitles = {} + formats, subtitles = [], {} + stream_numbers = {'audio': 0, 'video': 0} for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2618,8 +2657,10 @@ def extract_Initialization(source): base_url = base_url_e.text + base_url if re.match(r'^https?://', base_url): break - if mpd_base_url and not re.match(r'^https?://', base_url): - if not mpd_base_url.endswith('/') and not base_url.startswith('/'): + if mpd_base_url and base_url.startswith('/'): + base_url = compat_urlparse.urljoin(mpd_base_url, base_url) + elif mpd_base_url and not re.match(r'^https?://', base_url): + if not mpd_base_url.endswith('/'): mpd_base_url += '/' base_url = mpd_base_url + base_url representation_id = representation_attrib.get('id') @@ -2647,8 +2688,10 @@ def extract_Initialization(source): 'format_note': 'DASH %s' % content_type, 'filesize': filesize, 'container': mimetype2ext(mime_type) + '_dash', + 'manifest_stream_number': stream_numbers[content_type] } f.update(parse_codecs(codecs)) + stream_numbers[content_type] += 1 elif content_type == 'text': f = { 'ext': mimetype2ext(mime_type), @@ -2825,10 +2868,7 @@ def add_segment_url(): def _extract_ism_formats(self, *args, **kwargs): fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the ISM manifest; " - "if any subtitle tracks are missing," - )) + self._report_ignoring_subs('ISM') return fmts def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): @@ -3088,10 +3128,7 @@ def _media_formats(src, cur_media_type, type_info={}): def _extract_akamai_formats(self, *args, **kwargs): fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs) if subs: - self.report_warning(bug_reports_message( - "Ignoring subtitle tracks found in the manifests; " - "if any subtitle tracks are missing," - )) + self._report_ignoring_subs('akamai') return fmts def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}): @@ -3465,6 +3502,32 @@ def extract_subtitles(self, *args, **kwargs): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def extract_comments(self, *args, **kwargs): + if not self.get_param('getcomments'): + return None + generator = self._get_comments(*args, **kwargs) + + def extractor(): + comments = [] + try: + while True: + comments.append(next(generator)) + except KeyboardInterrupt: + interrupted = True + self.to_screen('Interrupted by user') + except StopIteration: + interrupted = False + comment_count = len(comments) + self.to_screen(f'Extracted {comment_count} comments') + return { + 'comments': comments, + 'comment_count': None if interrupted else comment_count + } + return extractor + + def _get_comments(self, *args, **kwargs): + raise NotImplementedError('This method must be implemented by subclasses') + @staticmethod def _merge_subtitle_items(subtitle_list1, subtitle_list2): """ Merge subtitle items for one language. Items with duplicated URLs @@ -3494,9 +3557,11 @@ def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') def mark_watched(self, *args, **kwargs): - if (self.get_param('mark_watched', False) - and (self._get_login_info()[0] is not None - or self.get_param('cookiefile') is not None)): + if not self.get_param('mark_watched', False): + return + if (self._get_login_info()[0] is not None + or self.get_param('cookiefile') + or self.get_param('cookiesfrombrowser')): self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): @@ -3579,7 +3644,14 @@ def _real_extract(self, query): return self._get_n_results(query, n) def _get_n_results(self, query, n): - """Get a specified number of results for a query""" + """Get a specified number of results for a query. + Either this function or _search_results must be overridden by subclasses """ + return self.playlist_result( + itertools.islice(self._search_results(query), 0, None if n == float('inf') else n), + query, query) + + def _search_results(self, query): + """Returns an iterator of search results""" raise NotImplementedError('This method must be implemented by subclasses') @property