compat_urlparse,
compat_xml_parse_error,
)
+from ..downloader import FileDownloader
from ..downloader.f4m import (
get_base_url,
remove_encrypted_media,
There must be a key "entries", which is a list, an iterable, or a PagedList
object, each element of which is a valid dictionary by this specification.
- Additionally, playlists can have "id", "title", "description", "uploader",
- "uploader_id", "uploader_url" attributes with the same semantics as videos
- (see above).
+ Additionally, playlists can have "id", "title", and any other relevent
+ attributes with the same semantics as videos (see above).
_type "multi_video" indicates that there are multiple videos that
urls, playlist_id=playlist_id, playlist_title=playlist_title)
@staticmethod
- def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
+ def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
"""Returns a playlist"""
video_info = {'_type': 'playlist',
'entries': entries}
+ video_info.update(kwargs)
if playlist_id:
video_info['id'] = playlist_id
if playlist_title:
'ViewAction': 'view',
}
+ def extract_interaction_type(e):
+ interaction_type = e.get('interactionType')
+ if isinstance(interaction_type, dict):
+ interaction_type = interaction_type.get('@type')
+ return str_or_none(interaction_type)
+
def extract_interaction_statistic(e):
interaction_statistic = e.get('interactionStatistic')
+ if isinstance(interaction_statistic, dict):
+ interaction_statistic = [interaction_statistic]
if not isinstance(interaction_statistic, list):
return
for is_e in interaction_statistic:
continue
if is_e.get('@type') != 'InteractionCounter':
continue
- interaction_type = is_e.get('interactionType')
- if not isinstance(interaction_type, compat_str):
+ interaction_type = extract_interaction_type(is_e)
+ if not interaction_type:
continue
# For interaction count some sites provide string instead of
# an integer (as per spec) with non digit characters (e.g. ",")
html, '%s form' % form_id, group='form')
return self._hidden_inputs(form)
- def _sort_formats(self, formats, field_preference=None):
- if not formats:
- raise ExtractorError('No video formats found')
+ class FormatSort:
+ regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
+
+ default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
+ 'res', 'fps', 'codec:vp9', 'size', 'br', 'asr',
+ 'proto', 'ext', 'has_audio', 'source', 'format_id') # These must not be aliases
+
+ settings = {
+ 'vcodec': {'type': 'ordered', 'regex': True,
+ 'order': ['av0?1', 'vp9', '(h265|he?vc?)', '(h264|avc)', 'vp8', '(mp4v|h263)', 'theora', '', None, 'none']},
+ 'acodec': {'type': 'ordered', 'regex': True,
+ 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
+ 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
+ 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
+ 'vext': {'type': 'ordered', 'field': 'video_ext',
+ 'order': ('mp4', 'webm', 'flv', '', 'none'),
+ 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
+ 'aext': {'type': 'ordered', 'field': 'audio_ext',
+ 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
+ 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
+ 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
+ 'ie_pref': {'priority': True, 'type': 'extractor', 'field': 'extractor_preference'},
+ 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
+ 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
+ 'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
+ 'quality': {'priority': True, 'convert': 'float_none'},
+ 'filesize': {'convert': 'bytes'},
+ 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
+ 'id': {'convert': 'string', 'field': 'format_id'},
+ 'height': {'convert': 'float_none'},
+ 'width': {'convert': 'float_none'},
+ 'fps': {'convert': 'float_none'},
+ 'tbr': {'convert': 'float_none'},
+ 'vbr': {'convert': 'float_none'},
+ 'abr': {'convert': 'float_none'},
+ 'asr': {'convert': 'float_none'},
+ 'source': {'convert': 'ignore', 'field': 'source_preference'},
+
+ 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
+ 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
+ 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
+ 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
+ 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
+
+ # Most of these exist only for compatibility reasons
+ 'dimension': {'type': 'alias', 'field': 'res'},
+ 'resolution': {'type': 'alias', 'field': 'res'},
+ 'extension': {'type': 'alias', 'field': 'ext'},
+ 'bitrate': {'type': 'alias', 'field': 'br'},
+ 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
+ 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
+ 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
+ 'framerate': {'type': 'alias', 'field': 'fps'},
+ 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
+ 'protocol': {'type': 'alias', 'field': 'proto'},
+ 'source_preference': {'type': 'alias', 'field': 'source'},
+ 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
+ 'filesize_estimate': {'type': 'alias', 'field': 'size'},
+ 'samplerate': {'type': 'alias', 'field': 'asr'},
+ 'video_ext': {'type': 'alias', 'field': 'vext'},
+ 'audio_ext': {'type': 'alias', 'field': 'aext'},
+ 'video_codec': {'type': 'alias', 'field': 'vcodec'},
+ 'audio_codec': {'type': 'alias', 'field': 'acodec'},
+ 'video': {'type': 'alias', 'field': 'hasvid'},
+ 'has_video': {'type': 'alias', 'field': 'hasvid'},
+ 'audio': {'type': 'alias', 'field': 'hasaud'},
+ 'has_audio': {'type': 'alias', 'field': 'hasaud'},
+ 'extractor': {'type': 'alias', 'field': 'ie_pref'},
+ 'preference': {'type': 'alias', 'field': 'ie_pref'},
+ 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
+ 'format_id': {'type': 'alias', 'field': 'id'},
+ }
- for f in formats:
- # Automatically determine tbr when missing based on abr and vbr (improves
- # formats sorting in some cases)
- if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
- f['tbr'] = f['abr'] + f['vbr']
-
- def _formats_key(f):
- # TODO remove the following workaround
- from ..utils import determine_ext
- if not f.get('ext') and 'url' in f:
- f['ext'] = determine_ext(f['url'])
-
- if isinstance(field_preference, (list, tuple)):
- return tuple(
- f.get(field)
- if f.get(field) is not None
- else ('' if field == 'format_id' else -1)
- for field in field_preference)
-
- preference = f.get('preference')
- if preference is None:
- preference = 0
- if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
- preference -= 0.5
-
- protocol = f.get('protocol') or determine_protocol(f)
- proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
-
- if f.get('vcodec') == 'none': # audio only
- preference -= 50
- if self._downloader.params.get('prefer_free_formats'):
- ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
+ _order = []
+
+ def _get_field_setting(self, field, key):
+ if field not in self.settings:
+ self.settings[field] = {}
+ propObj = self.settings[field]
+ if key not in propObj:
+ type = propObj.get('type')
+ if key == 'field':
+ default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
+ elif key == 'convert':
+ default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
else:
- ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
- ext_preference = 0
- try:
- audio_ext_preference = ORDER.index(f['ext'])
- except ValueError:
- audio_ext_preference = -1
+ default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
+ propObj[key] = default
+ return propObj[key]
+
+ def _resolve_field_value(self, field, value, convertNone=False):
+ if value is None:
+ if not convertNone:
+ return None
+ else:
+ value = value.lower()
+ conversion = self._get_field_setting(field, 'convert')
+ if conversion == 'ignore':
+ return None
+ if conversion == 'string':
+ return value
+ elif conversion == 'float_none':
+ return float_or_none(value)
+ elif conversion == 'bytes':
+ return FileDownloader.parse_bytes(value)
+ elif conversion == 'order':
+ order_free = self._get_field_setting(field, 'order_free')
+ order_list = order_free if order_free and self._use_free_order else self._get_field_setting(field, 'order')
+ use_regex = self._get_field_setting(field, 'regex')
+ list_length = len(order_list)
+ empty_pos = order_list.index('') if '' in order_list else list_length + 1
+ if use_regex and value is not None:
+ for (i, regex) in enumerate(order_list):
+ if regex and re.match(regex, value):
+ return list_length - i
+ return list_length - empty_pos # not in list
+ else: # not regex or value = None
+ return list_length - (order_list.index(value) if value in order_list else empty_pos)
else:
- if f.get('acodec') == 'none': # video only
- preference -= 40
- if self._downloader.params.get('prefer_free_formats'):
- ORDER = ['flv', 'mp4', 'webm']
+ if value.isnumeric():
+ return float(value)
else:
- ORDER = ['webm', 'flv', 'mp4']
- try:
- ext_preference = ORDER.index(f['ext'])
- except ValueError:
- ext_preference = -1
- audio_ext_preference = 0
-
- return (
- preference,
- f.get('language_preference') if f.get('language_preference') is not None else -1,
- f.get('quality') if f.get('quality') is not None else -1,
- f.get('tbr') if f.get('tbr') is not None else -1,
- f.get('filesize') if f.get('filesize') is not None else -1,
- f.get('vbr') if f.get('vbr') is not None else -1,
- f.get('height') if f.get('height') is not None else -1,
- f.get('width') if f.get('width') is not None else -1,
- proto_preference,
- ext_preference,
- f.get('abr') if f.get('abr') is not None else -1,
- audio_ext_preference,
- f.get('fps') if f.get('fps') is not None else -1,
- f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
- f.get('source_preference') if f.get('source_preference') is not None else -1,
- f.get('format_id') if f.get('format_id') is not None else '',
- )
- formats.sort(key=_formats_key)
+ self.settings[field]['convert'] = 'string'
+ return value
+
+ def evaluate_params(self, params, sort_extractor):
+ self._use_free_order = params.get('prefer_free_formats', False)
+ self._sort_user = params.get('format_sort', [])
+ self._sort_extractor = sort_extractor
+
+ def add_item(field, reverse, closest, limit_text):
+ field = field.lower()
+ if field in self._order:
+ return
+ self._order.append(field)
+ limit = self._resolve_field_value(field, limit_text)
+ data = {
+ 'reverse': reverse,
+ 'closest': False if limit is None else closest,
+ 'limit_text': limit_text,
+ 'limit': limit}
+ if field in self.settings:
+ self.settings[field].update(data)
+ else:
+ self.settings[field] = data
+
+ sort_list = (
+ tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
+ + (tuple() if params.get('format_sort_force', False)
+ else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
+ + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
+
+ for item in sort_list:
+ match = re.match(self.regex, item)
+ if match is None:
+ raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
+ field = match.group('field')
+ if field is None:
+ continue
+ if self._get_field_setting(field, 'type') == 'alias':
+ field = self._get_field_setting(field, 'field')
+ reverse = match.group('reverse') is not None
+ closest = match.group('seperator') == '~'
+ limit_text = match.group('limit')
+
+ has_limit = limit_text is not None
+ has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
+ has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
+
+ fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
+ limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
+ limit_count = len(limits)
+ for (i, f) in enumerate(fields):
+ add_item(f, reverse, closest,
+ limits[i] if i < limit_count
+ else limits[0] if has_limit and not has_multiple_limits
+ else None)
+
+ def print_verbose_info(self, to_screen):
+ to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
+ if self._sort_extractor:
+ to_screen('[debug] Sort order given by extractor: %s' % ','.join(self._sort_extractor))
+ to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
+ '+' if self._get_field_setting(field, 'reverse') else '', field,
+ '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
+ self._get_field_setting(field, 'limit_text'),
+ self._get_field_setting(field, 'limit'))
+ if self._get_field_setting(field, 'limit_text') is not None else '')
+ for field in self._order if self._get_field_setting(field, 'visible')]))
+
+ def _calculate_field_preference_from_value(self, format, field, type, value):
+ reverse = self._get_field_setting(field, 'reverse')
+ closest = self._get_field_setting(field, 'closest')
+ limit = self._get_field_setting(field, 'limit')
+
+ if type == 'extractor':
+ maximum = self._get_field_setting(field, 'max')
+ if value is None or (maximum is not None and value >= maximum):
+ value = 0
+ elif type == 'boolean':
+ in_list = self._get_field_setting(field, 'in_list')
+ not_in_list = self._get_field_setting(field, 'not_in_list')
+ value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
+ elif type == 'ordered':
+ value = self._resolve_field_value(field, value, True)
+
+ # try to convert to number
+ val_num = float_or_none(value)
+ is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
+ if is_num:
+ value = val_num
+
+ return ((-10, 0) if value is None
+ else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
+ else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
+ else (0, value, 0) if not reverse and (limit is None or value <= limit)
+ else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
+ else (-1, value, 0))
+
+ def _calculate_field_preference(self, format, field):
+ type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
+ get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
+ if type == 'multiple':
+ type = 'field' # Only 'field' is allowed in multiple for now
+ actual_fields = self._get_field_setting(field, 'field')
+
+ def wrapped_function(values):
+ values = tuple(filter(lambda x: x is not None, values))
+ return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
+ else values[0] if values
+ else None)
+
+ value = wrapped_function((get_value(f) for f in actual_fields))
+ else:
+ value = get_value(field)
+ return self._calculate_field_preference_from_value(format, field, type, value)
+
+ def calculate_preference(self, format):
+ # Determine missing protocol
+ if not format.get('protocol'):
+ format['protocol'] = determine_protocol(format)
+
+ # Determine missing ext
+ if not format.get('ext') and 'url' in format:
+ format['ext'] = determine_ext(format['url'])
+ if format.get('vcodec') == 'none':
+ format['audio_ext'] = format['ext']
+ format['video_ext'] = 'none'
+ else:
+ format['video_ext'] = format['ext']
+ format['audio_ext'] = 'none'
+ # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
+ # format['preference'] = -1000
+
+ # Determine missing bitrates
+ if format.get('tbr') is None:
+ if format.get('vbr') is not None and format.get('abr') is not None:
+ format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
+ else:
+ if format.get('vcodec') != "none" and format.get('vbr') is None:
+ format['vbr'] = format.get('tbr') - format.get('abr', 0)
+ if format.get('acodec') != "none" and format.get('abr') is None:
+ format['abr'] = format.get('tbr') - format.get('vbr', 0)
+
+ return tuple(self._calculate_field_preference(format, field) for field in self._order)
+
+ def _sort_formats(self, formats, field_preference=[]):
+ if not formats:
+ raise ExtractorError('No video formats found')
+ format_sort = self.FormatSort() # params and to_screen are taken from the downloader
+ format_sort.evaluate_params(self._downloader.params, field_preference)
+ if self._downloader.params.get('verbose', False):
+ format_sort.print_verbose_info(self._downloader.to_screen)
+ formats.sort(key=lambda f: format_sort.calculate_preference(f))
def _check_formats(self, formats, video_id):
if formats:
})
return entries
- def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}):
+ def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
mpd_url, video_id,
note=note or 'Downloading MPD manifest',
mpd_base_url = base_url(urlh.geturl())
return self._parse_mpd_formats(
- mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
- formats_dict=formats_dict, mpd_url=mpd_url)
+ mpd_doc, mpd_id, mpd_base_url, mpd_url)
- def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
+ def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
-
- # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
- # is not necessarily unique within a Period thus formats with
- # the same `format_id` are quite possible. There are numerous examples
- # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
- # https://github.com/ytdl-org/youtube-dl/issues/13919)
- full_info = formats_dict.get(representation_id, {}).copy()
- full_info.update(f)
- formats.append(full_info)
+ formats.append(f)
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
# amp-video and amp-audio are very similar to their HTML5 counterparts
# so we wll include them right here (see
# https://www.ampproject.org/docs/reference/components/amp-video)
- media_tags = [(media_tag, media_type, '')
- for media_tag, media_type
- in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
+ # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
+ _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
+ media_tags = [(media_tag, media_tag_name, media_type, '')
+ for media_tag, media_tag_name, media_type
+ in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see
# https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
# http://www.porntrex.com/maps/videositemap.xml).
- r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
- for media_tag, media_type, media_content in media_tags:
+ r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
+ for media_tag, _, media_type, media_content in media_tags:
media_info = {
'formats': [],
'subtitles': {},
return entries
def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
+ signed = 'hdnea=' in manifest_url
+ if not signed:
+ # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
+ manifest_url = re.sub(
+ r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
+ '', manifest_url).strip('?')
+
formats = []
hdcore_sign = 'hdcore=3.7.0'
hls_host = hosts.get('hls')
if hls_host:
m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
- formats.extend(self._extract_m3u8_formats(
+ m3u8_formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
+ formats.extend(m3u8_formats)
http_host = hosts.get('http')
- if http_host and 'hdnea=' not in manifest_url:
- REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+'
+ if http_host and m3u8_formats and not signed:
+ REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
qualities_length = len(qualities)
- if len(formats) in (qualities_length + 1, qualities_length * 2 + 1):
+ if len(m3u8_formats) in (qualities_length, qualities_length + 1):
i = 0
- http_formats = []
- for f in formats:
- if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none':
+ for f in m3u8_formats:
+ if f['vcodec'] != 'none':
for protocol in ('http', 'https'):
http_f = f.copy()
del http_f['manifest_url']
http_url = re.sub(
- REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url'])
+ REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
http_f.update({
'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
'url': http_url,
'protocol': protocol,
})
- http_formats.append(http_f)
+ formats.append(http_f)
i += 1
- formats.extend(http_formats)
return formats