from __future__ import unicode_literals
import base64
+import collections
import datetime
import hashlib
+import itertools
import json
import netrc
import os
compat_cookies_SimpleCookie,
compat_etree_Element,
compat_etree_fromstring,
+ compat_expanduser,
compat_getpass,
compat_http_client,
compat_os_name,
GeoRestrictedError,
GeoUtils,
int_or_none,
+ join_nonempty,
js_to_json,
JSON_LD_RE,
mimetype2ext,
strip_or_none,
traverse_obj,
unescapeHTML,
+ UnsupportedError,
unified_strdate,
unified_timestamp,
update_Request,
* width Width of the video, if known
* height Height of the video, if known
* resolution Textual description of width and height
+ * dynamic_range The dynamic range of the video. One of:
+ "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
* tbr Average bitrate of audio and video in KBit/s
* abr Average audio bitrate in KBit/s
* acodec Name of the audio codec in use
* "resolution" (optional, string "{width}x{height}",
deprecated)
* "filesize" (optional, int)
- * "_test_url" (optional, bool) - If true, test the URL
thumbnail: Full URL to a video thumbnail image.
description: Full video description.
uploader: Full name of the video uploader.
series, programme or podcast:
series: Title of the series or programme the video episode belongs to.
+ series_id: Id of the series or programme the video episode belongs to, as a unicode string.
season: Title of the season the video episode belongs to.
season_number: Number of the season the video episode belongs to, as an integer.
season_id: Id of the season the video episode belongs to, as a unicode string.
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
+ Subclasses may also override suitable() if necessary, but ensure the function
+ signature is preserved and that this function imports everything it needs
+ (except other extractors), so that lazy_extractors works correctly
+
_GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
Though it won't disable explicit geo restriction bypass based on
will be used by geo restriction bypass mechanism similarly
to _GEO_COUNTRIES.
- Finally, the _WORKING attribute should be set to False for broken IEs
+ The _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
"""
_WORKING = True
_LOGIN_HINTS = {
- 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
+ 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
'cookies': (
- 'Use --cookies for the authentication. '
- 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to pass cookies'),
- 'password': 'Use --username and --password or --netrc to provide account credentials',
+ 'Use --cookies-from-browser or --cookies for the authentication. '
+ 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
+ 'password': 'Use --username and --password, or --netrc to provide account credentials',
}
def __init__(self, downloader=None):
- """Constructor. Receives an optional downloader."""
+ """Constructor. Receives an optional downloader (a YoutubeDL instance).
+ If a downloader is not passed during initialization,
+ it must be set using "set_downloader()" before "extract()" is called"""
self._ready = False
self._x_forwarded_for_ip = None
self._printed_messages = set()
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
+ if '_VALID_URL' not in cls.__dict__:
+ cls._VALID_URL = cls._make_valid_url()
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
return cls._VALID_URL_RE.match(url)
if self.__maybe_fake_ip_and_retry(e.countries):
continue
raise
+ except UnsupportedError:
+ raise
except ExtractorError as e:
- video_id = e.video_id or self.get_temp_id(url)
- raise ExtractorError(
- e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
+ kwargs = {
+ 'video_id': e.video_id or self.get_temp_id(url),
+ 'ie': self.IE_NAME,
+ 'tb': e.traceback,
+ 'expected': e.expected,
+ 'cause': e.cause
+ }
+ if hasattr(e, 'countries'):
+ kwargs['countries'] = e.countries
+ raise type(e)(e.msg, **kwargs)
except compat_http_client.IncompleteRead as e:
raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
except (KeyError, StopIteration) as e:
See _download_webpage docstring for arguments specification.
"""
if not self._downloader._first_webpage_request:
- sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
+ sleep_interval = self.get_param('sleep_interval_requests') or 0
if sleep_interval > 0:
self.to_screen('Sleeping %s seconds ...' % sleep_interval)
time.sleep(sleep_interval)
self._downloader.to_screen(dump)
if self.get_param('write_pages', False):
basen = '%s_%s' % (video_id, urlh.geturl())
- if len(basen) > 240:
+ trim_length = self.get_param('trim_file_name') or 240
+ if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
- basen = basen[:240 - len(h)] + h
+ basen = basen[:trim_length - len(h)] + h
raw_filename = basen + '.dump'
filename = sanitize_filename(raw_filename, restricted=True)
self.to_screen('Saving request to ' + filename)
def raise_login_required(
self, msg='This video is only available for registered users',
metadata_available=False, method='any'):
- if metadata_available and self.get_param('ignore_no_formats_error'):
+ if metadata_available and (
+ self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
self.report_warning(msg)
if method is not None:
msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
def raise_geo_restricted(
self, msg='This video is not available from your location due to geo restriction',
countries=None, metadata_available=False):
- if metadata_available and self.get_param('ignore_no_formats_error'):
+ if metadata_available and (
+ self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
self.report_warning(msg)
else:
raise GeoRestrictedError(msg, countries=countries)
def raise_no_formats(self, msg, expected=False, video_id=None):
- if expected and self.get_param('ignore_no_formats_error'):
+ if expected and (
+ self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
self.report_warning(msg, video_id)
elif isinstance(msg, ExtractorError):
raise msg
# Methods for following #608
@staticmethod
- def url_result(url, ie=None, video_id=None, video_title=None):
+ def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
"""Returns a URL that points to a page that should be processed"""
# TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
+ video_info.update(kwargs)
if video_id is not None:
video_info['id'] = video_id
if video_title is not None:
if mobj:
break
- if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
- _name = '\033[0;34m%s\033[0m' % name
- else:
- _name = name
+ _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
if mobj:
if group is None:
if self.get_param('usenetrc', False):
try:
- info = netrc.netrc().authenticators(netrc_machine)
+ netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+ if os.path.isdir(netrc_file):
+ netrc_file = os.path.join(netrc_file, '.netrc')
+ info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
if info is not None:
username = info[0]
password = info[2]
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
continue
+ rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
+ if rating is not None:
+ info['average_rating'] = rating
if item_type in ('TVEpisode', 'Episode'):
episode_name = unescapeHTML(e.get('name'))
info.update({
break
return dict((k, v) for k, v in info.items() if v is not None)
+ def _search_nextjs_data(self, webpage, video_id, **kw):
+ return self._parse_json(
+ self._search_regex(
+ r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
+ webpage, 'next.js data', **kw),
+ video_id, **kw)
+
+ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
+ ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
+ # not all website do this, but it can be changed
+ # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
+ rectx = re.escape(context_name)
+ js, arg_keys, arg_vals = self._search_regex(
+ (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
+ r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
+ webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
+
+ args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
+
+ for key, val in args.items():
+ if val in ('undefined', 'void 0'):
+ args[key] = 'null'
+
+ return self._parse_json(js_to_json(js, args), video_id)['data'][0]
+
@staticmethod
def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
- 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
- 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases
- ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
+ 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
+ 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
+ ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
'height', 'width', 'proto', 'vext', 'abr', 'aext',
- 'fps', 'fs_approx', 'source', 'format_id')
+ 'fps', 'fs_approx', 'source', 'id')
settings = {
'vcodec': {'type': 'ordered', 'regex': True,
'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
'acodec': {'type': 'ordered', 'regex': True,
- 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
+ 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
+ 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
+ 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
- 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
+ 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
'vext': {'type': 'ordered', 'field': 'video_ext',
'order': ('mp4', 'webm', 'flv', '', 'none'),
'order_free': ('webm', 'mp4', 'flv', '', 'none')},
'ie_pref': {'priority': True, 'type': 'extractor'},
'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
- 'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
- 'quality': {'convert': 'float_none', 'default': -1},
+ 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
+ 'quality': {'convert': 'float', 'default': -1},
'filesize': {'convert': 'bytes'},
'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
'id': {'convert': 'string', 'field': 'format_id'},
'vbr': {'convert': 'float_none'},
'abr': {'convert': 'float_none'},
'asr': {'convert': 'float_none'},
- 'source': {'convert': 'ignore', 'field': 'source_preference'},
+ 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
'res': {'type': 'multiple', 'field': ('height', 'width'),
'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
- # Most of these exist only for compatibility reasons
+ # For compatibility with youtube-dl
+ 'format_id': {'type': 'alias', 'field': 'id'},
+ 'preference': {'type': 'alias', 'field': 'ie_pref'},
+ 'language_preference': {'type': 'alias', 'field': 'lang'},
+
+ # Deprecated
'dimension': {'type': 'alias', 'field': 'res'},
'resolution': {'type': 'alias', 'field': 'res'},
'extension': {'type': 'alias', 'field': 'ext'},
'video_bitrate': {'type': 'alias', 'field': 'vbr'},
'audio_bitrate': {'type': 'alias', 'field': 'abr'},
'framerate': {'type': 'alias', 'field': 'fps'},
- 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
'protocol': {'type': 'alias', 'field': 'proto'},
'source_preference': {'type': 'alias', 'field': 'source'},
'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
'audio': {'type': 'alias', 'field': 'hasaud'},
'has_audio': {'type': 'alias', 'field': 'hasaud'},
'extractor': {'type': 'alias', 'field': 'ie_pref'},
- 'preference': {'type': 'alias', 'field': 'ie_pref'},
'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
- 'format_id': {'type': 'alias', 'field': 'id'},
}
- _order = []
+ def __init__(self, ie, field_preference):
+ self._order = []
+ self.ydl = ie._downloader
+ self.evaluate_params(self.ydl.params, field_preference)
+ if ie.get_param('verbose'):
+ self.print_verbose_info(self.ydl.write_debug)
def _get_field_setting(self, field, key):
if field not in self.settings:
+ if key in ('forced', 'priority'):
+ return False
+ self.ydl.deprecation_warning(
+ f'Using arbitrary fields ({field}) for format sorting is deprecated '
+ 'and may be removed in a future version')
self.settings[field] = {}
propObj = self.settings[field]
if key not in propObj:
if field is None:
continue
if self._get_field_setting(field, 'type') == 'alias':
- field = self._get_field_setting(field, 'field')
+ alias, field = field, self._get_field_setting(field, 'field')
+ if alias not in ('format_id', 'preference', 'language_preference'):
+ self.ydl.deprecation_warning(
+ f'Format sorting alias {alias} is deprecated '
+ f'and may be removed in a future version. Please use {field} instead')
reverse = match.group('reverse') is not None
closest = match.group('separator') == '~'
limit_text = match.group('limit')
has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
- limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
+ limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
limit_count = len(limits)
for (i, f) in enumerate(fields):
add_item(f, reverse, closest,
if format.get('vbr') is not None and format.get('abr') is not None:
format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
else:
- if format.get('vcodec') != "none" and format.get('vbr') is None:
+ if format.get('vcodec') != 'none' and format.get('vbr') is None:
format['vbr'] = format.get('tbr') - format.get('abr', 0)
- if format.get('acodec') != "none" and format.get('abr') is None:
+ if format.get('acodec') != 'none' and format.get('abr') is None:
format['abr'] = format.get('tbr') - format.get('vbr', 0)
return tuple(self._calculate_field_preference(format, field) for field in self._order)
def _sort_formats(self, formats, field_preference=[]):
if not formats:
return
- format_sort = self.FormatSort() # params and to_screen are taken from the downloader
- format_sort.evaluate_params(self._downloader.params, field_preference)
- if self.get_param('verbose', False):
- format_sort.print_verbose_info(self._downloader.write_debug)
+ format_sort = self.FormatSort(self, field_preference)
formats.sort(key=lambda f: format_sort.calculate_preference(f))
def _check_formats(self, formats, video_id):
tbr = int_or_none(media_el.attrib.get('bitrate'))
width = int_or_none(media_el.attrib.get('width'))
height = int_or_none(media_el.attrib.get('height'))
- format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+ format_id = join_nonempty(f4m_id, tbr or i)
# If <bootstrapInfo> is present, the specified f4m is a
# stream-level manifest, and only set-level manifests may refer to
# external resources. See section 11.4 and section 4 of F4M spec
def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
return {
- 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
+ 'format_id': join_nonempty(m3u8_id, 'meta'),
'url': m3u8_url,
'ext': ext,
'protocol': 'm3u8',
'format_note': 'Quality selection URL',
}
+ def _report_ignoring_subs(self, name):
+ self.report_warning(bug_reports_message(
+ f'Ignoring subtitle tracks found in the {name} manifest; '
+ 'if any subtitle tracks are missing,'
+ ), only_once=True)
+
def _extract_m3u8_formats(self, *args, **kwargs):
fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
if subs:
- self.report_warning(bug_reports_message(
- "Ignoring subtitle tracks found in the HLS manifest; "
- "if any subtitle tracks are missing,"
- ), only_once=True)
+ self._report_ignoring_subs('HLS')
return fmts
def _extract_m3u8_formats_and_subtitles(
video_id=None):
formats, subtitles = [], {}
- if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
- return formats, subtitles
-
- has_drm = re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)
+ has_drm = re.search('|'.join([
+ r'#EXT-X-FAXS-CM:', # Adobe Flash Access
+ r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
+ ]), m3u8_doc)
def format_url(url):
return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
formats = [{
- 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
+ 'format_id': join_nonempty(m3u8_id, idx),
'format_index': idx,
'url': m3u8_url,
'ext': ext,
if media_url:
manifest_url = format_url(media_url)
formats.extend({
- 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
+ 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
'format_note': name,
'format_index': idx,
'url': manifest_url,
# format_id intact.
if not live:
stream_name = build_stream_name()
- format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
+ format_id[1] = stream_name or '%d' % (tbr or len(formats))
f = {
- 'format_id': '-'.join(map(str, filter(None, format_id))),
+ 'format_id': join_nonempty(*format_id),
'format_index': idx,
'url': manifest_url,
'manifest_url': m3u8_url,
last_stream_inf = {}
return formats, subtitles
+ def _extract_m3u8_vod_duration(
+ self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
+
+ m3u8_vod = self._download_webpage(
+ m3u8_vod_url, video_id,
+ note='Downloading m3u8 VOD manifest' if note is None else note,
+ errnote='Failed to download VOD manifest' if errnote is None else errnote,
+ fatal=False, data=data, headers=headers, query=query)
+
+ return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
+
+ def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
+ if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
+ return None
+
+ return int(sum(
+ float(line[len('#EXTINF:'):].split(',')[0])
+ for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
+
@staticmethod
def _xpath_ns(path, namespace=None):
if not namespace:
def _extract_smil_formats(self, *args, **kwargs):
fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
if subs:
- self.report_warning(bug_reports_message(
- "Ignoring subtitle tracks found in the SMIL manifest; "
- "if any subtitle tracks are missing,"
- ), only_once=True)
+ self._report_ignoring_subs('SMIL')
return fmts
def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
rtmp_count = 0
http_count = 0
m3u8_count = 0
+ imgs_count = 0
- srcs = []
+ srcs = set()
media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
for medium in media:
src = medium.get('src')
if not src or src in srcs:
continue
- srcs.append(src)
+ srcs.add(src)
bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
'height': height,
})
+ for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
+ src = medium.get('src')
+ if not src or src in srcs:
+ continue
+ srcs.add(src)
+
+ imgs_count += 1
+ formats.append({
+ 'format_id': 'imagestream-%d' % (imgs_count),
+ 'url': src,
+ 'ext': mimetype2ext(medium.get('type')),
+ 'acodec': 'none',
+ 'vcodec': 'none',
+ 'width': int_or_none(medium.get('width')),
+ 'height': int_or_none(medium.get('height')),
+ 'format_note': 'SMIL storyboards',
+ })
+
return formats
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
def _extract_mpd_formats(self, *args, **kwargs):
fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
- self.report_warning(bug_reports_message(
- "Ignoring subtitle tracks found in the DASH manifest; "
- "if any subtitle tracks are missing,"
- ), only_once=True)
+ self._report_ignoring_subs('DASH')
return fmts
def _extract_mpd_formats_and_subtitles(
def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
if subs:
- self.report_warning(bug_reports_message(
- "Ignoring subtitle tracks found in the DASH manifest; "
- "if any subtitle tracks are missing,"
- ), only_once=True)
+ self._report_ignoring_subs('DASH')
return fmts
def _parse_mpd_formats_and_subtitles(
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats, subtitles = [], {}
- stream_numbers = {'audio': 0, 'video': 0}
+ stream_numbers = collections.defaultdict(int)
for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
content_type = mime_type
elif codecs.split('.')[0] == 'stpp':
content_type = 'text'
+ elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
+ content_type = 'text'
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
continue
base_url = base_url_e.text + base_url
if re.match(r'^https?://', base_url):
break
- if mpd_base_url and not re.match(r'^https?://', base_url):
- if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
+ if mpd_base_url and base_url.startswith('/'):
+ base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
+ elif mpd_base_url and not re.match(r'^https?://', base_url):
+ if not mpd_base_url.endswith('/'):
mpd_base_url += '/'
base_url = mpd_base_url + base_url
representation_id = representation_attrib.get('id')
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
- 'manifest_stream_number': stream_numbers[content_type]
}
f.update(parse_codecs(codecs))
- stream_numbers[content_type] += 1
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
- if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
+ if content_type in ('video', 'audio', 'image/jpeg'):
+ f['manifest_stream_number'] = stream_numbers[f['url']]
+ stream_numbers[f['url']] += 1
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
def _extract_ism_formats(self, *args, **kwargs):
fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
if subs:
- self.report_warning(bug_reports_message(
- "Ignoring subtitle tracks found in the ISM manifest; "
- "if any subtitle tracks are missing,"
- ))
+ self._report_ignoring_subs('ISM')
return fmts
def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
})
fragment_ctx['time'] += fragment_ctx['duration']
- format_id = []
- if ism_id:
- format_id.append(ism_id)
- if stream_name:
- format_id.append(stream_name)
- format_id.append(compat_str(tbr))
-
if stream_type == 'text':
subtitles.setdefault(stream_language, []).append({
'ext': 'ismt',
})
elif stream_type in ('video', 'audio'):
formats.append({
- 'format_id': '-'.join(format_id),
+ 'format_id': join_nonempty(ism_id, stream_name, tbr),
'url': ism_url,
'manifest_url': ism_url,
'ext': 'ismv' if stream_type == 'video' else 'isma',
def _extract_akamai_formats(self, *args, **kwargs):
fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
if subs:
- self.report_warning(bug_reports_message(
- "Ignoring subtitle tracks found in the manifests; "
- "if any subtitle tracks are missing,"
- ))
+ self._report_ignoring_subs('akamai')
return fmts
def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
def _get_subtitles(self, *args, **kwargs):
raise NotImplementedError('This method must be implemented by subclasses')
+ def extract_comments(self, *args, **kwargs):
+ if not self.get_param('getcomments'):
+ return None
+ generator = self._get_comments(*args, **kwargs)
+
+ def extractor():
+ comments = []
+ interrupted = True
+ try:
+ while True:
+ comments.append(next(generator))
+ except StopIteration:
+ interrupted = False
+ except KeyboardInterrupt:
+ self.to_screen('Interrupted by user')
+ except Exception as e:
+ if self.get_param('ignoreerrors') is not True:
+ raise
+ self._downloader.report_error(e)
+ comment_count = len(comments)
+ self.to_screen(f'Extracted {comment_count} comments')
+ return {
+ 'comments': comments,
+ 'comment_count': None if interrupted else comment_count
+ }
+ return extractor
+
+ def _get_comments(self, *args, **kwargs):
+ raise NotImplementedError('This method must be implemented by subclasses')
+
@staticmethod
def _merge_subtitle_items(subtitle_list1, subtitle_list2):
""" Merge subtitle items for one language. Items with duplicated URLs
raise NotImplementedError('This method must be implemented by subclasses')
def mark_watched(self, *args, **kwargs):
- if (self.get_param('mark_watched', False)
- and (self._get_login_info()[0] is not None
- or self.get_param('cookiefile') is not None)):
+ if not self.get_param('mark_watched', False):
+ return
+ if (self._get_login_info()[0] is not None
+ or self.get_param('cookiefile')
+ or self.get_param('cookiesfrombrowser')):
self._mark_watched(*args, **kwargs)
def _mark_watched(self, *args, **kwargs):
"""
Base class for paged search queries extractors.
They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
- Instances should define _SEARCH_KEY and _MAX_RESULTS.
+ Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
"""
+ _MAX_RESULTS = float('inf')
+
@classmethod
def _make_valid_url(cls):
return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
- @classmethod
- def suitable(cls, url):
- return re.match(cls._make_valid_url(), url) is not None
-
def _real_extract(self, query):
- mobj = re.match(self._make_valid_url(), query)
- if mobj is None:
- raise ExtractorError('Invalid search query "%s"' % query)
-
- prefix = mobj.group('prefix')
- query = mobj.group('query')
+ prefix, query = self._match_valid_url(query).group('prefix', 'query')
if prefix == '':
return self._get_n_results(query, 1)
elif prefix == 'all':
return self._get_n_results(query, n)
def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
+ """Get a specified number of results for a query.
+ Either this function or _search_results must be overridden by subclasses """
+ return self.playlist_result(
+ itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
+ query, query)
+
+ def _search_results(self, query):
+ """Returns an iterator of search results"""
raise NotImplementedError('This method must be implemented by subclasses')
@property