import sys
import time
import types
-import urllib.error
import urllib.parse
import urllib.request
import xml.etree.ElementTree
from ..compat import functools # isort: split
-from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
+from ..compat import (
+ compat_etree_fromstring,
+ compat_expanduser,
+ compat_os_name,
+ urllib_req_to_req,
+)
from ..cookies import LenientSimpleCookie
from ..downloader.f4m import get_base_url, remove_encrypted_media
from ..downloader.hls import HlsFD
+from ..networking import HEADRequest, Request
+from ..networking.exceptions import (
+ HTTPError,
+ IncompleteRead,
+ network_exceptions,
+)
from ..utils import (
IDENTITY,
JSON_LD_RE,
FormatSorter,
GeoRestrictedError,
GeoUtils,
- HEADRequest,
LenientJSONDecoder,
Popen,
RegexNotFoundError,
js_to_json,
mimetype2ext,
netrc_from_content,
- network_exceptions,
orderedSet,
parse_bitrate,
parse_codecs,
parse_resolution,
sanitize_filename,
sanitize_url,
- sanitized_Request,
smuggle_url,
str_or_none,
str_to_int,
unescapeHTML,
unified_strdate,
unified_timestamp,
- update_Request,
- update_url_query,
url_basename,
url_or_none,
urlhandle_detect_ext,
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
* ffmpeg_args Extra arguments for ffmpeg downloader
+ * is_dash_periods Whether the format is a result of merging
+ multiple DASH periods.
RTMP formats can also have the additional fields: page_url,
app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
rtmp_protocol, rtmp_real_time
If it is not clear whether to use timestamp or this, use the former
release_date: The date (YYYYMMDD) when the video was released in UTC.
If not explicitly set, calculated from release_timestamp
+ release_year: Year (YYYY) as integer when the video or album was released.
+ To be used if no exact release date is known.
+ If not explicitly set, calculated from release_date.
modified_timestamp: UNIX timestamp of the moment the video was last modified.
modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
If not explicitly set, calculated from modified_timestamp
'private', 'premium_only', 'subscriber_only', 'needs_auth',
'unlisted' or 'public'. Use 'InfoExtractor._availability'
to set it
+ media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
_old_archive_ids: A list of old archive ids needed for backward compatibility
_format_sort_fields: A list of fields to use for sorting formats
__post_extractor: A function to be called just before the metadata is
and compilations).
disc_number: Number of the disc or other physical medium the track belongs to,
as an integer.
- release_year: Year (YYYY) when the album was released.
composer: Composer of the piece
The following fields should only be set for clips that should be cut from the original video:
except UnsupportedError:
raise
except ExtractorError as e:
- e.video_id = e.video_id or self.get_temp_id(url),
+ e.video_id = e.video_id or self.get_temp_id(url)
e.ie = e.ie or self.IE_NAME,
e.traceback = e.traceback or sys.exc_info()[2]
raise
- except http.client.IncompleteRead as e:
+ except IncompleteRead as e:
raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
except (KeyError, StopIteration) as e:
raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
@staticmethod
def __can_accept_status_code(err, expected_status):
- assert isinstance(err, urllib.error.HTTPError)
+ assert isinstance(err, HTTPError)
if expected_status is None:
return False
elif callable(expected_status):
- return expected_status(err.code) is True
+ return expected_status(err.status) is True
else:
- return err.code in variadic(expected_status)
+ return err.status in variadic(expected_status)
def _create_request(self, url_or_request, data=None, headers=None, query=None):
if isinstance(url_or_request, urllib.request.Request):
- return update_Request(url_or_request, data=data, headers=headers, query=query)
- if query:
- url_or_request = update_url_query(url_or_request, query)
- return sanitized_Request(url_or_request, data, headers or {})
+ self._downloader.deprecation_warning(
+ 'Passing a urllib.request.Request to _create_request() is deprecated. '
+ 'Use yt_dlp.networking.common.Request instead.')
+ url_or_request = urllib_req_to_req(url_or_request)
+ elif not isinstance(url_or_request, Request):
+ url_or_request = Request(url_or_request)
+
+ url_or_request.update(data=data, headers=headers, query=query)
+ return url_or_request
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
"""
try:
return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
except network_exceptions as err:
- if isinstance(err, urllib.error.HTTPError):
+ if isinstance(err, HTTPError):
if self.__can_accept_status_code(err, expected_status):
- # Retain reference to error to prevent file object from
- # being closed before it can be read. Works around the
- # effects of <https://bugs.python.org/issue15002>
- # introduced in Python 3.4.1.
- err.fp._error = err
- return err.fp
+ return err.response
if errnote is False:
return False
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
if self.get_param('dump_intermediate_pages', False):
- self.to_screen('Dumping request to ' + urlh.geturl())
+ self.to_screen('Dumping request to ' + urlh.url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.geturl(), video_id)
+ filename = self._request_dump_filename(urlh.url, video_id)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.full_url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, video_id)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf:
while True:
try:
return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
- except http.client.IncompleteRead as e:
+ except IncompleteRead as e:
try_count += 1
if try_count >= tries:
raise e
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
rectx = re.escape(context_name)
- FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
+ FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
js, arg_keys, arg_vals = self._search_regex(
(rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
return []
manifest, urlh = res
- manifest_url = urlh.geturl()
+ manifest_url = urlh.url
return self._parse_f4m_formats(
manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
return [], {}
m3u8_doc, urlh = res
- m3u8_url = urlh.geturl()
+ m3u8_url = urlh.url
return self._parse_m3u8_formats_and_subtitles(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
mpd_url, video_id,
note='Downloading MPD VOD manifest' if note is None else note,
errnote='Failed to download VOD manifest' if errnote is None else errnote,
- fatal=False, data=data, headers=headers, query=query) or {}
+ fatal=False, data=data, headers=headers, query=query)
+ if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
+ return None
return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
@staticmethod
if res is False:
assert not fatal
return [], {}
-
smil, urlh = res
- smil_url = urlh.geturl()
- namespace = self._parse_smil_namespace(smil)
-
- fmts = self._parse_smil_formats(
- smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
- subs = self._parse_smil_subtitles(
- smil, namespace=namespace)
-
- return fmts, subs
+ return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
+ namespace=self._parse_smil_namespace(smil))
def _extract_smil_formats(self, *args, **kwargs):
fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
return {}
smil, urlh = res
- smil_url = urlh.geturl()
+ smil_url = urlh.url
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
namespace = self._parse_smil_namespace(smil)
- formats = self._parse_smil_formats(
+ formats, subtitles = self._parse_smil_formats_and_subtitles(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
- subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
video_id = os.path.splitext(url_basename(smil_url))[0]
title = None
return self._search_regex(
r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
- def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ def _parse_smil_formats(self, *args, **kwargs):
+ fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('SMIL')
+ return fmts
+
+ def _parse_smil_formats_and_subtitles(
+ self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
base = smil_url
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
b = meta.get('base') or meta.get('httpBase')
base = b
break
- formats = []
+ formats, subtitles = [], {}
rtmp_count = 0
http_count = 0
m3u8_count = 0
imgs_count = 0
srcs = set()
- media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
+ media = itertools.chain.from_iterable(
+ smil.findall(self._xpath_ns(arg, namespace))
+ for arg in ['.//video', './/audio', './/media'])
for medium in media:
src = medium.get('src')
if not src or src in srcs:
src_url = src_url.strip()
if proto == 'm3u8' or src_ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
+ m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ self._merge_subtitles(m3u8_subs, target=subtitles)
if len(m3u8_formats) == 1:
m3u8_count += 1
m3u8_formats[0].update({
f4m_url += urllib.parse.urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
elif src_ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- src_url, video_id, mpd_id='dash', fatal=False))
+ mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
+ src_url, video_id, mpd_id='dash', fatal=False)
+ formats.extend(mpd_formats)
+ self._merge_subtitles(mpd_subs, target=subtitles)
elif re.search(r'\.ism/[Mm]anifest', src_url):
- formats.extend(self._extract_ism_formats(
- src_url, video_id, ism_id='mss', fatal=False))
+ ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
+ src_url, video_id, ism_id='mss', fatal=False)
+ formats.extend(ism_formats)
+ self._merge_subtitles(ism_subs, target=subtitles)
elif src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
'format_note': 'SMIL storyboards',
})
- return formats
+ smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
+ self._merge_subtitles(smil_subs, target=subtitles)
+
+ return formats, subtitles
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
urls = []
return []
xspf, urlh = res
- xspf_url = urlh.geturl()
+ xspf_url = urlh.url
return self._parse_xspf(
xspf, playlist_id, xspf_url=xspf_url,
self._report_ignoring_subs('DASH')
return fmts
- def _extract_mpd_formats_and_subtitles(
+ def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
+ periods = self._extract_mpd_periods(*args, **kwargs)
+ return self._merge_mpd_periods(periods)
+
+ def _extract_mpd_periods(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):
errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
- return [], {}
+ return []
mpd_doc, urlh = res
if mpd_doc is None:
- return [], {}
+ return []
# We could have been redirected to a new url when we retrieved our mpd file.
- mpd_url = urlh.geturl()
+ mpd_url = urlh.url
mpd_base_url = base_url(mpd_url)
- return self._parse_mpd_formats_and_subtitles(
- mpd_doc, mpd_id, mpd_base_url, mpd_url)
+ return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
self._report_ignoring_subs('DASH')
return fmts
- def _parse_mpd_formats_and_subtitles(
- self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
+ def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
+ periods = self._parse_mpd_periods(*args, **kwargs)
+ return self._merge_mpd_periods(periods)
+
+ def _merge_mpd_periods(self, periods):
+ """
+ Combine all formats and subtitles from an MPD manifest into a single list,
+ by concatenate streams with similar formats.
+ """
+ formats, subtitles = {}, {}
+ for period in periods:
+ for f in period['formats']:
+ assert 'is_dash_periods' not in f, 'format already processed'
+ f['is_dash_periods'] = True
+ format_key = tuple(v for k, v in f.items() if k not in (
+ ('format_id', 'fragments', 'manifest_stream_number')))
+ if format_key not in formats:
+ formats[format_key] = f
+ elif 'fragments' in f:
+ formats[format_key].setdefault('fragments', []).extend(f['fragments'])
+
+ if subtitles and period['subtitles']:
+ self.report_warning(bug_reports_message(
+ 'Found subtitles in multiple periods in the DASH manifest; '
+ 'if part of the subtitles are missing,'
+ ), only_once=True)
+
+ for sub_lang, sub_info in period['subtitles'].items():
+ subtitles.setdefault(sub_lang, []).extend(sub_info)
+
+ return list(formats.values()), subtitles
+
+ def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
return ms_info
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
- formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
- for period in mpd_doc.findall(_add_ns('Period')):
+ for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
+ period_entry = {
+ 'id': period.get('id', f'period-{period_idx}'),
+ 'formats': [],
+ 'subtitles': collections.defaultdict(list),
+ }
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
if content_type in ('video', 'audio', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1
- formats.append(f)
+ period_entry['formats'].append(f)
elif content_type == 'text':
- subtitles.setdefault(lang or 'und', []).append(f)
-
- return formats, subtitles
+ period_entry['subtitles'][lang or 'und'].append(f)
+ yield period_entry
def _extract_ism_formats(self, *args, **kwargs):
fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
if ism_doc is None:
return [], {}
- return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
+ return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
"""