import base64
import collections
+import functools
import getpass
import hashlib
import http.client
import os
import random
import re
+import subprocess
import sys
import time
import types
import urllib.request
import xml.etree.ElementTree
-from ..compat import functools # isort: split
-from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
+from ..compat import (
+ compat_etree_fromstring,
+ compat_expanduser,
+ compat_os_name,
+ urllib_req_to_req,
+)
from ..cookies import LenientSimpleCookie
from ..downloader.f4m import get_base_url, remove_encrypted_media
+from ..downloader.hls import HlsFD
+from ..networking import HEADRequest, Request
+from ..networking.exceptions import (
+ HTTPError,
+ IncompleteRead,
+ network_exceptions,
+)
+from ..networking.impersonate import ImpersonateTarget
from ..utils import (
IDENTITY,
JSON_LD_RE,
FormatSorter,
GeoRestrictedError,
GeoUtils,
- HEADRequest,
LenientJSONDecoder,
+ Popen,
RegexNotFoundError,
RetryManager,
UnsupportedError,
determine_ext,
dict_get,
encode_data_uri,
- error_to_compat_str,
extract_attributes,
filter_dict,
fix_xml_ampersands,
join_nonempty,
js_to_json,
mimetype2ext,
- network_exceptions,
+ netrc_from_content,
orderedSet,
parse_bitrate,
parse_codecs,
parse_resolution,
sanitize_filename,
sanitize_url,
- sanitized_Request,
smuggle_url,
str_or_none,
str_to_int,
unescapeHTML,
unified_strdate,
unified_timestamp,
- update_Request,
- update_url_query,
url_basename,
url_or_none,
urlhandle_detect_ext,
Automatically calculated from width and height
* dynamic_range The dynamic range of the video. One of:
"SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
- * tbr Average bitrate of audio and video in KBit/s
- * abr Average audio bitrate in KBit/s
+ * tbr Average bitrate of audio and video in kbps (1000 bits/sec)
+ * abr Average audio bitrate in kbps (1000 bits/sec)
* acodec Name of the audio codec in use
* asr Audio sampling rate in Hertz
* audio_channels Number of audio channels
- * vbr Average video bitrate in KBit/s
+ * vbr Average video bitrate in kbps (1000 bits/sec)
* fps Frame rate
* vcodec Name of the video codec in use
* container Name of the container format
width : height ratio as float.
* no_resume The server does not support resuming the
(HTTP or RTMP) download. Boolean.
- * has_drm The format has DRM and cannot be downloaded. Boolean
+ * has_drm True if the format has DRM and cannot be downloaded.
+ 'maybe' if the format may have DRM and has to be tested before download.
* extra_param_to_segment_url A query string to append to each
fragment's URL, or to update each existing query string
- with. Only applied by the native HLS/DASH downloaders.
+ with. If it is an HLS stream with an AES-128 decryption key,
+ the query paramaters will be passed to the key URI as well,
+ unless there is an `extra_param_to_key_url` given,
+ or unless an external key URI is provided via `hls_aes`.
+ Only applied by the native HLS/DASH downloaders.
+ * extra_param_to_key_url A query string to append to the URL
+ of the format's HLS AES-128 decryption key.
+ Only applied by the native HLS downloader.
* hls_aes A dictionary of HLS AES-128 decryption information
used by the native HLS downloader to override the
values in the media playlist when an '#EXT-X-KEY' tag
* downloader_options A dictionary of downloader options
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
- * ffmpeg_args Extra arguments for ffmpeg downloader
+ * ffmpeg_args Extra arguments for ffmpeg downloader (input)
+ * ffmpeg_args_out Extra arguments for ffmpeg downloader (output)
+ * is_dash_periods Whether the format is a result of merging
+ multiple DASH periods.
RTMP formats can also have the additional fields: page_url,
app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
rtmp_protocol, rtmp_real_time
direct: True if a direct video file was given (must only be set by GenericIE)
alt_title: A secondary title of the video.
- display_id An alternative identifier for the video, not necessarily
+ display_id: An alternative identifier for the video, not necessarily
unique, but available before title. Typically, id is
something like "4234987", title "Dancing naked mole rats",
and display_id "dancing-naked-mole-rats"
description: Full video description.
uploader: Full name of the video uploader.
license: License name the video is licensed under.
- creator: The creator of the video.
+ creators: List of creators of the video.
timestamp: UNIX timestamp of the moment the video was uploaded
upload_date: Video upload date in UTC (YYYYMMDD).
If not explicitly set, calculated from timestamp
If it is not clear whether to use timestamp or this, use the former
release_date: The date (YYYYMMDD) when the video was released in UTC.
If not explicitly set, calculated from release_timestamp
+ release_year: Year (YYYY) as integer when the video or album was released.
+ To be used if no exact release date is known.
+ If not explicitly set, calculated from release_date.
modified_timestamp: UNIX timestamp of the moment the video was last modified.
modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
If not explicitly set, calculated from modified_timestamp
channel_id: Id of the channel.
channel_url: Full URL to a channel webpage.
channel_follower_count: Number of followers of the channel.
+ channel_is_verified: Whether the channel is verified on the platform.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{tag: subformats}. "tag" is usually a language code, and
* "author" - human-readable name of the comment author
* "author_id" - user ID of the comment author
* "author_thumbnail" - The thumbnail of the comment author
+ * "author_url" - The url to the comment author's page
+ * "author_is_verified" - Whether the author is verified
+ on the platform
+ * "author_is_uploader" - Whether the comment is made by
+ the video uploader
* "id" - Comment ID
* "html" - Comment as HTML
* "text" - Plain text of the comment
* "dislike_count" - Number of negative ratings of the comment
* "is_favorited" - Whether the comment is marked as
favorite by the video uploader
- * "author_is_uploader" - Whether the comment is made by
- the video uploader
+ * "is_pinned" - Whether the comment is pinned to
+ the top of the comments
age_limit: Age restriction for the video, as an integer (years)
webpage_url: The URL to the video webpage, if given to yt-dlp it
should allow to get the same result again. (It will be set
* "start_time" - The start time of the chapter in seconds
* "end_time" - The end time of the chapter in seconds
* "title" (optional, string)
+ heatmap: A list of dictionaries, with the following entries:
+ * "start_time" - The start time of the data point in seconds
+ * "end_time" - The end time of the data point in seconds
+ * "value" - The normalized value of the data point (float between 0 and 1)
playable_in_embed: Whether this video is allowed to play in embedded
players on other sites. Can be True (=always allowed),
False (=never allowed), None (=unknown), or a string
'private', 'premium_only', 'subscriber_only', 'needs_auth',
'unlisted' or 'public'. Use 'InfoExtractor._availability'
to set it
+ media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
_old_archive_ids: A list of old archive ids needed for backward compatibility
_format_sort_fields: A list of fields to use for sorting formats
__post_extractor: A function to be called just before the metadata is
track_number: Number of the track within an album or a disc, as an integer.
track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
as a unicode string.
- artist: Artist(s) of the track.
- genre: Genre(s) of the track.
+ artists: List of artists of the track.
+ composers: List of composers of the piece.
+ genres: List of genres of the track.
album: Title of the album the track belongs to.
album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
- album_artist: List of all artists appeared on the album (e.g.
- "Ash Borer / Fell Voices" or "Various Artists", useful for splits
- and compilations).
+ album_artists: List of all artists appeared on the album.
+ E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
+ Useful for splits and compilations.
disc_number: Number of the disc or other physical medium the track belongs to,
as an integer.
- release_year: Year (YYYY) when the album was released.
- composer: Composer of the piece
The following fields should only be set for clips that should be cut from the original video:
rows: Number of rows in each storyboard fragment, as an integer
columns: Number of columns in each storyboard fragment, as an integer
+ The following fields are deprecated and should not be set by new code:
+ composer: Use "composers" instead.
+ Composer(s) of the piece, comma-separated.
+ artist: Use "artists" instead.
+ Artist(s) of the track, comma-separated.
+ genre: Use "genres" instead.
+ Genre(s) of the track, comma-separated.
+ album_artist: Use "album_artists" instead.
+ All artists appeared on the album, comma-separated.
+ creator: Use "creators" instead.
+ The creator of the video.
+
Unless mentioned otherwise, the fields should be Unicode strings.
Unless mentioned otherwise, None is equivalent to absence of information.
Subclasses of this should also be added to the list of extractors and
- should define a _VALID_URL regexp and, re-define the _real_extract() and
- (optionally) _real_initialize() methods.
+ should define _VALID_URL as a regexp or a Sequence of regexps, and
+ re-define the _real_extract() and (optionally) _real_initialize() methods.
Subclasses may also override suitable() if necessary, but ensure the function
signature is preserved and that this function imports everything it needs
_EMBED_REGEX = []
def _login_hint(self, method=NO_DEFAULT, netrc=None):
- password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
+ password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
return {
None: '',
'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
- cls._VALID_URL_RE = re.compile(cls._VALID_URL)
- return cls._VALID_URL_RE.match(url)
+ cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
+ return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
@classmethod
def suitable(cls, url):
except UnsupportedError:
raise
except ExtractorError as e:
- e.video_id = e.video_id or self.get_temp_id(url),
- e.ie = e.ie or self.IE_NAME,
+ e.video_id = e.video_id or self.get_temp_id(url)
+ e.ie = e.ie or self.IE_NAME
e.traceback = e.traceback or sys.exc_info()[2]
raise
- except http.client.IncompleteRead as e:
+ except IncompleteRead as e:
raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
except (KeyError, StopIteration) as e:
raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
if self._x_forwarded_for_ip:
self.report_warning(
- 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
- % (self._x_forwarded_for_ip, country_code.upper()))
+ 'Video is geo restricted. Retrying extraction with fake IP '
+ f'{self._x_forwarded_for_ip} ({country_code.upper()}) as X-Forwarded-For.')
return True
return False
@staticmethod
def __can_accept_status_code(err, expected_status):
- assert isinstance(err, urllib.error.HTTPError)
+ assert isinstance(err, HTTPError)
if expected_status is None:
return False
elif callable(expected_status):
- return expected_status(err.code) is True
+ return expected_status(err.status) is True
else:
- return err.code in variadic(expected_status)
+ return err.status in variadic(expected_status)
- def _create_request(self, url_or_request, data=None, headers=None, query=None):
+ def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None):
if isinstance(url_or_request, urllib.request.Request):
- return update_Request(url_or_request, data=data, headers=headers, query=query)
- if query:
- url_or_request = update_url_query(url_or_request, query)
- return sanitized_Request(url_or_request, data, headers or {})
+ self._downloader.deprecation_warning(
+ 'Passing a urllib.request.Request to _create_request() is deprecated. '
+ 'Use yt_dlp.networking.common.Request instead.')
+ url_or_request = urllib_req_to_req(url_or_request)
+ elif not isinstance(url_or_request, Request):
+ url_or_request = Request(url_or_request)
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
+ url_or_request.update(data=data, headers=headers, query=query, extensions=extensions)
+ return url_or_request
+
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None,
+ headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False):
"""
Return the response handle.
if not self._downloader._first_webpage_request:
sleep_interval = self.get_param('sleep_interval_requests') or 0
if sleep_interval > 0:
- self.to_screen('Sleeping %s seconds ...' % sleep_interval)
+ self.to_screen(f'Sleeping {sleep_interval} seconds ...')
time.sleep(sleep_interval)
else:
self._downloader._first_webpage_request = False
headers = (headers or {}).copy()
headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
+ extensions = {}
+
+ if impersonate in (True, ''):
+ impersonate = ImpersonateTarget()
+ requested_targets = [
+ t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t)
+ for t in variadic(impersonate)
+ ] if impersonate else []
+
+ available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None)
+ if available_target:
+ extensions['impersonate'] = available_target
+ elif requested_targets:
+ message = 'The extractor is attempting impersonation, but '
+ message += (
+ 'no impersonate target is available' if not str(impersonate)
+ else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"')
+ info_msg = ('see https://github.com/yt-dlp/yt-dlp#impersonation '
+ 'for information on installing the required dependencies')
+ if require_impersonation:
+ raise ExtractorError(f'{message}; {info_msg}', expected=True)
+ self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True)
+
try:
- return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
+ return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions))
except network_exceptions as err:
- if isinstance(err, urllib.error.HTTPError):
+ if isinstance(err, HTTPError):
if self.__can_accept_status_code(err, expected_status):
- # Retain reference to error to prevent file object from
- # being closed before it can be read. Works around the
- # effects of <https://bugs.python.org/issue15002>
- # introduced in Python 3.4.1.
- err.fp._error = err
- return err.fp
+ return err.response
if errnote is False:
return False
if errnote is None:
errnote = 'Unable to download webpage'
- errmsg = f'{errnote}: {error_to_compat_str(err)}'
+ errmsg = f'{errnote}: {err}'
if fatal:
raise ExtractorError(errmsg, cause=err)
else:
return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
- encoding=None, data=None, headers={}, query={}, expected_status=None):
+ encoding=None, data=None, headers={}, query={}, expected_status=None,
+ impersonate=None, require_impersonation=False):
"""
Return a tuple (page content as string, URL handle).
Arguments:
url_or_request -- plain text URL as a string or
- a urllib.request.Request object
+ a yt_dlp.networking.Request object
video_id -- Video/playlist/item identifier (string)
Keyword arguments:
returning True if it should be accepted
Note that this argument does not affect success status codes (2xx)
which are always accepted.
+ impersonate -- the impersonate target. Can be any of the following entities:
+ - an instance of yt_dlp.networking.impersonate.ImpersonateTarget
+ - a string in the format of CLIENT[:OS]
+ - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances
+ - a boolean value; True means any impersonate target is sufficient
+ require_impersonation -- flag to toggle whether the request should raise an error
+ if impersonation is not possible (bool, default: False)
"""
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, str):
url_or_request = url_or_request.partition('#')[0]
- urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data,
+ headers=headers, query=query, expected_status=expected_status,
+ impersonate=impersonate, require_impersonation=require_impersonation)
if urlh is False:
assert not fatal
return False
- content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
+ encoding=encoding, data=data)
return (content, urlh)
@staticmethod
r'<iframe src="([^"]+)"', content,
'Websense information URL', default=None)
if blocked_iframe:
- msg += ' Visit %s for more details' % blocked_iframe
+ msg += f' Visit {blocked_iframe} for more details'
raise ExtractorError(msg, expected=True)
if '<title>The URL you requested has been blocked</title>' in first_block:
msg = (
r'</h1><p>(.*?)</p>',
content, 'block message', default=None)
if block_msg:
- msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+ msg += ' (Message: "{}")'.format(block_msg.replace('\n', ' '))
raise ExtractorError(msg, expected=True)
if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
and 'blocklist.rkn.gov.ru' in content):
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True)
- def _request_dump_filename(self, url, video_id):
- basen = f'{video_id}_{url}'
+ def _request_dump_filename(self, url, video_id, data=None):
+ if data is not None:
+ data = hashlib.md5(data).hexdigest()
+ basen = join_nonempty(video_id, data, url, delim='_')
trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length:
- h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
+ h = '___' + hashlib.md5(basen.encode()).hexdigest()
basen = basen[:trim_length - len(h)] + h
filename = sanitize_filename(f'{basen}.dump', restricted=True)
# Working around MAX_PATH limitation on Windows (see
except LookupError:
return webpage_bytes.decode('utf-8', 'replace')
- def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
+ prefix=None, encoding=None, data=None):
webpage_bytes = urlh.read()
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
if self.get_param('dump_intermediate_pages', False):
- self.to_screen('Dumping request to ' + urlh.geturl())
+ self.to_screen('Dumping request to ' + urlh.url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.geturl(), video_id)
+ if isinstance(url_or_request, Request):
+ data = self._create_request(url_or_request, data).data
+ filename = self._request_dump_filename(urlh.url, video_id, data)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
if transform_source:
xml_string = transform_source(xml_string)
try:
- return compat_etree_fromstring(xml_string.encode('utf-8'))
+ return compat_etree_fromstring(xml_string.encode())
except xml.etree.ElementTree.ParseError as ve:
self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
return getattr(ie, parser)(content, *args, **kwargs)
def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+ fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
+ impersonate=None, require_impersonation=False):
res = self._download_webpage_handle(
url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
- data=data, headers=headers, query=query, expected_status=expected_status)
+ data=data, headers=headers, query=query, expected_status=expected_status,
+ impersonate=impersonate, require_impersonation=require_impersonation)
if res is False:
return res
content, urlh = res
return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+ fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None,
+ impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.full_url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf:
'headers': headers,
'query': query,
'expected_status': expected_status,
+ 'impersonate': impersonate,
+ 'require_impersonation': require_impersonation,
}
if parser is None:
kwargs.pop('transform_source')
while True:
try:
return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
- except http.client.IncompleteRead as e:
+ except IncompleteRead as e:
try_count += 1
if try_count >= tries:
raise e
def report_extraction(self, id_or_name):
"""Report information extraction."""
- self.to_screen('%s: Extracting information' % id_or_name)
+ self.to_screen(f'{id_or_name}: Extracting information')
def report_download_webpage(self, video_id):
"""Report webpage download."""
- self.to_screen('%s: Downloading webpage' % video_id)
+ self.to_screen(f'{video_id}: Downloading webpage')
def report_age_confirmation(self):
"""Report attempt to confirm age."""
elif default is not NO_DEFAULT:
return default
elif fatal:
- raise RegexNotFoundError('Unable to extract %s' % _name)
+ raise RegexNotFoundError(f'Unable to extract {_name}')
else:
- self.report_warning('unable to extract %s' % _name + bug_reports_message())
+ self.report_warning(f'unable to extract {_name}' + bug_reports_message())
return None
def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
return clean_html(res)
def _get_netrc_login_info(self, netrc_machine=None):
- username = None
- password = None
netrc_machine = netrc_machine or self._NETRC_MACHINE
- if self.get_param('usenetrc', False):
- try:
- netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
- if os.path.isdir(netrc_file):
- netrc_file = os.path.join(netrc_file, '.netrc')
- info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
- if info is not None:
- username = info[0]
- password = info[2]
- else:
- raise netrc.NetrcParseError(
- 'No authenticators for %s' % netrc_machine)
- except (OSError, netrc.NetrcParseError) as err:
- self.report_warning(
- 'parsing .netrc: %s' % error_to_compat_str(err))
+ cmd = self.get_param('netrc_cmd')
+ if cmd:
+ cmd = cmd.replace('{}', netrc_machine)
+ self.to_screen(f'Executing command: {cmd}')
+ stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
+ if ret != 0:
+ raise OSError(f'Command returned error code {ret}')
+ info = netrc_from_content(stdout).authenticators(netrc_machine)
+
+ elif self.get_param('usenetrc', False):
+ netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+ if os.path.isdir(netrc_file):
+ netrc_file = os.path.join(netrc_file, '.netrc')
+ info = netrc.netrc(netrc_file).authenticators(netrc_machine)
- return username, password
+ else:
+ return None, None
+ if not info:
+ self.to_screen(f'No authenticators for {netrc_machine}')
+ return None, None
+
+ self.write_debug(f'Using netrc for {netrc_machine} authentication')
+ return info[0], info[2]
def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
"""
Get the login info as (username, password)
First look for the manually specified credentials using username_option
and password_option as keys in params dictionary. If no such credentials
- available look in the netrc file using the netrc_machine or _NETRC_MACHINE
- value.
+ are available try the netrc_cmd if it is defined or look in the
+ netrc file using the netrc_machine or _NETRC_MACHINE value.
If there's no info available, return (None, None)
"""
- # Attempt to use provided username and password or .netrc data
username = self.get_param(username_option)
if username is not None:
password = self.get_param(password_option)
else:
- username, password = self._get_netrc_login_info(netrc_machine)
-
+ try:
+ username, password = self._get_netrc_login_info(netrc_machine)
+ except (OSError, netrc.NetrcParseError) as err:
+ self.report_warning(f'Failed to parse .netrc: {err}')
+ return None, None
return username, password
def _get_tfa_info(self, note='two-factor verification code'):
if tfa is not None:
return tfa
- return getpass.getpass('Type %s and press [Return]: ' % note)
+ return getpass.getpass(f'Type {note} and press [Return]: ')
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
- property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
- % {'prop': re.escape(prop), 'sep': '(?::|[:-])'})
+ property_re = r'(?:name|property)=(?:\'og{sep}{prop}\'|"og{sep}{prop}"|\s*og{sep}{prop}\b)'.format(
+ prop=re.escape(prop), sep='(?::|[:-])')
template = r'<meta[^>]+?%s[^>]+?%s'
return [
template % (property_re, content_re),
@staticmethod
def _meta_regex(prop):
- return r'''(?isx)<meta
- (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
- [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
+ return rf'''(?isx)<meta
+ (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?){re.escape(prop)}\1)
+ [^>]+?content=(["\'])(?P<content>.*?)\2'''
def _og_search_property(self, prop, html, name=None, **kargs):
prop = variadic(prop)
if name is None:
- name = 'OpenGraph %s' % prop[0]
+ name = f'OpenGraph {prop[0]}'
og_regexes = []
for p in prop:
og_regexes.extend(self._og_regexes(p))
elif fatal:
raise RegexNotFoundError('Unable to extract JSON-LD')
else:
- self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
+ self.report_warning(f'unable to extract JSON-LD {bug_reports_message()}')
return {}
def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
}
def is_type(e, *expected_types):
- type = variadic(traverse_obj(e, '@type'))
- return any(x in type for x in expected_types)
+ type_ = variadic(traverse_obj(e, '@type'))
+ return any(x in type_ for x in expected_types)
def extract_interaction_type(e):
interaction_type = e.get('interactionType')
count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
if not count_kind:
continue
- count_key = '%s_count' % count_kind
+ count_key = f'{count_kind}_count'
if info.get(count_key) is not None:
continue
info[count_key] = interaction_count
'end_time': part.get('endOffset'),
} for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
for idx, (last_c, current_c, next_c) in enumerate(zip(
- [{'end_time': 0}] + chapters, chapters, chapters[1:])):
+ [{'end_time': 0}, *chapters], chapters, chapters[1:])):
current_c['end_time'] = current_c['end_time'] or next_c['start_time']
current_c['start_time'] = current_c['start_time'] or last_c['end_time']
if None in current_c.values():
traverse_json_ld(json_ld)
return filter_dict(info)
- def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
- return self._parse_json(
- self._search_regex(
- r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
- webpage, 'next.js data', fatal=fatal, **kw),
- video_id, transform_source=transform_source, fatal=fatal)
+ def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
+ if default == '{}':
+ self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
+ default = {}
+ if default is not NO_DEFAULT:
+ fatal = False
+
+ return self._search_json(
+ r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
+ video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
rectx = re.escape(context_name)
- FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
+ FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
js, arg_keys, arg_vals = self._search_regex(
(rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
hidden_inputs = {}
- for input in re.findall(r'(?i)(<input[^>]+>)', html):
- attrs = extract_attributes(input)
- if not input:
+ for input_el in re.findall(r'(?i)(<input[^>]+>)', html):
+ attrs = extract_attributes(input_el)
+ if not input_el:
continue
if attrs.get('type') not in ('hidden', 'submit'):
continue
def _form_hidden_inputs(self, form_id, html):
form = self._search_regex(
- r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
- html, '%s form' % form_id, group='form')
+ rf'(?is)<form[^>]+?id=(["\']){form_id}\1[^>]*>(?P<form>.+?)</form>',
+ html, f'{form_id} form', group='form')
return self._hidden_inputs(form)
@classproperty(cache=True)
formats[:] = filter(
lambda f: self._is_valid_url(
f['url'], video_id,
- item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
+ item='{} video format'.format(f.get('format_id')) if f.get('format_id') else 'video'),
formats)
@staticmethod
def _is_valid_url(self, url, video_id, item='video', headers={}):
url = self._proto_relative_url(url, scheme='http:')
# For now assume non HTTP(S) URLs always valid
- if not (url.startswith('http://') or url.startswith('https://')):
+ if not url.startswith(('http://', 'https://')):
return True
try:
- self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
+ self._request_webpage(url, video_id, f'Checking {item} URL', headers=headers)
return True
except ExtractorError as e:
self.to_screen(
- '%s: %s URL is invalid, skipping: %s'
- % (video_id, item, error_to_compat_str(e.cause)))
+ f'{video_id}: {item} URL is invalid, skipping: {e.cause!s}')
return False
def http_scheme(self):
return []
manifest, urlh = res
- manifest_url = urlh.geturl()
+ manifest_url = urlh.url
return self._parse_f4m_formats(
manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
# currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
if akamai_pv is not None and ';' in akamai_pv.text:
- playerVerificationChallenge = akamai_pv.text.split(';')[0]
- if playerVerificationChallenge.strip() != '':
+ player_verification_challenge = akamai_pv.text.split(';')[0]
+ if player_verification_challenge.strip() != '':
return []
formats = []
if not media_url:
continue
manifest_url = (
- media_url if media_url.startswith('http://') or media_url.startswith('https://')
+ media_url if media_url.startswith(('http://', 'https://'))
else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
# If media_url is itself a f4m manifest do the recursive extraction
# since bitrates in parent manifest (this one) and media_url manifest
def _report_ignoring_subs(self, name):
self.report_warning(bug_reports_message(
f'Ignoring subtitle tracks found in the {name} manifest; '
- 'if any subtitle tracks are missing,'
+ 'if any subtitle tracks are missing,',
), only_once=True)
def _extract_m3u8_formats(self, *args, **kwargs):
return [], {}
m3u8_doc, urlh = res
- m3u8_url = urlh.geturl()
+ m3u8_url = urlh.url
return self._parse_m3u8_formats_and_subtitles(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
errnote=None, fatal=True, data=None, headers={}, query={},
video_id=None):
formats, subtitles = [], {}
-
- has_drm = re.search('|'.join([
- r'#EXT-X-FAXS-CM:', # Adobe Flash Access
- r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
- ]), m3u8_doc)
+ has_drm = HlsFD._has_drm(m3u8_doc)
def format_url(url):
return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
formats = [{
'format_id': join_nonempty(m3u8_id, idx),
'format_index': idx,
- 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
+ 'url': m3u8_url or encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl'),
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
'has_drm': has_drm,
}
+
+ # YouTube-specific
+ if yt_audio_content_id := last_stream_inf.get('YT-EXT-AUDIO-CONTENT-ID'):
+ f['language'] = yt_audio_content_id.split('.')[0]
+
resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
mpd_url, video_id,
note='Downloading MPD VOD manifest' if note is None else note,
errnote='Failed to download VOD manifest' if errnote is None else errnote,
- fatal=False, data=data, headers=headers, query=query) or {}
+ fatal=False, data=data, headers=headers, query=query)
+ if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
+ return None
return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
@staticmethod
if not c or c == '.':
out.append(c)
else:
- out.append('{%s}%s' % (namespace, c))
+ out.append(f'{{{namespace}}}{c}')
return '/'.join(out)
def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
if res is False:
assert not fatal
return [], {}
-
smil, urlh = res
- smil_url = urlh.geturl()
- namespace = self._parse_smil_namespace(smil)
-
- fmts = self._parse_smil_formats(
- smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
- subs = self._parse_smil_subtitles(
- smil, namespace=namespace)
-
- return fmts, subs
+ return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
+ namespace=self._parse_smil_namespace(smil))
def _extract_smil_formats(self, *args, **kwargs):
fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
return {}
smil, urlh = res
- smil_url = urlh.geturl()
+ smil_url = urlh.url
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
namespace = self._parse_smil_namespace(smil)
- formats = self._parse_smil_formats(
+ formats, subtitles = self._parse_smil_formats_and_subtitles(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
- subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
video_id = os.path.splitext(url_basename(smil_url))[0]
title = None
return self._search_regex(
r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
- def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ def _parse_smil_formats(self, *args, **kwargs):
+ fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('SMIL')
+ return fmts
+
+ def _parse_smil_formats_and_subtitles(
+ self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
base = smil_url
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
b = meta.get('base') or meta.get('httpBase')
base = b
break
- formats = []
+ formats, subtitles = [], {}
rtmp_count = 0
http_count = 0
m3u8_count = 0
imgs_count = 0
srcs = set()
- media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
+ media = itertools.chain.from_iterable(
+ smil.findall(self._xpath_ns(arg, namespace))
+ for arg in ['.//video', './/audio', './/media'])
for medium in media:
src = medium.get('src')
if not src or src in srcs:
})
continue
- src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
+ src_url = src if src.startswith('http') else urllib.parse.urljoin(f'{base}/', src)
src_url = src_url.strip()
if proto == 'm3u8' or src_ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
+ m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ self._merge_subtitles(m3u8_subs, target=subtitles)
if len(m3u8_formats) == 1:
m3u8_count += 1
m3u8_formats[0].update({
f4m_url += urllib.parse.urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
elif src_ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- src_url, video_id, mpd_id='dash', fatal=False))
+ mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
+ src_url, video_id, mpd_id='dash', fatal=False)
+ formats.extend(mpd_formats)
+ self._merge_subtitles(mpd_subs, target=subtitles)
elif re.search(r'\.ism/[Mm]anifest', src_url):
- formats.extend(self._extract_ism_formats(
- src_url, video_id, ism_id='mss', fatal=False))
+ ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
+ src_url, video_id, ism_id='mss', fatal=False)
+ formats.extend(ism_formats)
+ self._merge_subtitles(ism_subs, target=subtitles)
elif src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
imgs_count += 1
formats.append({
- 'format_id': 'imagestream-%d' % (imgs_count),
+ 'format_id': f'imagestream-{imgs_count}',
'url': src,
'ext': mimetype2ext(medium.get('type')),
'acodec': 'none',
'format_note': 'SMIL storyboards',
})
- return formats
+ smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
+ self._merge_subtitles(smil_subs, target=subtitles)
+
+ return formats, subtitles
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
urls = []
subtitles = {}
- for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+ for textstream in smil.findall(self._xpath_ns('.//textstream', namespace)):
src = textstream.get('src')
if not src or src in urls:
continue
return []
xspf, urlh = res
- xspf_url = urlh.geturl()
+ xspf_url = urlh.url
return self._parse_xspf(
xspf, playlist_id, xspf_url=xspf_url,
self._report_ignoring_subs('DASH')
return fmts
- def _extract_mpd_formats_and_subtitles(
+ def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
+ periods = self._extract_mpd_periods(*args, **kwargs)
+ return self._merge_mpd_periods(periods)
+
+ def _extract_mpd_periods(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):
errnote='Failed to download MPD manifest' if errnote is None else errnote,
fatal=fatal, data=data, headers=headers, query=query)
if res is False:
- return [], {}
+ return []
mpd_doc, urlh = res
if mpd_doc is None:
- return [], {}
+ return []
# We could have been redirected to a new url when we retrieved our mpd file.
- mpd_url = urlh.geturl()
+ mpd_url = urlh.url
mpd_base_url = base_url(mpd_url)
- return self._parse_mpd_formats_and_subtitles(
- mpd_doc, mpd_id, mpd_base_url, mpd_url)
+ return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
def _parse_mpd_formats(self, *args, **kwargs):
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
self._report_ignoring_subs('DASH')
return fmts
- def _parse_mpd_formats_and_subtitles(
- self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
+ def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
+ periods = self._parse_mpd_periods(*args, **kwargs)
+ return self._merge_mpd_periods(periods)
+
+ def _merge_mpd_periods(self, periods):
+ """
+ Combine all formats and subtitles from an MPD manifest into a single list,
+ by concatenate streams with similar formats.
+ """
+ formats, subtitles = {}, {}
+ for period in periods:
+ for f in period['formats']:
+ assert 'is_dash_periods' not in f, 'format already processed'
+ f['is_dash_periods'] = True
+ format_key = tuple(v for k, v in f.items() if k not in (
+ ('format_id', 'fragments', 'manifest_stream_number')))
+ if format_key not in formats:
+ formats[format_key] = f
+ elif 'fragments' in f:
+ formats[format_key].setdefault('fragments', []).extend(f['fragments'])
+
+ if subtitles and period['subtitles']:
+ self.report_warning(bug_reports_message(
+ 'Found subtitles in multiple periods in the DASH manifest; '
+ 'if part of the subtitles are missing,',
+ ), only_once=True)
+
+ for sub_lang, sub_info in period['subtitles'].items():
+ subtitles.setdefault(sub_lang, []).extend(sub_info)
+
+ return list(formats.values()), subtitles
+
+ def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
return ms_info
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
- formats, subtitles = [], {}
stream_numbers = collections.defaultdict(int)
- for period in mpd_doc.findall(_add_ns('Period')):
+ for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
+ period_entry = {
+ 'id': period.get('id', f'period-{period_idx}'),
+ 'formats': [],
+ 'subtitles': collections.defaultdict(list),
+ }
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
content_type = 'text'
else:
- self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
+ self.report_warning(f'Unknown MIME type {mime_type} in DASH manifest')
continue
base_url = ''
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
- 'format_note': 'DASH %s' % content_type,
+ 'format_note': f'DASH {content_type}',
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
- **codecs
+ **codecs,
}
elif content_type == 'text':
f = {
t += c
# Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator
- t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
- t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
+ t = re.sub(r'\$({})\$'.format('|'.join(identifiers)), r'%(\1)d', t)
+ t = re.sub(r'\$({})%([^$]+)\$'.format('|'.join(identifiers)), r'%(\1)\2', t)
t.replace('$$', '$')
return t
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
- for num, s in enumerate(representation_ms_info['s']):
+ for s in representation_ms_info['s']:
segment_time = s.get('t') or segment_time
segment_d = s['d']
add_segment_url()
segment_number += 1
- for r in range(s.get('r', 0)):
+ for _ in range(s.get('r', 0)):
segment_time += segment_d
add_segment_url()
segment_number += 1
timescale = representation_ms_info['timescale']
for s in representation_ms_info['s']:
duration = float_or_none(s['d'], timescale)
- for r in range(s.get('r', 0) + 1):
+ for _ in range(s.get('r', 0) + 1):
segment_uri = representation_ms_info['segment_urls'][segment_index]
fragments.append({
location_key(segment_uri): segment_uri,
if content_type in ('video', 'audio', 'image/jpeg'):
f['manifest_stream_number'] = stream_numbers[f['url']]
stream_numbers[f['url']] += 1
- formats.append(f)
+ period_entry['formats'].append(f)
elif content_type == 'text':
- subtitles.setdefault(lang or 'und', []).append(f)
-
- return formats, subtitles
+ period_entry['subtitles'][lang or 'und'].append(f)
+ yield period_entry
def _extract_ism_formats(self, *args, **kwargs):
fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
if ism_doc is None:
return [], {}
- return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
+ return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
"""
fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
# TODO: add support for WVC1 and WMAP
if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
- self.report_warning('%s is not a supported codec' % fourcc)
+ self.report_warning(f'{fourcc} is not a supported codec')
continue
tbr = int(track.attrib['Bitrate']) // 1000
# [1] does not mention Width and Height attributes. However,
'fourcc': fourcc,
'language': stream_language,
'codec_private_data': track.get('CodecPrivateData'),
- }
+ },
})
elif stream_type in ('video', 'audio'):
formats.append({
'protocol': 'ism',
'fragments': fragments,
'has_drm': ism_doc.find('Protection') is not None,
+ 'language': stream_language,
+ 'audio_channels': int_or_none(track.get('Channels')),
'_download_params': {
'stream_type': stream_type,
'duration': duration,
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
media_tags = [(media_tag, media_tag_name, media_type, '')
for media_tag, media_tag_name, media_type
- in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
+ in re.findall(rf'(?s)(<({_MEDIA_TAG_NAME_RE})[^>]*/>)', webpage)]
media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see
# https://github.com/ytdl-org/youtube-dl/issues/11979,
# e.g. http://www.porntrex.com/maps/videositemap.xml).
- r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
+ rf'(?s)(<(?P<tag>{_MEDIA_TAG_NAME_RE})(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
for media_tag, _, media_type, media_content in media_tags:
media_info = {
'formats': [],
mobj = re.search(
r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
url_base = mobj.group('url')
- http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
+ http_base_url = '{}{}:{}'.format('http', mobj.group('s') or '', url_base)
formats = []
def manifest_url(manifest):
m_url = f'{http_base_url}/{manifest}'
if query:
- m_url += '?%s' % query
+ m_url += f'?{query}'
return m_url
if 'm3u8' not in skip_protocols:
video_id, fatal=False)
for rtmp_format in rtmp_formats:
rtsp_format = rtmp_format.copy()
- rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+ rtsp_format['url'] = '{}/{}'.format(rtmp_format['url'], rtmp_format['play_path'])
del rtsp_format['play_path']
del rtsp_format['ext']
rtsp_format.update({
return formats
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
- mobj = re.search(
- r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
- webpage)
- if mobj:
- try:
- jwplayer_data = self._parse_json(mobj.group('options'),
- video_id=video_id,
- transform_source=transform_source)
- except ExtractorError:
- pass
- else:
- if isinstance(jwplayer_data, dict):
- return jwplayer_data
-
- def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
+ return self._search_json(
+ r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
+ webpage, 'JWPlayer data', video_id,
+ # must be a {...} or sequence, ending
+ contains_pattern=r'\{(?s:.*)}(?(load)(?:\s*,\s*\{(?s:.*)})*)', end_pattern=r'(?(load)\]|\))',
+ transform_source=transform_source, default=None)
+
+ def _extract_jwplayer_data(self, webpage, video_id, *args, transform_source=js_to_json, **kwargs):
jwplayer_data = self._find_jwplayer_data(
- webpage, video_id, transform_source=js_to_json)
+ webpage, video_id, transform_source=transform_source)
return self._parse_jwplayer_data(
jwplayer_data, video_id, *args, **kwargs)
mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
subtitles = {}
- tracks = video_data.get('tracks')
- if tracks and isinstance(tracks, list):
- for track in tracks:
- if not isinstance(track, dict):
- continue
- track_kind = track.get('kind')
- if not track_kind or not isinstance(track_kind, str):
- continue
- if track_kind.lower() not in ('captions', 'subtitles'):
- continue
- track_url = urljoin(base_url, track.get('file'))
- if not track_url:
- continue
- subtitles.setdefault(track.get('label') or 'en', []).append({
- 'url': self._proto_relative_url(track_url)
- })
+ for track in traverse_obj(video_data, (
+ 'tracks', lambda _, v: v['kind'].lower() in ('captions', 'subtitles'))):
+ track_url = urljoin(base_url, track.get('file'))
+ if not track_url:
+ continue
+ subtitles.setdefault(track.get('label') or 'en', []).append({
+ 'url': self._proto_relative_url(track_url),
+ })
entry = {
'id': this_video_id,
'tbr': int_or_none(source.get('bitrate'), scale=1000),
'filesize': int_or_none(source.get('filesize')),
'ext': ext,
- 'format_id': format_id
+ 'format_id': format_id,
}
if source_url.startswith('rtmp'):
a_format['ext'] = 'flv'
# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
# of jwplayer.flash.swf
rtmp_url_parts = re.split(
- r'((?:mp4|mp3|flv):)', source_url, 1)
+ r'((?:mp4|mp3|flv):)', source_url, maxsplit=1)
if len(rtmp_url_parts) == 3:
rtmp_url, prefix, play_path = rtmp_url_parts
a_format.update({
def _get_cookies(self, url):
""" Return a http.cookies.SimpleCookie with the cookies for the url """
- return LenientSimpleCookie(self._downloader._calc_cookies(url))
+ return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
def _apply_first_set_cookie_header(self, url_handle, cookie):
"""
continue
cookies = cookies.encode('iso-8859-1').decode('utf-8')
cookie_value = re.search(
- r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
+ rf'{cookie}=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)', cookies)
if cookie_value:
value, domain = cookie_value.groups()
self._set_cookie(domain, cookie, value)
@classmethod
def is_single_video(cls, url):
"""Returns whether the URL is of a single video, None if unknown"""
- assert cls.suitable(url), 'The URL must be suitable for the extractor'
- return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
+ if cls.suitable(url):
+ return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
@classmethod
def is_suitable(cls, age_limit):
desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
# Escape emojis. Ref: https://github.com/github/markup/issues/1153
- name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
+ name = (' - **{}**'.format(re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME))) if markdown else cls.IE_NAME
return f'{name}:{desc}' if desc else name
def extract_subtitles(self, *args, **kwargs):
self.to_screen(f'Extracted {comment_count} comments')
return {
'comments': comments,
- 'comment_count': None if interrupted else comment_count
+ 'comment_count': None if interrupted else comment_count,
}
return extractor
'start_time': start_function(chapter),
'title': title_function(chapter),
} for chapter in chapter_list or []]
- if not strict:
+ if strict:
+ warn = self.report_warning
+ else:
+ warn = self.write_debug
chapter_list.sort(key=lambda c: c['start_time'] or 0)
chapters = [{'start_time': 0}]
for idx, chapter in enumerate(chapter_list):
if chapter['start_time'] is None:
- self.report_warning(f'Incomplete chapter {idx}')
+ warn(f'Incomplete chapter {idx}')
elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
chapters.append(chapter)
elif chapter not in chapters:
- self.report_warning(
- f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"')
+ issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
+ else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
+ warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
return chapters[1:]
def _extract_chapters_from_description(self, description, duration):
@staticmethod
def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
- all_known = all(map(
- lambda x: x is not None,
- (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
+ all_known = all(
+ x is not None for x in
+ (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted))
return (
'private' if is_private
else 'premium_only' if needs_premium
@classproperty
def _VALID_URL(cls):
- return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
+ return rf'{cls._SEARCH_KEY}(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)'
def _real_extract(self, query):
prefix, query = self._match_valid_url(query).group('prefix', 'query')