jfr.im git - yt-dlp.git/blame_incremental - yt

Commit	Line	Data
	1	import base64
	2	import collections
	3	import getpass
	4	import hashlib
	5	import http.client
	6	import http.cookiejar
	7	import http.cookies
	8	import inspect
	9	import itertools
	10	import json
	11	import math
	12	import netrc
	13	import os
	14	import random
	15	import re
	16	import sys
	17	import time
	18	import types
	19	import urllib.parse
	20	import urllib.request
	21	import xml.etree.ElementTree
	22
	23	from ..compat import functools # isort: split
	24	from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
	25	from ..cookies import LenientSimpleCookie
	26	from ..downloader import FileDownloader
	27	from ..downloader.f4m import get_base_url, remove_encrypted_media
	28	from ..utils import (
	29	IDENTITY,
	30	JSON_LD_RE,
	31	NO_DEFAULT,
	32	ExtractorError,
	33	GeoRestrictedError,
	34	GeoUtils,
	35	LenientJSONDecoder,
	36	RegexNotFoundError,
	37	RetryManager,
	38	UnsupportedError,
	39	age_restricted,
	40	base_url,
	41	bug_reports_message,
	42	classproperty,
	43	clean_html,
	44	determine_ext,
	45	determine_protocol,
	46	dict_get,
	47	encode_data_uri,
	48	error_to_compat_str,
	49	extract_attributes,
	50	filter_dict,
	51	fix_xml_ampersands,
	52	float_or_none,
	53	format_field,
	54	int_or_none,
	55	join_nonempty,
	56	js_to_json,
	57	mimetype2ext,
	58	network_exceptions,
	59	orderedSet,
	60	parse_bitrate,
	61	parse_codecs,
	62	parse_duration,
	63	parse_iso8601,
	64	parse_m3u8_attributes,
	65	parse_resolution,
	66	sanitize_filename,
	67	sanitize_url,
	68	sanitized_Request,
	69	smuggle_url,
	70	str_or_none,
	71	str_to_int,
	72	strip_or_none,
	73	traverse_obj,
	74	try_call,
	75	try_get,
	76	unescapeHTML,
	77	unified_strdate,
	78	unified_timestamp,
	79	update_Request,
	80	update_url_query,
	81	url_basename,
	82	url_or_none,
	83	urljoin,
	84	variadic,
	85	xpath_element,
	86	xpath_text,
	87	xpath_with_ns,
	88	)
	89
	90
	91	class InfoExtractor:
	92	"""Information Extractor class.
	93
	94	Information extractors are the classes that, given a URL, extract
	95	information about the video (or videos) the URL refers to. This
	96	information includes the real video URL, the video title, author and
	97	others. The information is stored in a dictionary which is then
	98	passed to the YoutubeDL. The YoutubeDL processes this
	99	information possibly downloading the video to the file system, among
	100	other possible outcomes.
	101
	102	The type field determines the type of the result.
	103	By far the most common value (and the default if _type is missing) is
	104	"video", which indicates a single video.
	105
	106	For a video, the dictionaries must include the following fields:
	107
	108	id: Video identifier.
	109	title: Video title, unescaped. Set to an empty string if video has
	110	no title as opposed to "None" which signifies that the
	111	extractor failed to obtain a title
	112
	113	Additionally, it must contain either a formats entry or a url one:
	114
	115	formats: A list of dictionaries for each format available, ordered
	116	from worst to best quality.
	117
	118	Potential fields:
	119	* url The mandatory URL representing the media:
	120	for plain file media - HTTP URL of this file,
	121	for RTMP - RTMP URL,
	122	for HLS - URL of the M3U8 media playlist,
	123	for HDS - URL of the F4M manifest,
	124	for DASH
	125	- HTTP URL to plain file media (in case of
	126	unfragmented media)
	127	- URL of the MPD manifest or base URL
	128	representing the media if MPD manifest
	129	is parsed from a string (in case of
	130	fragmented media)
	131	for MSS - URL of the ISM manifest.
	132	* manifest_url
	133	The URL of the manifest file in case of
	134	fragmented media:
	135	for HLS - URL of the M3U8 master playlist,
	136	for HDS - URL of the F4M manifest,
	137	for DASH - URL of the MPD manifest,
	138	for MSS - URL of the ISM manifest.
	139	* manifest_stream_number (For internal use only)
	140	The index of the stream in the manifest file
	141	* ext Will be calculated from URL if missing
	142	* format A human-readable description of the format
	143	("mp4 container with h264/opus").
	144	Calculated from the format_id, width, height.
	145	and format_note fields if missing.
	146	* format_id A short description of the format
	147	("mp4_h264_opus" or "19").
	148	Technically optional, but strongly recommended.
	149	* format_note Additional info about the format
	150	("3D" or "DASH video")
	151	* width Width of the video, if known
	152	* height Height of the video, if known
	153	* resolution Textual description of width and height
	154	* dynamic_range The dynamic range of the video. One of:
	155	"SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
	156	* tbr Average bitrate of audio and video in KBit/s
	157	* abr Average audio bitrate in KBit/s
	158	* acodec Name of the audio codec in use
	159	* asr Audio sampling rate in Hertz
	160	* audio_channels Number of audio channels
	161	* vbr Average video bitrate in KBit/s
	162	* fps Frame rate
	163	* vcodec Name of the video codec in use
	164	* container Name of the container format
	165	* filesize The number of bytes, if known in advance
	166	* filesize_approx An estimate for the number of bytes
	167	* player_url SWF Player URL (used for rtmpdump).
	168	* protocol The protocol that will be used for the actual
	169	download, lower-case. One of "http", "https" or
	170	one of the protocols defined in downloader.PROTOCOL_MAP
	171	* fragment_base_url
	172	Base URL for fragments. Each fragment's path
	173	value (if present) will be relative to
	174	this URL.
	175	* fragments A list of fragments of a fragmented media.
	176	Each fragment entry must contain either an url
	177	or a path. If an url is present it should be
	178	considered by a client. Otherwise both path and
	179	fragment_base_url must be present. Here is
	180	the list of all potential fields:
	181	* "url" - fragment's URL
	182	* "path" - fragment's path relative to
	183	fragment_base_url
	184	* "duration" (optional, int or float)
	185	* "filesize" (optional, int)
	186	* is_from_start Is a live format that can be downloaded
	187	from the start. Boolean
	188	* preference Order number of this format. If this field is
	189	present and not None, the formats get sorted
	190	by this field, regardless of all other values.
	191	-1 for default (order by other properties),
	192	-2 or smaller for less than default.
	193	< -1000 to hide the format (if there is
	194	another one which is strictly better)
	195	* language Language code, e.g. "de" or "en-US".
	196	* language_preference Is this in the language mentioned in
	197	the URL?
	198	10 if it's what the URL is about,
	199	-1 for default (don't know),
	200	-10 otherwise, other values reserved for now.
	201	* quality Order number of the video quality of this
	202	format, irrespective of the file format.
	203	-1 for default (order by other properties),
	204	-2 or smaller for less than default.
	205	* source_preference Order number for this video source
	206	(quality takes higher priority)
	207	-1 for default (order by other properties),
	208	-2 or smaller for less than default.
	209	* http_headers A dictionary of additional HTTP headers
	210	to add to the request.
	211	* stretched_ratio If given and not 1, indicates that the
	212	video's pixels are not square.
	213	width : height ratio as float.
	214	* no_resume The server does not support resuming the
	215	(HTTP or RTMP) download. Boolean.
	216	* has_drm The format has DRM and cannot be downloaded. Boolean
	217	* downloader_options A dictionary of downloader options
	218	(For internal use only)
	219	* http_chunk_size Chunk size for HTTP downloads
	220	* ffmpeg_args Extra arguments for ffmpeg downloader
	221	RTMP formats can also have the additional fields: page_url,
	222	app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
	223	rtmp_protocol, rtmp_real_time
	224
	225	url: Final video URL.
	226	ext: Video filename extension.
	227	format: The video format, defaults to ext (used for --get-format)
	228	player_url: SWF Player URL (used for rtmpdump).
	229
	230	The following fields are optional:
	231
	232	direct: True if a direct video file was given (must only be set by GenericIE)
	233	alt_title: A secondary title of the video.
	234	display_id An alternative identifier for the video, not necessarily
	235	unique, but available before title. Typically, id is
	236	something like "4234987", title "Dancing naked mole rats",
	237	and display_id "dancing-naked-mole-rats"
	238	thumbnails: A list of dictionaries, with the following entries:
	239	* "id" (optional, string) - Thumbnail format ID
	240	* "url"
	241	* "preference" (optional, int) - quality of the image
	242	* "width" (optional, int)
	243	* "height" (optional, int)
	244	* "resolution" (optional, string "{width}x{height}",
	245	deprecated)
	246	* "filesize" (optional, int)
	247	* "http_headers" (dict) - HTTP headers for the request
	248	thumbnail: Full URL to a video thumbnail image.
	249	description: Full video description.
	250	uploader: Full name of the video uploader.
	251	license: License name the video is licensed under.
	252	creator: The creator of the video.
	253	timestamp: UNIX timestamp of the moment the video was uploaded
	254	upload_date: Video upload date in UTC (YYYYMMDD).
	255	If not explicitly set, calculated from timestamp
	256	release_timestamp: UNIX timestamp of the moment the video was released.
	257	If it is not clear whether to use timestamp or this, use the former
	258	release_date: The date (YYYYMMDD) when the video was released in UTC.
	259	If not explicitly set, calculated from release_timestamp
	260	modified_timestamp: UNIX timestamp of the moment the video was last modified.
	261	modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
	262	If not explicitly set, calculated from modified_timestamp
	263	uploader_id: Nickname or id of the video uploader.
	264	uploader_url: Full URL to a personal webpage of the video uploader.
	265	channel: Full name of the channel the video is uploaded on.
	266	Note that channel fields may or may not repeat uploader
	267	fields. This depends on a particular extractor.
	268	channel_id: Id of the channel.
	269	channel_url: Full URL to a channel webpage.
	270	channel_follower_count: Number of followers of the channel.
	271	location: Physical location where the video was filmed.
	272	subtitles: The available subtitles as a dictionary in the format
	273	{tag: subformats}. "tag" is usually a language code, and
	274	"subformats" is a list sorted from lower to higher
	275	preference, each element is a dictionary with the "ext"
	276	entry and one of:
	277	* "data": The subtitles file contents
	278	* "url": A URL pointing to the subtitles file
	279	It can optionally also have:
	280	* "name": Name or description of the subtitles
	281	* "http_headers": A dictionary of additional HTTP headers
	282	to add to the request.
	283	"ext" will be calculated from URL if missing
	284	automatic_captions: Like 'subtitles'; contains automatically generated
	285	captions instead of normal subtitles
	286	duration: Length of the video in seconds, as an integer or float.
	287	view_count: How many users have watched the video on the platform.
	288	concurrent_view_count: How many users are currently watching the video on the platform.
	289	like_count: Number of positive ratings of the video
	290	dislike_count: Number of negative ratings of the video
	291	repost_count: Number of reposts of the video
	292	average_rating: Average rating give by users, the scale used depends on the webpage
	293	comment_count: Number of comments on the video
	294	comments: A list of comments, each with one or more of the following
	295	properties (all but one of text or html optional):
	296	* "author" - human-readable name of the comment author
	297	* "author_id" - user ID of the comment author
	298	* "author_thumbnail" - The thumbnail of the comment author
	299	* "id" - Comment ID
	300	* "html" - Comment as HTML
	301	* "text" - Plain text of the comment
	302	* "timestamp" - UNIX timestamp of comment
	303	* "parent" - ID of the comment this one is replying to.
	304	Set to "root" to indicate that this is a
	305	comment to the original video.
	306	* "like_count" - Number of positive ratings of the comment
	307	* "dislike_count" - Number of negative ratings of the comment
	308	* "is_favorited" - Whether the comment is marked as
	309	favorite by the video uploader
	310	* "author_is_uploader" - Whether the comment is made by
	311	the video uploader
	312	age_limit: Age restriction for the video, as an integer (years)
	313	webpage_url: The URL to the video webpage, if given to yt-dlp it
	314	should allow to get the same result again. (It will be set
	315	by YoutubeDL if it's missing)
	316	categories: A list of categories that the video falls in, for example
	317	["Sports", "Berlin"]
	318	tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
	319	cast: A list of the video cast
	320	is_live: True, False, or None (=unknown). Whether this video is a
	321	live stream that goes on instead of a fixed-length video.
	322	was_live: True, False, or None (=unknown). Whether this video was
	323	originally a live stream.
	324	live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
	325	or 'post_live' (was live, but VOD is not yet processed)
	326	If absent, automatically set from is_live, was_live
	327	start_time: Time in seconds where the reproduction should start, as
	328	specified in the URL.
	329	end_time: Time in seconds where the reproduction should end, as
	330	specified in the URL.
	331	chapters: A list of dictionaries, with the following entries:
	332	* "start_time" - The start time of the chapter in seconds
	333	* "end_time" - The end time of the chapter in seconds
	334	* "title" (optional, string)
	335	playable_in_embed: Whether this video is allowed to play in embedded
	336	players on other sites. Can be True (=always allowed),
	337	False (=never allowed), None (=unknown), or a string
	338	specifying the criteria for embedability; e.g. 'whitelist'
	339	availability: Under what condition the video is available. One of
	340	'private', 'premium_only', 'subscriber_only', 'needs_auth',
	341	'unlisted' or 'public'. Use 'InfoExtractor._availability'
	342	to set it
	343	_old_archive_ids: A list of old archive ids needed for backward compatibility
	344	__post_extractor: A function to be called just before the metadata is
	345	written to either disk, logger or console. The function
	346	must return a dict which will be added to the info_dict.
	347	This is usefull for additional information that is
	348	time-consuming to extract. Note that the fields thus
	349	extracted will not be available to output template and
	350	match_filter. So, only "comments" and "comment_count" are
	351	currently allowed to be extracted via this method.
	352
	353	The following fields should only be used when the video belongs to some logical
	354	chapter or section:
	355
	356	chapter: Name or title of the chapter the video belongs to.
	357	chapter_number: Number of the chapter the video belongs to, as an integer.
	358	chapter_id: Id of the chapter the video belongs to, as a unicode string.
	359
	360	The following fields should only be used when the video is an episode of some
	361	series, programme or podcast:
	362
	363	series: Title of the series or programme the video episode belongs to.
	364	series_id: Id of the series or programme the video episode belongs to, as a unicode string.
	365	season: Title of the season the video episode belongs to.
	366	season_number: Number of the season the video episode belongs to, as an integer.
	367	season_id: Id of the season the video episode belongs to, as a unicode string.
	368	episode: Title of the video episode. Unlike mandatory video title field,
	369	this field should denote the exact title of the video episode
	370	without any kind of decoration.
	371	episode_number: Number of the video episode within a season, as an integer.
	372	episode_id: Id of the video episode, as a unicode string.
	373
	374	The following fields should only be used when the media is a track or a part of
	375	a music album:
	376
	377	track: Title of the track.
	378	track_number: Number of the track within an album or a disc, as an integer.
	379	track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
	380	as a unicode string.
	381	artist: Artist(s) of the track.
	382	genre: Genre(s) of the track.
	383	album: Title of the album the track belongs to.
	384	album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
	385	album_artist: List of all artists appeared on the album (e.g.
	386	"Ash Borer / Fell Voices" or "Various Artists", useful for splits
	387	and compilations).
	388	disc_number: Number of the disc or other physical medium the track belongs to,
	389	as an integer.
	390	release_year: Year (YYYY) when the album was released.
	391	composer: Composer of the piece
	392
	393	The following fields should only be set for clips that should be cut from the original video:
	394
	395	section_start: Start time of the section in seconds
	396	section_end: End time of the section in seconds
	397
	398	The following fields should only be set for storyboards:
	399	rows: Number of rows in each storyboard fragment, as an integer
	400	columns: Number of columns in each storyboard fragment, as an integer
	401
	402	Unless mentioned otherwise, the fields should be Unicode strings.
	403
	404	Unless mentioned otherwise, None is equivalent to absence of information.
	405
	406
	407	_type "playlist" indicates multiple videos.
	408	There must be a key "entries", which is a list, an iterable, or a PagedList
	409	object, each element of which is a valid dictionary by this specification.
	410
	411	Additionally, playlists can have "id", "title", and any other relevant
	412	attributes with the same semantics as videos (see above).
	413
	414	It can also have the following optional fields:
	415
	416	playlist_count: The total number of videos in a playlist. If not given,
	417	YoutubeDL tries to calculate it from "entries"
	418
	419
	420	_type "multi_video" indicates that there are multiple videos that
	421	form a single show, for examples multiple acts of an opera or TV episode.
	422	It must have an entries key like a playlist and contain all the keys
	423	required for a video at the same time.
	424
	425
	426	_type "url" indicates that the video must be extracted from another
	427	location, possibly by a different extractor. Its only required key is:
	428	"url" - the next URL to extract.
	429	The key "ie_key" can be set to the class name (minus the trailing "IE",
	430	e.g. "Youtube") if the extractor class is known in advance.
	431	Additionally, the dictionary may have any properties of the resolved entity
	432	known in advance, for example "title" if the title of the referred video is
	433	known ahead of time.
	434
	435
	436	_type "url_transparent" entities have the same specification as "url", but
	437	indicate that the given additional information is more precise than the one
	438	associated with the resolved URL.
	439	This is useful when a site employs a video service that hosts the video and
	440	its technical metadata, but that video service does not embed a useful
	441	title, description etc.
	442
	443
	444	Subclasses of this should also be added to the list of extractors and
	445	should define a _VALID_URL regexp and, re-define the _real_extract() and
	446	(optionally) _real_initialize() methods.
	447
	448	Subclasses may also override suitable() if necessary, but ensure the function
	449	signature is preserved and that this function imports everything it needs
	450	(except other extractors), so that lazy_extractors works correctly.
	451
	452	Subclasses can define a list of _EMBED_REGEX, which will be searched for in
	453	the HTML of Generic webpages. It may also override _extract_embed_urls
	454	or _extract_from_webpage as necessary. While these are normally classmethods,
	455	_extract_from_webpage is allowed to be an instance method.
	456
	457	_extract_from_webpage may raise self.StopExtraction() to stop further
	458	processing of the webpage and obtain exclusive rights to it. This is useful
	459	when the extractor cannot reliably be matched using just the URL,
	460	e.g. invidious/peertube instances
	461
	462	Embed-only extractors can be defined by setting _VALID_URL = False.
	463
	464	To support username + password (or netrc) login, the extractor must define a
	465	_NETRC_MACHINE and re-define _perform_login(username, password) and
	466	(optionally) _initialize_pre_login() methods. The _perform_login method will
	467	be called between _initialize_pre_login and _real_initialize if credentials
	468	are passed by the user. In cases where it is necessary to have the login
	469	process as part of the extraction rather than initialization, _perform_login
	470	can be left undefined.
	471
	472	_GEO_BYPASS attribute may be set to False in order to disable
	473	geo restriction bypass mechanisms for a particular extractor.
	474	Though it won't disable explicit geo restriction bypass based on
	475	country code provided with geo_bypass_country.
	476
	477	_GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
	478	countries for this extractor. One of these countries will be used by
	479	geo restriction bypass mechanism right away in order to bypass
	480	geo restriction, of course, if the mechanism is not disabled.
	481
	482	_GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
	483	IP blocks in CIDR notation for this extractor. One of these IP blocks
	484	will be used by geo restriction bypass mechanism similarly
	485	to _GEO_COUNTRIES.
	486
	487	The _ENABLED attribute should be set to False for IEs that
	488	are disabled by default and must be explicitly enabled.
	489
	490	The _WORKING attribute should be set to False for broken IEs
	491	in order to warn the users and skip the tests.
	492	"""
	493
	494	_ready = False
	495	_downloader = None
	496	_x_forwarded_for_ip = None
	497	_GEO_BYPASS = True
	498	_GEO_COUNTRIES = None
	499	_GEO_IP_BLOCKS = None
	500	_WORKING = True

1

import base64

import collections

import getpass

import hashlib

import http.client

import http.cookiejar

import http.cookies

import inspect

import itertools

import json

import math

import netrc

import os

import random

import re

import sys

import time

import types

import urllib.parse

import urllib.request

21

import xml.etree.ElementTree

22

23

from ..compat import functools # isort: split

24

from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name

25

from ..cookies import LenientSimpleCookie

26

from ..downloader import FileDownloader

27

from ..downloader.f4m import get_base_url, remove_encrypted_media

28

from ..utils import (

IDENTITY,

JSON_LD_RE,

NO_DEFAULT,

ExtractorError,

GeoRestrictedError,

GeoUtils,

LenientJSONDecoder,

RegexNotFoundError,

RetryManager,

UnsupportedError,

age_restricted,

base_url,

bug_reports_message,

classproperty,

clean_html,

determine_ext,

determine_protocol,

dict_get,

encode_data_uri,

error_to_compat_str,

extract_attributes,

filter_dict,

fix_xml_ampersands,

float_or_none,

format_field,

int_or_none,

join_nonempty,

js_to_json,

mimetype2ext,

network_exceptions,

orderedSet,

parse_bitrate,

parse_codecs,

parse_duration,

parse_iso8601,

parse_m3u8_attributes,

parse_resolution,

sanitize_filename,

sanitize_url,

sanitized_Request,

smuggle_url,

str_or_none,

str_to_int,

strip_or_none,

traverse_obj,

try_call,

try_get,

unescapeHTML,

unified_strdate,

unified_timestamp,

update_Request,

update_url_query,

url_basename,

url_or_none,

urljoin,

variadic,

xpath_element,

xpath_text,

xpath_with_ns,

)

class InfoExtractor:

"""Information Extractor class.

93

94

Information extractors are the classes that, given a URL, extract

95

information about the video (or videos) the URL refers to. This

96

information includes the real video URL, the video title, author and

97

others. The information is stored in a dictionary which is then

98

passed to the YoutubeDL. The YoutubeDL processes this

99

information possibly downloading the video to the file system, among

100

other possible outcomes.

101

102

The type field determines the type of the result.

103

By far the most common value (and the default if _type is missing) is

104

"video", which indicates a single video.

105

106

For a video, the dictionaries must include the following fields:

107

108

id: Video identifier.

109

title: Video title, unescaped. Set to an empty string if video has

110

no title as opposed to "None" which signifies that the

111

extractor failed to obtain a title

112

113

Additionally, it must contain either a formats entry or a url one:

114

115

formats: A list of dictionaries for each format available, ordered

116

from worst to best quality.

117

118

Potential fields:

119

* url The mandatory URL representing the media:

120

for plain file media - HTTP URL of this file,

121

for RTMP - RTMP URL,

122

for HLS - URL of the M3U8 media playlist,

123

for HDS - URL of the F4M manifest,

124

for DASH

125

- HTTP URL to plain file media (in case of

126

unfragmented media)

127

- URL of the MPD manifest or base URL

128

representing the media if MPD manifest

129

is parsed from a string (in case of

130

fragmented media)

131

for MSS - URL of the ISM manifest.

132

* manifest_url

133

The URL of the manifest file in case of

134

fragmented media:

135

for HLS - URL of the M3U8 master playlist,

136

for HDS - URL of the F4M manifest,

137

for DASH - URL of the MPD manifest,

138

for MSS - URL of the ISM manifest.

139

* manifest_stream_number (For internal use only)

140

The index of the stream in the manifest file

141

* ext Will be calculated from URL if missing

142

* format A human-readable description of the format

143

("mp4 container with h264/opus").

144

Calculated from the format_id, width, height.

145

and format_note fields if missing.

146

* format_id A short description of the format

147

("mp4_h264_opus" or "19").

148

Technically optional, but strongly recommended.

149

* format_note Additional info about the format

150

("3D" or "DASH video")

151

* width Width of the video, if known

152

* height Height of the video, if known

153

* resolution Textual description of width and height

154

* dynamic_range The dynamic range of the video. One of:

155

"SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"

156

* tbr Average bitrate of audio and video in KBit/s

157

* abr Average audio bitrate in KBit/s

158

* acodec Name of the audio codec in use

159

* asr Audio sampling rate in Hertz

160

* audio_channels Number of audio channels

161

* vbr Average video bitrate in KBit/s

162

* fps Frame rate

163

* vcodec Name of the video codec in use

164

* container Name of the container format

165

* filesize The number of bytes, if known in advance

166

* filesize_approx An estimate for the number of bytes

167

* player_url SWF Player URL (used for rtmpdump).

168

* protocol The protocol that will be used for the actual

169

download, lower-case. One of "http", "https" or

170

one of the protocols defined in downloader.PROTOCOL_MAP

171

* fragment_base_url

172

Base URL for fragments. Each fragment's path

173

value (if present) will be relative to

174

this URL.

175

* fragments A list of fragments of a fragmented media.

176

Each fragment entry must contain either an url

177

or a path. If an url is present it should be

178

considered by a client. Otherwise both path and

179

fragment_base_url must be present. Here is

180

the list of all potential fields:

181

* "url" - fragment's URL

182

* "path" - fragment's path relative to

183

fragment_base_url

184

* "duration" (optional, int or float)

185

* "filesize" (optional, int)

186

* is_from_start Is a live format that can be downloaded

187

from the start. Boolean

188

* preference Order number of this format. If this field is

189

present and not None, the formats get sorted

190

by this field, regardless of all other values.

191

-1 for default (order by other properties),

192

-2 or smaller for less than default.

193

< -1000 to hide the format (if there is

194

another one which is strictly better)

195

* language Language code, e.g. "de" or "en-US".

196

* language_preference Is this in the language mentioned in

197

the URL?

198

10 if it's what the URL is about,

199

-1 for default (don't know),

200

-10 otherwise, other values reserved for now.

201

* quality Order number of the video quality of this

202

format, irrespective of the file format.

203

-1 for default (order by other properties),

204

-2 or smaller for less than default.

205

* source_preference Order number for this video source

206

(quality takes higher priority)

207

-1 for default (order by other properties),

208

-2 or smaller for less than default.

209

* http_headers A dictionary of additional HTTP headers

210

to add to the request.

211

* stretched_ratio If given and not 1, indicates that the

212

video's pixels are not square.

213

width : height ratio as float.

214

* no_resume The server does not support resuming the

215

(HTTP or RTMP) download. Boolean.

216

* has_drm The format has DRM and cannot be downloaded. Boolean

217

* downloader_options A dictionary of downloader options

218

(For internal use only)

219

* http_chunk_size Chunk size for HTTP downloads

220

* ffmpeg_args Extra arguments for ffmpeg downloader

221

RTMP formats can also have the additional fields: page_url,

222

app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,

223

rtmp_protocol, rtmp_real_time

224

225

url: Final video URL.

226

ext: Video filename extension.

227

format: The video format, defaults to ext (used for --get-format)

228

player_url: SWF Player URL (used for rtmpdump).

229

230

The following fields are optional:

231

232

direct: True if a direct video file was given (must only be set by GenericIE)

233

alt_title: A secondary title of the video.

234

display_id An alternative identifier for the video, not necessarily

235

unique, but available before title. Typically, id is

236

something like "4234987", title "Dancing naked mole rats",

237

and display_id "dancing-naked-mole-rats"

238

thumbnails: A list of dictionaries, with the following entries:

239

* "id" (optional, string) - Thumbnail format ID

240

* "url"

241

* "preference" (optional, int) - quality of the image

242

* "width" (optional, int)

243

* "height" (optional, int)

244

* "resolution" (optional, string "{width}x{height}",

245

deprecated)

246

* "filesize" (optional, int)

247

* "http_headers" (dict) - HTTP headers for the request

248

thumbnail: Full URL to a video thumbnail image.

249

description: Full video description.

250

uploader: Full name of the video uploader.

251

license: License name the video is licensed under.

252

creator: The creator of the video.

253

timestamp: UNIX timestamp of the moment the video was uploaded

254

upload_date: Video upload date in UTC (YYYYMMDD).

255

If not explicitly set, calculated from timestamp

256

release_timestamp: UNIX timestamp of the moment the video was released.

257

If it is not clear whether to use timestamp or this, use the former

258

release_date: The date (YYYYMMDD) when the video was released in UTC.

259

If not explicitly set, calculated from release_timestamp

260

modified_timestamp: UNIX timestamp of the moment the video was last modified.

261

modified_date: The date (YYYYMMDD) when the video was last modified in UTC.

262

If not explicitly set, calculated from modified_timestamp

263

uploader_id: Nickname or id of the video uploader.

264

uploader_url: Full URL to a personal webpage of the video uploader.

265

channel: Full name of the channel the video is uploaded on.

266

Note that channel fields may or may not repeat uploader

267

fields. This depends on a particular extractor.

268

channel_id: Id of the channel.

269

channel_url: Full URL to a channel webpage.

270

channel_follower_count: Number of followers of the channel.

271

location: Physical location where the video was filmed.

272

subtitles: The available subtitles as a dictionary in the format

273

{tag: subformats}. "tag" is usually a language code, and

274

"subformats" is a list sorted from lower to higher

275

preference, each element is a dictionary with the "ext"

276

entry and one of:

277

* "data": The subtitles file contents

278

* "url": A URL pointing to the subtitles file

279

It can optionally also have:

280

* "name": Name or description of the subtitles

281

* "http_headers": A dictionary of additional HTTP headers

282

to add to the request.

283

"ext" will be calculated from URL if missing

284

automatic_captions: Like 'subtitles'; contains automatically generated

285

captions instead of normal subtitles

286

duration: Length of the video in seconds, as an integer or float.

287

view_count: How many users have watched the video on the platform.

288

concurrent_view_count: How many users are currently watching the video on the platform.

289

like_count: Number of positive ratings of the video

290

dislike_count: Number of negative ratings of the video

291

repost_count: Number of reposts of the video

292

average_rating: Average rating give by users, the scale used depends on the webpage

293

comment_count: Number of comments on the video

294

comments: A list of comments, each with one or more of the following

295

properties (all but one of text or html optional):

296

* "author" - human-readable name of the comment author

297

* "author_id" - user ID of the comment author

298

* "author_thumbnail" - The thumbnail of the comment author

299

* "id" - Comment ID

300

* "html" - Comment as HTML

301

* "text" - Plain text of the comment

302

* "timestamp" - UNIX timestamp of comment

303

* "parent" - ID of the comment this one is replying to.

304

Set to "root" to indicate that this is a

305

comment to the original video.

306

* "like_count" - Number of positive ratings of the comment

307

* "dislike_count" - Number of negative ratings of the comment

308

* "is_favorited" - Whether the comment is marked as

309

favorite by the video uploader

310

* "author_is_uploader" - Whether the comment is made by

311

the video uploader

312

age_limit: Age restriction for the video, as an integer (years)

313

webpage_url: The URL to the video webpage, if given to yt-dlp it

314

should allow to get the same result again. (It will be set

315

by YoutubeDL if it's missing)

316

categories: A list of categories that the video falls in, for example

317

["Sports", "Berlin"]

318

tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]

319

cast: A list of the video cast

320

is_live: True, False, or None (=unknown). Whether this video is a

321

live stream that goes on instead of a fixed-length video.

322

was_live: True, False, or None (=unknown). Whether this video was

323

originally a live stream.

324

live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',

325

or 'post_live' (was live, but VOD is not yet processed)

326

If absent, automatically set from is_live, was_live

327

start_time: Time in seconds where the reproduction should start, as

328

specified in the URL.

329

end_time: Time in seconds where the reproduction should end, as

330

specified in the URL.

331

chapters: A list of dictionaries, with the following entries:

332

* "start_time" - The start time of the chapter in seconds

333

* "end_time" - The end time of the chapter in seconds

334

* "title" (optional, string)

335

playable_in_embed: Whether this video is allowed to play in embedded

336

players on other sites. Can be True (=always allowed),

337

False (=never allowed), None (=unknown), or a string

338

specifying the criteria for embedability; e.g. 'whitelist'

339

availability: Under what condition the video is available. One of

340

'private', 'premium_only', 'subscriber_only', 'needs_auth',

341

'unlisted' or 'public'. Use 'InfoExtractor._availability'

342

to set it

343

_old_archive_ids: A list of old archive ids needed for backward compatibility

344

__post_extractor: A function to be called just before the metadata is

345

written to either disk, logger or console. The function

346

must return a dict which will be added to the info_dict.

347

This is usefull for additional information that is

348

time-consuming to extract. Note that the fields thus

349

extracted will not be available to output template and

350

match_filter. So, only "comments" and "comment_count" are

351

currently allowed to be extracted via this method.

352

353

The following fields should only be used when the video belongs to some logical

354

chapter or section:

355

356

chapter: Name or title of the chapter the video belongs to.

357

chapter_number: Number of the chapter the video belongs to, as an integer.

358

chapter_id: Id of the chapter the video belongs to, as a unicode string.

359

360

The following fields should only be used when the video is an episode of some

361

series, programme or podcast:

362

363

series: Title of the series or programme the video episode belongs to.

364

series_id: Id of the series or programme the video episode belongs to, as a unicode string.

365

season: Title of the season the video episode belongs to.

366

season_number: Number of the season the video episode belongs to, as an integer.

367

season_id: Id of the season the video episode belongs to, as a unicode string.

368

episode: Title of the video episode. Unlike mandatory video title field,

369

this field should denote the exact title of the video episode

370

without any kind of decoration.

371

episode_number: Number of the video episode within a season, as an integer.

372

episode_id: Id of the video episode, as a unicode string.

373

374

The following fields should only be used when the media is a track or a part of

375

a music album:

376

377

track: Title of the track.

378

track_number: Number of the track within an album or a disc, as an integer.

379

track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),

380

as a unicode string.

381

artist: Artist(s) of the track.

382

genre: Genre(s) of the track.

383

album: Title of the album the track belongs to.

384

album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).

385

album_artist: List of all artists appeared on the album (e.g.

386

"Ash Borer / Fell Voices" or "Various Artists", useful for splits

387

and compilations).

388

disc_number: Number of the disc or other physical medium the track belongs to,

389

as an integer.

390

release_year: Year (YYYY) when the album was released.

391

composer: Composer of the piece

392

393

The following fields should only be set for clips that should be cut from the original video:

394

395

section_start: Start time of the section in seconds

396

section_end: End time of the section in seconds

397

398

The following fields should only be set for storyboards:

399

rows: Number of rows in each storyboard fragment, as an integer

400

columns: Number of columns in each storyboard fragment, as an integer

401

402

Unless mentioned otherwise, the fields should be Unicode strings.

403

404

Unless mentioned otherwise, None is equivalent to absence of information.

405

406

407

_type "playlist" indicates multiple videos.

408

There must be a key "entries", which is a list, an iterable, or a PagedList

409

object, each element of which is a valid dictionary by this specification.

410

411

Additionally, playlists can have "id", "title", and any other relevant

412

attributes with the same semantics as videos (see above).

413

414

It can also have the following optional fields:

415

416

playlist_count: The total number of videos in a playlist. If not given,

417

YoutubeDL tries to calculate it from "entries"

418

419

420

_type "multi_video" indicates that there are multiple videos that

421

form a single show, for examples multiple acts of an opera or TV episode.

422

It must have an entries key like a playlist and contain all the keys

423

required for a video at the same time.

424

425

426

_type "url" indicates that the video must be extracted from another

427

location, possibly by a different extractor. Its only required key is:

428

"url" - the next URL to extract.

429

The key "ie_key" can be set to the class name (minus the trailing "IE",

430

e.g. "Youtube") if the extractor class is known in advance.

431

Additionally, the dictionary may have any properties of the resolved entity

432

known in advance, for example "title" if the title of the referred video is

known ahead of time.

_type "url_transparent" entities have the same specification as "url", but

437

indicate that the given additional information is more precise than the one

438

associated with the resolved URL.

439

This is useful when a site employs a video service that hosts the video and

440

its technical metadata, but that video service does not embed a useful

441

title, description etc.

442

443

444

Subclasses of this should also be added to the list of extractors and

445

should define a _VALID_URL regexp and, re-define the _real_extract() and

446

(optionally) _real_initialize() methods.

447

448

Subclasses may also override suitable() if necessary, but ensure the function

449

signature is preserved and that this function imports everything it needs

450

(except other extractors), so that lazy_extractors works correctly.

451

452

Subclasses can define a list of _EMBED_REGEX, which will be searched for in

453

the HTML of Generic webpages. It may also override _extract_embed_urls

454

or _extract_from_webpage as necessary. While these are normally classmethods,

455

_extract_from_webpage is allowed to be an instance method.

456

457

_extract_from_webpage may raise self.StopExtraction() to stop further

458

processing of the webpage and obtain exclusive rights to it. This is useful

459

when the extractor cannot reliably be matched using just the URL,

460

e.g. invidious/peertube instances

461

462

Embed-only extractors can be defined by setting _VALID_URL = False.

463

464

To support username + password (or netrc) login, the extractor must define a

465

_NETRC_MACHINE and re-define _perform_login(username, password) and

466

(optionally) _initialize_pre_login() methods. The _perform_login method will

467

be called between _initialize_pre_login and _real_initialize if credentials

468

are passed by the user. In cases where it is necessary to have the login

469

process as part of the extraction rather than initialization, _perform_login

470

can be left undefined.

471

472

_GEO_BYPASS attribute may be set to False in order to disable

473

geo restriction bypass mechanisms for a particular extractor.

474

Though it won't disable explicit geo restriction bypass based on

475

country code provided with geo_bypass_country.

476

477

_GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted

478

countries for this extractor. One of these countries will be used by

479

geo restriction bypass mechanism right away in order to bypass

480

geo restriction, of course, if the mechanism is not disabled.

481

482

_GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted

483

IP blocks in CIDR notation for this extractor. One of these IP blocks

484

will be used by geo restriction bypass mechanism similarly

485

to _GEO_COUNTRIES.

486

487

The _ENABLED attribute should be set to False for IEs that

488

are disabled by default and must be explicitly enabled.

489

490

The _WORKING attribute should be set to False for broken IEs

491

in order to warn the users and skip the tests.

"""

_ready = False

_downloader = None

_x_forwarded_for_ip = None

497

_GEO_BYPASS = True

498

_GEO_COUNTRIES = None

499

_GEO_IP_BLOCKS = None

500

_WORKING = True

501

_ENABLED = True

502

_NETRC_MACHINE = None

IE_DESC = None

SEARCH_KEY = None

_VALID_URL = None

_EMBED_REGEX = []

def _login_hint(self, method=NO_DEFAULT, netrc=None):

509

password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'

510

return {

511

None: '',

512

'any': f'Use --cookies, --cookies-from-browser, {password_hint}',

513

'password': f'Use {password_hint}',

514

'cookies': (

515

'Use --cookies-from-browser or --cookies for the authentication. '

516

'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),

517

}[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']

518

519

def __init__(self, downloader=None):

520

"""Constructor. Receives an optional downloader (a YoutubeDL instance).

521

If a downloader is not passed during initialization,

522

it must be set using "set_downloader()" before "extract()" is called"""

523

self._ready = False

524

self._x_forwarded_for_ip = None

525

self._printed_messages = set()

526

self.set_downloader(downloader)

527

528

@classmethod

529

def _match_valid_url(cls, url):

530

if cls._VALID_URL is False:

531

return None

532

# This does not use has/getattr intentionally - we want to know whether

533

# we have cached the regexp for *this* class, whereas getattr would also

534

# match the superclass

535

if '_VALID_URL_RE' not in cls.__dict__:

536

cls._VALID_URL_RE = re.compile(cls._VALID_URL)

537

return cls._VALID_URL_RE.match(url)

538

539

@classmethod

540

def suitable(cls, url):

541

"""Receives a URL and returns True if suitable for this IE."""

542

# This function must import everything it needs (except other extractors),

543

# so that lazy_extractors works correctly

544

return cls._match_valid_url(url) is not None

545

546

@classmethod

547

def _match_id(cls, url):

548

return cls._match_valid_url(url).group('id')

549

550

@classmethod

551

def get_temp_id(cls, url):

552

try:

553

return cls._match_id(url)

554

except (IndexError, AttributeError):

return None

@classmethod

def working(cls):

"""Getter method for _WORKING."""

return cls._WORKING

@classmethod

def supports_login(cls):

564

return bool(cls._NETRC_MACHINE)

565

566

def initialize(self):

567

"""Initializes an instance (authentication, etc)."""

568

self._printed_messages = set()

569

self._initialize_geo_bypass({

570

'countries': self._GEO_COUNTRIES,

571

'ip_blocks': self._GEO_IP_BLOCKS,

572

})

573

if not self._ready:

574

self._initialize_pre_login()

575

if self.supports_login():

576

username, password = self._get_login_info()

577

if username:

578

self._perform_login(username, password)

579

elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):

580

self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')

581

self._real_initialize()

582

self._ready = True

583

584

def _initialize_geo_bypass(self, geo_bypass_context):

585

"""

586

Initialize geo restriction bypass mechanism.

587

588

This method is used to initialize geo bypass mechanism based on faking

589

X-Forwarded-For HTTP header. A random country from provided country list

590

is selected and a random IP belonging to this country is generated. This

591

IP will be passed as X-Forwarded-For HTTP header in all subsequent

592

HTTP requests.

593

594

This method will be used for initial geo bypass mechanism initialization

595

during the instance initialization with _GEO_COUNTRIES and

596

_GEO_IP_BLOCKS.

597

598

You may also manually call it from extractor's code if geo bypass

599

information is not available beforehand (e.g. obtained during

600

extraction) or due to some other reason. In this case you should pass

601

this information in geo bypass context passed as first argument. It may

602

contain following fields:

603

604

countries: List of geo unrestricted countries (similar

605

to _GEO_COUNTRIES)

606

ip_blocks: List of geo unrestricted IP blocks in CIDR notation

607

(similar to _GEO_IP_BLOCKS)

608

609

"""

610

if not self._x_forwarded_for_ip:

611

612

# Geo bypass mechanism is explicitly disabled by user

613

if not self.get_param('geo_bypass', True):

614

return

615

616

if not geo_bypass_context:

617

geo_bypass_context = {}

618

619

# Backward compatibility: previously _initialize_geo_bypass

620

# expected a list of countries, some 3rd party code may still use

621

# it this way

622

if isinstance(geo_bypass_context, (list, tuple)):

623

geo_bypass_context = {

624

'countries': geo_bypass_context,

625

}

626

627

# The whole point of geo bypass mechanism is to fake IP

628

# as X-Forwarded-For HTTP header based on some IP block or

629

# country code.

630

631

# Path 1: bypassing based on IP block in CIDR notation

632

633

# Explicit IP block specified by user, use it right away

634

# regardless of whether extractor is geo bypassable or not

635

ip_block = self.get_param('geo_bypass_ip_block', None)

636

637

# Otherwise use random IP block from geo bypass context but only

638

# if extractor is known as geo bypassable

639

if not ip_block:

640

ip_blocks = geo_bypass_context.get('ip_blocks')

641

if self._GEO_BYPASS and ip_blocks:

642

ip_block = random.choice(ip_blocks)

643

644

if ip_block:

645

self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)

646

self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')

647

return

648

649

# Path 2: bypassing based on country code

650

651

# Explicit country code specified by user, use it right away

652

# regardless of whether extractor is geo bypassable or not

653

country = self.get_param('geo_bypass_country', None)

654

655

# Otherwise use random country code from geo bypass context but

656

# only if extractor is known as geo bypassable

657

if not country:

658

countries = geo_bypass_context.get('countries')

659

if self._GEO_BYPASS and countries:

660

country = random.choice(countries)

661

662

if country:

663

self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)

664

self._downloader.write_debug(

665

f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')

666

667

def extract(self, url):

668

"""Extracts URL information and returns it in list of dicts."""

try:

for _ in range(2):

try:

self.initialize()

self.write_debug('Extracting URL: %s' % url)

674

ie_result = self._real_extract(url)

675

if ie_result is None:

676

return None

677

if self._x_forwarded_for_ip:

678

ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip

679

subtitles = ie_result.get('subtitles') or {}

680

if 'no-live-chat' in self.get_param('compat_opts'):

681

for lang in ('live_chat', 'comments', 'danmaku'):

682

subtitles.pop(lang, None)

683

return ie_result

684

except GeoRestrictedError as e:

685

if self.__maybe_fake_ip_and_retry(e.countries):

686

continue

687

raise

688

except UnsupportedError:

689

raise

690

except ExtractorError as e:

691

kwargs = {

692

'video_id': e.video_id or self.get_temp_id(url),

693

'ie': self.IE_NAME,

694

'tb': e.traceback or sys.exc_info()[2],

695

'expected': e.expected,

696

'cause': e.cause

697

}

698

if hasattr(e, 'countries'):

699

kwargs['countries'] = e.countries

700

raise type(e)(e.orig_msg, **kwargs)

701

except http.client.IncompleteRead as e:

702

raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))

703

except (KeyError, StopIteration) as e:

704

raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))

705

706

def __maybe_fake_ip_and_retry(self, countries):

707

if (not self.get_param('geo_bypass_country', None)

708

and self._GEO_BYPASS

709

and self.get_param('geo_bypass', True)

710

and not self._x_forwarded_for_ip

711

and countries):

712

country_code = random.choice(countries)

713

self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)

714

if self._x_forwarded_for_ip:

715

self.report_warning(

716

'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'

717

% (self._x_forwarded_for_ip, country_code.upper()))

return True

return False

def set_downloader(self, downloader):

722

"""Sets a YoutubeDL instance as the downloader for this IE."""

723

self._downloader = downloader

@property

def cache(self):

return self._downloader.cache

@property

def cookiejar(self):

return self._downloader.cookiejar

732

733

def _initialize_pre_login(self):

734

""" Initialization before login. Redefine in subclasses."""

735

pass

736

737

def _perform_login(self, username, password):

738

""" Login with username and password. Redefine in subclasses."""

739

pass

740

741

def _real_initialize(self):

742

"""Real initialization process. Redefine in subclasses."""

743

pass

744

745

def _real_extract(self, url):

746

"""Real extraction process. Redefine in subclasses."""

747

raise NotImplementedError('This method must be implemented by subclasses')

@classmethod

def ie_key(cls):

"""A string for getting the InfoExtractor with get_info_extractor"""

752

return cls.__name__[:-2]

@classproperty

def IE_NAME(cls):

return cls.__name__[:-2]

757

758

@staticmethod

759

def __can_accept_status_code(err, expected_status):

760

assert isinstance(err, urllib.error.HTTPError)

761

if expected_status is None:

762

return False

763

elif callable(expected_status):

764

return expected_status(err.code) is True

765

else:

766

return err.code in variadic(expected_status)

767

768

def _create_request(self, url_or_request, data=None, headers=None, query=None):

769

if isinstance(url_or_request, urllib.request.Request):

770

return update_Request(url_or_request, data=data, headers=headers, query=query)

771

if query:

772

url_or_request = update_url_query(url_or_request, query)

773

return sanitized_Request(url_or_request, data, headers or {})

774

775

def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):

776

"""

777

Return the response handle.

778

779

See _download_webpage docstring for arguments specification.

780

"""

781

if not self._downloader._first_webpage_request:

782

sleep_interval = self.get_param('sleep_interval_requests') or 0

783

if sleep_interval > 0:

784

self.to_screen('Sleeping %s seconds ...' % sleep_interval)

785

time.sleep(sleep_interval)

786

else:

787

self._downloader._first_webpage_request = False

788

789

if note is None:

790

self.report_download_webpage(video_id)

791

elif note is not False:

792

if video_id is None:

793

self.to_screen(str(note))

794

else:

795

self.to_screen(f'{video_id}: {note}')

796

797

# Some sites check X-Forwarded-For HTTP header in order to figure out

798

# the origin of the client behind proxy. This allows bypassing geo

799

# restriction by faking this header's value to IP that belongs to some

800

# geo unrestricted country. We will do so once we encounter any

801

# geo restriction error.

802

if self._x_forwarded_for_ip:

803

headers = (headers or {}).copy()

804

headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)

805

806

try:

807

return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))

808

except network_exceptions as err:

809

if isinstance(err, urllib.error.HTTPError):

810

if self.__can_accept_status_code(err, expected_status):

811

# Retain reference to error to prevent file object from

812

# being closed before it can be read. Works around the

813

# effects of <https://bugs.python.org/issue15002>

814

# introduced in Python 3.4.1.

err.fp._error = err

return err.fp

if errnote is False:

return False

if errnote is None:

errnote = 'Unable to download webpage'

822

823

errmsg = f'{errnote}: {error_to_compat_str(err)}'

824

if fatal:

825

raise ExtractorError(errmsg, cause=err)

826

else:

827

self.report_warning(errmsg)

828

return False

829

830

def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,

831

encoding=None, data=None, headers={}, query={}, expected_status=None):

832

"""

833

Return a tuple (page content as string, URL handle).

834

835

Arguments:

836

url_or_request -- plain text URL as a string or

837

a urllib.request.Request object

838

video_id -- Video/playlist/item identifier (string)

839

840

Keyword arguments:

841

note -- note printed before downloading (string)

842

errnote -- note printed in case of an error (string)

843

fatal -- flag denoting whether error should be considered fatal,

844

i.e. whether it should cause ExtractionError to be raised,

845

otherwise a warning will be reported and extraction continued

846

encoding -- encoding for a page content decoding, guessed automatically

847

when not explicitly specified

848

data -- POST data (bytes)

849

headers -- HTTP headers (dict)

850

query -- URL query (dict)

851

expected_status -- allows to accept failed HTTP requests (non 2xx

852

status code) by explicitly specifying a set of accepted status

853

codes. Can be any of the following entities:

854

- an integer type specifying an exact failed status code to

855

accept

856

- a list or a tuple of integer types specifying a list of

857

failed status codes to accept

858

- a callable accepting an actual failed status code and

859

returning True if it should be accepted

860

Note that this argument does not affect success status codes (2xx)

861

which are always accepted.

862

"""

863

864

# Strip hashes from the URL (#1038)

865

if isinstance(url_or_request, str):

866

url_or_request = url_or_request.partition('#')[0]

867

868

urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)

if urlh is False:

assert not fatal

return False

content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)

873

return (content, urlh)

874

875

@staticmethod

876

def _guess_encoding_from_content(content_type, webpage_bytes):

877

m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)

878

if m:

879

encoding = m.group(1)

880

else:

881

m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',

882

webpage_bytes[:1024])

883

if m:

884

encoding = m.group(1).decode('ascii')

885

elif webpage_bytes.startswith(b'\xff\xfe'):

encoding = 'utf-16'

else:

encoding = 'utf-8'

return encoding

def __check_blocked(self, content):

893

first_block = content[:512]

894

if ('<title>Access to this site is blocked</title>' in content

895

and 'Websense' in first_block):

896

msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'

897

blocked_iframe = self._html_search_regex(

898

r'<iframe src="([^"]+)"', content,

899

'Websense information URL', default=None)

900

if blocked_iframe:

901

msg += ' Visit %s for more details' % blocked_iframe

902

raise ExtractorError(msg, expected=True)

903

if '<title>The URL you requested has been blocked</title>' in first_block:

904

msg = (

905

'Access to this webpage has been blocked by Indian censorship. '

906

'Use a VPN or proxy server (with --proxy) to route around it.')

907

block_msg = self._html_search_regex(

908

r'</h1><p>(.*?)</p>',

909

content, 'block message', default=None)

910

if block_msg:

911

msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')

912

raise ExtractorError(msg, expected=True)

913

if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content

914

and 'blocklist.rkn.gov.ru' in content):

915

raise ExtractorError(

916

'Access to this webpage has been blocked by decision of the Russian government. '

917

'Visit http://blocklist.rkn.gov.ru/ for a block reason.',

918

expected=True)

919

920

def _request_dump_filename(self, url, video_id):

921

basen = f'{video_id}_{url}'

922

trim_length = self.get_param('trim_file_name') or 240

923

if len(basen) > trim_length:

924

h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()

925

basen = basen[:trim_length - len(h)] + h

926

filename = sanitize_filename(f'{basen}.dump', restricted=True)

927

# Working around MAX_PATH limitation on Windows (see

928

# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)

929

if compat_os_name == 'nt':

930

absfilepath = os.path.abspath(filename)

931

if len(absfilepath) > 259:

932

filename = fR'\\?\{absfilepath}'

933

return filename

934

935

def __decode_webpage(self, webpage_bytes, encoding, headers):

936

if not encoding:

937

encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)

938

try:

939

return webpage_bytes.decode(encoding, 'replace')

940

except LookupError:

941

return webpage_bytes.decode('utf-8', 'replace')

942

943

def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):

944

webpage_bytes = urlh.read()

945

if prefix is not None:

946

webpage_bytes = prefix + webpage_bytes

947

if self.get_param('dump_intermediate_pages', False):

948

self.to_screen('Dumping request to ' + urlh.geturl())

949

dump = base64.b64encode(webpage_bytes).decode('ascii')

950

self._downloader.to_screen(dump)

951

if self.get_param('write_pages'):

952

filename = self._request_dump_filename(urlh.geturl(), video_id)

953

self.to_screen(f'Saving request to {filename}')

954

with open(filename, 'wb') as outf:

955

outf.write(webpage_bytes)

956

957

content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)

958

self.__check_blocked(content)

return content

def __print_error(self, errnote, fatal, video_id, err):

963

if fatal:

964

raise ExtractorError(f'{video_id}: {errnote}', cause=err)

965

elif errnote:

966

self.report_warning(f'{video_id}: {errnote}: {err}')

967

968

def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):

969

if transform_source:

970

xml_string = transform_source(xml_string)

971

try:

972

return compat_etree_fromstring(xml_string.encode('utf-8'))

973

except xml.etree.ElementTree.ParseError as ve:

974

self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)

975

976

def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):

977

try:

978

return json.loads(

979

json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)

980

except ValueError as ve:

981

self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)

982

983

def _parse_socket_response_as_json(self, data, *args, **kwargs):

984

return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)

985

986

def __create_download_methods(name, parser, note, errnote, return_value):

987

988

def parse(ie, content, *args, errnote=errnote, **kwargs):

if parser is None:

return content

if errnote is False:

kwargs['errnote'] = errnote

993

# parser is fetched by name so subclasses can override it

994

return getattr(ie, parser)(content, *args, **kwargs)

995

996

def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,

997

fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):

998

res = self._download_webpage_handle(

999

url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,

1000

data=data, headers=headers, query=query, expected_status=expected_status)

if res is False:

return res

content, urlh = res

return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh

1005

1006

def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,

1007

fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):

1008

if self.get_param('load_pages'):

1009

url_or_request = self._create_request(url_or_request, data, headers, query)

1010

filename = self._request_dump_filename(url_or_request.full_url, video_id)

1011

self.to_screen(f'Loading request from {filename}')

1012

try:

1013

with open(filename, 'rb') as dumpf:

1014

webpage_bytes = dumpf.read()

1015

except OSError as e:

1016

self.report_warning(f'Unable to load request from disk: {e}')

1017

else:

1018

content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)

1019

return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)

kwargs = {

'note': note,

'errnote': errnote,

'transform_source': transform_source,

1024

'fatal': fatal,

1025

'encoding': encoding,

'data': data,

'headers': headers,

'query': query,

'expected_status': expected_status,

1030

}

1031

if parser is None:

1032

kwargs.pop('transform_source')

1033

# The method is fetched by name so subclasses can override _download_..._handle

1034

res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)

1035

return res if res is False else res[0]

1036

1037

def impersonate(func, name, return_value):

1038

func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'

1039

func.__doc__ = f'''

1040

@param transform_source Apply this transformation before parsing

1041

@returns {return_value}

1042

1043

See _download_webpage_handle docstring for other arguments specification

1044

'''

1045

1046

impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')

1047

impersonate(download_content, f'_download_{name}', f'{return_value}')

1048

return download_handle, download_content

1049

1050

_download_xml_handle, _download_xml = __create_download_methods(

1051

'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')

1052

_download_json_handle, _download_json = __create_download_methods(

1053

'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')

1054

_download_socket_json_handle, _download_socket_json = __create_download_methods(

1055

'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')

1056

__download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]

1057

1058

def _download_webpage(

1059

self, url_or_request, video_id, note=None, errnote=None,

1060

fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):

1061

"""

1062

Return the data of the page as a string.

1063

1064

Keyword arguments:

1065

tries -- number of tries

1066

timeout -- sleep interval between tries

1067

1068

See _download_webpage_handle docstring for other arguments specification.

1069

"""

1070

1071

R''' # NB: These are unused; should they be deprecated?

1072

if tries != 1:

1073

self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')

1074

if timeout is NO_DEFAULT:

1075

timeout = 5

1076

else:

1077

self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')

'''

try_count = 0

while True:

try:

return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)

1084

except http.client.IncompleteRead as e:

1085

try_count += 1

1086

if try_count >= tries:

1087

raise e

1088

self._sleep(timeout, video_id)

1089

1090

def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):

1091

idstr = format_field(video_id, None, '%s: ')

1092

msg = f'[{self.IE_NAME}] {idstr}{msg}'

1093

if only_once:

1094

if f'WARNING: {msg}' in self._printed_messages:

1095

return

1096

self._printed_messages.add(f'WARNING: {msg}')

1097

self._downloader.report_warning(msg, *args, **kwargs)

1098

1099

def to_screen(self, msg, *args, **kwargs):

1100

"""Print msg to screen, prefixing it with '[ie_name]'"""

1101

self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)

1102

1103

def write_debug(self, msg, *args, **kwargs):

1104

self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)

1105

1106

def get_param(self, name, default=None, *args, **kwargs):

1107

if self._downloader:

1108

return self._downloader.params.get(name, default, *args, **kwargs)

1109

return default

1110

1111

def report_drm(self, video_id, partial=False):

1112

self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)

1113

1114

def report_extraction(self, id_or_name):

1115

"""Report information extraction."""

1116

self.to_screen('%s: Extracting information' % id_or_name)

1117

1118

def report_download_webpage(self, video_id):

1119

"""Report webpage download."""

1120

self.to_screen('%s: Downloading webpage' % video_id)

1121

1122

def report_age_confirmation(self):

1123

"""Report attempt to confirm age."""

1124

self.to_screen('Confirming age')

1125

1126

def report_login(self):

1127

"""Report attempt to log in."""

1128

self.to_screen('Logging in')

1129

1130

def raise_login_required(

1131

self, msg='This video is only available for registered users',

1132

metadata_available=False, method=NO_DEFAULT):

1133

if metadata_available and (

1134

self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):

1135

self.report_warning(msg)

1136

return

1137

msg += format_field(self._login_hint(method), None, '. %s')

1138

raise ExtractorError(msg, expected=True)

1139

1140

def raise_geo_restricted(

1141

self, msg='This video is not available from your location due to geo restriction',

1142

countries=None, metadata_available=False):

1143

if metadata_available and (

1144

self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):

1145

self.report_warning(msg)

1146

else:

1147

raise GeoRestrictedError(msg, countries=countries)

1148

1149

def raise_no_formats(self, msg, expected=False, video_id=None):

1150

if expected and (

1151

self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):

1152

self.report_warning(msg, video_id)

1153

elif isinstance(msg, ExtractorError):

1154

raise msg

1155

else:

1156

raise ExtractorError(msg, expected=expected, video_id=video_id)

1157

1158

# Methods for following #608

1159

@staticmethod

1160

def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):

1161

"""Returns a URL that points to a page that should be processed"""

1162

if ie is not None:

1163

kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()

1164

if video_id is not None:

1165

kwargs['id'] = video_id

1166

if video_title is not None:

1167

kwargs['title'] = video_title

1168

return {

1169

**kwargs,

1170

'_type': 'url_transparent' if url_transparent else 'url',

'url': url,

}

@classmethod

def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,

1176

getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):

1177

return cls.playlist_result(

1178

(cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),

1179

playlist_id, playlist_title, **kwargs)

1180

1181

@staticmethod

1182

def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):

1183

"""Returns a playlist"""

1184

if playlist_id:

1185

kwargs['id'] = playlist_id

1186

if playlist_title:

1187

kwargs['title'] = playlist_title

1188

if playlist_description is not None:

1189

kwargs['description'] = playlist_description

1190

return {

1191

**kwargs,

1192

'_type': 'multi_video' if multi_video else 'playlist',

'entries': entries,

}

def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):

1197

"""

1198

Perform a regex search on the given string, using a single or a list of

1199

patterns returning the first matching group.

1200

In case of failure return a default value or raise a WARNING or a

1201

RegexNotFoundError, depending on fatal, specifying the field name.

"""

if string is None:

mobj = None

elif isinstance(pattern, (str, re.Pattern)):

1206

mobj = re.search(pattern, string, flags)

1207

else:

1208

for p in pattern:

1209

mobj = re.search(p, string, flags)

if mobj:

break

_name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)

if mobj:

if group is None:

# return the first matching group

1218

return next(g for g in mobj.groups() if g is not None)

1219

elif isinstance(group, (list, tuple)):

1220

return tuple(mobj.group(g) for g in group)

1221

else:

1222

return mobj.group(group)

1223

elif default is not NO_DEFAULT:

1224

return default

1225

elif fatal:

1226

raise RegexNotFoundError('Unable to extract %s' % _name)

1227

else:

1228

self.report_warning('unable to extract %s' % _name + bug_reports_message())

1229

return None

1230

1231

def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',

1232

contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):

1233

"""Searches string for the JSON object specified by start_pattern"""

1234

# NB: end_pattern is only used to reduce the size of the initial match

1235

if default is NO_DEFAULT:

1236

default, has_default = {}, False

1237

else:

1238

fatal, has_default = False, True

1239

1240

json_string = self._search_regex(

1241

rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',

1242

string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)

if not json_string:

return default

_name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)

1247

try:

1248

return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)

1249

except ExtractorError as e:

1250

if fatal:

1251

raise ExtractorError(

1252

f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)

1253

elif not has_default:

1254

self.report_warning(

1255

f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)

1256

return default

1257

1258

def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):

1259

"""

1260

Like _search_regex, but strips HTML tags and unescapes entities.

1261

"""

1262

res = self._search_regex(pattern, string, name, default, fatal, flags, group)

1263

if res:

1264

return clean_html(res).strip()

else:

return res

def _get_netrc_login_info(self, netrc_machine=None):

1269

username = None

1270

password = None

1271

netrc_machine = netrc_machine or self._NETRC_MACHINE

1272

1273

if self.get_param('usenetrc', False):

1274

try:

1275

netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')

1276

if os.path.isdir(netrc_file):

1277

netrc_file = os.path.join(netrc_file, '.netrc')

1278

info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)

if info is not None:

username = info[0]

password = info[2]

else:

raise netrc.NetrcParseError(

1284

'No authenticators for %s' % netrc_machine)

1285

except (OSError, netrc.NetrcParseError) as err:

1286

self.report_warning(

1287

'parsing .netrc: %s' % error_to_compat_str(err))

1288

1289

return username, password

1290

1291

def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):

1292

"""

1293

Get the login info as (username, password)

1294

First look for the manually specified credentials using username_option

1295

and password_option as keys in params dictionary. If no such credentials

1296

available look in the netrc file using the netrc_machine or _NETRC_MACHINE

1297

value.

1298

If there's no info available, return (None, None)

1299

"""

1300

1301

# Attempt to use provided username and password or .netrc data

1302

username = self.get_param(username_option)

1303

if username is not None:

1304

password = self.get_param(password_option)

1305

else:

1306

username, password = self._get_netrc_login_info(netrc_machine)

1307

1308

return username, password

1309

1310

def _get_tfa_info(self, note='two-factor verification code'):

1311

"""

1312

Get the two-factor authentication info

1313

TODO - asking the user will be required for sms/phone verify

1314

currently just uses the command line option

1315

If there's no info available, return None

1316

"""

1317

1318

tfa = self.get_param('twofactor')

if tfa is not None:

return tfa

return getpass.getpass('Type %s and press [Return]: ' % note)

1323

1324

# Helper functions for extracting OpenGraph info

1325

@staticmethod

1326

def _og_regexes(prop):

1327

content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'

1328

property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'

1329

% {'prop': re.escape(prop), 'sep': '(?::|[:-])'})

1330

template = r'<meta[^>]+?%s[^>]+?%s'

1331

return [

1332

template % (property_re, content_re),

1333

template % (content_re, property_re),

]

@staticmethod

def _meta_regex(prop):

1338

return r'''(?isx)<meta

1339

(?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)

1340

[^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)

1341

1342

def _og_search_property(self, prop, html, name=None, **kargs):

1343

prop = variadic(prop)

1344

if name is None:

1345

name = 'OpenGraph %s' % prop[0]

1346

og_regexes = []

1347

for p in prop:

1348

og_regexes.extend(self._og_regexes(p))

1349

escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)

1350

if escaped is None:

1351

return None

1352

return unescapeHTML(escaped)

1353

1354

def _og_search_thumbnail(self, html, **kargs):

1355

return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)

1356

1357

def _og_search_description(self, html, **kargs):

1358

return self._og_search_property('description', html, fatal=False, **kargs)

1359

1360

def _og_search_title(self, html, *, fatal=False, **kargs):

1361

return self._og_search_property('title', html, fatal=fatal, **kargs)

1362

1363

def _og_search_video_url(self, html, name='video url', secure=True, **kargs):

1364

regexes = self._og_regexes('video') + self._og_regexes('video:url')

1365

if secure:

1366

regexes = self._og_regexes('video:secure_url') + regexes

1367

return self._html_search_regex(regexes, html, name, **kargs)

1368

1369

def _og_search_url(self, html, **kargs):

1370

return self._og_search_property('url', html, **kargs)

1371

1372

def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):

1373

return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)

1374

1375

def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):

1376

name = variadic(name)

1377

if display_name is None:

1378

display_name = name[0]

1379

return self._html_search_regex(

1380

[self._meta_regex(n) for n in name],

1381

html, display_name, fatal=fatal, group='content', **kwargs)

1382

1383

def _dc_search_uploader(self, html):

1384

return self._html_search_meta('dc.creator', html, 'uploader')

1385

1386

@staticmethod

1387

def _rta_search(html):

1388

# See http://www.rtalabel.org/index.php?content=howtofaq#single

1389

if re.search(r'(?ix)<meta\s+name="rating"\s+'

1390

r' content="RTA-5042-1996-1400-1577-RTA"',

html):

return 18

# And then there are the jokers who advertise that they use RTA, but actually don't.

1395

AGE_LIMIT_MARKERS = [

1396

r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',

1397

]

1398

if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):

return 18

return 0

def _media_rating_search(self, html):

1403

# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/

1404

rating = self._html_search_meta('rating', html)

if not rating:

return None

RATING_TABLE = {

'safe for kids': 0,

'general': 8,

'14 years': 14,

'mature': 17,

'restricted': 19,

}

return RATING_TABLE.get(rating.lower())

1417

1418

def _family_friendly_search(self, html):

1419

# See http://schema.org/VideoObject

1420

family_friendly = self._html_search_meta(

1421

'isFamilyFriendly', html, default=None)

1422

1423

if not family_friendly:

return None

RATING_TABLE = {

'1': 0,

'true': 0,

'0': 18,

'false': 18,

}

return RATING_TABLE.get(family_friendly.lower())

1433

1434

def _twitter_search_player(self, html):

1435

return self._html_search_meta('twitter:player', html,

1436

'twitter card player')

1437

1438

def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):

1439

"""Yield all json ld objects in the html"""

1440

if default is not NO_DEFAULT:

1441

fatal = False

1442

for mobj in re.finditer(JSON_LD_RE, html):

1443

json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)

1444

for json_ld in variadic(json_ld_item):

1445

if isinstance(json_ld, dict):

1446

yield json_ld

1447

1448

def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):

1449

"""Search for a video in any json ld in the html"""

1450

if default is not NO_DEFAULT:

1451

fatal = False

1452

info = self._json_ld(

1453

list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),

1454

video_id, fatal=fatal, expected_type=expected_type)

1455

if info:

1456

return info

1457

if default is not NO_DEFAULT:

1458

return default

1459

elif fatal:

1460

raise RegexNotFoundError('Unable to extract JSON-LD')

1461

else:

1462

self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())

1463

return {}

1464

1465

def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):

1466

if isinstance(json_ld, str):

1467

json_ld = self._parse_json(json_ld, video_id, fatal=fatal)

if not json_ld:

return {}

info = {}

INTERACTION_TYPE_MAP = {

1473

'CommentAction': 'comment',

1474

'AgreeAction': 'like',

1475

'DisagreeAction': 'dislike',

1476

'LikeAction': 'like',

1477

'DislikeAction': 'dislike',

1478

'ListenAction': 'view',

1479

'WatchAction': 'view',

1480

'ViewAction': 'view',

1481

}

1482

1483

def is_type(e, *expected_types):

1484

type = variadic(traverse_obj(e, '@type'))

1485

return any(x in type for x in expected_types)

1486

1487

def extract_interaction_type(e):

1488

interaction_type = e.get('interactionType')

1489

if isinstance(interaction_type, dict):

1490

interaction_type = interaction_type.get('@type')

1491

return str_or_none(interaction_type)

1492

1493

def extract_interaction_statistic(e):

1494

interaction_statistic = e.get('interactionStatistic')

1495

if isinstance(interaction_statistic, dict):

1496

interaction_statistic = [interaction_statistic]

1497

if not isinstance(interaction_statistic, list):

1498

return

1499

for is_e in interaction_statistic:

1500

if not is_type(is_e, 'InteractionCounter'):

1501

continue

1502

interaction_type = extract_interaction_type(is_e)

1503

if not interaction_type:

1504

continue

1505

# For interaction count some sites provide string instead of

1506

# an integer (as per spec) with non digit characters (e.g. ",")

1507

# so extracting count with more relaxed str_to_int

1508

interaction_count = str_to_int(is_e.get('userInteractionCount'))

1509

if interaction_count is None:

1510

continue

1511

count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])

1512

if not count_kind:

1513

continue

1514

count_key = '%s_count' % count_kind

1515

if info.get(count_key) is not None:

1516

continue

1517

info[count_key] = interaction_count

1518

1519

def extract_chapter_information(e):

1520

chapters = [{

1521

'title': part.get('name'),

1522

'start_time': part.get('startOffset'),

1523

'end_time': part.get('endOffset'),

1524

} for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']

1525

for idx, (last_c, current_c, next_c) in enumerate(zip(

1526

[{'end_time': 0}] + chapters, chapters, chapters[1:])):

1527

current_c['end_time'] = current_c['end_time'] or next_c['start_time']

1528

current_c['start_time'] = current_c['start_time'] or last_c['end_time']

1529

if None in current_c.values():

1530

self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')

1531

return

1532

if chapters:

1533

chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']

1534

info['chapters'] = chapters

1535

1536

def extract_video_object(e):

1537

author = e.get('author')

1538

info.update({

1539

'url': url_or_none(e.get('contentUrl')),

1540

'ext': mimetype2ext(e.get('encodingFormat')),

1541

'title': unescapeHTML(e.get('name')),

1542

'description': unescapeHTML(e.get('description')),

1543

'thumbnails': [{'url': unescapeHTML(url)}

1544

for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))

1545

if url_or_none(url)],

1546

'duration': parse_duration(e.get('duration')),

1547

'timestamp': unified_timestamp(e.get('uploadDate')),

1548

# author can be an instance of 'Organization' or 'Person' types.

1549

# both types can have 'name' property(inherited from 'Thing' type). [1]

1550

# however some websites are using 'Text' type instead.

1551

# 1. https://schema.org/VideoObject

1552

'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,

1553

'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),

1554

'filesize': int_or_none(float_or_none(e.get('contentSize'))),

1555

'tbr': int_or_none(e.get('bitrate')),

1556

'width': int_or_none(e.get('width')),

1557

'height': int_or_none(e.get('height')),

1558

'view_count': int_or_none(e.get('interactionCount')),

1559

'tags': try_call(lambda: e.get('keywords').split(',')),

1560

})

1561

if is_type(e, 'AudioObject'):

1562

info.update({

1563

'vcodec': 'none',

1564

'abr': int_or_none(e.get('bitrate')),

1565

})

1566

extract_interaction_statistic(e)

1567

extract_chapter_information(e)

1568

1569

def traverse_json_ld(json_ld, at_top_level=True):

1570

for e in variadic(json_ld):

1571

if not isinstance(e, dict):

1572

continue

1573

if at_top_level and '@context' not in e:

1574

continue

1575

if at_top_level and set(e.keys()) == {'@context', '@graph'}:

1576

traverse_json_ld(e['@graph'], at_top_level=False)

1577

break

1578

if expected_type is not None and not is_type(e, expected_type):

1579

continue

1580

rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)

1581

if rating is not None:

1582

info['average_rating'] = rating

1583

if is_type(e, 'TVEpisode', 'Episode'):

1584

episode_name = unescapeHTML(e.get('name'))

1585

info.update({

1586

'episode': episode_name,

1587

'episode_number': int_or_none(e.get('episodeNumber')),

1588

'description': unescapeHTML(e.get('description')),

1589

})

1590

if not info.get('title') and episode_name:

1591

info['title'] = episode_name

1592

part_of_season = e.get('partOfSeason')

1593

if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):

1594

info.update({

1595

'season': unescapeHTML(part_of_season.get('name')),

1596

'season_number': int_or_none(part_of_season.get('seasonNumber')),

1597

})

1598

part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')

1599

if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):

1600

info['series'] = unescapeHTML(part_of_series.get('name'))

1601

elif is_type(e, 'Movie'):

1602

info.update({

1603

'title': unescapeHTML(e.get('name')),

1604

'description': unescapeHTML(e.get('description')),

1605

'duration': parse_duration(e.get('duration')),

1606

'timestamp': unified_timestamp(e.get('dateCreated')),

1607

})

1608

elif is_type(e, 'Article', 'NewsArticle'):

1609

info.update({

1610

'timestamp': parse_iso8601(e.get('datePublished')),

1611

'title': unescapeHTML(e.get('headline')),

1612

'description': unescapeHTML(e.get('articleBody') or e.get('description')),

1613

})

1614

if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):

1615

extract_video_object(e['video'][0])

1616

elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):

1617

extract_video_object(e['subjectOf'][0])

1618

elif is_type(e, 'VideoObject', 'AudioObject'):

1619

extract_video_object(e)

1620

if expected_type is None:

continue

else:

break

video = e.get('video')

1625

if is_type(video, 'VideoObject'):

1626

extract_video_object(video)

1627

if expected_type is None:

continue

else:

break

traverse_json_ld(json_ld)

1633

return filter_dict(info)

1634

1635

def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):

1636

return self._parse_json(

1637

self._search_regex(

1638

r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',

1639

webpage, 'next.js data', fatal=fatal, **kw),

1640

video_id, transform_source=transform_source, fatal=fatal)

1641

1642

def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):

1643

"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""

1644

rectx = re.escape(context_name)

1645

FUNCTION_RE = r'$function\((?P<arg_keys>.*?)${return\s+(?P<js>{.*?})\s*;?\s*}$(?P<arg_vals>.*?)$'

1646

js, arg_keys, arg_vals = self._search_regex(

1647

(rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),

1648

webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)

1649

1650

args = dict(zip(arg_keys.split(','), arg_vals.split(',')))

1651

1652

for key, val in args.items():

1653

if val in ('undefined', 'void 0'):

1654

args[key] = 'null'

1655

1656

ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)

1657

return traverse_obj(ret, traverse) or {}

1658

1659

@staticmethod

1660

def _hidden_inputs(html):

1661

html = re.sub(r'', '', html)

1662

hidden_inputs = {}

1663

for input in re.findall(r'(?i)(<input[^>]+>)', html):

1664

attrs = extract_attributes(input)

1665

if not input:

1666

continue

1667

if attrs.get('type') not in ('hidden', 'submit'):

1668

continue

1669

name = attrs.get('name') or attrs.get('id')

1670

value = attrs.get('value')

1671

if name and value is not None:

1672

hidden_inputs[name] = value

1673

return hidden_inputs

1674

1675

def _form_hidden_inputs(self, form_id, html):

1676

form = self._search_regex(

1677

r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,

1678

html, '%s form' % form_id, group='form')

1679

return self._hidden_inputs(form)

1680

1681

class FormatSort:

1682

regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'

1683

1684

default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',

1685

'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',

1686

'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases

1687

ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',

1688

'height', 'width', 'proto', 'vext', 'abr', 'aext',

1689

'fps', 'fs_approx', 'source', 'id')

1690

1691

settings = {

1692

'vcodec': {'type': 'ordered', 'regex': True,

1693

'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},

1694

'acodec': {'type': 'ordered', 'regex': True,

1695

'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},

1696

'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',

1697

'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},

1698

'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',

1699

'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},

1700

'vext': {'type': 'ordered', 'field': 'video_ext',

1701

'order': ('mp4', 'webm', 'flv', '', 'none'),

1702

'order_free': ('webm', 'mp4', 'flv', '', 'none')},

1703

'aext': {'type': 'ordered', 'field': 'audio_ext',

1704

'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),

1705

'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},

1706

'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},

1707

'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',

1708

'field': ('vcodec', 'acodec'),

1709

'function': lambda it: int(any(v != 'none' for v in it))},

1710

'ie_pref': {'priority': True, 'type': 'extractor'},

1711

'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},

1712

'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},

1713

'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},

1714

'quality': {'convert': 'float', 'default': -1},

1715

'filesize': {'convert': 'bytes'},

1716

'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},

1717

'id': {'convert': 'string', 'field': 'format_id'},

1718

'height': {'convert': 'float_none'},

1719

'width': {'convert': 'float_none'},

1720

'fps': {'convert': 'float_none'},

1721

'channels': {'convert': 'float_none', 'field': 'audio_channels'},

1722

'tbr': {'convert': 'float_none'},

1723

'vbr': {'convert': 'float_none'},

1724

'abr': {'convert': 'float_none'},

1725

'asr': {'convert': 'float_none'},

1726

'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},

1727

1728

'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},

1729

'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},

1730

'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},

1731

'ext': {'type': 'combined', 'field': ('vext', 'aext')},

1732

'res': {'type': 'multiple', 'field': ('height', 'width'),

1733

'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},

1734

1735

# Actual field names

1736

'format_id': {'type': 'alias', 'field': 'id'},

1737

'preference': {'type': 'alias', 'field': 'ie_pref'},

1738

'language_preference': {'type': 'alias', 'field': 'lang'},

1739

'source_preference': {'type': 'alias', 'field': 'source'},

1740

'protocol': {'type': 'alias', 'field': 'proto'},

1741

'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},

1742

'audio_channels': {'type': 'alias', 'field': 'channels'},

1743

1744

# Deprecated

1745

'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},

1746

'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},

1747

'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},

1748

'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},

1749

'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},

1750

'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},

1751

'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},

1752

'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},

1753

'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},

1754

'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},

1755

'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},

1756

'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},

1757

'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},

1758

'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},

1759

'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},

1760

'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},

1761

'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},

1762

'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},

1763

'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},

1764

'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},

1765

}

1766

1767

def __init__(self, ie, field_preference):

1768

self._order = []

1769

self.ydl = ie._downloader

1770

self.evaluate_params(self.ydl.params, field_preference)

1771

if ie.get_param('verbose'):

1772

self.print_verbose_info(self.ydl.write_debug)

1773

1774

def _get_field_setting(self, field, key):

1775

if field not in self.settings:

1776

if key in ('forced', 'priority'):

1777

return False

1778

self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '

1779

'deprecated and may be removed in a future version')

1780

self.settings[field] = {}

1781

propObj = self.settings[field]

1782

if key not in propObj:

1783

type = propObj.get('type')

1784

if key == 'field':

1785

default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field

1786

elif key == 'convert':

1787

default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'

1788

else:

1789

default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)

1790

propObj[key] = default

1791

return propObj[key]

1792

1793

def _resolve_field_value(self, field, value, convertNone=False):

if value is None:

if not convertNone:

return None

else:

value = value.lower()

1799

conversion = self._get_field_setting(field, 'convert')

1800

if conversion == 'ignore':

1801

return None

1802

if conversion == 'string':

1803

return value

1804

elif conversion == 'float_none':

1805

return float_or_none(value)

1806

elif conversion == 'bytes':

1807

return FileDownloader.parse_bytes(value)

1808

elif conversion == 'order':

1809

order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')

1810

use_regex = self._get_field_setting(field, 'regex')

1811

list_length = len(order_list)

1812

empty_pos = order_list.index('') if '' in order_list else list_length + 1

1813

if use_regex and value is not None:

1814

for i, regex in enumerate(order_list):

1815

if regex and re.match(regex, value):

1816

return list_length - i

1817

return list_length - empty_pos # not in list

1818

else: # not regex or value = None

1819

return list_length - (order_list.index(value) if value in order_list else empty_pos)

1820

else:

1821

if value.isnumeric():

1822

return float(value)

1823

else:

1824

self.settings[field]['convert'] = 'string'

1825

return value

1826

1827

def evaluate_params(self, params, sort_extractor):

1828

self._use_free_order = params.get('prefer_free_formats', False)

1829

self._sort_user = params.get('format_sort', [])

1830

self._sort_extractor = sort_extractor

1831

1832

def add_item(field, reverse, closest, limit_text):

1833

field = field.lower()

1834

if field in self._order:

1835

return

1836

self._order.append(field)

1837

limit = self._resolve_field_value(field, limit_text)

1838

data = {

1839

'reverse': reverse,

1840

'closest': False if limit is None else closest,

1841

'limit_text': limit_text,

1842

'limit': limit}

1843

if field in self.settings:

1844

self.settings[field].update(data)

1845

else:

1846

self.settings[field] = data

1847

1848

sort_list = (

1849

tuple(field for field in self.default if self._get_field_setting(field, 'forced'))

1850

+ (tuple() if params.get('format_sort_force', False)

1851

else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))

1852

+ tuple(self._sort_user) + tuple(sort_extractor) + self.default)

1853

1854

for item in sort_list:

1855

match = re.match(self.regex, item)

1856

if match is None:

1857

raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)

1858

field = match.group('field')

1859

if field is None:

1860

continue

1861

if self._get_field_setting(field, 'type') == 'alias':

1862

alias, field = field, self._get_field_setting(field, 'field')

1863

if self._get_field_setting(alias, 'deprecated'):

1864

self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '

1865

f'be removed in a future version. Please use {field} instead')

1866

reverse = match.group('reverse') is not None

1867

closest = match.group('separator') == '~'

1868

limit_text = match.group('limit')

1869

1870

has_limit = limit_text is not None

1871

has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'

1872

has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')

1873

1874

fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)

1875

limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()

1876

limit_count = len(limits)

1877

for (i, f) in enumerate(fields):

1878

add_item(f, reverse, closest,

1879

limits[i] if i < limit_count

1880

else limits[0] if has_limit and not has_multiple_limits

1881

else None)

1882

1883

def print_verbose_info(self, write_debug):

1884

if self._sort_user:

1885

write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))

1886

if self._sort_extractor:

1887

write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))

1888

write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (

1889

'+' if self._get_field_setting(field, 'reverse') else '', field,

1890

'%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',

1891

self._get_field_setting(field, 'limit_text'),

1892

self._get_field_setting(field, 'limit'))

1893

if self._get_field_setting(field, 'limit_text') is not None else '')

1894

for field in self._order if self._get_field_setting(field, 'visible')]))

1895

1896

def _calculate_field_preference_from_value(self, format, field, type, value):

1897

reverse = self._get_field_setting(field, 'reverse')

1898

closest = self._get_field_setting(field, 'closest')

1899

limit = self._get_field_setting(field, 'limit')

1900

1901

if type == 'extractor':

1902

maximum = self._get_field_setting(field, 'max')

1903

if value is None or (maximum is not None and value >= maximum):

1904

value = -1

1905

elif type == 'boolean':

1906

in_list = self._get_field_setting(field, 'in_list')

1907

not_in_list = self._get_field_setting(field, 'not_in_list')

1908

value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1

1909

elif type == 'ordered':

1910

value = self._resolve_field_value(field, value, True)

1911

1912

# try to convert to number

1913

val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))

1914

is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None

if is_num:

value = val_num

return ((-10, 0) if value is None

1919

else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher

1920

else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest

1921

else (0, value, 0) if not reverse and (limit is None or value <= limit)

1922

else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit

1923

else (-1, value, 0))

1924

1925

def _calculate_field_preference(self, format, field):

1926

type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple

1927

get_value = lambda f: format.get(self._get_field_setting(f, 'field'))

1928

if type == 'multiple':

1929

type = 'field' # Only 'field' is allowed in multiple for now

1930

actual_fields = self._get_field_setting(field, 'field')

1931

1932

value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)

1933

else:

1934

value = get_value(field)

1935

return self._calculate_field_preference_from_value(format, field, type, value)

1936

1937

def calculate_preference(self, format):

1938

# Determine missing protocol

1939

if not format.get('protocol'):

1940

format['protocol'] = determine_protocol(format)

1941

1942

# Determine missing ext

1943

if not format.get('ext') and 'url' in format:

1944

format['ext'] = determine_ext(format['url'])

1945

if format.get('vcodec') == 'none':

1946

format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'

1947

format['video_ext'] = 'none'

1948

else:

1949

format['video_ext'] = format['ext']

1950

format['audio_ext'] = 'none'

1951

# if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?

1952

# format['preference'] = -1000

1953

1954

# Determine missing bitrates

1955

if format.get('tbr') is None:

1956

if format.get('vbr') is not None and format.get('abr') is not None:

1957

format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)

1958

else:

1959

if format.get('vcodec') != 'none' and format.get('vbr') is None:

1960

format['vbr'] = format.get('tbr') - format.get('abr', 0)

1961

if format.get('acodec') != 'none' and format.get('abr') is None:

1962

format['abr'] = format.get('tbr') - format.get('vbr', 0)

1963

1964

return tuple(self._calculate_field_preference(format, field) for field in self._order)

1965

1966

def _sort_formats(self, formats, field_preference=[]):

1967

if not formats:

1968

return

1969

formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)

1970

1971

def _check_formats(self, formats, video_id):

1972

if formats:

1973

formats[:] = filter(

1974

lambda f: self._is_valid_url(

1975

f['url'], video_id,

1976

item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),

formats)

@staticmethod

def _remove_duplicate_formats(formats):

format_urls = set()

unique_formats = []

for f in formats:

if f['url'] not in format_urls:

1985

format_urls.add(f['url'])

1986

unique_formats.append(f)

1987

formats[:] = unique_formats

1988

1989

def _is_valid_url(self, url, video_id, item='video', headers={}):

1990

url = self._proto_relative_url(url, scheme='http:')

1991

# For now assume non HTTP(S) URLs always valid

1992

if not (url.startswith('http://') or url.startswith('https://')):

1993

return True

1994

try:

1995

self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)

1996

return True

1997

except ExtractorError as e:

1998

self.to_screen(

1999

'%s: %s URL is invalid, skipping: %s'

2000

% (video_id, item, error_to_compat_str(e.cause)))

2001

return False

2002

2003

def http_scheme(self):

2004

""" Either "http:" or "https:", depending on the user's preferences """

2005

return (

2006

'http:'

2007

if self.get_param('prefer_insecure', False)

2008

else 'https:')

2009

2010

def _proto_relative_url(self, url, scheme=None):

2011

scheme = scheme or self.http_scheme()

2012

assert scheme.endswith(':')

2013

return sanitize_url(url, scheme=scheme[:-1])

2014

2015

def _sleep(self, timeout, video_id, msg_template=None):

2016

if msg_template is None:

2017

msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'

2018

msg = msg_template % {'video_id': video_id, 'timeout': timeout}

self.to_screen(msg)

time.sleep(timeout)

def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,

2023

transform_source=lambda s: fix_xml_ampersands(s).strip(),

2024

fatal=True, m3u8_id=None, data=None, headers={}, query={}):

2025

res = self._download_xml_handle(

2026

manifest_url, video_id, 'Downloading f4m manifest',

2027

'Unable to download f4m manifest',

2028

# Some manifests may be malformed, e.g. prosiebensat1 generated manifests

2029

# (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)

2030

transform_source=transform_source,

2031

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return []

manifest, urlh = res

manifest_url = urlh.geturl()

2037

2038

return self._parse_f4m_formats(

2039

manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,

2040

transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)

2041

2042

def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,

2043

transform_source=lambda s: fix_xml_ampersands(s).strip(),

2044

fatal=True, m3u8_id=None):

2045

if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:

2046

return []

2047

2048

# currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy

2049

akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')

2050

if akamai_pv is not None and ';' in akamai_pv.text:

2051

playerVerificationChallenge = akamai_pv.text.split(';')[0]

2052

if playerVerificationChallenge.strip() != '':

return []

formats = []

manifest_version = '1.0'

2057

media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')

2058

if not media_nodes:

2059

manifest_version = '2.0'

2060

media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')

2061

# Remove unsupported DRM protected media from final formats

2062

# rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).

2063

media_nodes = remove_encrypted_media(media_nodes)

if not media_nodes:

return formats

manifest_base_url = get_base_url(manifest)

2068

2069

bootstrap_info = xpath_element(

2070

manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],

2071

'bootstrap info', default=None)

2072

2073

vcodec = None

2074

mime_type = xpath_text(

2075

manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],

2076

'base URL', default=None)

2077

if mime_type and mime_type.startswith('audio/'):

2078

vcodec = 'none'

2079

2080

for i, media_el in enumerate(media_nodes):

2081

tbr = int_or_none(media_el.attrib.get('bitrate'))

2082

width = int_or_none(media_el.attrib.get('width'))

2083

height = int_or_none(media_el.attrib.get('height'))

2084

format_id = join_nonempty(f4m_id, tbr or i)

2085

# If <bootstrapInfo> is present, the specified f4m is a

2086

# stream-level manifest, and only set-level manifests may refer to

2087

# external resources. See section 11.4 and section 4 of F4M spec

2088

if bootstrap_info is None:

2089

media_url = None

2090

# @href is introduced in 2.0, see section 11.6 of F4M spec

2091

if manifest_version == '2.0':

2092

media_url = media_el.attrib.get('href')

2093

if media_url is None:

2094

media_url = media_el.attrib.get('url')

if not media_url:

continue

manifest_url = (

media_url if media_url.startswith('http://') or media_url.startswith('https://')

2099

else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))

2100

# If media_url is itself a f4m manifest do the recursive extraction

2101

# since bitrates in parent manifest (this one) and media_url manifest

2102

# may differ leading to inability to resolve the format by requested

2103

# bitrate in f4m downloader

2104

ext = determine_ext(manifest_url)

2105

if ext == 'f4m':

2106

f4m_formats = self._extract_f4m_formats(

2107

manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,

2108

transform_source=transform_source, fatal=fatal)

2109

# Sometimes stream-level manifest contains single media entry that

2110

# does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).

2111

# At the same time parent's media entry in set-level manifest may

2112

# contain it. We will copy it from parent in such cases.

2113

if len(f4m_formats) == 1:

2114

f = f4m_formats[0]

2115

f.update({

2116

'tbr': f.get('tbr') or tbr,

2117

'width': f.get('width') or width,

2118

'height': f.get('height') or height,

2119

'format_id': f.get('format_id') if not tbr else format_id,

2120

'vcodec': vcodec,

2121

})

2122

formats.extend(f4m_formats)

2123

continue

2124

elif ext == 'm3u8':

2125

formats.extend(self._extract_m3u8_formats(

2126

manifest_url, video_id, 'mp4', preference=preference,

2127

quality=quality, m3u8_id=m3u8_id, fatal=fatal))

2128

continue

2129

formats.append({

2130

'format_id': format_id,

2131

'url': manifest_url,

2132

'manifest_url': manifest_url,

2133

'ext': 'flv' if bootstrap_info is not None else None,

'protocol': 'f4m',

'tbr': tbr,

'width': width,

'height': height,

'vcodec': vcodec,

'preference': preference,

'quality': quality,

})

return formats

def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):

2145

return {

2146

'format_id': join_nonempty(m3u8_id, 'meta'),

'url': m3u8_url,

'ext': ext,

'protocol': 'm3u8',

'preference': preference - 100 if preference else -100,

2151

'quality': quality,

2152

'resolution': 'multiple',

2153

'format_note': 'Quality selection URL',

2154

}

2155

2156

def _report_ignoring_subs(self, name):

2157

self.report_warning(bug_reports_message(

2158

f'Ignoring subtitle tracks found in the {name} manifest; '

2159

'if any subtitle tracks are missing,'

2160

), only_once=True)

2161

2162

def _extract_m3u8_formats(self, *args, **kwargs):

2163

fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)

2164

if subs:

2165

self._report_ignoring_subs('HLS')

2166

return fmts

2167

2168

def _extract_m3u8_formats_and_subtitles(

2169

self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',

2170

preference=None, quality=None, m3u8_id=None, note=None,

2171

errnote=None, fatal=True, live=False, data=None, headers={},

2172

query={}):

2173

2174

res = self._download_webpage_handle(

2175

m3u8_url, video_id,

2176

note='Downloading m3u8 information' if note is None else note,

2177

errnote='Failed to download m3u8 information' if errnote is None else errnote,

2178

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return [], {}

m3u8_doc, urlh = res

m3u8_url = urlh.geturl()

2185

2186

return self._parse_m3u8_formats_and_subtitles(

2187

m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,

2188

preference=preference, quality=quality, m3u8_id=m3u8_id,

2189

note=note, errnote=errnote, fatal=fatal, live=live, data=data,

2190

headers=headers, query=query, video_id=video_id)

2191

2192

def _parse_m3u8_formats_and_subtitles(

2193

self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',

2194

preference=None, quality=None, m3u8_id=None, live=False, note=None,

2195

errnote=None, fatal=True, data=None, headers={}, query={},

2196

video_id=None):

2197

formats, subtitles = [], {}

2198

2199

has_drm = re.search('|'.join([

2200

r'#EXT-X-FAXS-CM:', # Adobe Flash Access

2201

r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay

]), m3u8_doc)

def format_url(url):

return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)

2206

2207

if self.get_param('hls_split_discontinuity', False):

2208

def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):

if not m3u8_doc:

if not manifest_url:

return []

m3u8_doc = self._download_webpage(

2213

manifest_url, video_id, fatal=fatal, data=data, headers=headers,

2214

note=False, errnote='Failed to download m3u8 playlist information')

2215

if m3u8_doc is False:

2216

return []

2217

return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))

2218

2219

else:

2220

def _extract_m3u8_playlist_indices(*args, **kwargs):

return [None]

# References:

# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21

2225

# 2. https://github.com/ytdl-org/youtube-dl/issues/12211

2226

# 3. https://github.com/ytdl-org/youtube-dl/issues/18923

2227

2228

# We should try extracting formats only from master playlists [1, 4.3.4],

2229

# i.e. playlists that describe available qualities. On the other hand

2230

# media playlists [1, 4.3.3] should be returned as is since they contain

2231

# just the media without qualities renditions.

2232

# Fortunately, master playlist can be easily distinguished from media

2233

# playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]

2234

# master playlist tags MUST NOT appear in a media playlist and vice versa.

2235

# As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every

2236

# media playlist and MUST NOT appear in master playlist thus we can

2237

# clearly detect media playlist with this criterion.

2238

2239

if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is

2240

formats = [{

2241

'format_id': join_nonempty(m3u8_id, idx),

2242

'format_index': idx,

2243

'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),

2244

'ext': ext,

2245

'protocol': entry_protocol,

2246

'preference': preference,

2247

'quality': quality,

2248

'has_drm': has_drm,

2249

} for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]

2250

2251

return formats, subtitles

groups = {}

last_stream_inf = {}

def extract_media(x_media_line):

2257

media = parse_m3u8_attributes(x_media_line)

2258

# As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED

2259

media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')

2260

if not (media_type and group_id and name):

2261

return

2262

groups.setdefault(group_id, []).append(media)

2263

# <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>

2264

if media_type == 'SUBTITLES':

2265

# According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the

2266

# EXT-X-MEDIA tag if the media type is SUBTITLES.

2267

# However, lack of URI has been spotted in the wild.

2268

# e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339

2269

if not media.get('URI'):

2270

return

2271

url = format_url(media['URI'])

2272

sub_info = {

2273

'url': url,

2274

'ext': determine_ext(url),

2275

}

2276

if sub_info['ext'] == 'm3u8':

2277

# Per RFC 8216 §3.1, the only possible subtitle format m3u8

2278

# files may contain is WebVTT:

2279

# <https://tools.ietf.org/html/rfc8216#section-3.1>

2280

sub_info['ext'] = 'vtt'

2281

sub_info['protocol'] = 'm3u8_native'

2282

lang = media.get('LANGUAGE') or 'und'

2283

subtitles.setdefault(lang, []).append(sub_info)

2284

if media_type not in ('VIDEO', 'AUDIO'):

2285

return

2286

media_url = media.get('URI')

2287

if media_url:

2288

manifest_url = format_url(media_url)

2289

formats.extend({

2290

'format_id': join_nonempty(m3u8_id, group_id, name, idx),

'format_note': name,

'format_index': idx,

'url': manifest_url,

'manifest_url': m3u8_url,

2295

'language': media.get('LANGUAGE'),

2296

'ext': ext,

2297

'protocol': entry_protocol,

2298

'preference': preference,

2299

'quality': quality,

2300

'vcodec': 'none' if media_type == 'AUDIO' else None,

2301

} for idx in _extract_m3u8_playlist_indices(manifest_url))

2302

2303

def build_stream_name():

2304

# Despite specification does not mention NAME attribute for

2305

# EXT-X-STREAM-INF tag it still sometimes may be present (see [1]

2306

# or vidio test in TestInfoExtractor.test_parse_m3u8_formats)

2307

# 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015

2308

stream_name = last_stream_inf.get('NAME')

2309

if stream_name:

2310

return stream_name

2311

# If there is no NAME in EXT-X-STREAM-INF it will be obtained

2312

# from corresponding rendition group

2313

stream_group_id = last_stream_inf.get('VIDEO')

2314

if not stream_group_id:

2315

return

2316

stream_group = groups.get(stream_group_id)

2317

if not stream_group:

2318

return stream_group_id

2319

rendition = stream_group[0]

2320

return rendition.get('NAME') or stream_group_id

2321

2322

# parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the

2323

# chance to detect video only formats when EXT-X-STREAM-INF tags

2324

# precede EXT-X-MEDIA tags in HLS manifest such as [3].

2325

for line in m3u8_doc.splitlines():

2326

if line.startswith('#EXT-X-MEDIA:'):

2327

extract_media(line)

2328

2329

for line in m3u8_doc.splitlines():

2330

if line.startswith('#EXT-X-STREAM-INF:'):

2331

last_stream_inf = parse_m3u8_attributes(line)

2332

elif line.startswith('#') or not line.strip():

continue

else:

tbr = float_or_none(

last_stream_inf.get('AVERAGE-BANDWIDTH')

2337

or last_stream_inf.get('BANDWIDTH'), scale=1000)

2338

manifest_url = format_url(line.strip())

2339

2340

for idx in _extract_m3u8_playlist_indices(manifest_url):

2341

format_id = [m3u8_id, None, idx]

2342

# Bandwidth of live streams may differ over time thus making

2343

# format_id unpredictable. So it's better to keep provided

2344

# format_id intact.

2345

if not live:

2346

stream_name = build_stream_name()

2347

format_id[1] = stream_name or '%d' % (tbr or len(formats))

2348

f = {

2349

'format_id': join_nonempty(*format_id),

2350

'format_index': idx,

2351

'url': manifest_url,

2352

'manifest_url': m3u8_url,

2353

'tbr': tbr,

2354

'ext': ext,

2355

'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),

2356

'protocol': entry_protocol,

2357

'preference': preference,

2358

'quality': quality,

2359

}

2360

resolution = last_stream_inf.get('RESOLUTION')

2361

if resolution:

2362

mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)

2363

if mobj:

2364

f['width'] = int(mobj.group('width'))

2365

f['height'] = int(mobj.group('height'))

2366

# Unified Streaming Platform

2367

mobj = re.search(

2368

r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])

2369

if mobj:

2370

abr, vbr = mobj.groups()

2371

abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)

f.update({

'vbr': vbr,

'abr': abr,

})

codecs = parse_codecs(last_stream_inf.get('CODECS'))

2377

f.update(codecs)

2378

audio_group_id = last_stream_inf.get('AUDIO')

2379

# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which

2380

# references a rendition group MUST have a CODECS attribute.

2381

# However, this is not always respected. E.g. [2]

2382

# contains EXT-X-STREAM-INF tag which references AUDIO

2383

# rendition group but does not have CODECS and despite

2384

# referencing an audio group it represents a complete

2385

# (with audio and video) format. So, for such cases we will

2386

# ignore references to rendition groups and treat them

2387

# as complete formats.

2388

if audio_group_id and codecs and f.get('vcodec') != 'none':

2389

audio_group = groups.get(audio_group_id)

2390

if audio_group and audio_group[0].get('URI'):

2391

# TODO: update acodec for audio only formats with

# the same GROUP-ID

f['acodec'] = 'none'

if not f.get('ext'):

f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'

formats.append(f)

# for DailyMotion

progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')

2400

if progressive_uri:

2401

http_f = f.copy()

2402

del http_f['manifest_url']

2403

http_f.update({

2404

'format_id': f['format_id'].replace('hls-', 'http-'),

2405

'protocol': 'http',

2406

'url': progressive_uri,

2407

})

2408

formats.append(http_f)

2409

2410

last_stream_inf = {}

2411

return formats, subtitles

2412

2413

def _extract_m3u8_vod_duration(

2414

self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):

2415

2416

m3u8_vod = self._download_webpage(

2417

m3u8_vod_url, video_id,

2418

note='Downloading m3u8 VOD manifest' if note is None else note,

2419

errnote='Failed to download VOD manifest' if errnote is None else errnote,

2420

fatal=False, data=data, headers=headers, query=query)

2421

2422

return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)

2423

2424

def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):

2425

if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:

return None

return int(sum(

float(line[len('#EXTINF:'):].split(',')[0])

2430

for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None

2431

2432

@staticmethod

2433

def _xpath_ns(path, namespace=None):

if not namespace:

return path

out = []

for c in path.split('/'):

2438

if not c or c == '.':

2439

out.append(c)

2440

else:

2441

out.append('{%s}%s' % (namespace, c))

2442

return '/'.join(out)

2443

2444

def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):

2445

res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)

if res is False:

assert not fatal

return [], {}

smil, urlh = res

smil_url = urlh.geturl()

2452

2453

namespace = self._parse_smil_namespace(smil)

2454

2455

fmts = self._parse_smil_formats(

2456

smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)

2457

subs = self._parse_smil_subtitles(

2458

smil, namespace=namespace)

return fmts, subs

def _extract_smil_formats(self, *args, **kwargs):

2463

fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)

2464

if subs:

2465

self._report_ignoring_subs('SMIL')

2466

return fmts

2467

2468

def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):

2469

res = self._download_smil(smil_url, video_id, fatal=fatal)

if res is False:

return {}

smil, urlh = res

smil_url = urlh.geturl()

2475

2476

return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)

2477

2478

def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):

2479

return self._download_xml_handle(

2480

smil_url, video_id, 'Downloading SMIL file',

2481

'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)

2482

2483

def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):

2484

namespace = self._parse_smil_namespace(smil)

2485

2486

formats = self._parse_smil_formats(

2487

smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)

2488

subtitles = self._parse_smil_subtitles(smil, namespace=namespace)

2489

2490

video_id = os.path.splitext(url_basename(smil_url))[0]

title = None

description = None

upload_date = None

for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):

2495

name = meta.attrib.get('name')

2496

content = meta.attrib.get('content')

2497

if not name or not content:

2498

continue

2499

if not title and name == 'title':

2500

title = content

2501

elif not description and name in ('description', 'abstract'):

2502

description = content

2503

elif not upload_date and name == 'date':

2504

upload_date = unified_strdate(content)

2505

2506

thumbnails = [{

2507

'id': image.get('type'),

2508

'url': image.get('src'),

2509

'width': int_or_none(image.get('width')),

2510

'height': int_or_none(image.get('height')),

2511

} for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]

return {

'id': video_id,

'title': title or video_id,

2516

'description': description,

2517

'upload_date': upload_date,

2518

'thumbnails': thumbnails,

2519

'formats': formats,

2520

'subtitles': subtitles,

2521

}

2522

2523

def _parse_smil_namespace(self, smil):

2524

return self._search_regex(

2525

r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)

2526

2527

def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):

2528

base = smil_url

2529

for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):

2530

b = meta.get('base') or meta.get('httpBase')

if b:

base = b

break

formats = []

rtmp_count = 0

http_count = 0

m3u8_count = 0

imgs_count = 0

srcs = set()

media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))

2543

for medium in media:

2544

src = medium.get('src')

2545

if not src or src in srcs:

continue

srcs.add(src)

bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)

2550

filesize = int_or_none(medium.get('size') or medium.get('fileSize'))

2551

width = int_or_none(medium.get('width'))

2552

height = int_or_none(medium.get('height'))

2553

proto = medium.get('proto')

2554

ext = medium.get('ext')

2555

src_ext = determine_ext(src)

2556

streamer = medium.get('streamer') or base

2557

2558

if proto == 'rtmp' or streamer.startswith('rtmp'):

rtmp_count += 1

formats.append({

'url': streamer,

'play_path': src,

'ext': 'flv',

'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),

2565

'tbr': bitrate,

2566

'filesize': filesize,

'width': width,

'height': height,

})

if transform_rtmp_url:

2571

streamer, src = transform_rtmp_url(streamer, src)

formats[-1].update({

'url': streamer,

'play_path': src,

})

continue

src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)

2579

src_url = src_url.strip()

2580

2581

if proto == 'm3u8' or src_ext == 'm3u8':

2582

m3u8_formats = self._extract_m3u8_formats(

2583

src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)

2584

if len(m3u8_formats) == 1:

2585

m3u8_count += 1

2586

m3u8_formats[0].update({

2587

'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),

'tbr': bitrate,

'width': width,

'height': height,

})

formats.extend(m3u8_formats)

2593

elif src_ext == 'f4m':

f4m_url = src_url

if not f4m_params:

f4m_params = {

'hdcore': '3.2.0',

'plugin': 'flowplayer-3.2.0.1',

2599

}

2600

f4m_url += '&' if '?' in f4m_url else '?'

2601

f4m_url += urllib.parse.urlencode(f4m_params)

2602

formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))

2603

elif src_ext == 'mpd':

2604

formats.extend(self._extract_mpd_formats(

2605

src_url, video_id, mpd_id='dash', fatal=False))

2606

elif re.search(r'\.ism/[Mm]anifest', src_url):

2607

formats.extend(self._extract_ism_formats(

2608

src_url, video_id, ism_id='mss', fatal=False))

2609

elif src_url.startswith('http') and self._is_valid_url(src, video_id):

http_count += 1

formats.append({

'url': src_url,

'ext': ext or src_ext or 'flv',

2614

'format_id': 'http-%d' % (bitrate or http_count),

2615

'tbr': bitrate,

2616

'filesize': filesize,

'width': width,

'height': height,

})

for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):

2622

src = medium.get('src')

2623

if not src or src in srcs:

continue

srcs.add(src)

imgs_count += 1

formats.append({

'format_id': 'imagestream-%d' % (imgs_count),

2630

'url': src,

2631

'ext': mimetype2ext(medium.get('type')),

2632

'acodec': 'none',

2633

'vcodec': 'none',

2634

'width': int_or_none(medium.get('width')),

2635

'height': int_or_none(medium.get('height')),

2636

'format_note': 'SMIL storyboards',

})

return formats

def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):

2642

urls = []

2643

subtitles = {}

2644

for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):

2645

src = textstream.get('src')

2646

if not src or src in urls:

2647

continue

2648

urls.append(src)

2649

ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)

2650

lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang

2651

subtitles.setdefault(lang, []).append({

'url': src,

'ext': ext,

})

return subtitles

def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):

2658

res = self._download_xml_handle(

2659

xspf_url, playlist_id, 'Downloading xpsf playlist',

2660

'Unable to download xspf manifest', fatal=fatal)

if res is False:

return []

xspf, urlh = res

xspf_url = urlh.geturl()

2666

2667

return self._parse_xspf(

2668

xspf, playlist_id, xspf_url=xspf_url,

2669

xspf_base_url=base_url(xspf_url))

2670

2671

def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):

2672

NS_MAP = {

2673

'xspf': 'http://xspf.org/ns/0/',

2674

's1': 'http://static.streamone.nl/player/ns/0',

}

entries = []

for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):

2679

title = xpath_text(

2680

track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)

2681

description = xpath_text(

2682

track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')

2683

thumbnail = xpath_text(

2684

track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')

2685

duration = float_or_none(

2686

xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)

2687

2688

formats = []

2689

for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):

2690

format_url = urljoin(xspf_base_url, location.text)

if not format_url:

continue

formats.append({

'url': format_url,

'manifest_url': xspf_url,

2696

'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),

2697

'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),

2698

'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),

2699

})

2700

self._sort_formats(formats)

entries.append({

'id': playlist_id,

'title': title,

'description': description,

2706

'thumbnail': thumbnail,

2707

'duration': duration,

'formats': formats,

})

return entries

def _extract_mpd_formats(self, *args, **kwargs):

2713

fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)

2714

if subs:

2715

self._report_ignoring_subs('DASH')

2716

return fmts

2717

2718

def _extract_mpd_formats_and_subtitles(

2719

self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,

2720

fatal=True, data=None, headers={}, query={}):

2721

res = self._download_xml_handle(

2722

mpd_url, video_id,

2723

note='Downloading MPD manifest' if note is None else note,

2724

errnote='Failed to download MPD manifest' if errnote is None else errnote,

2725

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return [], {}

mpd_doc, urlh = res

if mpd_doc is None:

return [], {}

# We could have been redirected to a new url when we retrieved our mpd file.

2733

mpd_url = urlh.geturl()

2734

mpd_base_url = base_url(mpd_url)

2735

2736

return self._parse_mpd_formats_and_subtitles(

2737

mpd_doc, mpd_id, mpd_base_url, mpd_url)

2738

2739

def _parse_mpd_formats(self, *args, **kwargs):

2740

fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)

2741

if subs:

2742

self._report_ignoring_subs('DASH')

2743

return fmts

2744

2745

def _parse_mpd_formats_and_subtitles(

2746

self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):

2747

"""

2748

Parse formats from MPD manifest.

2749

References:

2750

1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),

2751

http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip

2752

2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP

2753

"""

2754

if not self.get_param('dynamic_mpd', True):

2755

if mpd_doc.get('type') == 'dynamic':

2756

return [], {}

2757

2758

namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)

2759

2760

def _add_ns(path):

2761

return self._xpath_ns(path, namespace)

2762

2763

def is_drm_protected(element):

2764

return element.find(_add_ns('ContentProtection')) is not None

2765

2766

def extract_multisegment_info(element, ms_parent_info):

2767

ms_info = ms_parent_info.copy()

2768

2769

# As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some

2770

# common attributes and elements. We will only extract relevant

2771

# for us.

2772

def extract_common(source):

2773

segment_timeline = source.find(_add_ns('SegmentTimeline'))

2774

if segment_timeline is not None:

2775

s_e = segment_timeline.findall(_add_ns('S'))

2776

if s_e:

2777

ms_info['total_number'] = 0

2778

ms_info['s'] = []

2779

for s in s_e:

2780

r = int(s.get('r', 0))

2781

ms_info['total_number'] += 1 + r

2782

ms_info['s'].append({

2783

't': int(s.get('t', 0)),

2784

# @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])

2785

'd': int(s.attrib['d']),

2786

'r': r,

2787

})

2788

start_number = source.get('startNumber')

2789

if start_number:

2790

ms_info['start_number'] = int(start_number)

2791

timescale = source.get('timescale')

2792

if timescale:

2793

ms_info['timescale'] = int(timescale)

2794

segment_duration = source.get('duration')

2795

if segment_duration:

2796

ms_info['segment_duration'] = float(segment_duration)

2797

2798

def extract_Initialization(source):

2799

initialization = source.find(_add_ns('Initialization'))

2800

if initialization is not None:

2801

ms_info['initialization_url'] = initialization.attrib['sourceURL']

2802

2803

segment_list = element.find(_add_ns('SegmentList'))

2804

if segment_list is not None:

2805

extract_common(segment_list)

2806

extract_Initialization(segment_list)

2807

segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))

2808

if segment_urls_e:

2809

ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]

2810

else:

2811

segment_template = element.find(_add_ns('SegmentTemplate'))

2812

if segment_template is not None:

2813

extract_common(segment_template)

2814

media = segment_template.get('media')

2815

if media:

2816

ms_info['media'] = media

2817

initialization = segment_template.get('initialization')

2818

if initialization:

2819

ms_info['initialization'] = initialization

2820

else:

2821

extract_Initialization(segment_template)

2822

return ms_info

2823

2824

mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))

2825

formats, subtitles = [], {}

2826

stream_numbers = collections.defaultdict(int)

2827

for period in mpd_doc.findall(_add_ns('Period')):

2828

period_duration = parse_duration(period.get('duration')) or mpd_duration

2829

period_ms_info = extract_multisegment_info(period, {

'start_number': 1,

'timescale': 1,

})

for adaptation_set in period.findall(_add_ns('AdaptationSet')):

2834

adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)

2835

for representation in adaptation_set.findall(_add_ns('Representation')):

2836

representation_attrib = adaptation_set.attrib.copy()

2837

representation_attrib.update(representation.attrib)

2838

# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory

2839

mime_type = representation_attrib['mimeType']

2840

content_type = representation_attrib.get('contentType', mime_type.split('/')[0])

2841

2842

codec_str = representation_attrib.get('codecs', '')

2843

# Some kind of binary subtitle found in some youtube livestreams

2844

if mime_type == 'application/x-rawcc':

2845

codecs = {'scodec': codec_str}

2846

else:

2847

codecs = parse_codecs(codec_str)

2848

if content_type not in ('video', 'audio', 'text'):

2849

if mime_type == 'image/jpeg':

2850

content_type = mime_type

2851

elif codecs.get('vcodec', 'none') != 'none':

2852

content_type = 'video'

2853

elif codecs.get('acodec', 'none') != 'none':

2854

content_type = 'audio'

2855

elif codecs.get('scodec', 'none') != 'none':

2856

content_type = 'text'

2857

elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):

2858

content_type = 'text'

2859

else:

2860

self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)

continue

base_url = ''

for element in (representation, adaptation_set, period, mpd_doc):

2865

base_url_e = element.find(_add_ns('BaseURL'))

2866

if try_call(lambda: base_url_e.text) is not None:

2867

base_url = base_url_e.text + base_url

2868

if re.match(r'^https?://', base_url):

2869

break

2870

if mpd_base_url and base_url.startswith('/'):

2871

base_url = urllib.parse.urljoin(mpd_base_url, base_url)

2872

elif mpd_base_url and not re.match(r'^https?://', base_url):

2873

if not mpd_base_url.endswith('/'):

2874

mpd_base_url += '/'

2875

base_url = mpd_base_url + base_url

2876

representation_id = representation_attrib.get('id')

2877

lang = representation_attrib.get('lang')

2878

url_el = representation.find(_add_ns('BaseURL'))

2879

filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)

2880

bandwidth = int_or_none(representation_attrib.get('bandwidth'))

2881

if representation_id is not None:

2882

format_id = representation_id

2883

else:

2884

format_id = content_type

2885

if mpd_id:

2886

format_id = mpd_id + '-' + format_id

2887

if content_type in ('video', 'audio'):

2888

f = {

2889

'format_id': format_id,

2890

'manifest_url': mpd_url,

2891

'ext': mimetype2ext(mime_type),

2892

'width': int_or_none(representation_attrib.get('width')),

2893

'height': int_or_none(representation_attrib.get('height')),

2894

'tbr': float_or_none(bandwidth, 1000),

2895

'asr': int_or_none(representation_attrib.get('audioSamplingRate')),

2896

'fps': int_or_none(representation_attrib.get('frameRate')),

2897

'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,

2898

'format_note': 'DASH %s' % content_type,

2899

'filesize': filesize,

2900

'container': mimetype2ext(mime_type) + '_dash',

2901

**codecs

2902

}

2903

elif content_type == 'text':

2904

f = {

2905

'ext': mimetype2ext(mime_type),

2906

'manifest_url': mpd_url,

2907

'filesize': filesize,

2908

}

2909

elif content_type == 'image/jpeg':

2910

# See test case in VikiIE

2911

# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1

2912

f = {

2913

'format_id': format_id,

2914

'ext': 'mhtml',

2915

'manifest_url': mpd_url,

2916

'format_note': 'DASH storyboards (jpeg)',

'acodec': 'none',

'vcodec': 'none',

}

if is_drm_protected(adaptation_set) or is_drm_protected(representation):

2921

f['has_drm'] = True

2922

representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)

2923

2924

def prepare_template(template_name, identifiers):

2925

tmpl = representation_ms_info[template_name]

2926

if representation_id is not None:

2927

tmpl = tmpl.replace('$RepresentationID$', representation_id)

2928

# First of, % characters outside $...$ templates

2929

# must be escaped by doubling for proper processing

2930

# by % operator string formatting used further (see

2931

# https://github.com/ytdl-org/youtube-dl/issues/16867).

t = ''

in_template = False

for c in tmpl:

t += c

if c == '$':

in_template = not in_template

2938

elif c == '%' and not in_template:

2939

t += c

2940

# Next, $...$ templates are translated to their

2941

# %(...) counterparts to be used with % operator

2942

t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)

2943

t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)

t.replace('$$', '$')

return t

# @initialization is a regular template like @media one

2948

# so it should be handled just the same way (see

2949

# https://github.com/ytdl-org/youtube-dl/issues/11605)

2950

if 'initialization' in representation_ms_info:

2951

initialization_template = prepare_template(

2952

'initialization',

2953

# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and

2954

# $Time$ shall not be included for @initialization thus

2955

# only $Bandwidth$ remains

2956

('Bandwidth', ))

2957

representation_ms_info['initialization_url'] = initialization_template % {

2958

'Bandwidth': bandwidth,

2959

}

2960

2961

def location_key(location):

2962

return 'url' if re.match(r'^https?://', location) else 'path'

2963

2964

if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:

2965

2966

media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))

2967

media_location_key = location_key(media_template)

2968

2969

# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$

2970

# can't be used at the same time

2971

if '%(Number' in media_template and 's' not in representation_ms_info:

2972

segment_duration = None

2973

if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:

2974

segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])

2975

representation_ms_info['total_number'] = int(math.ceil(

2976

float_or_none(period_duration, segment_duration, default=0)))

2977

representation_ms_info['fragments'] = [{

2978

media_location_key: media_template % {

2979

'Number': segment_number,

2980

'Bandwidth': bandwidth,

2981

},

2982

'duration': segment_duration,

2983

} for segment_number in range(

2984

representation_ms_info['start_number'],

2985

representation_ms_info['total_number'] + representation_ms_info['start_number'])]

2986

else:

2987

# $Number*$ or $Time$ in media template with S list available

2988

# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg

2989

# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411

2990

representation_ms_info['fragments'] = []

2991

segment_time = 0

2992

segment_d = None

2993

segment_number = representation_ms_info['start_number']

2994

2995

def add_segment_url():

2996

segment_url = media_template % {

2997

'Time': segment_time,

2998

'Bandwidth': bandwidth,

2999

'Number': segment_number,

3000

}

3001

representation_ms_info['fragments'].append({

3002

media_location_key: segment_url,

3003

'duration': float_or_none(segment_d, representation_ms_info['timescale']),

3004

})

3005

3006

for num, s in enumerate(representation_ms_info['s']):

3007

segment_time = s.get('t') or segment_time

segment_d = s['d']

add_segment_url()

segment_number += 1

for r in range(s.get('r', 0)):

3012

segment_time += segment_d

3013

add_segment_url()

3014

segment_number += 1

3015

segment_time += segment_d

3016

elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:

3017

# No media template,

3018

# e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI

3019

# or any YouTube dashsegments video

3020

fragments = []

3021

segment_index = 0

3022

timescale = representation_ms_info['timescale']

3023

for s in representation_ms_info['s']:

3024

duration = float_or_none(s['d'], timescale)

3025

for r in range(s.get('r', 0) + 1):

3026

segment_uri = representation_ms_info['segment_urls'][segment_index]

3027

fragments.append({

3028

location_key(segment_uri): segment_uri,

3029

'duration': duration,

3030

})

3031

segment_index += 1

3032

representation_ms_info['fragments'] = fragments

3033

elif 'segment_urls' in representation_ms_info:

3034

# Segment URLs with no SegmentTimeline

3035

# E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091

3036

# https://github.com/ytdl-org/youtube-dl/pull/14844

3037

fragments = []

3038

segment_duration = float_or_none(

3039

representation_ms_info['segment_duration'],

3040

representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None

3041

for segment_url in representation_ms_info['segment_urls']:

3042

fragment = {

3043

location_key(segment_url): segment_url,

3044

}

3045

if segment_duration:

3046

fragment['duration'] = segment_duration

3047

fragments.append(fragment)

3048

representation_ms_info['fragments'] = fragments

3049

# If there is a fragments key available then we correctly recognized fragmented media.

3050

# Otherwise we will assume unfragmented media with direct access. Technically, such

3051

# assumption is not necessarily correct since we may simply have no support for

3052

# some forms of fragmented media renditions yet, but for now we'll use this fallback.

3053

if 'fragments' in representation_ms_info:

3054

f.update({

3055

# NB: mpd_url may be empty when MPD manifest is parsed from a string

3056

'url': mpd_url or base_url,

3057

'fragment_base_url': base_url,

3058

'fragments': [],

3059

'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',

3060

})

3061

if 'initialization_url' in representation_ms_info:

3062

initialization_url = representation_ms_info['initialization_url']

3063

if not f.get('url'):

3064

f['url'] = initialization_url

3065

f['fragments'].append({location_key(initialization_url): initialization_url})

3066

f['fragments'].extend(representation_ms_info['fragments'])

3067

if not period_duration:

3068

period_duration = try_get(

3069

representation_ms_info,

3070

lambda r: sum(frag['duration'] for frag in r['fragments']), float)

3071

else:

3072

# Assuming direct URL to unfragmented media.

3073

f['url'] = base_url

3074

if content_type in ('video', 'audio', 'image/jpeg'):

3075

f['manifest_stream_number'] = stream_numbers[f['url']]

3076

stream_numbers[f['url']] += 1

3077

formats.append(f)

3078

elif content_type == 'text':

3079

subtitles.setdefault(lang or 'und', []).append(f)

3080

3081

return formats, subtitles

3082

3083

def _extract_ism_formats(self, *args, **kwargs):

3084

fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)

3085

if subs:

3086

self._report_ignoring_subs('ISM')

3087

return fmts

3088

3089

def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):

3090

res = self._download_xml_handle(

3091

ism_url, video_id,

3092

note='Downloading ISM manifest' if note is None else note,

3093

errnote='Failed to download ISM manifest' if errnote is None else errnote,

3094

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return [], {}

ism_doc, urlh = res

if ism_doc is None:

return [], {}

return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)

3102

3103

def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):

3104

"""

3105

Parse formats from ISM manifest.

3106

References:

3107

1. [MS-SSTR]: Smooth Streaming Protocol,

3108

https://msdn.microsoft.com/en-us/library/ff469518.aspx

3109

"""

3110

if ism_doc.get('IsLive') == 'TRUE':

3111

return [], {}

3112

3113

duration = int(ism_doc.attrib['Duration'])

3114

timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000

formats = []

subtitles = {}

for stream in ism_doc.findall('StreamIndex'):

3119

stream_type = stream.get('Type')

3120

if stream_type not in ('video', 'audio', 'text'):

3121

continue

3122

url_pattern = stream.attrib['Url']

3123

stream_timescale = int_or_none(stream.get('TimeScale')) or timescale

3124

stream_name = stream.get('Name')

3125

stream_language = stream.get('Language', 'und')

3126

for track in stream.findall('QualityLevel'):

3127

KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}

3128

fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))

3129

# TODO: add support for WVC1 and WMAP

3130

if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):

3131

self.report_warning('%s is not a supported codec' % fourcc)

3132

continue

3133

tbr = int(track.attrib['Bitrate']) // 1000

3134

# [1] does not mention Width and Height attributes. However,

3135

# they're often present while MaxWidth and MaxHeight are

3136

# missing, so should be used as fallbacks

3137

width = int_or_none(track.get('MaxWidth') or track.get('Width'))

3138

height = int_or_none(track.get('MaxHeight') or track.get('Height'))

3139

sampling_rate = int_or_none(track.get('SamplingRate'))

3140

3141

track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)

3142

track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)

fragments = []

fragment_ctx = {

'time': 0,

}

stream_fragments = stream.findall('c')

3149

for stream_fragment_index, stream_fragment in enumerate(stream_fragments):

3150

fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']

3151

fragment_repeat = int_or_none(stream_fragment.get('r')) or 1

3152

fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))

3153

if not fragment_ctx['duration']:

3154

try:

3155

next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])

3156

except IndexError:

3157

next_fragment_time = duration

3158

fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat

3159

for _ in range(fragment_repeat):

3160

fragments.append({

3161

'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),

3162

'duration': fragment_ctx['duration'] / stream_timescale,

3163

})

3164

fragment_ctx['time'] += fragment_ctx['duration']

3165

3166

if stream_type == 'text':

3167

subtitles.setdefault(stream_language, []).append({

'ext': 'ismt',

'protocol': 'ism',

'url': ism_url,

'manifest_url': ism_url,

3172

'fragments': fragments,

3173

'_download_params': {

3174

'stream_type': stream_type,

3175

'duration': duration,

3176

'timescale': stream_timescale,

3177

'fourcc': fourcc,

3178

'language': stream_language,

3179

'codec_private_data': track.get('CodecPrivateData'),

3180

}

3181

})

3182

elif stream_type in ('video', 'audio'):

3183

formats.append({

3184

'format_id': join_nonempty(ism_id, stream_name, tbr),

3185

'url': ism_url,

3186

'manifest_url': ism_url,

3187

'ext': 'ismv' if stream_type == 'video' else 'isma',

'width': width,

'height': height,

'tbr': tbr,

'asr': sampling_rate,

3192

'vcodec': 'none' if stream_type == 'audio' else fourcc,

3193

'acodec': 'none' if stream_type == 'video' else fourcc,

3194

'protocol': 'ism',

3195

'fragments': fragments,

3196

'has_drm': ism_doc.find('Protection') is not None,

3197

'_download_params': {

3198

'stream_type': stream_type,

3199

'duration': duration,

3200

'timescale': stream_timescale,

3201

'width': width or 0,

3202

'height': height or 0,

3203

'fourcc': fourcc,

3204

'language': stream_language,

3205

'codec_private_data': track.get('CodecPrivateData'),

3206

'sampling_rate': sampling_rate,

3207

'channels': int_or_none(track.get('Channels', 2)),

3208

'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),

3209

'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),

3210

},

3211

})

3212

return formats, subtitles

3213

3214

def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):

3215

def absolute_url(item_url):

3216

return urljoin(base_url, item_url)

3217

3218

def parse_content_type(content_type):

3219

if not content_type:

3220

return {}

3221

ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)

3222

if ctr:

3223

mimetype, codecs = ctr.groups()

3224

f = parse_codecs(codecs)

3225

f['ext'] = mimetype2ext(mimetype)

return f

return {}

def _media_formats(src, cur_media_type, type_info=None):

3230

type_info = type_info or {}

3231

full_url = absolute_url(src)

3232

ext = type_info.get('ext') or determine_ext(full_url)

3233

if ext == 'm3u8':

3234

is_plain_url = False

3235

formats = self._extract_m3u8_formats(

3236

full_url, video_id, ext='mp4',

3237

entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,

3238

preference=preference, quality=quality, fatal=False)

3239

elif ext == 'mpd':

3240

is_plain_url = False

3241

formats = self._extract_mpd_formats(

3242

full_url, video_id, mpd_id=mpd_id, fatal=False)

else:

is_plain_url = True

formats = [{

'url': full_url,

'vcodec': 'none' if cur_media_type == 'audio' else None,

3248

'ext': ext,

3249

}]

3250

return is_plain_url, formats

3251

3252

entries = []

3253

# amp-video and amp-audio are very similar to their HTML5 counterparts

3254

# so we will include them right here (see

3255

# https://www.ampproject.org/docs/reference/components/amp-video)

3256

# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/

3257

_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'

3258

media_tags = [(media_tag, media_tag_name, media_type, '')

3259

for media_tag, media_tag_name, media_type

3260

in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]

3261

media_tags.extend(re.findall(

3262

# We only allow video|audio followed by a whitespace or '>'.

3263

# Allowing more characters may end up in significant slow down (see

3264

# https://github.com/ytdl-org/youtube-dl/issues/11979,

3265

# e.g. http://www.porntrex.com/maps/videositemap.xml).

3266

r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))

3267

for media_tag, _, media_type, media_content in media_tags:

media_info = {

'formats': [],

'subtitles': {},

}

media_attributes = extract_attributes(media_tag)

3273

src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))

3274

if src:

3275

f = parse_content_type(media_attributes.get('type'))

3276

_, formats = _media_formats(src, media_type, f)

3277

media_info['formats'].extend(formats)

3278

media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))

3279

if media_content:

3280

for source_tag in re.findall(r'<source[^>]+>', media_content):

3281

s_attr = extract_attributes(source_tag)

3282

# data-video-src and data-src are non standard but seen

3283

# several times in the wild

3284

src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))

3285

if not src:

3286

continue

3287

f = parse_content_type(s_attr.get('type'))

3288

is_plain_url, formats = _media_formats(src, media_type, f)

3289

if is_plain_url:

3290

# width, height, res, label and title attributes are

3291

# all not standard but seen several times in the wild

3292

labels = [

3293

s_attr.get(lbl)

3294

for lbl in ('label', 'title')

3295

if str_or_none(s_attr.get(lbl))

3296

]

3297

width = int_or_none(s_attr.get('width'))

3298

height = (int_or_none(s_attr.get('height'))

3299

or int_or_none(s_attr.get('res')))

3300

if not width or not height:

3301

for lbl in labels:

3302

resolution = parse_resolution(lbl)

3303

if not resolution:

3304

continue

3305

width = width or resolution.get('width')

3306

height = height or resolution.get('height')

3307

for lbl in labels:

3308

tbr = parse_bitrate(lbl)

if tbr:

break

else:

tbr = None

f.update({

'width': width,

'height': height,

'tbr': tbr,

'format_id': s_attr.get('label') or s_attr.get('title'),

3318

})

3319

f.update(formats[0])

3320

media_info['formats'].append(f)

3321

else:

3322

media_info['formats'].extend(formats)

3323

for track_tag in re.findall(r'<track[^>]+>', media_content):

3324

track_attributes = extract_attributes(track_tag)

3325

kind = track_attributes.get('kind')

3326

if not kind or kind in ('subtitles', 'captions'):

3327

src = strip_or_none(track_attributes.get('src'))

3328

if not src:

3329

continue

3330

lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')

3331

media_info['subtitles'].setdefault(lang, []).append({

3332

'url': absolute_url(src),

3333

})

3334

for f in media_info['formats']:

3335

f.setdefault('http_headers', {})['Referer'] = base_url

3336

if media_info['formats'] or media_info['subtitles']:

3337

entries.append(media_info)

3338

return entries

3339

3340

def _extract_akamai_formats(self, *args, **kwargs):

3341

fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)

3342

if subs:

3343

self._report_ignoring_subs('akamai')

3344

return fmts

3345

3346

def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):

3347

signed = 'hdnea=' in manifest_url

3348

if not signed:

3349

# https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html

3350

manifest_url = re.sub(

3351

r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',

3352

'', manifest_url).strip('?')

formats = []

subtitles = {}

hdcore_sign = 'hdcore=3.7.0'

3358

f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')

3359

hds_host = hosts.get('hds')

3360

if hds_host:

3361

f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)

3362

if 'hdcore=' not in f4m_url:

3363

f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign

3364

f4m_formats = self._extract_f4m_formats(

3365

f4m_url, video_id, f4m_id='hds', fatal=False)

3366

for entry in f4m_formats:

3367

entry.update({'extra_param_to_segment_url': hdcore_sign})

3368

formats.extend(f4m_formats)

3369

3370

m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')

3371

hls_host = hosts.get('hls')

3372

if hls_host:

3373

m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)

3374

m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(

3375

m3u8_url, video_id, 'mp4', 'm3u8_native',

3376

m3u8_id='hls', fatal=False)

3377

formats.extend(m3u8_formats)

3378

subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)

3379

3380

http_host = hosts.get('http')

3381

if http_host and m3u8_formats and not signed:

3382

REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'

3383

qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')

3384

qualities_length = len(qualities)

3385

if len(m3u8_formats) in (qualities_length, qualities_length + 1):

3386

i = 0

3387

for f in m3u8_formats:

3388

if f['vcodec'] != 'none':

3389

for protocol in ('http', 'https'):

3390

http_f = f.copy()

3391

del http_f['manifest_url']

3392

http_url = re.sub(

3393

REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])

3394

http_f.update({

3395

'format_id': http_f['format_id'].replace('hls-', protocol + '-'),

3396

'url': http_url,

3397

'protocol': protocol,

3398

})

3399

formats.append(http_f)

3400

i += 1

3401

3402

return formats, subtitles

3403

3404

def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):

3405

query = urllib.parse.urlparse(url).query

3406

3407

mobj = re.search(

3408

r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)

3409

url_base = mobj.group('url')

3410

http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)

3411

formats = []

3412

3413

def manifest_url(manifest):

3414

m_url = f'{http_base_url}/{manifest}'

3415

if query:

3416

m_url += '?%s' % query

3417

return m_url

3418

3419

if 'm3u8' not in skip_protocols:

3420

formats.extend(self._extract_m3u8_formats(

3421

manifest_url('playlist.m3u8'), video_id, 'mp4',

3422

m3u8_entry_protocol, m3u8_id='hls', fatal=False))

3423

if 'f4m' not in skip_protocols:

3424

formats.extend(self._extract_f4m_formats(

3425

manifest_url('manifest.f4m'),

3426

video_id, f4m_id='hds', fatal=False))

3427

if 'dash' not in skip_protocols:

3428

formats.extend(self._extract_mpd_formats(

3429

manifest_url('manifest.mpd'),

3430

video_id, mpd_id='dash', fatal=False))

3431

if re.search(r'(?:/smil:|\.smil)', url_base):

3432

if 'smil' not in skip_protocols:

3433

rtmp_formats = self._extract_smil_formats(

3434

manifest_url('jwplayer.smil'),

3435

video_id, fatal=False)

3436

for rtmp_format in rtmp_formats:

3437

rtsp_format = rtmp_format.copy()

3438

rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])

3439

del rtsp_format['play_path']

3440

del rtsp_format['ext']

3441

rtsp_format.update({

3442

'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),

3443

'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),

3444

'protocol': 'rtsp',

3445

})

3446

formats.extend([rtmp_format, rtsp_format])

3447

else:

3448

for protocol in ('rtmp', 'rtsp'):

3449

if protocol not in skip_protocols:

3450

formats.append({

3451

'url': f'{protocol}:{url_base}',

3452

'format_id': protocol,

3453

'protocol': protocol,

})

return formats

def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):

3458

mobj = re.search(

3459

r'(?s)jwplayer$(?P<quote>[\'"])[^\'" ]+(?P=quote)$(?!</script>).*?\.setup\s*$(?P<options>[^)]+)$',

webpage)

if mobj:

try:

jwplayer_data = self._parse_json(mobj.group('options'),

3464

video_id=video_id,

3465

transform_source=transform_source)

3466

except ExtractorError:

3467

pass

3468

else:

3469

if isinstance(jwplayer_data, dict):

3470

return jwplayer_data

3471

3472

def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):

3473

jwplayer_data = self._find_jwplayer_data(

3474

webpage, video_id, transform_source=js_to_json)

3475

return self._parse_jwplayer_data(

3476

jwplayer_data, video_id, *args, **kwargs)

3477

3478

def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,

3479

m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):

3480

# JWPlayer backward compatibility: flattened playlists

3481

# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96

3482

if 'playlist' not in jwplayer_data:

3483

jwplayer_data = {'playlist': [jwplayer_data]}

entries = []

# JWPlayer backward compatibility: single playlist item

3488

# https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10

3489

if not isinstance(jwplayer_data['playlist'], list):

3490

jwplayer_data['playlist'] = [jwplayer_data['playlist']]

3491

3492

for video_data in jwplayer_data['playlist']:

3493

# JWPlayer backward compatibility: flattened sources

3494

# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35

3495

if 'sources' not in video_data:

3496

video_data['sources'] = [video_data]

3497

3498

this_video_id = video_id or video_data['mediaid']

3499

3500

formats = self._parse_jwplayer_formats(

3501

video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,

3502

mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)

3503

3504

subtitles = {}

3505

tracks = video_data.get('tracks')

3506

if tracks and isinstance(tracks, list):

3507

for track in tracks:

3508

if not isinstance(track, dict):

3509

continue

3510

track_kind = track.get('kind')

3511

if not track_kind or not isinstance(track_kind, str):

3512

continue

3513

if track_kind.lower() not in ('captions', 'subtitles'):

3514

continue

3515

track_url = urljoin(base_url, track.get('file'))

3516

if not track_url:

3517

continue

3518

subtitles.setdefault(track.get('label') or 'en', []).append({

3519

'url': self._proto_relative_url(track_url)

})

entry = {

'id': this_video_id,

'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),

3525

'description': clean_html(video_data.get('description')),

3526

'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),

3527

'timestamp': int_or_none(video_data.get('pubdate')),

3528

'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),

3529

'subtitles': subtitles,

3530

}

3531

# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32

3532

if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):

3533

entry.update({

3534

'_type': 'url_transparent',

3535

'url': formats[0]['url'],

3536

})

3537

else:

3538

self._sort_formats(formats)

3539

entry['formats'] = formats

3540

entries.append(entry)

3541

if len(entries) == 1:

3542

return entries[0]

3543

else:

3544

return self.playlist_result(entries)

3545

3546

def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,

3547

m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):

3548

urls = []

3549

formats = []

3550

for source in jwplayer_sources_data:

3551

if not isinstance(source, dict):

3552

continue

3553

source_url = urljoin(

3554

base_url, self._proto_relative_url(source.get('file')))

3555

if not source_url or source_url in urls:

3556

continue

3557

urls.append(source_url)

3558

source_type = source.get('type') or ''

3559

ext = mimetype2ext(source_type) or determine_ext(source_url)

3560

if source_type == 'hls' or ext == 'm3u8':

3561

formats.extend(self._extract_m3u8_formats(

3562

source_url, video_id, 'mp4', entry_protocol='m3u8_native',

3563

m3u8_id=m3u8_id, fatal=False))

3564

elif source_type == 'dash' or ext == 'mpd':

3565

formats.extend(self._extract_mpd_formats(

3566

source_url, video_id, mpd_id=mpd_id, fatal=False))

3567

elif ext == 'smil':

3568

formats.extend(self._extract_smil_formats(

3569

source_url, video_id, fatal=False))

3570

# https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67

3571

elif source_type.startswith('audio') or ext in (

3572

'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):

formats.append({

'url': source_url,

'vcodec': 'none',

'ext': ext,

})

else:

height = int_or_none(source.get('height'))

3580

if height is None:

3581

# Often no height is provided but there is a label in

3582

# format like "1080p", "720p SD", or 1080.

3583

height = int_or_none(self._search_regex(

3584

r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),

3585

'height', default=None))

3586

a_format = {

3587

'url': source_url,

3588

'width': int_or_none(source.get('width')),

3589

'height': height,

3590

'tbr': int_or_none(source.get('bitrate'), scale=1000),

3591

'filesize': int_or_none(source.get('filesize')),

3592

'ext': ext,

3593

}

3594

if source_url.startswith('rtmp'):

3595

a_format['ext'] = 'flv'

3596

# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as

3597

# of jwplayer.flash.swf

3598

rtmp_url_parts = re.split(

3599

r'((?:mp4|mp3|flv):)', source_url, 1)

3600

if len(rtmp_url_parts) == 3:

3601

rtmp_url, prefix, play_path = rtmp_url_parts

3602

a_format.update({

3603

'url': rtmp_url,

3604

'play_path': prefix + play_path,

3605

})

3606

if rtmp_params:

3607

a_format.update(rtmp_params)

3608

formats.append(a_format)

3609

return formats

3610

3611

def _live_title(self, name):

3612

self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')

3613

return name

3614

3615

def _int(self, v, name, fatal=False, **kwargs):

3616

res = int_or_none(v, **kwargs)

3617

if res is None:

3618

msg = f'Failed to extract {name}: Could not parse value {v!r}'

3619

if fatal:

3620

raise ExtractorError(msg)

3621

else:

3622

self.report_warning(msg)

3623

return res

3624

3625

def _float(self, v, name, fatal=False, **kwargs):

3626

res = float_or_none(v, **kwargs)

3627

if res is None:

3628

msg = f'Failed to extract {name}: Could not parse value {v!r}'

3629

if fatal:

3630

raise ExtractorError(msg)

3631

else:

3632

self.report_warning(msg)

3633

return res

3634

3635

def _set_cookie(self, domain, name, value, expire_time=None, port=None,

3636

path='/', secure=False, discard=False, rest={}, **kwargs):

3637

cookie = http.cookiejar.Cookie(

3638

0, name, value, port, port is not None, domain, True,

3639

domain.startswith('.'), path, True, secure, expire_time,

3640

discard, None, None, rest)

3641

self.cookiejar.set_cookie(cookie)

3642

3643

def _get_cookies(self, url):

3644

""" Return a http.cookies.SimpleCookie with the cookies for the url """

3645

return LenientSimpleCookie(self._downloader._calc_cookies(url))

3646

3647

def _apply_first_set_cookie_header(self, url_handle, cookie):

3648

"""

3649

Apply first Set-Cookie header instead of the last. Experimental.

3650

3651

Some sites (e.g. [1-3]) may serve two cookies under the same name

3652

in Set-Cookie header and expect the first (old) one to be set rather

3653

than second (new). However, as of RFC6265 the newer one cookie

3654

should be set into cookie store what actually happens.

3655

We will workaround this issue by resetting the cookie to

3656

the first one manually.

3657

1. https://new.vk.com/

3658

2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201

3659

3. https://learning.oreilly.com/

3660

"""

3661

for header, cookies in url_handle.headers.items():

3662

if header.lower() != 'set-cookie':

3663

continue

3664

cookies = cookies.encode('iso-8859-1').decode('utf-8')

3665

cookie_value = re.search(

3666

r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)

3667

if cookie_value:

3668

value, domain = cookie_value.groups()

3669

self._set_cookie(domain, cookie, value)

break

@classmethod

def get_testcases(cls, include_onlymatching=False):

3674

t = getattr(cls, '_TEST', None)

3675

if t:

3676

assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'

3677

tests = [t]

3678

else:

3679

tests = getattr(cls, '_TESTS', [])

3680

for t in tests:

3681

if not include_onlymatching and t.get('only_matching', False):

3682

continue

3683

t['name'] = cls.ie_key()

yield t

@classmethod

def get_webpage_testcases(cls):

3688

tests = getattr(cls, '_WEBPAGE_TESTS', [])

3689

for t in tests:

3690

t['name'] = cls.ie_key()

return tests

@classproperty

def age_limit(cls):

"""Get age limit from the testcases"""

3696

return max(traverse_obj(

3697

(*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),

3698

(..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])

3699

3700

@classmethod

3701

def is_suitable(cls, age_limit):

3702

"""Test whether the extractor is generally suitable for the given age limit"""

3703

return not age_restricted(cls.age_limit, age_limit)

3704

3705

@classmethod

3706

def description(cls, *, markdown=True, search_examples=None):

3707

"""Description of the extractor"""

3708

desc = ''

3709

if cls._NETRC_MACHINE:

3710

if markdown:

3711

desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'

3712

else:

3713

desc += f' [{cls._NETRC_MACHINE}]'

3714

if cls.IE_DESC is False:

3715

desc += ' [HIDDEN]'

3716

elif cls.IE_DESC:

3717

desc += f' {cls.IE_DESC}'

3718

if cls.SEARCH_KEY:

3719

desc += f'; "{cls.SEARCH_KEY}:" prefix'

3720

if search_examples:

3721

_COUNTS = ('', '5', '10', 'all')

3722

desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'

3723

if not cls.working():

3724

desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'

3725

3726

name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME

3727

return f'{name}:{desc}' if desc else name

3728

3729

def extract_subtitles(self, *args, **kwargs):

3730

if (self.get_param('writesubtitles', False)

3731

or self.get_param('listsubtitles')):

3732

return self._get_subtitles(*args, **kwargs)

3733

return {}

3734

3735

def _get_subtitles(self, *args, **kwargs):

3736

raise NotImplementedError('This method must be implemented by subclasses')

3737

3738

def extract_comments(self, *args, **kwargs):

3739

if not self.get_param('getcomments'):

3740

return None

3741

generator = self._get_comments(*args, **kwargs)

def extractor():

comments = []

interrupted = True

try:

while True:

comments.append(next(generator))

3749

except StopIteration:

3750

interrupted = False

3751

except KeyboardInterrupt:

3752

self.to_screen('Interrupted by user')

3753

except Exception as e:

3754

if self.get_param('ignoreerrors') is not True:

3755

raise

3756

self._downloader.report_error(e)

3757

comment_count = len(comments)

3758

self.to_screen(f'Extracted {comment_count} comments')

3759

return {

3760

'comments': comments,

3761

'comment_count': None if interrupted else comment_count

}

return extractor

def _get_comments(self, *args, **kwargs):

3766

raise NotImplementedError('This method must be implemented by subclasses')

3767

3768

@staticmethod

3769

def _merge_subtitle_items(subtitle_list1, subtitle_list2):

3770

""" Merge subtitle items for one language. Items with duplicated URLs/data

3771

will be dropped. """

3772

list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}

3773

ret = list(subtitle_list1)

3774

ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)

return ret

@classmethod

def _merge_subtitles(cls, *dicts, target=None):

3779

""" Merge subtitle dictionaries, language by language. """

if target is None:

target = {}

for d in dicts:

for lang, subs in d.items():

3784

target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)

3785

return target

3786

3787

def extract_automatic_captions(self, *args, **kwargs):

3788

if (self.get_param('writeautomaticsub', False)

3789

or self.get_param('listsubtitles')):

3790

return self._get_automatic_captions(*args, **kwargs)

3791

return {}

3792

3793

def _get_automatic_captions(self, *args, **kwargs):

3794

raise NotImplementedError('This method must be implemented by subclasses')

3795

3796

@functools.cached_property

3797

def _cookies_passed(self):

3798

"""Whether cookies have been passed to YoutubeDL"""

3799

return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None

3800

3801

def mark_watched(self, *args, **kwargs):

3802

if not self.get_param('mark_watched', False):

3803

return

3804

if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:

3805

self._mark_watched(*args, **kwargs)

3806

3807

def _mark_watched(self, *args, **kwargs):

3808

raise NotImplementedError('This method must be implemented by subclasses')

3809

3810

def geo_verification_headers(self):

3811

headers = {}

3812

geo_verification_proxy = self.get_param('geo_verification_proxy')

3813

if geo_verification_proxy:

3814

headers['Ytdl-request-proxy'] = geo_verification_proxy

return headers

@staticmethod

def _generic_id(url):

3819

return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])

3820

3821

@staticmethod

3822

def _generic_title(url):

3823

return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])

3824

3825

@staticmethod

3826

def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):

3827

all_known = all(map(

3828

lambda x: x is not None,

3829

(is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))

3830

return (

3831

'private' if is_private

3832

else 'premium_only' if needs_premium

3833

else 'subscriber_only' if needs_subscription

3834

else 'needs_auth' if needs_auth

3835

else 'unlisted' if is_unlisted

3836

else 'public' if all_known

3837

else None)

3838

3839

def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):

3840

'''

3841

@returns A list of values for the extractor argument given by "key"

3842

or "default" if no such key is present

3843

@param default The default value to return when the key is not present (default: [])

3844

@param casesense When false, the values are converted to lower case

3845

'''

3846

ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()

3847

val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))

3848

if val is None:

3849

return [] if default is NO_DEFAULT else default

3850

return list(val) if casesense else [x.lower() for x in val]

3851

3852

def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):

3853

if not playlist_id or not video_id:

3854

return not video_id

3855

3856

no_playlist = (smuggled_data or {}).get('force_noplaylist')

3857

if no_playlist is not None:

3858

return not no_playlist

3859

3860

video_id = '' if video_id is True else f' {video_id}'

3861

playlist_id = '' if playlist_id is True else f' {playlist_id}'

3862

if self.get_param('noplaylist'):

3863

self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')

3864

return False

3865

self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')

3866

return True

3867

3868

def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):

3869

RetryManager.report_retry(

3870

err, _count or int(fatal), _retries,

3871

info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,

3872

sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))

3873

3874

def RetryManager(self, **kwargs):

3875

return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)

3876

3877

def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):

3878

display_id = traverse_obj(info_dict, 'display_id', 'id')

3879

self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')

3880

return self._downloader.get_info_extractor('Generic')._extract_embeds(

3881

smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)

3882

3883

@classmethod

3884

def extract_from_webpage(cls, ydl, url, webpage):

3885

ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)

3886

else ydl.get_info_extractor(cls.ie_key()))

3887

for info in ie._extract_from_webpage(url, webpage) or []:

3888

# url = None since we do not want to set (webpage/original)_url

3889

ydl.add_default_extra_info(info, ie, None)

yield info

@classmethod

def _extract_from_webpage(cls, url, webpage):

3894

for embed_url in orderedSet(

3895

cls._extract_embed_urls(url, webpage) or [], lazy=True):

3896

yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)

3897

3898

@classmethod

3899

def _extract_embed_urls(cls, url, webpage):

3900

"""@returns all the embed urls on the webpage"""

3901

if '_EMBED_URL_RE' not in cls.__dict__:

3902

assert isinstance(cls._EMBED_REGEX, (list, tuple))

3903

for idx, regex in enumerate(cls._EMBED_REGEX):

3904

assert regex.count('(?P<url>') == 1, \

3905

f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'

3906

cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))

3907

3908

for regex in cls._EMBED_URL_RE:

3909

for mobj in regex.finditer(webpage):

3910

embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))

3911

if cls._VALID_URL is False or cls.suitable(embed_url):

3912

yield embed_url

3913

3914

class StopExtraction(Exception):

pass

@classmethod

def _extract_url(cls, webpage): # TODO: Remove

3919

"""Only for compatibility with some older extractors"""

3920

return next(iter(cls._extract_embed_urls(None, webpage) or []), None)

3921

3922

@classmethod

3923

def __init_subclass__(cls, *, plugin_name=None, **kwargs):

3924

if plugin_name:

3925

mro = inspect.getmro(cls)

3926

super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]

3927

cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key

3928

while getattr(super_class, '__wrapped__', None):

3929

super_class = super_class.__wrapped__

3930

setattr(sys.modules[super_class.__module__], super_class.__name__, cls)

3931

3932

return super().__init_subclass__(**kwargs)

3933

3934

3935

class SearchInfoExtractor(InfoExtractor):

3936

"""

3937

Base class for paged search queries extractors.

3938

They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}

3939

Instances should define _SEARCH_KEY and optionally _MAX_RESULTS

3940

"""

3941

3942

_MAX_RESULTS = float('inf')

@classproperty

def _VALID_URL(cls):

return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

3947

3948

def _real_extract(self, query):

3949

prefix, query = self._match_valid_url(query).group('prefix', 'query')

3950

if prefix == '':

3951

return self._get_n_results(query, 1)

3952

elif prefix == 'all':

3953

return self._get_n_results(query, self._MAX_RESULTS)

else:

n = int(prefix)

if n <= 0:

raise ExtractorError(f'invalid download number {n} for query "{query}"')

3958

elif n > self._MAX_RESULTS:

3959

self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))

3960

n = self._MAX_RESULTS

3961

return self._get_n_results(query, n)

3962

3963

def _get_n_results(self, query, n):

3964

"""Get a specified number of results for a query.

3965

Either this function or _search_results must be overridden by subclasses """

3966

return self.playlist_result(

3967

itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),

3968

query, query)

3969

3970

def _search_results(self, query):

3971

"""Returns an iterator of search results"""

3972

raise NotImplementedError('This method must be implemented by subclasses')

@classproperty

def SEARCH_KEY(cls):

return cls._SEARCH_KEY

3977

3978

3979

class UnsupportedURLIE(InfoExtractor):

_VALID_URL = '.*'

_ENABLED = False

IE_DESC = False

def _real_extract(self, url):

3985

raise UnsupportedError(url)