jfr.im git - yt-dlp.git/blame_incremental - yt

Commit	Line	Data
	1	import base64
	2	import collections
	3	import getpass
	4	import hashlib
	5	import http.client
	6	import http.cookiejar
	7	import http.cookies
	8	import inspect
	9	import itertools
	10	import json
	11	import math
	12	import netrc
	13	import os
	14	import random
	15	import re
	16	import subprocess
	17	import sys
	18	import time
	19	import types
	20	import urllib.parse
	21	import urllib.request
	22	import xml.etree.ElementTree
	23
	24	from ..compat import functools # isort: split
	25	from ..compat import (
	26	compat_etree_fromstring,
	27	compat_expanduser,
	28	compat_os_name,
	29	urllib_req_to_req,
	30	)
	31	from ..cookies import LenientSimpleCookie
	32	from ..downloader.f4m import get_base_url, remove_encrypted_media
	33	from ..downloader.hls import HlsFD
	34	from ..networking import HEADRequest, Request
	35	from ..networking.exceptions import (
	36	HTTPError,
	37	IncompleteRead,
	38	network_exceptions,
	39	)
	40	from ..utils import (
	41	IDENTITY,
	42	JSON_LD_RE,
	43	NO_DEFAULT,
	44	ExtractorError,
	45	FormatSorter,
	46	GeoRestrictedError,
	47	GeoUtils,
	48	LenientJSONDecoder,
	49	Popen,
	50	RegexNotFoundError,
	51	RetryManager,
	52	UnsupportedError,
	53	age_restricted,
	54	base_url,
	55	bug_reports_message,
	56	classproperty,
	57	clean_html,
	58	deprecation_warning,
	59	determine_ext,
	60	dict_get,
	61	encode_data_uri,
	62	error_to_compat_str,
	63	extract_attributes,
	64	filter_dict,
	65	fix_xml_ampersands,
	66	float_or_none,
	67	format_field,
	68	int_or_none,
	69	join_nonempty,
	70	js_to_json,
	71	mimetype2ext,
	72	netrc_from_content,
	73	orderedSet,
	74	parse_bitrate,
	75	parse_codecs,
	76	parse_duration,
	77	parse_iso8601,
	78	parse_m3u8_attributes,
	79	parse_resolution,
	80	sanitize_filename,
	81	sanitize_url,
	82	smuggle_url,
	83	str_or_none,
	84	str_to_int,
	85	strip_or_none,
	86	traverse_obj,
	87	truncate_string,
	88	try_call,
	89	try_get,
	90	unescapeHTML,
	91	unified_strdate,
	92	unified_timestamp,
	93	url_basename,
	94	url_or_none,
	95	urlhandle_detect_ext,
	96	urljoin,
	97	variadic,
	98	xpath_element,
	99	xpath_text,
	100	xpath_with_ns,
	101	)
	102
	103
	104	class InfoExtractor:
	105	"""Information Extractor class.
	106
	107	Information extractors are the classes that, given a URL, extract
	108	information about the video (or videos) the URL refers to. This
	109	information includes the real video URL, the video title, author and
	110	others. The information is stored in a dictionary which is then
	111	passed to the YoutubeDL. The YoutubeDL processes this
	112	information possibly downloading the video to the file system, among
	113	other possible outcomes.
	114
	115	The type field determines the type of the result.
	116	By far the most common value (and the default if _type is missing) is
	117	"video", which indicates a single video.
	118
	119	For a video, the dictionaries must include the following fields:
	120
	121	id: Video identifier.
	122	title: Video title, unescaped. Set to an empty string if video has
	123	no title as opposed to "None" which signifies that the
	124	extractor failed to obtain a title
	125
	126	Additionally, it must contain either a formats entry or a url one:
	127
	128	formats: A list of dictionaries for each format available, ordered
	129	from worst to best quality.
	130
	131	Potential fields:
	132	* url The mandatory URL representing the media:
	133	for plain file media - HTTP URL of this file,
	134	for RTMP - RTMP URL,
	135	for HLS - URL of the M3U8 media playlist,
	136	for HDS - URL of the F4M manifest,
	137	for DASH
	138	- HTTP URL to plain file media (in case of
	139	unfragmented media)
	140	- URL of the MPD manifest or base URL
	141	representing the media if MPD manifest
	142	is parsed from a string (in case of
	143	fragmented media)
	144	for MSS - URL of the ISM manifest.
	145	* request_data Data to send in POST request to the URL
	146	* manifest_url
	147	The URL of the manifest file in case of
	148	fragmented media:
	149	for HLS - URL of the M3U8 master playlist,
	150	for HDS - URL of the F4M manifest,
	151	for DASH - URL of the MPD manifest,
	152	for MSS - URL of the ISM manifest.
	153	* manifest_stream_number (For internal use only)
	154	The index of the stream in the manifest file
	155	* ext Will be calculated from URL if missing
	156	* format A human-readable description of the format
	157	("mp4 container with h264/opus").
	158	Calculated from the format_id, width, height.
	159	and format_note fields if missing.
	160	* format_id A short description of the format
	161	("mp4_h264_opus" or "19").
	162	Technically optional, but strongly recommended.
	163	* format_note Additional info about the format
	164	("3D" or "DASH video")
	165	* width Width of the video, if known
	166	* height Height of the video, if known
	167	* aspect_ratio Aspect ratio of the video, if known
	168	Automatically calculated from width and height
	169	* resolution Textual description of width and height
	170	Automatically calculated from width and height
	171	* dynamic_range The dynamic range of the video. One of:
	172	"SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
	173	* tbr Average bitrate of audio and video in KBit/s
	174	* abr Average audio bitrate in KBit/s
	175	* acodec Name of the audio codec in use
	176	* asr Audio sampling rate in Hertz
	177	* audio_channels Number of audio channels
	178	* vbr Average video bitrate in KBit/s
	179	* fps Frame rate
	180	* vcodec Name of the video codec in use
	181	* container Name of the container format
	182	* filesize The number of bytes, if known in advance
	183	* filesize_approx An estimate for the number of bytes
	184	* player_url SWF Player URL (used for rtmpdump).
	185	* protocol The protocol that will be used for the actual
	186	download, lower-case. One of "http", "https" or
	187	one of the protocols defined in downloader.PROTOCOL_MAP
	188	* fragment_base_url
	189	Base URL for fragments. Each fragment's path
	190	value (if present) will be relative to
	191	this URL.
	192	* fragments A list of fragments of a fragmented media.
	193	Each fragment entry must contain either an url
	194	or a path. If an url is present it should be
	195	considered by a client. Otherwise both path and
	196	fragment_base_url must be present. Here is
	197	the list of all potential fields:
	198	* "url" - fragment's URL
	199	* "path" - fragment's path relative to
	200	fragment_base_url
	201	* "duration" (optional, int or float)
	202	* "filesize" (optional, int)
	203	* is_from_start Is a live format that can be downloaded
	204	from the start. Boolean
	205	* preference Order number of this format. If this field is
	206	present and not None, the formats get sorted
	207	by this field, regardless of all other values.
	208	-1 for default (order by other properties),
	209	-2 or smaller for less than default.
	210	< -1000 to hide the format (if there is
	211	another one which is strictly better)
	212	* language Language code, e.g. "de" or "en-US".
	213	* language_preference Is this in the language mentioned in
	214	the URL?
	215	10 if it's what the URL is about,
	216	-1 for default (don't know),
	217	-10 otherwise, other values reserved for now.
	218	* quality Order number of the video quality of this
	219	format, irrespective of the file format.
	220	-1 for default (order by other properties),
	221	-2 or smaller for less than default.
	222	* source_preference Order number for this video source
	223	(quality takes higher priority)
	224	-1 for default (order by other properties),
	225	-2 or smaller for less than default.
	226	* http_headers A dictionary of additional HTTP headers
	227	to add to the request.
	228	* stretched_ratio If given and not 1, indicates that the
	229	video's pixels are not square.
	230	width : height ratio as float.
	231	* no_resume The server does not support resuming the
	232	(HTTP or RTMP) download. Boolean.
	233	* has_drm True if the format has DRM and cannot be downloaded.
	234	'maybe' if the format may have DRM and has to be tested before download.
	235	* extra_param_to_segment_url A query string to append to each
	236	fragment's URL, or to update each existing query string
	237	with. Only applied by the native HLS/DASH downloaders.
	238	* hls_aes A dictionary of HLS AES-128 decryption information
	239	used by the native HLS downloader to override the
	240	values in the media playlist when an '#EXT-X-KEY' tag
	241	is present in the playlist:
	242	* uri The URI from which the key will be downloaded
	243	* key The key (as hex) used to decrypt fragments.
	244	If `key` is given, any key URI will be ignored
	245	* iv The IV (as hex) used to decrypt fragments
	246	* downloader_options A dictionary of downloader options
	247	(For internal use only)
	248	* http_chunk_size Chunk size for HTTP downloads
	249	* ffmpeg_args Extra arguments for ffmpeg downloader
	250	* is_dash_periods Whether the format is a result of merging
	251	multiple DASH periods.
	252	RTMP formats can also have the additional fields: page_url,
	253	app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
	254	rtmp_protocol, rtmp_real_time
	255
	256	url: Final video URL.
	257	ext: Video filename extension.
	258	format: The video format, defaults to ext (used for --get-format)
	259	player_url: SWF Player URL (used for rtmpdump).
	260
	261	The following fields are optional:
	262
	263	direct: True if a direct video file was given (must only be set by GenericIE)
	264	alt_title: A secondary title of the video.
	265	display_id An alternative identifier for the video, not necessarily
	266	unique, but available before title. Typically, id is
	267	something like "4234987", title "Dancing naked mole rats",
	268	and display_id "dancing-naked-mole-rats"
	269	thumbnails: A list of dictionaries, with the following entries:
	270	* "id" (optional, string) - Thumbnail format ID
	271	* "url"
	272	* "preference" (optional, int) - quality of the image
	273	* "width" (optional, int)
	274	* "height" (optional, int)
	275	* "resolution" (optional, string "{width}x{height}",
	276	deprecated)
	277	* "filesize" (optional, int)
	278	* "http_headers" (dict) - HTTP headers for the request
	279	thumbnail: Full URL to a video thumbnail image.
	280	description: Full video description.
	281	uploader: Full name of the video uploader.
	282	license: License name the video is licensed under.
	283	creator: The creator of the video.
	284	timestamp: UNIX timestamp of the moment the video was uploaded
	285	upload_date: Video upload date in UTC (YYYYMMDD).
	286	If not explicitly set, calculated from timestamp
	287	release_timestamp: UNIX timestamp of the moment the video was released.
	288	If it is not clear whether to use timestamp or this, use the former
	289	release_date: The date (YYYYMMDD) when the video was released in UTC.
	290	If not explicitly set, calculated from release_timestamp
	291	release_year: Year (YYYY) as integer when the video or album was released.
	292	To be used if no exact release date is known.
	293	If not explicitly set, calculated from release_date.
	294	modified_timestamp: UNIX timestamp of the moment the video was last modified.
	295	modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
	296	If not explicitly set, calculated from modified_timestamp
	297	uploader_id: Nickname or id of the video uploader.
	298	uploader_url: Full URL to a personal webpage of the video uploader.
	299	channel: Full name of the channel the video is uploaded on.
	300	Note that channel fields may or may not repeat uploader
	301	fields. This depends on a particular extractor.
	302	channel_id: Id of the channel.
	303	channel_url: Full URL to a channel webpage.
	304	channel_follower_count: Number of followers of the channel.
	305	channel_is_verified: Whether the channel is verified on the platform.
	306	location: Physical location where the video was filmed.
	307	subtitles: The available subtitles as a dictionary in the format
	308	{tag: subformats}. "tag" is usually a language code, and
	309	"subformats" is a list sorted from lower to higher
	310	preference, each element is a dictionary with the "ext"
	311	entry and one of:
	312	* "data": The subtitles file contents
	313	* "url": A URL pointing to the subtitles file
	314	It can optionally also have:
	315	* "name": Name or description of the subtitles
	316	* "http_headers": A dictionary of additional HTTP headers
	317	to add to the request.
	318	"ext" will be calculated from URL if missing
	319	automatic_captions: Like 'subtitles'; contains automatically generated
	320	captions instead of normal subtitles
	321	duration: Length of the video in seconds, as an integer or float.
	322	view_count: How many users have watched the video on the platform.
	323	concurrent_view_count: How many users are currently watching the video on the platform.
	324	like_count: Number of positive ratings of the video
	325	dislike_count: Number of negative ratings of the video
	326	repost_count: Number of reposts of the video
	327	average_rating: Average rating give by users, the scale used depends on the webpage
	328	comment_count: Number of comments on the video
	329	comments: A list of comments, each with one or more of the following
	330	properties (all but one of text or html optional):
	331	* "author" - human-readable name of the comment author
	332	* "author_id" - user ID of the comment author
	333	* "author_thumbnail" - The thumbnail of the comment author
	334	* "author_url" - The url to the comment author's page
	335	* "author_is_verified" - Whether the author is verified
	336	on the platform
	337	* "author_is_uploader" - Whether the comment is made by
	338	the video uploader
	339	* "id" - Comment ID
	340	* "html" - Comment as HTML
	341	* "text" - Plain text of the comment
	342	* "timestamp" - UNIX timestamp of comment
	343	* "parent" - ID of the comment this one is replying to.
	344	Set to "root" to indicate that this is a
	345	comment to the original video.
	346	* "like_count" - Number of positive ratings of the comment
	347	* "dislike_count" - Number of negative ratings of the comment
	348	* "is_favorited" - Whether the comment is marked as
	349	favorite by the video uploader
	350	* "is_pinned" - Whether the comment is pinned to
	351	the top of the comments
	352	age_limit: Age restriction for the video, as an integer (years)
	353	webpage_url: The URL to the video webpage, if given to yt-dlp it
	354	should allow to get the same result again. (It will be set
	355	by YoutubeDL if it's missing)
	356	categories: A list of categories that the video falls in, for example
	357	["Sports", "Berlin"]
	358	tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
	359	cast: A list of the video cast
	360	is_live: True, False, or None (=unknown). Whether this video is a
	361	live stream that goes on instead of a fixed-length video.
	362	was_live: True, False, or None (=unknown). Whether this video was
	363	originally a live stream.
	364	live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
	365	or 'post_live' (was live, but VOD is not yet processed)
	366	If absent, automatically set from is_live, was_live
	367	start_time: Time in seconds where the reproduction should start, as
	368	specified in the URL.
	369	end_time: Time in seconds where the reproduction should end, as
	370	specified in the URL.
	371	chapters: A list of dictionaries, with the following entries:
	372	* "start_time" - The start time of the chapter in seconds
	373	* "end_time" - The end time of the chapter in seconds
	374	* "title" (optional, string)
	375	heatmap: A list of dictionaries, with the following entries:
	376	* "start_time" - The start time of the data point in seconds
	377	* "end_time" - The end time of the data point in seconds
	378	* "value" - The normalized value of the data point (float between 0 and 1)
	379	playable_in_embed: Whether this video is allowed to play in embedded
	380	players on other sites. Can be True (=always allowed),
	381	False (=never allowed), None (=unknown), or a string
	382	specifying the criteria for embedability; e.g. 'whitelist'
	383	availability: Under what condition the video is available. One of
	384	'private', 'premium_only', 'subscriber_only', 'needs_auth',
	385	'unlisted' or 'public'. Use 'InfoExtractor._availability'
	386	to set it
	387	media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
	388	_old_archive_ids: A list of old archive ids needed for backward compatibility
	389	_format_sort_fields: A list of fields to use for sorting formats
	390	__post_extractor: A function to be called just before the metadata is
	391	written to either disk, logger or console. The function
	392	must return a dict which will be added to the info_dict.
	393	This is usefull for additional information that is
	394	time-consuming to extract. Note that the fields thus
	395	extracted will not be available to output template and
	396	match_filter. So, only "comments" and "comment_count" are
	397	currently allowed to be extracted via this method.
	398
	399	The following fields should only be used when the video belongs to some logical
	400	chapter or section:
	401
	402	chapter: Name or title of the chapter the video belongs to.
	403	chapter_number: Number of the chapter the video belongs to, as an integer.
	404	chapter_id: Id of the chapter the video belongs to, as a unicode string.
	405
	406	The following fields should only be used when the video is an episode of some
	407	series, programme or podcast:
	408
	409	series: Title of the series or programme the video episode belongs to.
	410	series_id: Id of the series or programme the video episode belongs to, as a unicode string.
	411	season: Title of the season the video episode belongs to.
	412	season_number: Number of the season the video episode belongs to, as an integer.
	413	season_id: Id of the season the video episode belongs to, as a unicode string.
	414	episode: Title of the video episode. Unlike mandatory video title field,
	415	this field should denote the exact title of the video episode
	416	without any kind of decoration.
	417	episode_number: Number of the video episode within a season, as an integer.
	418	episode_id: Id of the video episode, as a unicode string.
	419
	420	The following fields should only be used when the media is a track or a part of
	421	a music album:
	422
	423	track: Title of the track.
	424	track_number: Number of the track within an album or a disc, as an integer.
	425	track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
	426	as a unicode string.
	427	artist: Artist(s) of the track.
	428	genre: Genre(s) of the track.
	429	album: Title of the album the track belongs to.
	430	album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
	431	album_artist: List of all artists appeared on the album (e.g.
	432	"Ash Borer / Fell Voices" or "Various Artists", useful for splits
	433	and compilations).
	434	disc_number: Number of the disc or other physical medium the track belongs to,
	435	as an integer.
	436	composer: Composer of the piece
	437
	438	The following fields should only be set for clips that should be cut from the original video:
	439
	440	section_start: Start time of the section in seconds
	441	section_end: End time of the section in seconds
	442
	443	The following fields should only be set for storyboards:
	444	rows: Number of rows in each storyboard fragment, as an integer
	445	columns: Number of columns in each storyboard fragment, as an integer
	446
	447	Unless mentioned otherwise, the fields should be Unicode strings.
	448
	449	Unless mentioned otherwise, None is equivalent to absence of information.
	450
	451
	452	_type "playlist" indicates multiple videos.
	453	There must be a key "entries", which is a list, an iterable, or a PagedList
	454	object, each element of which is a valid dictionary by this specification.
	455
	456	Additionally, playlists can have "id", "title", and any other relevant
	457	attributes with the same semantics as videos (see above).
	458
	459	It can also have the following optional fields:
	460
	461	playlist_count: The total number of videos in a playlist. If not given,
	462	YoutubeDL tries to calculate it from "entries"
	463
	464
	465	_type "multi_video" indicates that there are multiple videos that
	466	form a single show, for examples multiple acts of an opera or TV episode.
	467	It must have an entries key like a playlist and contain all the keys
	468	required for a video at the same time.
	469
	470
	471	_type "url" indicates that the video must be extracted from another
	472	location, possibly by a different extractor. Its only required key is:
	473	"url" - the next URL to extract.
	474	The key "ie_key" can be set to the class name (minus the trailing "IE",
	475	e.g. "Youtube") if the extractor class is known in advance.
	476	Additionally, the dictionary may have any properties of the resolved entity
	477	known in advance, for example "title" if the title of the referred video is
	478	known ahead of time.
	479
	480
	481	_type "url_transparent" entities have the same specification as "url", but
	482	indicate that the given additional information is more precise than the one
	483	associated with the resolved URL.
	484	This is useful when a site employs a video service that hosts the video and
	485	its technical metadata, but that video service does not embed a useful
	486	title, description etc.
	487
	488
	489	Subclasses of this should also be added to the list of extractors and
	490	should define _VALID_URL as a regexp or a Sequence of regexps, and
	491	re-define the _real_extract() and (optionally) _real_initialize() methods.
	492
	493	Subclasses may also override suitable() if necessary, but ensure the function
	494	signature is preserved and that this function imports everything it needs
	495	(except other extractors), so that lazy_extractors works correctly.
	496
	497	Subclasses can define a list of _EMBED_REGEX, which will be searched for in
	498	the HTML of Generic webpages. It may also override _extract_embed_urls
	499	or _extract_from_webpage as necessary. While these are normally classmethods,
	500	_extract_from_webpage is allowed to be an instance method.

1

import base64

import collections

import getpass

import hashlib

import http.client

import http.cookiejar

import http.cookies

import inspect

import itertools

import json

import math

import netrc

import os

import random

import re

import subprocess

import sys

import time

import types

import urllib.parse

import urllib.request

22

import xml.etree.ElementTree

23

24

from ..compat import functools # isort: split

25

from ..compat import (

26

compat_etree_fromstring,

compat_expanduser,

compat_os_name,

urllib_req_to_req,

)

from ..cookies import LenientSimpleCookie

32

from ..downloader.f4m import get_base_url, remove_encrypted_media

33

from ..downloader.hls import HlsFD

34

from ..networking import HEADRequest, Request

35

from ..networking.exceptions import (

HTTPError,

IncompleteRead,

network_exceptions,

)

from ..utils import (

IDENTITY,

JSON_LD_RE,

NO_DEFAULT,

ExtractorError,

FormatSorter,

GeoRestrictedError,

GeoUtils,

LenientJSONDecoder,

Popen,

RegexNotFoundError,

RetryManager,

UnsupportedError,

age_restricted,

base_url,

bug_reports_message,

classproperty,

clean_html,

deprecation_warning,

determine_ext,

dict_get,

encode_data_uri,

error_to_compat_str,

extract_attributes,

filter_dict,

fix_xml_ampersands,

float_or_none,

format_field,

int_or_none,

join_nonempty,

js_to_json,

mimetype2ext,

netrc_from_content,

orderedSet,

parse_bitrate,

parse_codecs,

parse_duration,

parse_iso8601,

parse_m3u8_attributes,

parse_resolution,

sanitize_filename,

sanitize_url,

smuggle_url,

str_or_none,

str_to_int,

strip_or_none,

traverse_obj,

truncate_string,

try_call,

try_get,

unescapeHTML,

unified_strdate,

unified_timestamp,

url_basename,

url_or_none,

urlhandle_detect_ext,

urljoin,

variadic,

xpath_element,

xpath_text,

xpath_with_ns,

)

class InfoExtractor:

"""Information Extractor class.

106

107

Information extractors are the classes that, given a URL, extract

108

information about the video (or videos) the URL refers to. This

109

information includes the real video URL, the video title, author and

110

others. The information is stored in a dictionary which is then

111

passed to the YoutubeDL. The YoutubeDL processes this

112

information possibly downloading the video to the file system, among

113

other possible outcomes.

114

115

The type field determines the type of the result.

116

By far the most common value (and the default if _type is missing) is

117

"video", which indicates a single video.

118

119

For a video, the dictionaries must include the following fields:

120

121

id: Video identifier.

122

title: Video title, unescaped. Set to an empty string if video has

123

no title as opposed to "None" which signifies that the

124

extractor failed to obtain a title

125

126

Additionally, it must contain either a formats entry or a url one:

127

128

formats: A list of dictionaries for each format available, ordered

129

from worst to best quality.

130

131

Potential fields:

132

* url The mandatory URL representing the media:

133

for plain file media - HTTP URL of this file,

134

for RTMP - RTMP URL,

135

for HLS - URL of the M3U8 media playlist,

136

for HDS - URL of the F4M manifest,

137

for DASH

138

- HTTP URL to plain file media (in case of

139

unfragmented media)

140

- URL of the MPD manifest or base URL

141

representing the media if MPD manifest

142

is parsed from a string (in case of

143

fragmented media)

144

for MSS - URL of the ISM manifest.

145

* request_data Data to send in POST request to the URL

146

* manifest_url

147

The URL of the manifest file in case of

148

fragmented media:

149

for HLS - URL of the M3U8 master playlist,

150

for HDS - URL of the F4M manifest,

151

for DASH - URL of the MPD manifest,

152

for MSS - URL of the ISM manifest.

153

* manifest_stream_number (For internal use only)

154

The index of the stream in the manifest file

155

* ext Will be calculated from URL if missing

156

* format A human-readable description of the format

157

("mp4 container with h264/opus").

158

Calculated from the format_id, width, height.

159

and format_note fields if missing.

160

* format_id A short description of the format

161

("mp4_h264_opus" or "19").

162

Technically optional, but strongly recommended.

163

* format_note Additional info about the format

164

("3D" or "DASH video")

165

* width Width of the video, if known

166

* height Height of the video, if known

167

* aspect_ratio Aspect ratio of the video, if known

168

Automatically calculated from width and height

169

* resolution Textual description of width and height

170

Automatically calculated from width and height

171

* dynamic_range The dynamic range of the video. One of:

172

"SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"

173

* tbr Average bitrate of audio and video in KBit/s

174

* abr Average audio bitrate in KBit/s

175

* acodec Name of the audio codec in use

176

* asr Audio sampling rate in Hertz

177

* audio_channels Number of audio channels

178

* vbr Average video bitrate in KBit/s

179

* fps Frame rate

180

* vcodec Name of the video codec in use

181

* container Name of the container format

182

* filesize The number of bytes, if known in advance

183

* filesize_approx An estimate for the number of bytes

184

* player_url SWF Player URL (used for rtmpdump).

185

* protocol The protocol that will be used for the actual

186

download, lower-case. One of "http", "https" or

187

one of the protocols defined in downloader.PROTOCOL_MAP

188

* fragment_base_url

189

Base URL for fragments. Each fragment's path

190

value (if present) will be relative to

191

this URL.

192

* fragments A list of fragments of a fragmented media.

193

Each fragment entry must contain either an url

194

or a path. If an url is present it should be

195

considered by a client. Otherwise both path and

196

fragment_base_url must be present. Here is

197

the list of all potential fields:

198

* "url" - fragment's URL

199

* "path" - fragment's path relative to

200

fragment_base_url

201

* "duration" (optional, int or float)

202

* "filesize" (optional, int)

203

* is_from_start Is a live format that can be downloaded

204

from the start. Boolean

205

* preference Order number of this format. If this field is

206

present and not None, the formats get sorted

207

by this field, regardless of all other values.

208

-1 for default (order by other properties),

209

-2 or smaller for less than default.

210

< -1000 to hide the format (if there is

211

another one which is strictly better)

212

* language Language code, e.g. "de" or "en-US".

213

* language_preference Is this in the language mentioned in

214

the URL?

215

10 if it's what the URL is about,

216

-1 for default (don't know),

217

-10 otherwise, other values reserved for now.

218

* quality Order number of the video quality of this

219

format, irrespective of the file format.

220

-1 for default (order by other properties),

221

-2 or smaller for less than default.

222

* source_preference Order number for this video source

223

(quality takes higher priority)

224

-1 for default (order by other properties),

225

-2 or smaller for less than default.

226

* http_headers A dictionary of additional HTTP headers

227

to add to the request.

228

* stretched_ratio If given and not 1, indicates that the

229

video's pixels are not square.

230

width : height ratio as float.

231

* no_resume The server does not support resuming the

232

(HTTP or RTMP) download. Boolean.

233

* has_drm True if the format has DRM and cannot be downloaded.

234

'maybe' if the format may have DRM and has to be tested before download.

235

* extra_param_to_segment_url A query string to append to each

236

fragment's URL, or to update each existing query string

237

with. Only applied by the native HLS/DASH downloaders.

238

* hls_aes A dictionary of HLS AES-128 decryption information

239

used by the native HLS downloader to override the

240

values in the media playlist when an '#EXT-X-KEY' tag

241

is present in the playlist:

242

* uri The URI from which the key will be downloaded

243

* key The key (as hex) used to decrypt fragments.

244

If `key` is given, any key URI will be ignored

245

* iv The IV (as hex) used to decrypt fragments

246

* downloader_options A dictionary of downloader options

247

(For internal use only)

248

* http_chunk_size Chunk size for HTTP downloads

249

* ffmpeg_args Extra arguments for ffmpeg downloader

250

* is_dash_periods Whether the format is a result of merging

251

multiple DASH periods.

252

RTMP formats can also have the additional fields: page_url,

253

app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,

254

rtmp_protocol, rtmp_real_time

255

256

url: Final video URL.

257

ext: Video filename extension.

258

format: The video format, defaults to ext (used for --get-format)

259

player_url: SWF Player URL (used for rtmpdump).

260

261

The following fields are optional:

262

263

direct: True if a direct video file was given (must only be set by GenericIE)

264

alt_title: A secondary title of the video.

265

display_id An alternative identifier for the video, not necessarily

266

unique, but available before title. Typically, id is

267

something like "4234987", title "Dancing naked mole rats",

268

and display_id "dancing-naked-mole-rats"

269

thumbnails: A list of dictionaries, with the following entries:

270

* "id" (optional, string) - Thumbnail format ID

271

* "url"

272

* "preference" (optional, int) - quality of the image

273

* "width" (optional, int)

274

* "height" (optional, int)

275

* "resolution" (optional, string "{width}x{height}",

276

deprecated)

277

* "filesize" (optional, int)

278

* "http_headers" (dict) - HTTP headers for the request

279

thumbnail: Full URL to a video thumbnail image.

280

description: Full video description.

281

uploader: Full name of the video uploader.

282

license: License name the video is licensed under.

283

creator: The creator of the video.

284

timestamp: UNIX timestamp of the moment the video was uploaded

285

upload_date: Video upload date in UTC (YYYYMMDD).

286

If not explicitly set, calculated from timestamp

287

release_timestamp: UNIX timestamp of the moment the video was released.

288

If it is not clear whether to use timestamp or this, use the former

289

release_date: The date (YYYYMMDD) when the video was released in UTC.

290

If not explicitly set, calculated from release_timestamp

291

release_year: Year (YYYY) as integer when the video or album was released.

292

To be used if no exact release date is known.

293

If not explicitly set, calculated from release_date.

294

modified_timestamp: UNIX timestamp of the moment the video was last modified.

295

modified_date: The date (YYYYMMDD) when the video was last modified in UTC.

296

If not explicitly set, calculated from modified_timestamp

297

uploader_id: Nickname or id of the video uploader.

298

uploader_url: Full URL to a personal webpage of the video uploader.

299

channel: Full name of the channel the video is uploaded on.

300

Note that channel fields may or may not repeat uploader

301

fields. This depends on a particular extractor.

302

channel_id: Id of the channel.

303

channel_url: Full URL to a channel webpage.

304

channel_follower_count: Number of followers of the channel.

305

channel_is_verified: Whether the channel is verified on the platform.

306

location: Physical location where the video was filmed.

307

subtitles: The available subtitles as a dictionary in the format

308

{tag: subformats}. "tag" is usually a language code, and

309

"subformats" is a list sorted from lower to higher

310

preference, each element is a dictionary with the "ext"

311

entry and one of:

312

* "data": The subtitles file contents

313

* "url": A URL pointing to the subtitles file

314

It can optionally also have:

315

* "name": Name or description of the subtitles

316

* "http_headers": A dictionary of additional HTTP headers

317

to add to the request.

318

"ext" will be calculated from URL if missing

319

automatic_captions: Like 'subtitles'; contains automatically generated

320

captions instead of normal subtitles

321

duration: Length of the video in seconds, as an integer or float.

322

view_count: How many users have watched the video on the platform.

323

concurrent_view_count: How many users are currently watching the video on the platform.

324

like_count: Number of positive ratings of the video

325

dislike_count: Number of negative ratings of the video

326

repost_count: Number of reposts of the video

327

average_rating: Average rating give by users, the scale used depends on the webpage

328

comment_count: Number of comments on the video

329

comments: A list of comments, each with one or more of the following

330

properties (all but one of text or html optional):

331

* "author" - human-readable name of the comment author

332

* "author_id" - user ID of the comment author

333

* "author_thumbnail" - The thumbnail of the comment author

334

* "author_url" - The url to the comment author's page

335

* "author_is_verified" - Whether the author is verified

336

on the platform

337

* "author_is_uploader" - Whether the comment is made by

338

the video uploader

339

* "id" - Comment ID

340

* "html" - Comment as HTML

341

* "text" - Plain text of the comment

342

* "timestamp" - UNIX timestamp of comment

343

* "parent" - ID of the comment this one is replying to.

344

Set to "root" to indicate that this is a

345

comment to the original video.

346

* "like_count" - Number of positive ratings of the comment

347

* "dislike_count" - Number of negative ratings of the comment

348

* "is_favorited" - Whether the comment is marked as

349

favorite by the video uploader

350

* "is_pinned" - Whether the comment is pinned to

351

the top of the comments

352

age_limit: Age restriction for the video, as an integer (years)

353

webpage_url: The URL to the video webpage, if given to yt-dlp it

354

should allow to get the same result again. (It will be set

355

by YoutubeDL if it's missing)

356

categories: A list of categories that the video falls in, for example

357

["Sports", "Berlin"]

358

tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]

359

cast: A list of the video cast

360

is_live: True, False, or None (=unknown). Whether this video is a

361

live stream that goes on instead of a fixed-length video.

362

was_live: True, False, or None (=unknown). Whether this video was

363

originally a live stream.

364

live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',

365

or 'post_live' (was live, but VOD is not yet processed)

366

If absent, automatically set from is_live, was_live

367

start_time: Time in seconds where the reproduction should start, as

368

specified in the URL.

369

end_time: Time in seconds where the reproduction should end, as

370

specified in the URL.

371

chapters: A list of dictionaries, with the following entries:

372

* "start_time" - The start time of the chapter in seconds

373

* "end_time" - The end time of the chapter in seconds

374

* "title" (optional, string)

375

heatmap: A list of dictionaries, with the following entries:

376

* "start_time" - The start time of the data point in seconds

377

* "end_time" - The end time of the data point in seconds

378

* "value" - The normalized value of the data point (float between 0 and 1)

379

playable_in_embed: Whether this video is allowed to play in embedded

380

players on other sites. Can be True (=always allowed),

381

False (=never allowed), None (=unknown), or a string

382

specifying the criteria for embedability; e.g. 'whitelist'

383

availability: Under what condition the video is available. One of

384

'private', 'premium_only', 'subscriber_only', 'needs_auth',

385

'unlisted' or 'public'. Use 'InfoExtractor._availability'

386

to set it

387

media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"

388

_old_archive_ids: A list of old archive ids needed for backward compatibility

389

_format_sort_fields: A list of fields to use for sorting formats

390

__post_extractor: A function to be called just before the metadata is

391

written to either disk, logger or console. The function

392

must return a dict which will be added to the info_dict.

393

This is usefull for additional information that is

394

time-consuming to extract. Note that the fields thus

395

extracted will not be available to output template and

396

match_filter. So, only "comments" and "comment_count" are

397

currently allowed to be extracted via this method.

398

399

The following fields should only be used when the video belongs to some logical

400

chapter or section:

401

402

chapter: Name or title of the chapter the video belongs to.

403

chapter_number: Number of the chapter the video belongs to, as an integer.

404

chapter_id: Id of the chapter the video belongs to, as a unicode string.

405

406

The following fields should only be used when the video is an episode of some

407

series, programme or podcast:

408

409

series: Title of the series or programme the video episode belongs to.

410

series_id: Id of the series or programme the video episode belongs to, as a unicode string.

411

season: Title of the season the video episode belongs to.

412

season_number: Number of the season the video episode belongs to, as an integer.

413

season_id: Id of the season the video episode belongs to, as a unicode string.

414

episode: Title of the video episode. Unlike mandatory video title field,

415

this field should denote the exact title of the video episode

416

without any kind of decoration.

417

episode_number: Number of the video episode within a season, as an integer.

418

episode_id: Id of the video episode, as a unicode string.

419

420

The following fields should only be used when the media is a track or a part of

421

a music album:

422

423

track: Title of the track.

424

track_number: Number of the track within an album or a disc, as an integer.

425

track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),

426

as a unicode string.

427

artist: Artist(s) of the track.

428

genre: Genre(s) of the track.

429

album: Title of the album the track belongs to.

430

album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).

431

album_artist: List of all artists appeared on the album (e.g.

432

"Ash Borer / Fell Voices" or "Various Artists", useful for splits

433

and compilations).

434

disc_number: Number of the disc or other physical medium the track belongs to,

435

as an integer.

436

composer: Composer of the piece

437

438

The following fields should only be set for clips that should be cut from the original video:

439

440

section_start: Start time of the section in seconds

441

section_end: End time of the section in seconds

442

443

The following fields should only be set for storyboards:

444

rows: Number of rows in each storyboard fragment, as an integer

445

columns: Number of columns in each storyboard fragment, as an integer

446

447

Unless mentioned otherwise, the fields should be Unicode strings.

448

449

Unless mentioned otherwise, None is equivalent to absence of information.

450

451

452

_type "playlist" indicates multiple videos.

453

There must be a key "entries", which is a list, an iterable, or a PagedList

454

object, each element of which is a valid dictionary by this specification.

455

456

Additionally, playlists can have "id", "title", and any other relevant

457

attributes with the same semantics as videos (see above).

458

459

It can also have the following optional fields:

460

461

playlist_count: The total number of videos in a playlist. If not given,

462

YoutubeDL tries to calculate it from "entries"

463

464

465

_type "multi_video" indicates that there are multiple videos that

466

form a single show, for examples multiple acts of an opera or TV episode.

467

It must have an entries key like a playlist and contain all the keys

468

required for a video at the same time.

469

470

471

_type "url" indicates that the video must be extracted from another

472

location, possibly by a different extractor. Its only required key is:

473

"url" - the next URL to extract.

474

The key "ie_key" can be set to the class name (minus the trailing "IE",

475

e.g. "Youtube") if the extractor class is known in advance.

476

Additionally, the dictionary may have any properties of the resolved entity

477

known in advance, for example "title" if the title of the referred video is

known ahead of time.

_type "url_transparent" entities have the same specification as "url", but

482

indicate that the given additional information is more precise than the one

483

associated with the resolved URL.

484

This is useful when a site employs a video service that hosts the video and

485

its technical metadata, but that video service does not embed a useful

486

title, description etc.

487

488

489

Subclasses of this should also be added to the list of extractors and

490

should define _VALID_URL as a regexp or a Sequence of regexps, and

491

re-define the _real_extract() and (optionally) _real_initialize() methods.

492

493

Subclasses may also override suitable() if necessary, but ensure the function

494

signature is preserved and that this function imports everything it needs

495

(except other extractors), so that lazy_extractors works correctly.

496

497

Subclasses can define a list of _EMBED_REGEX, which will be searched for in

498

the HTML of Generic webpages. It may also override _extract_embed_urls

499

or _extract_from_webpage as necessary. While these are normally classmethods,

500

_extract_from_webpage is allowed to be an instance method.

501

502

_extract_from_webpage may raise self.StopExtraction() to stop further

503

processing of the webpage and obtain exclusive rights to it. This is useful

504

when the extractor cannot reliably be matched using just the URL,

505

e.g. invidious/peertube instances

506

507

Embed-only extractors can be defined by setting _VALID_URL = False.

508

509

To support username + password (or netrc) login, the extractor must define a

510

_NETRC_MACHINE and re-define _perform_login(username, password) and

511

(optionally) _initialize_pre_login() methods. The _perform_login method will

512

be called between _initialize_pre_login and _real_initialize if credentials

513

are passed by the user. In cases where it is necessary to have the login

514

process as part of the extraction rather than initialization, _perform_login

515

can be left undefined.

516

517

_GEO_BYPASS attribute may be set to False in order to disable

518

geo restriction bypass mechanisms for a particular extractor.

519

Though it won't disable explicit geo restriction bypass based on

520

country code provided with geo_bypass_country.

521

522

_GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted

523

countries for this extractor. One of these countries will be used by

524

geo restriction bypass mechanism right away in order to bypass

525

geo restriction, of course, if the mechanism is not disabled.

526

527

_GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted

528

IP blocks in CIDR notation for this extractor. One of these IP blocks

529

will be used by geo restriction bypass mechanism similarly

530

to _GEO_COUNTRIES.

531

532

The _ENABLED attribute should be set to False for IEs that

533

are disabled by default and must be explicitly enabled.

534

535

The _WORKING attribute should be set to False for broken IEs

536

in order to warn the users and skip the tests.

"""

_ready = False

_downloader = None

_x_forwarded_for_ip = None

542

_GEO_BYPASS = True

543

_GEO_COUNTRIES = None

544

_GEO_IP_BLOCKS = None

545

_WORKING = True

546

_ENABLED = True

547

_NETRC_MACHINE = None

IE_DESC = None

SEARCH_KEY = None

_VALID_URL = None

_EMBED_REGEX = []

def _login_hint(self, method=NO_DEFAULT, netrc=None):

554

password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'

555

return {

556

None: '',

557

'any': f'Use --cookies, --cookies-from-browser, {password_hint}',

558

'password': f'Use {password_hint}',

559

'cookies': (

560

'Use --cookies-from-browser or --cookies for the authentication. '

561

'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),

562

}[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']

563

564

def __init__(self, downloader=None):

565

"""Constructor. Receives an optional downloader (a YoutubeDL instance).

566

If a downloader is not passed during initialization,

567

it must be set using "set_downloader()" before "extract()" is called"""

568

self._ready = False

569

self._x_forwarded_for_ip = None

570

self._printed_messages = set()

571

self.set_downloader(downloader)

572

573

@classmethod

574

def _match_valid_url(cls, url):

575

if cls._VALID_URL is False:

576

return None

577

# This does not use has/getattr intentionally - we want to know whether

578

# we have cached the regexp for *this* class, whereas getattr would also

579

# match the superclass

580

if '_VALID_URL_RE' not in cls.__dict__:

581

cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))

582

return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)

583

584

@classmethod

585

def suitable(cls, url):

586

"""Receives a URL and returns True if suitable for this IE."""

587

# This function must import everything it needs (except other extractors),

588

# so that lazy_extractors works correctly

589

return cls._match_valid_url(url) is not None

590

591

@classmethod

592

def _match_id(cls, url):

593

return cls._match_valid_url(url).group('id')

594

595

@classmethod

596

def get_temp_id(cls, url):

597

try:

598

return cls._match_id(url)

599

except (IndexError, AttributeError):

return None

@classmethod

def working(cls):

"""Getter method for _WORKING."""

return cls._WORKING

@classmethod

def supports_login(cls):

609

return bool(cls._NETRC_MACHINE)

610

611

def initialize(self):

612

"""Initializes an instance (authentication, etc)."""

613

self._printed_messages = set()

614

self._initialize_geo_bypass({

615

'countries': self._GEO_COUNTRIES,

616

'ip_blocks': self._GEO_IP_BLOCKS,

617

})

618

if not self._ready:

619

self._initialize_pre_login()

620

if self.supports_login():

621

username, password = self._get_login_info()

622

if username:

623

self._perform_login(username, password)

624

elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):

625

self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')

626

self._real_initialize()

627

self._ready = True

628

629

def _initialize_geo_bypass(self, geo_bypass_context):

630

"""

631

Initialize geo restriction bypass mechanism.

632

633

This method is used to initialize geo bypass mechanism based on faking

634

X-Forwarded-For HTTP header. A random country from provided country list

635

is selected and a random IP belonging to this country is generated. This

636

IP will be passed as X-Forwarded-For HTTP header in all subsequent

637

HTTP requests.

638

639

This method will be used for initial geo bypass mechanism initialization

640

during the instance initialization with _GEO_COUNTRIES and

641

_GEO_IP_BLOCKS.

642

643

You may also manually call it from extractor's code if geo bypass

644

information is not available beforehand (e.g. obtained during

645

extraction) or due to some other reason. In this case you should pass

646

this information in geo bypass context passed as first argument. It may

647

contain following fields:

648

649

countries: List of geo unrestricted countries (similar

650

to _GEO_COUNTRIES)

651

ip_blocks: List of geo unrestricted IP blocks in CIDR notation

652

(similar to _GEO_IP_BLOCKS)

653

654

"""

655

if not self._x_forwarded_for_ip:

656

657

# Geo bypass mechanism is explicitly disabled by user

658

if not self.get_param('geo_bypass', True):

659

return

660

661

if not geo_bypass_context:

662

geo_bypass_context = {}

663

664

# Backward compatibility: previously _initialize_geo_bypass

665

# expected a list of countries, some 3rd party code may still use

666

# it this way

667

if isinstance(geo_bypass_context, (list, tuple)):

668

geo_bypass_context = {

669

'countries': geo_bypass_context,

670

}

671

672

# The whole point of geo bypass mechanism is to fake IP

673

# as X-Forwarded-For HTTP header based on some IP block or

674

# country code.

675

676

# Path 1: bypassing based on IP block in CIDR notation

677

678

# Explicit IP block specified by user, use it right away

679

# regardless of whether extractor is geo bypassable or not

680

ip_block = self.get_param('geo_bypass_ip_block', None)

681

682

# Otherwise use random IP block from geo bypass context but only

683

# if extractor is known as geo bypassable

684

if not ip_block:

685

ip_blocks = geo_bypass_context.get('ip_blocks')

686

if self._GEO_BYPASS and ip_blocks:

687

ip_block = random.choice(ip_blocks)

688

689

if ip_block:

690

self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)

691

self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')

692

return

693

694

# Path 2: bypassing based on country code

695

696

# Explicit country code specified by user, use it right away

697

# regardless of whether extractor is geo bypassable or not

698

country = self.get_param('geo_bypass_country', None)

699

700

# Otherwise use random country code from geo bypass context but

701

# only if extractor is known as geo bypassable

702

if not country:

703

countries = geo_bypass_context.get('countries')

704

if self._GEO_BYPASS and countries:

705

country = random.choice(countries)

706

707

if country:

708

self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)

709

self._downloader.write_debug(

710

f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')

711

712

def extract(self, url):

713

"""Extracts URL information and returns it in list of dicts."""

try:

for _ in range(2):

try:

self.initialize()

self.to_screen('Extracting URL: %s' % (

719

url if self.get_param('verbose') else truncate_string(url, 100, 20)))

720

ie_result = self._real_extract(url)

721

if ie_result is None:

722

return None

723

if self._x_forwarded_for_ip:

724

ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip

725

subtitles = ie_result.get('subtitles') or {}

726

if 'no-live-chat' in self.get_param('compat_opts'):

727

for lang in ('live_chat', 'comments', 'danmaku'):

728

subtitles.pop(lang, None)

729

return ie_result

730

except GeoRestrictedError as e:

731

if self.__maybe_fake_ip_and_retry(e.countries):

732

continue

733

raise

734

except UnsupportedError:

735

raise

736

except ExtractorError as e:

737

e.video_id = e.video_id or self.get_temp_id(url)

738

e.ie = e.ie or self.IE_NAME,

739

e.traceback = e.traceback or sys.exc_info()[2]

740

raise

741

except IncompleteRead as e:

742

raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))

743

except (KeyError, StopIteration) as e:

744

raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))

745

746

def __maybe_fake_ip_and_retry(self, countries):

747

if (not self.get_param('geo_bypass_country', None)

748

and self._GEO_BYPASS

749

and self.get_param('geo_bypass', True)

750

and not self._x_forwarded_for_ip

751

and countries):

752

country_code = random.choice(countries)

753

self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)

754

if self._x_forwarded_for_ip:

755

self.report_warning(

756

'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'

757

% (self._x_forwarded_for_ip, country_code.upper()))

return True

return False

def set_downloader(self, downloader):

762

"""Sets a YoutubeDL instance as the downloader for this IE."""

763

self._downloader = downloader

@property

def cache(self):

return self._downloader.cache

@property

def cookiejar(self):

return self._downloader.cookiejar

772

773

def _initialize_pre_login(self):

774

""" Initialization before login. Redefine in subclasses."""

775

pass

776

777

def _perform_login(self, username, password):

778

""" Login with username and password. Redefine in subclasses."""

779

pass

780

781

def _real_initialize(self):

782

"""Real initialization process. Redefine in subclasses."""

783

pass

784

785

def _real_extract(self, url):

786

"""Real extraction process. Redefine in subclasses."""

787

raise NotImplementedError('This method must be implemented by subclasses')

@classmethod

def ie_key(cls):

"""A string for getting the InfoExtractor with get_info_extractor"""

792

return cls.__name__[:-2]

@classproperty

def IE_NAME(cls):

return cls.__name__[:-2]

797

798

@staticmethod

799

def __can_accept_status_code(err, expected_status):

800

assert isinstance(err, HTTPError)

801

if expected_status is None:

802

return False

803

elif callable(expected_status):

804

return expected_status(err.status) is True

805

else:

806

return err.status in variadic(expected_status)

807

808

def _create_request(self, url_or_request, data=None, headers=None, query=None):

809

if isinstance(url_or_request, urllib.request.Request):

810

self._downloader.deprecation_warning(

811

'Passing a urllib.request.Request to _create_request() is deprecated. '

812

'Use yt_dlp.networking.common.Request instead.')

813

url_or_request = urllib_req_to_req(url_or_request)

814

elif not isinstance(url_or_request, Request):

815

url_or_request = Request(url_or_request)

816

817

url_or_request.update(data=data, headers=headers, query=query)

818

return url_or_request

819

820

def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):

821

"""

822

Return the response handle.

823

824

See _download_webpage docstring for arguments specification.

825

"""

826

if not self._downloader._first_webpage_request:

827

sleep_interval = self.get_param('sleep_interval_requests') or 0

828

if sleep_interval > 0:

829

self.to_screen('Sleeping %s seconds ...' % sleep_interval)

830

time.sleep(sleep_interval)

831

else:

832

self._downloader._first_webpage_request = False

833

834

if note is None:

835

self.report_download_webpage(video_id)

836

elif note is not False:

837

if video_id is None:

838

self.to_screen(str(note))

839

else:

840

self.to_screen(f'{video_id}: {note}')

841

842

# Some sites check X-Forwarded-For HTTP header in order to figure out

843

# the origin of the client behind proxy. This allows bypassing geo

844

# restriction by faking this header's value to IP that belongs to some

845

# geo unrestricted country. We will do so once we encounter any

846

# geo restriction error.

847

if self._x_forwarded_for_ip:

848

headers = (headers or {}).copy()

849

headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)

850

851

try:

852

return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))

853

except network_exceptions as err:

854

if isinstance(err, HTTPError):

855

if self.__can_accept_status_code(err, expected_status):

return err.response

if errnote is False:

return False

if errnote is None:

errnote = 'Unable to download webpage'

862

863

errmsg = f'{errnote}: {error_to_compat_str(err)}'

864

if fatal:

865

raise ExtractorError(errmsg, cause=err)

866

else:

867

self.report_warning(errmsg)

868

return False

869

870

def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,

871

encoding=None, data=None, headers={}, query={}, expected_status=None):

872

"""

873

Return a tuple (page content as string, URL handle).

874

875

Arguments:

876

url_or_request -- plain text URL as a string or

877

a urllib.request.Request object

878

video_id -- Video/playlist/item identifier (string)

879

880

Keyword arguments:

881

note -- note printed before downloading (string)

882

errnote -- note printed in case of an error (string)

883

fatal -- flag denoting whether error should be considered fatal,

884

i.e. whether it should cause ExtractionError to be raised,

885

otherwise a warning will be reported and extraction continued

886

encoding -- encoding for a page content decoding, guessed automatically

887

when not explicitly specified

888

data -- POST data (bytes)

889

headers -- HTTP headers (dict)

890

query -- URL query (dict)

891

expected_status -- allows to accept failed HTTP requests (non 2xx

892

status code) by explicitly specifying a set of accepted status

893

codes. Can be any of the following entities:

894

- an integer type specifying an exact failed status code to

895

accept

896

- a list or a tuple of integer types specifying a list of

897

failed status codes to accept

898

- a callable accepting an actual failed status code and

899

returning True if it should be accepted

900

Note that this argument does not affect success status codes (2xx)

901

which are always accepted.

902

"""

903

904

# Strip hashes from the URL (#1038)

905

if isinstance(url_or_request, str):

906

url_or_request = url_or_request.partition('#')[0]

907

908

urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)

if urlh is False:

assert not fatal

return False

content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)

913

return (content, urlh)

914

915

@staticmethod

916

def _guess_encoding_from_content(content_type, webpage_bytes):

917

m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)

918

if m:

919

encoding = m.group(1)

920

else:

921

m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',

922

webpage_bytes[:1024])

923

if m:

924

encoding = m.group(1).decode('ascii')

925

elif webpage_bytes.startswith(b'\xff\xfe'):

encoding = 'utf-16'

else:

encoding = 'utf-8'

return encoding

def __check_blocked(self, content):

933

first_block = content[:512]

934

if ('<title>Access to this site is blocked</title>' in content

935

and 'Websense' in first_block):

936

msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'

937

blocked_iframe = self._html_search_regex(

938

r'<iframe src="([^"]+)"', content,

939

'Websense information URL', default=None)

940

if blocked_iframe:

941

msg += ' Visit %s for more details' % blocked_iframe

942

raise ExtractorError(msg, expected=True)

943

if '<title>The URL you requested has been blocked</title>' in first_block:

944

msg = (

945

'Access to this webpage has been blocked by Indian censorship. '

946

'Use a VPN or proxy server (with --proxy) to route around it.')

947

block_msg = self._html_search_regex(

948

r'</h1><p>(.*?)</p>',

949

content, 'block message', default=None)

950

if block_msg:

951

msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')

952

raise ExtractorError(msg, expected=True)

953

if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content

954

and 'blocklist.rkn.gov.ru' in content):

955

raise ExtractorError(

956

'Access to this webpage has been blocked by decision of the Russian government. '

957

'Visit http://blocklist.rkn.gov.ru/ for a block reason.',

958

expected=True)

959

960

def _request_dump_filename(self, url, video_id):

961

basen = f'{video_id}_{url}'

962

trim_length = self.get_param('trim_file_name') or 240

963

if len(basen) > trim_length:

964

h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()

965

basen = basen[:trim_length - len(h)] + h

966

filename = sanitize_filename(f'{basen}.dump', restricted=True)

967

# Working around MAX_PATH limitation on Windows (see

968

# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)

969

if compat_os_name == 'nt':

970

absfilepath = os.path.abspath(filename)

971

if len(absfilepath) > 259:

972

filename = fR'\\?\{absfilepath}'

973

return filename

974

975

def __decode_webpage(self, webpage_bytes, encoding, headers):

976

if not encoding:

977

encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)

978

try:

979

return webpage_bytes.decode(encoding, 'replace')

980

except LookupError:

981

return webpage_bytes.decode('utf-8', 'replace')

982

983

def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):

984

webpage_bytes = urlh.read()

985

if prefix is not None:

986

webpage_bytes = prefix + webpage_bytes

987

if self.get_param('dump_intermediate_pages', False):

988

self.to_screen('Dumping request to ' + urlh.url)

989

dump = base64.b64encode(webpage_bytes).decode('ascii')

990

self._downloader.to_screen(dump)

991

if self.get_param('write_pages'):

992

filename = self._request_dump_filename(urlh.url, video_id)

993

self.to_screen(f'Saving request to {filename}')

994

with open(filename, 'wb') as outf:

995

outf.write(webpage_bytes)

996

997

content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)

998

self.__check_blocked(content)

return content

def __print_error(self, errnote, fatal, video_id, err):

1003

if fatal:

1004

raise ExtractorError(f'{video_id}: {errnote}', cause=err)

1005

elif errnote:

1006

self.report_warning(f'{video_id}: {errnote}: {err}')

1007

1008

def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):

1009

if transform_source:

1010

xml_string = transform_source(xml_string)

1011

try:

1012

return compat_etree_fromstring(xml_string.encode('utf-8'))

1013

except xml.etree.ElementTree.ParseError as ve:

1014

self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)

1015

1016

def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):

1017

try:

1018

return json.loads(

1019

json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)

1020

except ValueError as ve:

1021

self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)

1022

1023

def _parse_socket_response_as_json(self, data, *args, **kwargs):

1024

return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)

1025

1026

def __create_download_methods(name, parser, note, errnote, return_value):

1027

1028

def parse(ie, content, *args, errnote=errnote, **kwargs):

if parser is None:

return content

if errnote is False:

kwargs['errnote'] = errnote

1033

# parser is fetched by name so subclasses can override it

1034

return getattr(ie, parser)(content, *args, **kwargs)

1035

1036

def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,

1037

fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):

1038

res = self._download_webpage_handle(

1039

url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,

1040

data=data, headers=headers, query=query, expected_status=expected_status)

if res is False:

return res

content, urlh = res

return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh

1045

1046

def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,

1047

fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):

1048

if self.get_param('load_pages'):

1049

url_or_request = self._create_request(url_or_request, data, headers, query)

1050

filename = self._request_dump_filename(url_or_request.url, video_id)

1051

self.to_screen(f'Loading request from {filename}')

1052

try:

1053

with open(filename, 'rb') as dumpf:

1054

webpage_bytes = dumpf.read()

1055

except OSError as e:

1056

self.report_warning(f'Unable to load request from disk: {e}')

1057

else:

1058

content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)

1059

return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)

kwargs = {

'note': note,

'errnote': errnote,

'transform_source': transform_source,

1064

'fatal': fatal,

1065

'encoding': encoding,

'data': data,

'headers': headers,

'query': query,

'expected_status': expected_status,

1070

}

1071

if parser is None:

1072

kwargs.pop('transform_source')

1073

# The method is fetched by name so subclasses can override _download_..._handle

1074

res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)

1075

return res if res is False else res[0]

1076

1077

def impersonate(func, name, return_value):

1078

func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'

1079

func.__doc__ = f'''

1080

@param transform_source Apply this transformation before parsing

1081

@returns {return_value}

1082

1083

See _download_webpage_handle docstring for other arguments specification

1084

'''

1085

1086

impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')

1087

impersonate(download_content, f'_download_{name}', f'{return_value}')

1088

return download_handle, download_content

1089

1090

_download_xml_handle, _download_xml = __create_download_methods(

1091

'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')

1092

_download_json_handle, _download_json = __create_download_methods(

1093

'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')

1094

_download_socket_json_handle, _download_socket_json = __create_download_methods(

1095

'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')

1096

__download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]

1097

1098

def _download_webpage(

1099

self, url_or_request, video_id, note=None, errnote=None,

1100

fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):

1101

"""

1102

Return the data of the page as a string.

1103

1104

Keyword arguments:

1105

tries -- number of tries

1106

timeout -- sleep interval between tries

1107

1108

See _download_webpage_handle docstring for other arguments specification.

1109

"""

1110

1111

R''' # NB: These are unused; should they be deprecated?

1112

if tries != 1:

1113

self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')

1114

if timeout is NO_DEFAULT:

1115

timeout = 5

1116

else:

1117

self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')

'''

try_count = 0

while True:

try:

return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)

1124

except IncompleteRead as e:

1125

try_count += 1

1126

if try_count >= tries:

1127

raise e

1128

self._sleep(timeout, video_id)

1129

1130

def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):

1131

idstr = format_field(video_id, None, '%s: ')

1132

msg = f'[{self.IE_NAME}] {idstr}{msg}'

1133

if only_once:

1134

if f'WARNING: {msg}' in self._printed_messages:

1135

return

1136

self._printed_messages.add(f'WARNING: {msg}')

1137

self._downloader.report_warning(msg, *args, **kwargs)

1138

1139

def to_screen(self, msg, *args, **kwargs):

1140

"""Print msg to screen, prefixing it with '[ie_name]'"""

1141

self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)

1142

1143

def write_debug(self, msg, *args, **kwargs):

1144

self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)

1145

1146

def get_param(self, name, default=None, *args, **kwargs):

1147

if self._downloader:

1148

return self._downloader.params.get(name, default, *args, **kwargs)

1149

return default

1150

1151

def report_drm(self, video_id, partial=NO_DEFAULT):

1152

if partial is not NO_DEFAULT:

1153

self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')

1154

self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)

1155

1156

def report_extraction(self, id_or_name):

1157

"""Report information extraction."""

1158

self.to_screen('%s: Extracting information' % id_or_name)

1159

1160

def report_download_webpage(self, video_id):

1161

"""Report webpage download."""

1162

self.to_screen('%s: Downloading webpage' % video_id)

1163

1164

def report_age_confirmation(self):

1165

"""Report attempt to confirm age."""

1166

self.to_screen('Confirming age')

1167

1168

def report_login(self):

1169

"""Report attempt to log in."""

1170

self.to_screen('Logging in')

1171

1172

def raise_login_required(

1173

self, msg='This video is only available for registered users',

1174

metadata_available=False, method=NO_DEFAULT):

1175

if metadata_available and (

1176

self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):

1177

self.report_warning(msg)

1178

return

1179

msg += format_field(self._login_hint(method), None, '. %s')

1180

raise ExtractorError(msg, expected=True)

1181

1182

def raise_geo_restricted(

1183

self, msg='This video is not available from your location due to geo restriction',

1184

countries=None, metadata_available=False):

1185

if metadata_available and (

1186

self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):

1187

self.report_warning(msg)

1188

else:

1189

raise GeoRestrictedError(msg, countries=countries)

1190

1191

def raise_no_formats(self, msg, expected=False, video_id=None):

1192

if expected and (

1193

self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):

1194

self.report_warning(msg, video_id)

1195

elif isinstance(msg, ExtractorError):

1196

raise msg

1197

else:

1198

raise ExtractorError(msg, expected=expected, video_id=video_id)

1199

1200

# Methods for following #608

1201

@staticmethod

1202

def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):

1203

"""Returns a URL that points to a page that should be processed"""

1204

if ie is not None:

1205

kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()

1206

if video_id is not None:

1207

kwargs['id'] = video_id

1208

if video_title is not None:

1209

kwargs['title'] = video_title

1210

return {

1211

**kwargs,

1212

'_type': 'url_transparent' if url_transparent else 'url',

'url': url,

}

@classmethod

def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,

1218

getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):

1219

return cls.playlist_result(

1220

(cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),

1221

playlist_id, playlist_title, **kwargs)

1222

1223

@staticmethod

1224

def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):

1225

"""Returns a playlist"""

1226

if playlist_id:

1227

kwargs['id'] = playlist_id

1228

if playlist_title:

1229

kwargs['title'] = playlist_title

1230

if playlist_description is not None:

1231

kwargs['description'] = playlist_description

1232

return {

1233

**kwargs,

1234

'_type': 'multi_video' if multi_video else 'playlist',

'entries': entries,

}

def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):

1239

"""

1240

Perform a regex search on the given string, using a single or a list of

1241

patterns returning the first matching group.

1242

In case of failure return a default value or raise a WARNING or a

1243

RegexNotFoundError, depending on fatal, specifying the field name.

"""

if string is None:

mobj = None

elif isinstance(pattern, (str, re.Pattern)):

1248

mobj = re.search(pattern, string, flags)

1249

else:

1250

for p in pattern:

1251

mobj = re.search(p, string, flags)

if mobj:

break

_name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)

if mobj:

if group is None:

# return the first matching group

1260

return next(g for g in mobj.groups() if g is not None)

1261

elif isinstance(group, (list, tuple)):

1262

return tuple(mobj.group(g) for g in group)

1263

else:

1264

return mobj.group(group)

1265

elif default is not NO_DEFAULT:

1266

return default

1267

elif fatal:

1268

raise RegexNotFoundError('Unable to extract %s' % _name)

1269

else:

1270

self.report_warning('unable to extract %s' % _name + bug_reports_message())

1271

return None

1272

1273

def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',

1274

contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):

1275

"""Searches string for the JSON object specified by start_pattern"""

1276

# NB: end_pattern is only used to reduce the size of the initial match

1277

if default is NO_DEFAULT:

1278

default, has_default = {}, False

1279

else:

1280

fatal, has_default = False, True

1281

1282

json_string = self._search_regex(

1283

rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',

1284

string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)

if not json_string:

return default

_name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)

1289

try:

1290

return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)

1291

except ExtractorError as e:

1292

if fatal:

1293

raise ExtractorError(

1294

f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)

1295

elif not has_default:

1296

self.report_warning(

1297

f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)

1298

return default

1299

1300

def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):

1301

"""

1302

Like _search_regex, but strips HTML tags and unescapes entities.

1303

"""

1304

res = self._search_regex(pattern, string, name, default, fatal, flags, group)

1305

if isinstance(res, tuple):

1306

return tuple(map(clean_html, res))

1307

return clean_html(res)

1308

1309

def _get_netrc_login_info(self, netrc_machine=None):

1310

netrc_machine = netrc_machine or self._NETRC_MACHINE

1311

1312

cmd = self.get_param('netrc_cmd')

1313

if cmd:

1314

cmd = cmd.replace('{}', netrc_machine)

1315

self.to_screen(f'Executing command: {cmd}')

1316

stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)

1317

if ret != 0:

1318

raise OSError(f'Command returned error code {ret}')

1319

info = netrc_from_content(stdout).authenticators(netrc_machine)

1320

1321

elif self.get_param('usenetrc', False):

1322

netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')

1323

if os.path.isdir(netrc_file):

1324

netrc_file = os.path.join(netrc_file, '.netrc')

1325

info = netrc.netrc(netrc_file).authenticators(netrc_machine)

else:

return None, None

if not info:

raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')

1331

return info[0], info[2]

1332

1333

def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):

1334

"""

1335

Get the login info as (username, password)

1336

First look for the manually specified credentials using username_option

1337

and password_option as keys in params dictionary. If no such credentials

1338

are available try the netrc_cmd if it is defined or look in the

1339

netrc file using the netrc_machine or _NETRC_MACHINE value.

1340

If there's no info available, return (None, None)

1341

"""

1342

1343

username = self.get_param(username_option)

1344

if username is not None:

1345

password = self.get_param(password_option)

1346

else:

1347

try:

1348

username, password = self._get_netrc_login_info(netrc_machine)

1349

except (OSError, netrc.NetrcParseError) as err:

1350

self.report_warning(f'Failed to parse .netrc: {err}')

1351

return None, None

1352

return username, password

1353

1354

def _get_tfa_info(self, note='two-factor verification code'):

1355

"""

1356

Get the two-factor authentication info

1357

TODO - asking the user will be required for sms/phone verify

1358

currently just uses the command line option

1359

If there's no info available, return None

1360

"""

1361

1362

tfa = self.get_param('twofactor')

if tfa is not None:

return tfa

return getpass.getpass('Type %s and press [Return]: ' % note)

1367

1368

# Helper functions for extracting OpenGraph info

1369

@staticmethod

1370

def _og_regexes(prop):

1371

content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'

1372

property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'

1373

% {'prop': re.escape(prop), 'sep': '(?::|[:-])'})

1374

template = r'<meta[^>]+?%s[^>]+?%s'

1375

return [

1376

template % (property_re, content_re),

1377

template % (content_re, property_re),

]

@staticmethod

def _meta_regex(prop):

1382

return r'''(?isx)<meta

1383

(?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)

1384

[^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)

1385

1386

def _og_search_property(self, prop, html, name=None, **kargs):

1387

prop = variadic(prop)

1388

if name is None:

1389

name = 'OpenGraph %s' % prop[0]

1390

og_regexes = []

1391

for p in prop:

1392

og_regexes.extend(self._og_regexes(p))

1393

escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)

1394

if escaped is None:

1395

return None

1396

return unescapeHTML(escaped)

1397

1398

def _og_search_thumbnail(self, html, **kargs):

1399

return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)

1400

1401

def _og_search_description(self, html, **kargs):

1402

return self._og_search_property('description', html, fatal=False, **kargs)

1403

1404

def _og_search_title(self, html, *, fatal=False, **kargs):

1405

return self._og_search_property('title', html, fatal=fatal, **kargs)

1406

1407

def _og_search_video_url(self, html, name='video url', secure=True, **kargs):

1408

regexes = self._og_regexes('video') + self._og_regexes('video:url')

1409

if secure:

1410

regexes = self._og_regexes('video:secure_url') + regexes

1411

return self._html_search_regex(regexes, html, name, **kargs)

1412

1413

def _og_search_url(self, html, **kargs):

1414

return self._og_search_property('url', html, **kargs)

1415

1416

def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):

1417

return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)

1418

1419

def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):

1420

name = variadic(name)

1421

if display_name is None:

1422

display_name = name[0]

1423

return self._html_search_regex(

1424

[self._meta_regex(n) for n in name],

1425

html, display_name, fatal=fatal, group='content', **kwargs)

1426

1427

def _dc_search_uploader(self, html):

1428

return self._html_search_meta('dc.creator', html, 'uploader')

1429

1430

@staticmethod

1431

def _rta_search(html):

1432

# See http://www.rtalabel.org/index.php?content=howtofaq#single

1433

if re.search(r'(?ix)<meta\s+name="rating"\s+'

1434

r' content="RTA-5042-1996-1400-1577-RTA"',

html):

return 18

# And then there are the jokers who advertise that they use RTA, but actually don't.

1439

AGE_LIMIT_MARKERS = [

1440

r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',

1441

r'>[^<]*you acknowledge you are at least (\d+) years old',

1442

r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',

]

age_limit = 0

for marker in AGE_LIMIT_MARKERS:

1447

mobj = re.search(marker, html)

1448

if mobj:

1449

age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))

1450

return age_limit

1451

1452

def _media_rating_search(self, html):

1453

# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/

1454

rating = self._html_search_meta('rating', html)

if not rating:

return None

RATING_TABLE = {

'safe for kids': 0,

'general': 8,

'14 years': 14,

'mature': 17,

'restricted': 19,

}

return RATING_TABLE.get(rating.lower())

1467

1468

def _family_friendly_search(self, html):

1469

# See http://schema.org/VideoObject

1470

family_friendly = self._html_search_meta(

1471

'isFamilyFriendly', html, default=None)

1472

1473

if not family_friendly:

return None

RATING_TABLE = {

'1': 0,

'true': 0,

'0': 18,

'false': 18,

}

return RATING_TABLE.get(family_friendly.lower())

1483

1484

def _twitter_search_player(self, html):

1485

return self._html_search_meta('twitter:player', html,

1486

'twitter card player')

1487

1488

def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):

1489

"""Yield all json ld objects in the html"""

1490

if default is not NO_DEFAULT:

1491

fatal = False

1492

for mobj in re.finditer(JSON_LD_RE, html):

1493

json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)

1494

for json_ld in variadic(json_ld_item):

1495

if isinstance(json_ld, dict):

1496

yield json_ld

1497

1498

def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):

1499

"""Search for a video in any json ld in the html"""

1500

if default is not NO_DEFAULT:

1501

fatal = False

1502

info = self._json_ld(

1503

list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),

1504

video_id, fatal=fatal, expected_type=expected_type)

1505

if info:

1506

return info

1507

if default is not NO_DEFAULT:

1508

return default

1509

elif fatal:

1510

raise RegexNotFoundError('Unable to extract JSON-LD')

1511

else:

1512

self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())

1513

return {}

1514

1515

def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):

1516

if isinstance(json_ld, str):

1517

json_ld = self._parse_json(json_ld, video_id, fatal=fatal)

if not json_ld:

return {}

info = {}

INTERACTION_TYPE_MAP = {

1523

'CommentAction': 'comment',

1524

'AgreeAction': 'like',

1525

'DisagreeAction': 'dislike',

1526

'LikeAction': 'like',

1527

'DislikeAction': 'dislike',

1528

'ListenAction': 'view',

1529

'WatchAction': 'view',

1530

'ViewAction': 'view',

1531

}

1532

1533

def is_type(e, *expected_types):

1534

type = variadic(traverse_obj(e, '@type'))

1535

return any(x in type for x in expected_types)

1536

1537

def extract_interaction_type(e):

1538

interaction_type = e.get('interactionType')

1539

if isinstance(interaction_type, dict):

1540

interaction_type = interaction_type.get('@type')

1541

return str_or_none(interaction_type)

1542

1543

def extract_interaction_statistic(e):

1544

interaction_statistic = e.get('interactionStatistic')

1545

if isinstance(interaction_statistic, dict):

1546

interaction_statistic = [interaction_statistic]

1547

if not isinstance(interaction_statistic, list):

1548

return

1549

for is_e in interaction_statistic:

1550

if not is_type(is_e, 'InteractionCounter'):

1551

continue

1552

interaction_type = extract_interaction_type(is_e)

1553

if not interaction_type:

1554

continue

1555

# For interaction count some sites provide string instead of

1556

# an integer (as per spec) with non digit characters (e.g. ",")

1557

# so extracting count with more relaxed str_to_int

1558

interaction_count = str_to_int(is_e.get('userInteractionCount'))

1559

if interaction_count is None:

1560

continue

1561

count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])

1562

if not count_kind:

1563

continue

1564

count_key = '%s_count' % count_kind

1565

if info.get(count_key) is not None:

1566

continue

1567

info[count_key] = interaction_count

1568

1569

def extract_chapter_information(e):

1570

chapters = [{

1571

'title': part.get('name'),

1572

'start_time': part.get('startOffset'),

1573

'end_time': part.get('endOffset'),

1574

} for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']

1575

for idx, (last_c, current_c, next_c) in enumerate(zip(

1576

[{'end_time': 0}] + chapters, chapters, chapters[1:])):

1577

current_c['end_time'] = current_c['end_time'] or next_c['start_time']

1578

current_c['start_time'] = current_c['start_time'] or last_c['end_time']

1579

if None in current_c.values():

1580

self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')

1581

return

1582

if chapters:

1583

chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']

1584

info['chapters'] = chapters

1585

1586

def extract_video_object(e):

1587

author = e.get('author')

1588

info.update({

1589

'url': url_or_none(e.get('contentUrl')),

1590

'ext': mimetype2ext(e.get('encodingFormat')),

1591

'title': unescapeHTML(e.get('name')),

1592

'description': unescapeHTML(e.get('description')),

1593

'thumbnails': [{'url': unescapeHTML(url)}

1594

for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))

1595

if url_or_none(url)],

1596

'duration': parse_duration(e.get('duration')),

1597

'timestamp': unified_timestamp(e.get('uploadDate')),

1598

# author can be an instance of 'Organization' or 'Person' types.

1599

# both types can have 'name' property(inherited from 'Thing' type). [1]

1600

# however some websites are using 'Text' type instead.

1601

# 1. https://schema.org/VideoObject

1602

'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,

1603

'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),

1604

'filesize': int_or_none(float_or_none(e.get('contentSize'))),

1605

'tbr': int_or_none(e.get('bitrate')),

1606

'width': int_or_none(e.get('width')),

1607

'height': int_or_none(e.get('height')),

1608

'view_count': int_or_none(e.get('interactionCount')),

1609

'tags': try_call(lambda: e.get('keywords').split(',')),

1610

})

1611

if is_type(e, 'AudioObject'):

1612

info.update({

1613

'vcodec': 'none',

1614

'abr': int_or_none(e.get('bitrate')),

1615

})

1616

extract_interaction_statistic(e)

1617

extract_chapter_information(e)

1618

1619

def traverse_json_ld(json_ld, at_top_level=True):

1620

for e in variadic(json_ld):

1621

if not isinstance(e, dict):

1622

continue

1623

if at_top_level and '@context' not in e:

1624

continue

1625

if at_top_level and set(e.keys()) == {'@context', '@graph'}:

1626

traverse_json_ld(e['@graph'], at_top_level=False)

1627

continue

1628

if expected_type is not None and not is_type(e, expected_type):

1629

continue

1630

rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)

1631

if rating is not None:

1632

info['average_rating'] = rating

1633

if is_type(e, 'TVEpisode', 'Episode'):

1634

episode_name = unescapeHTML(e.get('name'))

1635

info.update({

1636

'episode': episode_name,

1637

'episode_number': int_or_none(e.get('episodeNumber')),

1638

'description': unescapeHTML(e.get('description')),

1639

})

1640

if not info.get('title') and episode_name:

1641

info['title'] = episode_name

1642

part_of_season = e.get('partOfSeason')

1643

if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):

1644

info.update({

1645

'season': unescapeHTML(part_of_season.get('name')),

1646

'season_number': int_or_none(part_of_season.get('seasonNumber')),

1647

})

1648

part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')

1649

if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):

1650

info['series'] = unescapeHTML(part_of_series.get('name'))

1651

elif is_type(e, 'Movie'):

1652

info.update({

1653

'title': unescapeHTML(e.get('name')),

1654

'description': unescapeHTML(e.get('description')),

1655

'duration': parse_duration(e.get('duration')),

1656

'timestamp': unified_timestamp(e.get('dateCreated')),

1657

})

1658

elif is_type(e, 'Article', 'NewsArticle'):

1659

info.update({

1660

'timestamp': parse_iso8601(e.get('datePublished')),

1661

'title': unescapeHTML(e.get('headline')),

1662

'description': unescapeHTML(e.get('articleBody') or e.get('description')),

1663

})

1664

if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):

1665

extract_video_object(e['video'][0])

1666

elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):

1667

extract_video_object(e['subjectOf'][0])

1668

elif is_type(e, 'VideoObject', 'AudioObject'):

1669

extract_video_object(e)

1670

if expected_type is None:

continue

else:

break

video = e.get('video')

1675

if is_type(video, 'VideoObject'):

1676

extract_video_object(video)

1677

if expected_type is None:

continue

else:

break

traverse_json_ld(json_ld)

1683

return filter_dict(info)

1684

1685

def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):

1686

return self._parse_json(

1687

self._search_regex(

1688

r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',

1689

webpage, 'next.js data', fatal=fatal, **kw),

1690

video_id, transform_source=transform_source, fatal=fatal)

1691

1692

def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):

1693

"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""

1694

rectx = re.escape(context_name)

1695

FUNCTION_RE = r'$function\((?P<arg_keys>.*?)${.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}$(?P<arg_vals>.*?)$'

1696

js, arg_keys, arg_vals = self._search_regex(

1697

(rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),

1698

webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),

1699

default=NO_DEFAULT if fatal else (None, None, None))

if js is None:

return {}

args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(

1704

f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))

1705

1706

ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)

1707

return traverse_obj(ret, traverse) or {}

1708

1709

@staticmethod

1710

def _hidden_inputs(html):

1711

html = re.sub(r'', '', html)

1712

hidden_inputs = {}

1713

for input in re.findall(r'(?i)(<input[^>]+>)', html):

1714

attrs = extract_attributes(input)

1715

if not input:

1716

continue

1717

if attrs.get('type') not in ('hidden', 'submit'):

1718

continue

1719

name = attrs.get('name') or attrs.get('id')

1720

value = attrs.get('value')

1721

if name and value is not None:

1722

hidden_inputs[name] = value

1723

return hidden_inputs

1724

1725

def _form_hidden_inputs(self, form_id, html):

1726

form = self._search_regex(

1727

r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,

1728

html, '%s form' % form_id, group='form')

1729

return self._hidden_inputs(form)

1730

1731

@classproperty(cache=True)

1732

def FormatSort(cls):

1733

class FormatSort(FormatSorter):

1734

def __init__(ie, *args, **kwargs):

1735

super().__init__(ie._downloader, *args, **kwargs)

1736

1737

deprecation_warning(

1738

'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '

1739

'Use yt_dlp.utils.FormatSorter instead')

1740

return FormatSort

1741

1742

def _sort_formats(self, formats, field_preference=[]):

1743

if not field_preference:

1744

self._downloader.deprecation_warning(

1745

'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')

1746

return

1747

self._downloader.deprecation_warning(

1748

'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '

1749

'Return _format_sort_fields in the info_dict instead')

1750

if formats:

1751

formats[0]['__sort_fields'] = field_preference

1752

1753

def _check_formats(self, formats, video_id):

1754

if formats:

1755

formats[:] = filter(

1756

lambda f: self._is_valid_url(

1757

f['url'], video_id,

1758

item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),

formats)

@staticmethod

def _remove_duplicate_formats(formats):

format_urls = set()

unique_formats = []

for f in formats:

if f['url'] not in format_urls:

1767

format_urls.add(f['url'])

1768

unique_formats.append(f)

1769

formats[:] = unique_formats

1770

1771

def _is_valid_url(self, url, video_id, item='video', headers={}):

1772

url = self._proto_relative_url(url, scheme='http:')

1773

# For now assume non HTTP(S) URLs always valid

1774

if not (url.startswith('http://') or url.startswith('https://')):

1775

return True

1776

try:

1777

self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)

1778

return True

1779

except ExtractorError as e:

1780

self.to_screen(

1781

'%s: %s URL is invalid, skipping: %s'

1782

% (video_id, item, error_to_compat_str(e.cause)))

1783

return False

1784

1785

def http_scheme(self):

1786

""" Either "http:" or "https:", depending on the user's preferences """

1787

return (

1788

'http:'

1789

if self.get_param('prefer_insecure', False)

1790

else 'https:')

1791

1792

def _proto_relative_url(self, url, scheme=None):

1793

scheme = scheme or self.http_scheme()

1794

assert scheme.endswith(':')

1795

return sanitize_url(url, scheme=scheme[:-1])

1796

1797

def _sleep(self, timeout, video_id, msg_template=None):

1798

if msg_template is None:

1799

msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'

1800

msg = msg_template % {'video_id': video_id, 'timeout': timeout}

self.to_screen(msg)

time.sleep(timeout)

def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,

1805

transform_source=lambda s: fix_xml_ampersands(s).strip(),

1806

fatal=True, m3u8_id=None, data=None, headers={}, query={}):

1807

if self.get_param('ignore_no_formats_error'):

1808

fatal = False

1809

1810

res = self._download_xml_handle(

1811

manifest_url, video_id, 'Downloading f4m manifest',

1812

'Unable to download f4m manifest',

1813

# Some manifests may be malformed, e.g. prosiebensat1 generated manifests

1814

# (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)

1815

transform_source=transform_source,

1816

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return []

manifest, urlh = res

manifest_url = urlh.url

1822

1823

return self._parse_f4m_formats(

1824

manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,

1825

transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)

1826

1827

def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,

1828

transform_source=lambda s: fix_xml_ampersands(s).strip(),

1829

fatal=True, m3u8_id=None):

1830

if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:

1831

return []

1832

1833

# currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy

1834

akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')

1835

if akamai_pv is not None and ';' in akamai_pv.text:

1836

playerVerificationChallenge = akamai_pv.text.split(';')[0]

1837

if playerVerificationChallenge.strip() != '':

return []

formats = []

manifest_version = '1.0'

1842

media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')

1843

if not media_nodes:

1844

manifest_version = '2.0'

1845

media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')

1846

# Remove unsupported DRM protected media from final formats

1847

# rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).

1848

media_nodes = remove_encrypted_media(media_nodes)

if not media_nodes:

return formats

manifest_base_url = get_base_url(manifest)

1853

1854

bootstrap_info = xpath_element(

1855

manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],

1856

'bootstrap info', default=None)

1857

1858

vcodec = None

1859

mime_type = xpath_text(

1860

manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],

1861

'base URL', default=None)

1862

if mime_type and mime_type.startswith('audio/'):

1863

vcodec = 'none'

1864

1865

for i, media_el in enumerate(media_nodes):

1866

tbr = int_or_none(media_el.attrib.get('bitrate'))

1867

width = int_or_none(media_el.attrib.get('width'))

1868

height = int_or_none(media_el.attrib.get('height'))

1869

format_id = join_nonempty(f4m_id, tbr or i)

1870

# If <bootstrapInfo> is present, the specified f4m is a

1871

# stream-level manifest, and only set-level manifests may refer to

1872

# external resources. See section 11.4 and section 4 of F4M spec

1873

if bootstrap_info is None:

1874

media_url = None

1875

# @href is introduced in 2.0, see section 11.6 of F4M spec

1876

if manifest_version == '2.0':

1877

media_url = media_el.attrib.get('href')

1878

if media_url is None:

1879

media_url = media_el.attrib.get('url')

if not media_url:

continue

manifest_url = (

media_url if media_url.startswith('http://') or media_url.startswith('https://')

1884

else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))

1885

# If media_url is itself a f4m manifest do the recursive extraction

1886

# since bitrates in parent manifest (this one) and media_url manifest

1887

# may differ leading to inability to resolve the format by requested

1888

# bitrate in f4m downloader

1889

ext = determine_ext(manifest_url)

1890

if ext == 'f4m':

1891

f4m_formats = self._extract_f4m_formats(

1892

manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,

1893

transform_source=transform_source, fatal=fatal)

1894

# Sometimes stream-level manifest contains single media entry that

1895

# does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).

1896

# At the same time parent's media entry in set-level manifest may

1897

# contain it. We will copy it from parent in such cases.

1898

if len(f4m_formats) == 1:

1899

f = f4m_formats[0]

1900

f.update({

1901

'tbr': f.get('tbr') or tbr,

1902

'width': f.get('width') or width,

1903

'height': f.get('height') or height,

1904

'format_id': f.get('format_id') if not tbr else format_id,

1905

'vcodec': vcodec,

1906

})

1907

formats.extend(f4m_formats)

1908

continue

1909

elif ext == 'm3u8':

1910

formats.extend(self._extract_m3u8_formats(

1911

manifest_url, video_id, 'mp4', preference=preference,

1912

quality=quality, m3u8_id=m3u8_id, fatal=fatal))

1913

continue

1914

formats.append({

1915

'format_id': format_id,

1916

'url': manifest_url,

1917

'manifest_url': manifest_url,

1918

'ext': 'flv' if bootstrap_info is not None else None,

'protocol': 'f4m',

'tbr': tbr,

'width': width,

'height': height,

'vcodec': vcodec,

'preference': preference,

'quality': quality,

})

return formats

def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):

1930

return {

1931

'format_id': join_nonempty(m3u8_id, 'meta'),

'url': m3u8_url,

'ext': ext,

'protocol': 'm3u8',

'preference': preference - 100 if preference else -100,

1936

'quality': quality,

1937

'resolution': 'multiple',

1938

'format_note': 'Quality selection URL',

1939

}

1940

1941

def _report_ignoring_subs(self, name):

1942

self.report_warning(bug_reports_message(

1943

f'Ignoring subtitle tracks found in the {name} manifest; '

1944

'if any subtitle tracks are missing,'

1945

), only_once=True)

1946

1947

def _extract_m3u8_formats(self, *args, **kwargs):

1948

fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)

1949

if subs:

1950

self._report_ignoring_subs('HLS')

1951

return fmts

1952

1953

def _extract_m3u8_formats_and_subtitles(

1954

self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',

1955

preference=None, quality=None, m3u8_id=None, note=None,

1956

errnote=None, fatal=True, live=False, data=None, headers={},

1957

query={}):

1958

1959

if self.get_param('ignore_no_formats_error'):

fatal = False

if not m3u8_url:

if errnote is not False:

1964

errnote = errnote or 'Failed to obtain m3u8 URL'

1965

if fatal:

1966

raise ExtractorError(errnote, video_id=video_id)

1967

self.report_warning(f'{errnote}{bug_reports_message()}')

1968

return [], {}

1969

1970

res = self._download_webpage_handle(

1971

m3u8_url, video_id,

1972

note='Downloading m3u8 information' if note is None else note,

1973

errnote='Failed to download m3u8 information' if errnote is None else errnote,

1974

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return [], {}

m3u8_doc, urlh = res

m3u8_url = urlh.url

return self._parse_m3u8_formats_and_subtitles(

1983

m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,

1984

preference=preference, quality=quality, m3u8_id=m3u8_id,

1985

note=note, errnote=errnote, fatal=fatal, live=live, data=data,

1986

headers=headers, query=query, video_id=video_id)

1987

1988

def _parse_m3u8_formats_and_subtitles(

1989

self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',

1990

preference=None, quality=None, m3u8_id=None, live=False, note=None,

1991

errnote=None, fatal=True, data=None, headers={}, query={},

1992

video_id=None):

1993

formats, subtitles = [], {}

1994

has_drm = HlsFD._has_drm(m3u8_doc)

1995

1996

def format_url(url):

1997

return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)

1998

1999

if self.get_param('hls_split_discontinuity', False):

2000

def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):

if not m3u8_doc:

if not manifest_url:

return []

m3u8_doc = self._download_webpage(

2005

manifest_url, video_id, fatal=fatal, data=data, headers=headers,

2006

note=False, errnote='Failed to download m3u8 playlist information')

2007

if m3u8_doc is False:

2008

return []

2009

return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))

2010

2011

else:

2012

def _extract_m3u8_playlist_indices(*args, **kwargs):

return [None]

# References:

# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21

2017

# 2. https://github.com/ytdl-org/youtube-dl/issues/12211

2018

# 3. https://github.com/ytdl-org/youtube-dl/issues/18923

2019

2020

# We should try extracting formats only from master playlists [1, 4.3.4],

2021

# i.e. playlists that describe available qualities. On the other hand

2022

# media playlists [1, 4.3.3] should be returned as is since they contain

2023

# just the media without qualities renditions.

2024

# Fortunately, master playlist can be easily distinguished from media

2025

# playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]

2026

# master playlist tags MUST NOT appear in a media playlist and vice versa.

2027

# As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every

2028

# media playlist and MUST NOT appear in master playlist thus we can

2029

# clearly detect media playlist with this criterion.

2030

2031

if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is

2032

formats = [{

2033

'format_id': join_nonempty(m3u8_id, idx),

2034

'format_index': idx,

2035

'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),

2036

'ext': ext,

2037

'protocol': entry_protocol,

2038

'preference': preference,

2039

'quality': quality,

2040

'has_drm': has_drm,

2041

} for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]

2042

2043

return formats, subtitles

groups = {}

last_stream_inf = {}

def extract_media(x_media_line):

2049

media = parse_m3u8_attributes(x_media_line)

2050

# As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED

2051

media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')

2052

if not (media_type and group_id and name):

2053

return

2054

groups.setdefault(group_id, []).append(media)

2055

# <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>

2056

if media_type == 'SUBTITLES':

2057

# According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the

2058

# EXT-X-MEDIA tag if the media type is SUBTITLES.

2059

# However, lack of URI has been spotted in the wild.

2060

# e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339

2061

if not media.get('URI'):

2062

return

2063

url = format_url(media['URI'])

2064

sub_info = {

2065

'url': url,

2066

'ext': determine_ext(url),

2067

}

2068

if sub_info['ext'] == 'm3u8':

2069

# Per RFC 8216 §3.1, the only possible subtitle format m3u8

2070

# files may contain is WebVTT:

2071

# <https://tools.ietf.org/html/rfc8216#section-3.1>

2072

sub_info['ext'] = 'vtt'

2073

sub_info['protocol'] = 'm3u8_native'

2074

lang = media.get('LANGUAGE') or 'und'

2075

subtitles.setdefault(lang, []).append(sub_info)

2076

if media_type not in ('VIDEO', 'AUDIO'):

2077

return

2078

media_url = media.get('URI')

2079

if media_url:

2080

manifest_url = format_url(media_url)

2081

formats.extend({

2082

'format_id': join_nonempty(m3u8_id, group_id, name, idx),

'format_note': name,

'format_index': idx,

'url': manifest_url,

'manifest_url': m3u8_url,

2087

'language': media.get('LANGUAGE'),

2088

'ext': ext,

2089

'protocol': entry_protocol,

2090

'preference': preference,

2091

'quality': quality,

2092

'has_drm': has_drm,

2093

'vcodec': 'none' if media_type == 'AUDIO' else None,

2094

} for idx in _extract_m3u8_playlist_indices(manifest_url))

2095

2096

def build_stream_name():

2097

# Despite specification does not mention NAME attribute for

2098

# EXT-X-STREAM-INF tag it still sometimes may be present (see [1]

2099

# or vidio test in TestInfoExtractor.test_parse_m3u8_formats)

2100

# 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015

2101

stream_name = last_stream_inf.get('NAME')

2102

if stream_name:

2103

return stream_name

2104

# If there is no NAME in EXT-X-STREAM-INF it will be obtained

2105

# from corresponding rendition group

2106

stream_group_id = last_stream_inf.get('VIDEO')

2107

if not stream_group_id:

2108

return

2109

stream_group = groups.get(stream_group_id)

2110

if not stream_group:

2111

return stream_group_id

2112

rendition = stream_group[0]

2113

return rendition.get('NAME') or stream_group_id

2114

2115

# parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the

2116

# chance to detect video only formats when EXT-X-STREAM-INF tags

2117

# precede EXT-X-MEDIA tags in HLS manifest such as [3].

2118

for line in m3u8_doc.splitlines():

2119

if line.startswith('#EXT-X-MEDIA:'):

2120

extract_media(line)

2121

2122

for line in m3u8_doc.splitlines():

2123

if line.startswith('#EXT-X-STREAM-INF:'):

2124

last_stream_inf = parse_m3u8_attributes(line)

2125

elif line.startswith('#') or not line.strip():

continue

else:

tbr = float_or_none(

last_stream_inf.get('AVERAGE-BANDWIDTH')

2130

or last_stream_inf.get('BANDWIDTH'), scale=1000)

2131

manifest_url = format_url(line.strip())

2132

2133

for idx in _extract_m3u8_playlist_indices(manifest_url):

2134

format_id = [m3u8_id, None, idx]

2135

# Bandwidth of live streams may differ over time thus making

2136

# format_id unpredictable. So it's better to keep provided

2137

# format_id intact.

2138

if not live:

2139

stream_name = build_stream_name()

2140

format_id[1] = stream_name or '%d' % (tbr or len(formats))

2141

f = {

2142

'format_id': join_nonempty(*format_id),

2143

'format_index': idx,

2144

'url': manifest_url,

2145

'manifest_url': m3u8_url,

2146

'tbr': tbr,

2147

'ext': ext,

2148

'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),

2149

'protocol': entry_protocol,

2150

'preference': preference,

'quality': quality,

'has_drm': has_drm,

}

resolution = last_stream_inf.get('RESOLUTION')

2155

if resolution:

2156

mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)

2157

if mobj:

2158

f['width'] = int(mobj.group('width'))

2159

f['height'] = int(mobj.group('height'))

2160

# Unified Streaming Platform

2161

mobj = re.search(

2162

r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])

2163

if mobj:

2164

abr, vbr = mobj.groups()

2165

abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)

f.update({

'vbr': vbr,

'abr': abr,

})

codecs = parse_codecs(last_stream_inf.get('CODECS'))

2171

f.update(codecs)

2172

audio_group_id = last_stream_inf.get('AUDIO')

2173

# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which

2174

# references a rendition group MUST have a CODECS attribute.

2175

# However, this is not always respected. E.g. [2]

2176

# contains EXT-X-STREAM-INF tag which references AUDIO

2177

# rendition group but does not have CODECS and despite

2178

# referencing an audio group it represents a complete

2179

# (with audio and video) format. So, for such cases we will

2180

# ignore references to rendition groups and treat them

2181

# as complete formats.

2182

if audio_group_id and codecs and f.get('vcodec') != 'none':

2183

audio_group = groups.get(audio_group_id)

2184

if audio_group and audio_group[0].get('URI'):

2185

# TODO: update acodec for audio only formats with

# the same GROUP-ID

f['acodec'] = 'none'

if not f.get('ext'):

f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'

formats.append(f)

# for DailyMotion

progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')

2194

if progressive_uri:

2195

http_f = f.copy()

2196

del http_f['manifest_url']

2197

http_f.update({

2198

'format_id': f['format_id'].replace('hls-', 'http-'),

2199

'protocol': 'http',

2200

'url': progressive_uri,

2201

})

2202

formats.append(http_f)

2203

2204

last_stream_inf = {}

2205

return formats, subtitles

2206

2207

def _extract_m3u8_vod_duration(

2208

self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):

2209

2210

m3u8_vod = self._download_webpage(

2211

m3u8_vod_url, video_id,

2212

note='Downloading m3u8 VOD manifest' if note is None else note,

2213

errnote='Failed to download VOD manifest' if errnote is None else errnote,

2214

fatal=False, data=data, headers=headers, query=query)

2215

2216

return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)

2217

2218

def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):

2219

if '#EXT-X-ENDLIST' not in m3u8_vod:

return None

return int(sum(

float(line[len('#EXTINF:'):].split(',')[0])

2224

for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None

2225

2226

def _extract_mpd_vod_duration(

2227

self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):

2228

2229

mpd_doc = self._download_xml(

2230

mpd_url, video_id,

2231

note='Downloading MPD VOD manifest' if note is None else note,

2232

errnote='Failed to download VOD manifest' if errnote is None else errnote,

2233

fatal=False, data=data, headers=headers, query=query)

2234

if not isinstance(mpd_doc, xml.etree.ElementTree.Element):

2235

return None

2236

return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))

2237

2238

@staticmethod

2239

def _xpath_ns(path, namespace=None):

if not namespace:

return path

out = []

for c in path.split('/'):

2244

if not c or c == '.':

2245

out.append(c)

2246

else:

2247

out.append('{%s}%s' % (namespace, c))

2248

return '/'.join(out)

2249

2250

def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):

2251

if self.get_param('ignore_no_formats_error'):

2252

fatal = False

2253

2254

res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)

if res is False:

assert not fatal

return [], {}

smil, urlh = res

return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,

2261

namespace=self._parse_smil_namespace(smil))

2262

2263

def _extract_smil_formats(self, *args, **kwargs):

2264

fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)

2265

if subs:

2266

self._report_ignoring_subs('SMIL')

2267

return fmts

2268

2269

def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):

2270

res = self._download_smil(smil_url, video_id, fatal=fatal)

if res is False:

return {}

smil, urlh = res

smil_url = urlh.url

return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)

2278

2279

def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):

2280

return self._download_xml_handle(

2281

smil_url, video_id, 'Downloading SMIL file',

2282

'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)

2283

2284

def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):

2285

namespace = self._parse_smil_namespace(smil)

2286

2287

formats, subtitles = self._parse_smil_formats_and_subtitles(

2288

smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)

2289

2290

video_id = os.path.splitext(url_basename(smil_url))[0]

title = None

description = None

upload_date = None

for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):

2295

name = meta.attrib.get('name')

2296

content = meta.attrib.get('content')

2297

if not name or not content:

2298

continue

2299

if not title and name == 'title':

2300

title = content

2301

elif not description and name in ('description', 'abstract'):

2302

description = content

2303

elif not upload_date and name == 'date':

2304

upload_date = unified_strdate(content)

2305

2306

thumbnails = [{

2307

'id': image.get('type'),

2308

'url': image.get('src'),

2309

'width': int_or_none(image.get('width')),

2310

'height': int_or_none(image.get('height')),

2311

} for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]

return {

'id': video_id,

'title': title or video_id,

2316

'description': description,

2317

'upload_date': upload_date,

2318

'thumbnails': thumbnails,

2319

'formats': formats,

2320

'subtitles': subtitles,

2321

}

2322

2323

def _parse_smil_namespace(self, smil):

2324

return self._search_regex(

2325

r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)

2326

2327

def _parse_smil_formats(self, *args, **kwargs):

2328

fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)

2329

if subs:

2330

self._report_ignoring_subs('SMIL')

2331

return fmts

2332

2333

def _parse_smil_formats_and_subtitles(

2334

self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):

2335

base = smil_url

2336

for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):

2337

b = meta.get('base') or meta.get('httpBase')

if b:

base = b

break

formats, subtitles = [], {}

rtmp_count = 0

http_count = 0

m3u8_count = 0

imgs_count = 0

srcs = set()

media = itertools.chain.from_iterable(

2350

smil.findall(self._xpath_ns(arg, namespace))

2351

for arg in ['.//video', './/audio', './/media'])

2352

for medium in media:

2353

src = medium.get('src')

2354

if not src or src in srcs:

continue

srcs.add(src)

bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)

2359

filesize = int_or_none(medium.get('size') or medium.get('fileSize'))

2360

width = int_or_none(medium.get('width'))

2361

height = int_or_none(medium.get('height'))

2362

proto = medium.get('proto')

2363

ext = medium.get('ext')

2364

src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(

2365

self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))

2366

streamer = medium.get('streamer') or base

2367

2368

if proto == 'rtmp' or streamer.startswith('rtmp'):

rtmp_count += 1

formats.append({

'url': streamer,

'play_path': src,

'ext': 'flv',

'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),

2375

'tbr': bitrate,

2376

'filesize': filesize,

'width': width,

'height': height,

})

if transform_rtmp_url:

2381

streamer, src = transform_rtmp_url(streamer, src)

formats[-1].update({

'url': streamer,

'play_path': src,

})

continue

src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)

2389

src_url = src_url.strip()

2390

2391

if proto == 'm3u8' or src_ext == 'm3u8':

2392

m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(

2393

src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)

2394

self._merge_subtitles(m3u8_subs, target=subtitles)

2395

if len(m3u8_formats) == 1:

2396

m3u8_count += 1

2397

m3u8_formats[0].update({

2398

'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),

'tbr': bitrate,

'width': width,

'height': height,

})

formats.extend(m3u8_formats)

2404

elif src_ext == 'f4m':

f4m_url = src_url

if not f4m_params:

f4m_params = {

'hdcore': '3.2.0',

'plugin': 'flowplayer-3.2.0.1',

2410

}

2411

f4m_url += '&' if '?' in f4m_url else '?'

2412

f4m_url += urllib.parse.urlencode(f4m_params)

2413

formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))

2414

elif src_ext == 'mpd':

2415

mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(

2416

src_url, video_id, mpd_id='dash', fatal=False)

2417

formats.extend(mpd_formats)

2418

self._merge_subtitles(mpd_subs, target=subtitles)

2419

elif re.search(r'\.ism/[Mm]anifest', src_url):

2420

ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(

2421

src_url, video_id, ism_id='mss', fatal=False)

2422

formats.extend(ism_formats)

2423

self._merge_subtitles(ism_subs, target=subtitles)

2424

elif src_url.startswith('http') and self._is_valid_url(src, video_id):

http_count += 1

formats.append({

'url': src_url,

'ext': ext or src_ext or 'flv',

2429

'format_id': 'http-%d' % (bitrate or http_count),

2430

'tbr': bitrate,

2431

'filesize': filesize,

'width': width,

'height': height,

})

for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):

2437

src = medium.get('src')

2438

if not src or src in srcs:

continue

srcs.add(src)

imgs_count += 1

formats.append({

'format_id': 'imagestream-%d' % (imgs_count),

2445

'url': src,

2446

'ext': mimetype2ext(medium.get('type')),

2447

'acodec': 'none',

2448

'vcodec': 'none',

2449

'width': int_or_none(medium.get('width')),

2450

'height': int_or_none(medium.get('height')),

2451

'format_note': 'SMIL storyboards',

2452

})

2453

2454

smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)

2455

self._merge_subtitles(smil_subs, target=subtitles)

2456

2457

return formats, subtitles

2458

2459

def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):

2460

urls = []

2461

subtitles = {}

2462

for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):

2463

src = textstream.get('src')

2464

if not src or src in urls:

2465

continue

2466

urls.append(src)

2467

ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)

2468

lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang

2469

subtitles.setdefault(lang, []).append({

'url': src,

'ext': ext,

})

return subtitles

def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):

2476

res = self._download_xml_handle(

2477

xspf_url, playlist_id, 'Downloading xpsf playlist',

2478

'Unable to download xspf manifest', fatal=fatal)

if res is False:

return []

xspf, urlh = res

xspf_url = urlh.url

return self._parse_xspf(

2486

xspf, playlist_id, xspf_url=xspf_url,

2487

xspf_base_url=base_url(xspf_url))

2488

2489

def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):

2490

NS_MAP = {

2491

'xspf': 'http://xspf.org/ns/0/',

2492

's1': 'http://static.streamone.nl/player/ns/0',

}

entries = []

for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):

2497

title = xpath_text(

2498

track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)

2499

description = xpath_text(

2500

track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')

2501

thumbnail = xpath_text(

2502

track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')

2503

duration = float_or_none(

2504

xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)

2505

2506

formats = []

2507

for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):

2508

format_url = urljoin(xspf_base_url, location.text)

if not format_url:

continue

formats.append({

'url': format_url,

'manifest_url': xspf_url,

2514

'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),

2515

'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),

2516

'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),

})

entries.append({

'id': playlist_id,

'title': title,

'description': description,

2523

'thumbnail': thumbnail,

2524

'duration': duration,

'formats': formats,

})

return entries

def _extract_mpd_formats(self, *args, **kwargs):

2530

fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)

2531

if subs:

2532

self._report_ignoring_subs('DASH')

2533

return fmts

2534

2535

def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):

2536

periods = self._extract_mpd_periods(*args, **kwargs)

2537

return self._merge_mpd_periods(periods)

2538

2539

def _extract_mpd_periods(

2540

self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,

2541

fatal=True, data=None, headers={}, query={}):

2542

2543

if self.get_param('ignore_no_formats_error'):

2544

fatal = False

2545

2546

res = self._download_xml_handle(

2547

mpd_url, video_id,

2548

note='Downloading MPD manifest' if note is None else note,

2549

errnote='Failed to download MPD manifest' if errnote is None else errnote,

2550

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return []

mpd_doc, urlh = res

if mpd_doc is None:

return []

# We could have been redirected to a new url when we retrieved our mpd file.

2558

mpd_url = urlh.url

2559

mpd_base_url = base_url(mpd_url)

2560

2561

return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)

2562

2563

def _parse_mpd_formats(self, *args, **kwargs):

2564

fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)

2565

if subs:

2566

self._report_ignoring_subs('DASH')

2567

return fmts

2568

2569

def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):

2570

periods = self._parse_mpd_periods(*args, **kwargs)

2571

return self._merge_mpd_periods(periods)

2572

2573

def _merge_mpd_periods(self, periods):

2574

"""

2575

Combine all formats and subtitles from an MPD manifest into a single list,

2576

by concatenate streams with similar formats.

2577

"""

2578

formats, subtitles = {}, {}

2579

for period in periods:

2580

for f in period['formats']:

2581

assert 'is_dash_periods' not in f, 'format already processed'

2582

f['is_dash_periods'] = True

2583

format_key = tuple(v for k, v in f.items() if k not in (

2584

('format_id', 'fragments', 'manifest_stream_number')))

2585

if format_key not in formats:

2586

formats[format_key] = f

2587

elif 'fragments' in f:

2588

formats[format_key].setdefault('fragments', []).extend(f['fragments'])

2589

2590

if subtitles and period['subtitles']:

2591

self.report_warning(bug_reports_message(

2592

'Found subtitles in multiple periods in the DASH manifest; '

2593

'if part of the subtitles are missing,'

2594

), only_once=True)

2595

2596

for sub_lang, sub_info in period['subtitles'].items():

2597

subtitles.setdefault(sub_lang, []).extend(sub_info)

2598

2599

return list(formats.values()), subtitles

2600

2601

def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):

2602

"""

2603

Parse formats from MPD manifest.

2604

References:

2605

1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),

2606

http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip

2607

2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP

2608

"""

2609

if not self.get_param('dynamic_mpd', True):

2610

if mpd_doc.get('type') == 'dynamic':

2611

return [], {}

2612

2613

namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)

2614

2615

def _add_ns(path):

2616

return self._xpath_ns(path, namespace)

2617

2618

def is_drm_protected(element):

2619

return element.find(_add_ns('ContentProtection')) is not None

2620

2621

def extract_multisegment_info(element, ms_parent_info):

2622

ms_info = ms_parent_info.copy()

2623

2624

# As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some

2625

# common attributes and elements. We will only extract relevant

2626

# for us.

2627

def extract_common(source):

2628

segment_timeline = source.find(_add_ns('SegmentTimeline'))

2629

if segment_timeline is not None:

2630

s_e = segment_timeline.findall(_add_ns('S'))

2631

if s_e:

2632

ms_info['total_number'] = 0

2633

ms_info['s'] = []

2634

for s in s_e:

2635

r = int(s.get('r', 0))

2636

ms_info['total_number'] += 1 + r

2637

ms_info['s'].append({

2638

't': int(s.get('t', 0)),

2639

# @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])

2640

'd': int(s.attrib['d']),

2641

'r': r,

2642

})

2643

start_number = source.get('startNumber')

2644

if start_number:

2645

ms_info['start_number'] = int(start_number)

2646

timescale = source.get('timescale')

2647

if timescale:

2648

ms_info['timescale'] = int(timescale)

2649

segment_duration = source.get('duration')

2650

if segment_duration:

2651

ms_info['segment_duration'] = float(segment_duration)

2652

2653

def extract_Initialization(source):

2654

initialization = source.find(_add_ns('Initialization'))

2655

if initialization is not None:

2656

ms_info['initialization_url'] = initialization.attrib['sourceURL']

2657

2658

segment_list = element.find(_add_ns('SegmentList'))

2659

if segment_list is not None:

2660

extract_common(segment_list)

2661

extract_Initialization(segment_list)

2662

segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))

2663

if segment_urls_e:

2664

ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]

2665

else:

2666

segment_template = element.find(_add_ns('SegmentTemplate'))

2667

if segment_template is not None:

2668

extract_common(segment_template)

2669

media = segment_template.get('media')

2670

if media:

2671

ms_info['media'] = media

2672

initialization = segment_template.get('initialization')

2673

if initialization:

2674

ms_info['initialization'] = initialization

2675

else:

2676

extract_Initialization(segment_template)

2677

return ms_info

2678

2679

mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))

2680

stream_numbers = collections.defaultdict(int)

2681

for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):

2682

period_entry = {

2683

'id': period.get('id', f'period-{period_idx}'),

2684

'formats': [],

2685

'subtitles': collections.defaultdict(list),

2686

}

2687

period_duration = parse_duration(period.get('duration')) or mpd_duration

2688

period_ms_info = extract_multisegment_info(period, {

'start_number': 1,

'timescale': 1,

})

for adaptation_set in period.findall(_add_ns('AdaptationSet')):

2693

adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)

2694

for representation in adaptation_set.findall(_add_ns('Representation')):

2695

representation_attrib = adaptation_set.attrib.copy()

2696

representation_attrib.update(representation.attrib)

2697

# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory

2698

mime_type = representation_attrib['mimeType']

2699

content_type = representation_attrib.get('contentType', mime_type.split('/')[0])

2700

2701

codec_str = representation_attrib.get('codecs', '')

2702

# Some kind of binary subtitle found in some youtube livestreams

2703

if mime_type == 'application/x-rawcc':

2704

codecs = {'scodec': codec_str}

2705

else:

2706

codecs = parse_codecs(codec_str)

2707

if content_type not in ('video', 'audio', 'text'):

2708

if mime_type == 'image/jpeg':

2709

content_type = mime_type

2710

elif codecs.get('vcodec', 'none') != 'none':

2711

content_type = 'video'

2712

elif codecs.get('acodec', 'none') != 'none':

2713

content_type = 'audio'

2714

elif codecs.get('scodec', 'none') != 'none':

2715

content_type = 'text'

2716

elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):

2717

content_type = 'text'

2718

else:

2719

self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)

continue

base_url = ''

for element in (representation, adaptation_set, period, mpd_doc):

2724

base_url_e = element.find(_add_ns('BaseURL'))

2725

if try_call(lambda: base_url_e.text) is not None:

2726

base_url = base_url_e.text + base_url

2727

if re.match(r'^https?://', base_url):

2728

break

2729

if mpd_base_url and base_url.startswith('/'):

2730

base_url = urllib.parse.urljoin(mpd_base_url, base_url)

2731

elif mpd_base_url and not re.match(r'^https?://', base_url):

2732

if not mpd_base_url.endswith('/'):

2733

mpd_base_url += '/'

2734

base_url = mpd_base_url + base_url

2735

representation_id = representation_attrib.get('id')

2736

lang = representation_attrib.get('lang')

2737

url_el = representation.find(_add_ns('BaseURL'))

2738

filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)

2739

bandwidth = int_or_none(representation_attrib.get('bandwidth'))

2740

if representation_id is not None:

2741

format_id = representation_id

2742

else:

2743

format_id = content_type

2744

if mpd_id:

2745

format_id = mpd_id + '-' + format_id

2746

if content_type in ('video', 'audio'):

2747

f = {

2748

'format_id': format_id,

2749

'manifest_url': mpd_url,

2750

'ext': mimetype2ext(mime_type),

2751

'width': int_or_none(representation_attrib.get('width')),

2752

'height': int_or_none(representation_attrib.get('height')),

2753

'tbr': float_or_none(bandwidth, 1000),

2754

'asr': int_or_none(representation_attrib.get('audioSamplingRate')),

2755

'fps': int_or_none(representation_attrib.get('frameRate')),

2756

'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,

2757

'format_note': 'DASH %s' % content_type,

2758

'filesize': filesize,

2759

'container': mimetype2ext(mime_type) + '_dash',

2760

**codecs

2761

}

2762

elif content_type == 'text':

2763

f = {

2764

'ext': mimetype2ext(mime_type),

2765

'manifest_url': mpd_url,

2766

'filesize': filesize,

2767

}

2768

elif content_type == 'image/jpeg':

2769

# See test case in VikiIE

2770

# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1

2771

f = {

2772

'format_id': format_id,

2773

'ext': 'mhtml',

2774

'manifest_url': mpd_url,

2775

'format_note': 'DASH storyboards (jpeg)',

'acodec': 'none',

'vcodec': 'none',

}

if is_drm_protected(adaptation_set) or is_drm_protected(representation):

2780

f['has_drm'] = True

2781

representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)

2782

2783

def prepare_template(template_name, identifiers):

2784

tmpl = representation_ms_info[template_name]

2785

if representation_id is not None:

2786

tmpl = tmpl.replace('$RepresentationID$', representation_id)

2787

# First of, % characters outside $...$ templates

2788

# must be escaped by doubling for proper processing

2789

# by % operator string formatting used further (see

2790

# https://github.com/ytdl-org/youtube-dl/issues/16867).

t = ''

in_template = False

for c in tmpl:

t += c

if c == '$':

in_template = not in_template

2797

elif c == '%' and not in_template:

2798

t += c

2799

# Next, $...$ templates are translated to their

2800

# %(...) counterparts to be used with % operator

2801

t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)

2802

t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)

t.replace('$$', '$')

return t

# @initialization is a regular template like @media one

2807

# so it should be handled just the same way (see

2808

# https://github.com/ytdl-org/youtube-dl/issues/11605)

2809

if 'initialization' in representation_ms_info:

2810

initialization_template = prepare_template(

2811

'initialization',

2812

# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and

2813

# $Time$ shall not be included for @initialization thus

2814

# only $Bandwidth$ remains

2815

('Bandwidth', ))

2816

representation_ms_info['initialization_url'] = initialization_template % {

2817

'Bandwidth': bandwidth,

2818

}

2819

2820

def location_key(location):

2821

return 'url' if re.match(r'^https?://', location) else 'path'

2822

2823

if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:

2824

2825

media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))

2826

media_location_key = location_key(media_template)

2827

2828

# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$

2829

# can't be used at the same time

2830

if '%(Number' in media_template and 's' not in representation_ms_info:

2831

segment_duration = None

2832

if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:

2833

segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])

2834

representation_ms_info['total_number'] = int(math.ceil(

2835

float_or_none(period_duration, segment_duration, default=0)))

2836

representation_ms_info['fragments'] = [{

2837

media_location_key: media_template % {

2838

'Number': segment_number,

2839

'Bandwidth': bandwidth,

2840

},

2841

'duration': segment_duration,

2842

} for segment_number in range(

2843

representation_ms_info['start_number'],

2844

representation_ms_info['total_number'] + representation_ms_info['start_number'])]

2845

else:

2846

# $Number*$ or $Time$ in media template with S list available

2847

# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg

2848

# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411

2849

representation_ms_info['fragments'] = []

2850

segment_time = 0

2851

segment_d = None

2852

segment_number = representation_ms_info['start_number']

2853

2854

def add_segment_url():

2855

segment_url = media_template % {

2856

'Time': segment_time,

2857

'Bandwidth': bandwidth,

2858

'Number': segment_number,

2859

}

2860

representation_ms_info['fragments'].append({

2861

media_location_key: segment_url,

2862

'duration': float_or_none(segment_d, representation_ms_info['timescale']),

2863

})

2864

2865

for num, s in enumerate(representation_ms_info['s']):

2866

segment_time = s.get('t') or segment_time

segment_d = s['d']

add_segment_url()

segment_number += 1

for r in range(s.get('r', 0)):

2871

segment_time += segment_d

2872

add_segment_url()

2873

segment_number += 1

2874

segment_time += segment_d

2875

elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:

2876

# No media template,

2877

# e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI

2878

# or any YouTube dashsegments video

2879

fragments = []

2880

segment_index = 0

2881

timescale = representation_ms_info['timescale']

2882

for s in representation_ms_info['s']:

2883

duration = float_or_none(s['d'], timescale)

2884

for r in range(s.get('r', 0) + 1):

2885

segment_uri = representation_ms_info['segment_urls'][segment_index]

2886

fragments.append({

2887

location_key(segment_uri): segment_uri,

2888

'duration': duration,

2889

})

2890

segment_index += 1

2891

representation_ms_info['fragments'] = fragments

2892

elif 'segment_urls' in representation_ms_info:

2893

# Segment URLs with no SegmentTimeline

2894

# E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091

2895

# https://github.com/ytdl-org/youtube-dl/pull/14844

2896

fragments = []

2897

segment_duration = float_or_none(

2898

representation_ms_info['segment_duration'],

2899

representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None

2900

for segment_url in representation_ms_info['segment_urls']:

2901

fragment = {

2902

location_key(segment_url): segment_url,

2903

}

2904

if segment_duration:

2905

fragment['duration'] = segment_duration

2906

fragments.append(fragment)

2907

representation_ms_info['fragments'] = fragments

2908

# If there is a fragments key available then we correctly recognized fragmented media.

2909

# Otherwise we will assume unfragmented media with direct access. Technically, such

2910

# assumption is not necessarily correct since we may simply have no support for

2911

# some forms of fragmented media renditions yet, but for now we'll use this fallback.

2912

if 'fragments' in representation_ms_info:

2913

f.update({

2914

# NB: mpd_url may be empty when MPD manifest is parsed from a string

2915

'url': mpd_url or base_url,

2916

'fragment_base_url': base_url,

2917

'fragments': [],

2918

'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',

2919

})

2920

if 'initialization_url' in representation_ms_info:

2921

initialization_url = representation_ms_info['initialization_url']

2922

if not f.get('url'):

2923

f['url'] = initialization_url

2924

f['fragments'].append({location_key(initialization_url): initialization_url})

2925

f['fragments'].extend(representation_ms_info['fragments'])

2926

if not period_duration:

2927

period_duration = try_get(

2928

representation_ms_info,

2929

lambda r: sum(frag['duration'] for frag in r['fragments']), float)

2930

else:

2931

# Assuming direct URL to unfragmented media.

2932

f['url'] = base_url

2933

if content_type in ('video', 'audio', 'image/jpeg'):

2934

f['manifest_stream_number'] = stream_numbers[f['url']]

2935

stream_numbers[f['url']] += 1

2936

period_entry['formats'].append(f)

2937

elif content_type == 'text':

2938

period_entry['subtitles'][lang or 'und'].append(f)

2939

yield period_entry

2940

2941

def _extract_ism_formats(self, *args, **kwargs):

2942

fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)

2943

if subs:

2944

self._report_ignoring_subs('ISM')

2945

return fmts

2946

2947

def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):

2948

if self.get_param('ignore_no_formats_error'):

2949

fatal = False

2950

2951

res = self._download_xml_handle(

2952

ism_url, video_id,

2953

note='Downloading ISM manifest' if note is None else note,

2954

errnote='Failed to download ISM manifest' if errnote is None else errnote,

2955

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return [], {}

ism_doc, urlh = res

if ism_doc is None:

return [], {}

return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)

2963

2964

def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):

2965

"""

2966

Parse formats from ISM manifest.

2967

References:

2968

1. [MS-SSTR]: Smooth Streaming Protocol,

2969

https://msdn.microsoft.com/en-us/library/ff469518.aspx

2970

"""

2971

if ism_doc.get('IsLive') == 'TRUE':

2972

return [], {}

2973

2974

duration = int(ism_doc.attrib['Duration'])

2975

timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000

formats = []

subtitles = {}

for stream in ism_doc.findall('StreamIndex'):

2980

stream_type = stream.get('Type')

2981

if stream_type not in ('video', 'audio', 'text'):

2982

continue

2983

url_pattern = stream.attrib['Url']

2984

stream_timescale = int_or_none(stream.get('TimeScale')) or timescale

2985

stream_name = stream.get('Name')

2986

stream_language = stream.get('Language', 'und')

2987

for track in stream.findall('QualityLevel'):

2988

KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}

2989

fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))

2990

# TODO: add support for WVC1 and WMAP

2991

if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):

2992

self.report_warning('%s is not a supported codec' % fourcc)

2993

continue

2994

tbr = int(track.attrib['Bitrate']) // 1000

2995

# [1] does not mention Width and Height attributes. However,

2996

# they're often present while MaxWidth and MaxHeight are

2997

# missing, so should be used as fallbacks

2998

width = int_or_none(track.get('MaxWidth') or track.get('Width'))

2999

height = int_or_none(track.get('MaxHeight') or track.get('Height'))

3000

sampling_rate = int_or_none(track.get('SamplingRate'))

3001

3002

track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)

3003

track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)

fragments = []

fragment_ctx = {

'time': 0,

}

stream_fragments = stream.findall('c')

3010

for stream_fragment_index, stream_fragment in enumerate(stream_fragments):

3011

fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']

3012

fragment_repeat = int_or_none(stream_fragment.get('r')) or 1

3013

fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))

3014

if not fragment_ctx['duration']:

3015

try:

3016

next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])

3017

except IndexError:

3018

next_fragment_time = duration

3019

fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat

3020

for _ in range(fragment_repeat):

3021

fragments.append({

3022

'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),

3023

'duration': fragment_ctx['duration'] / stream_timescale,

3024

})

3025

fragment_ctx['time'] += fragment_ctx['duration']

3026

3027

if stream_type == 'text':

3028

subtitles.setdefault(stream_language, []).append({

'ext': 'ismt',

'protocol': 'ism',

'url': ism_url,

'manifest_url': ism_url,

3033

'fragments': fragments,

3034

'_download_params': {

3035

'stream_type': stream_type,

3036

'duration': duration,

3037

'timescale': stream_timescale,

3038

'fourcc': fourcc,

3039

'language': stream_language,

3040

'codec_private_data': track.get('CodecPrivateData'),

3041

}

3042

})

3043

elif stream_type in ('video', 'audio'):

3044

formats.append({

3045

'format_id': join_nonempty(ism_id, stream_name, tbr),

3046

'url': ism_url,

3047

'manifest_url': ism_url,

3048

'ext': 'ismv' if stream_type == 'video' else 'isma',

'width': width,

'height': height,

'tbr': tbr,

'asr': sampling_rate,

3053

'vcodec': 'none' if stream_type == 'audio' else fourcc,

3054

'acodec': 'none' if stream_type == 'video' else fourcc,

3055

'protocol': 'ism',

3056

'fragments': fragments,

3057

'has_drm': ism_doc.find('Protection') is not None,

3058

'language': stream_language,

3059

'audio_channels': int_or_none(track.get('Channels')),

3060

'_download_params': {

3061

'stream_type': stream_type,

3062

'duration': duration,

3063

'timescale': stream_timescale,

3064

'width': width or 0,

3065

'height': height or 0,

3066

'fourcc': fourcc,

3067

'language': stream_language,

3068

'codec_private_data': track.get('CodecPrivateData'),

3069

'sampling_rate': sampling_rate,

3070

'channels': int_or_none(track.get('Channels', 2)),

3071

'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),

3072

'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),

3073

},

3074

})

3075

return formats, subtitles

3076

3077

def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):

3078

def absolute_url(item_url):

3079

return urljoin(base_url, item_url)

3080

3081

def parse_content_type(content_type):

3082

if not content_type:

3083

return {}

3084

ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)

3085

if ctr:

3086

mimetype, codecs = ctr.groups()

3087

f = parse_codecs(codecs)

3088

f['ext'] = mimetype2ext(mimetype)

return f

return {}

def _media_formats(src, cur_media_type, type_info=None):

3093

type_info = type_info or {}

3094

full_url = absolute_url(src)

3095

ext = type_info.get('ext') or determine_ext(full_url)

3096

if ext == 'm3u8':

3097

is_plain_url = False

3098

formats = self._extract_m3u8_formats(

3099

full_url, video_id, ext='mp4',

3100

entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,

3101

preference=preference, quality=quality, fatal=False)

3102

elif ext == 'mpd':

3103

is_plain_url = False

3104

formats = self._extract_mpd_formats(

3105

full_url, video_id, mpd_id=mpd_id, fatal=False)

else:

is_plain_url = True

formats = [{

'url': full_url,

'vcodec': 'none' if cur_media_type == 'audio' else None,

3111

'ext': ext,

3112

}]

3113

return is_plain_url, formats

3114

3115

entries = []

3116

# amp-video and amp-audio are very similar to their HTML5 counterparts

3117

# so we will include them right here (see

3118

# https://www.ampproject.org/docs/reference/components/amp-video)

3119

# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/

3120

_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'

3121

media_tags = [(media_tag, media_tag_name, media_type, '')

3122

for media_tag, media_tag_name, media_type

3123

in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]

3124

media_tags.extend(re.findall(

3125

# We only allow video|audio followed by a whitespace or '>'.

3126

# Allowing more characters may end up in significant slow down (see

3127

# https://github.com/ytdl-org/youtube-dl/issues/11979,

3128

# e.g. http://www.porntrex.com/maps/videositemap.xml).

3129

r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))

3130

for media_tag, _, media_type, media_content in media_tags:

media_info = {

'formats': [],

'subtitles': {},

}

media_attributes = extract_attributes(media_tag)

3136

src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))

3137

if src:

3138

f = parse_content_type(media_attributes.get('type'))

3139

_, formats = _media_formats(src, media_type, f)

3140

media_info['formats'].extend(formats)

3141

media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))

3142

if media_content:

3143

for source_tag in re.findall(r'<source[^>]+>', media_content):

3144

s_attr = extract_attributes(source_tag)

3145

# data-video-src and data-src are non standard but seen

3146

# several times in the wild

3147

src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))

3148

if not src:

3149

continue

3150

f = parse_content_type(s_attr.get('type'))

3151

is_plain_url, formats = _media_formats(src, media_type, f)

3152

if is_plain_url:

3153

# width, height, res, label and title attributes are

3154

# all not standard but seen several times in the wild

3155

labels = [

3156

s_attr.get(lbl)

3157

for lbl in ('label', 'title')

3158

if str_or_none(s_attr.get(lbl))

3159

]

3160

width = int_or_none(s_attr.get('width'))

3161

height = (int_or_none(s_attr.get('height'))

3162

or int_or_none(s_attr.get('res')))

3163

if not width or not height:

3164

for lbl in labels:

3165

resolution = parse_resolution(lbl)

3166

if not resolution:

3167

continue

3168

width = width or resolution.get('width')

3169

height = height or resolution.get('height')

3170

for lbl in labels:

3171

tbr = parse_bitrate(lbl)

if tbr:

break

else:

tbr = None

f.update({

'width': width,

'height': height,

'tbr': tbr,

'format_id': s_attr.get('label') or s_attr.get('title'),

3181

})

3182

f.update(formats[0])

3183

media_info['formats'].append(f)

3184

else:

3185

media_info['formats'].extend(formats)

3186

for track_tag in re.findall(r'<track[^>]+>', media_content):

3187

track_attributes = extract_attributes(track_tag)

3188

kind = track_attributes.get('kind')

3189

if not kind or kind in ('subtitles', 'captions'):

3190

src = strip_or_none(track_attributes.get('src'))

3191

if not src:

3192

continue

3193

lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')

3194

media_info['subtitles'].setdefault(lang, []).append({

3195

'url': absolute_url(src),

3196

})

3197

for f in media_info['formats']:

3198

f.setdefault('http_headers', {})['Referer'] = base_url

3199

if media_info['formats'] or media_info['subtitles']:

3200

entries.append(media_info)

3201

return entries

3202

3203

def _extract_akamai_formats(self, *args, **kwargs):

3204

fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)

3205

if subs:

3206

self._report_ignoring_subs('akamai')

3207

return fmts

3208

3209

def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):

3210

signed = 'hdnea=' in manifest_url

3211

if not signed:

3212

# https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html

3213

manifest_url = re.sub(

3214

r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',

3215

'', manifest_url).strip('?')

formats = []

subtitles = {}

hdcore_sign = 'hdcore=3.7.0'

3221

f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')

3222

hds_host = hosts.get('hds')

3223

if hds_host:

3224

f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)

3225

if 'hdcore=' not in f4m_url:

3226

f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign

3227

f4m_formats = self._extract_f4m_formats(

3228

f4m_url, video_id, f4m_id='hds', fatal=False)

3229

for entry in f4m_formats:

3230

entry.update({'extra_param_to_segment_url': hdcore_sign})

3231

formats.extend(f4m_formats)

3232

3233

m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')

3234

hls_host = hosts.get('hls')

3235

if hls_host:

3236

m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)

3237

m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(

3238

m3u8_url, video_id, 'mp4', 'm3u8_native',

3239

m3u8_id='hls', fatal=False)

3240

formats.extend(m3u8_formats)

3241

subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)

3242

3243

http_host = hosts.get('http')

3244

if http_host and m3u8_formats and not signed:

3245

REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'

3246

qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')

3247

qualities_length = len(qualities)

3248

if len(m3u8_formats) in (qualities_length, qualities_length + 1):

3249

i = 0

3250

for f in m3u8_formats:

3251

if f['vcodec'] != 'none':

3252

for protocol in ('http', 'https'):

3253

http_f = f.copy()

3254

del http_f['manifest_url']

3255

http_url = re.sub(

3256

REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])

3257

http_f.update({

3258

'format_id': http_f['format_id'].replace('hls-', protocol + '-'),

3259

'url': http_url,

3260

'protocol': protocol,

3261

})

3262

formats.append(http_f)

3263

i += 1

3264

3265

return formats, subtitles

3266

3267

def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):

3268

query = urllib.parse.urlparse(url).query

3269

3270

mobj = re.search(

3271

r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)

3272

url_base = mobj.group('url')

3273

http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)

3274

formats = []

3275

3276

def manifest_url(manifest):

3277

m_url = f'{http_base_url}/{manifest}'

3278

if query:

3279

m_url += '?%s' % query

3280

return m_url

3281

3282

if 'm3u8' not in skip_protocols:

3283

formats.extend(self._extract_m3u8_formats(

3284

manifest_url('playlist.m3u8'), video_id, 'mp4',

3285

m3u8_entry_protocol, m3u8_id='hls', fatal=False))

3286

if 'f4m' not in skip_protocols:

3287

formats.extend(self._extract_f4m_formats(

3288

manifest_url('manifest.f4m'),

3289

video_id, f4m_id='hds', fatal=False))

3290

if 'dash' not in skip_protocols:

3291

formats.extend(self._extract_mpd_formats(

3292

manifest_url('manifest.mpd'),

3293

video_id, mpd_id='dash', fatal=False))

3294

if re.search(r'(?:/smil:|\.smil)', url_base):

3295

if 'smil' not in skip_protocols:

3296

rtmp_formats = self._extract_smil_formats(

3297

manifest_url('jwplayer.smil'),

3298

video_id, fatal=False)

3299

for rtmp_format in rtmp_formats:

3300

rtsp_format = rtmp_format.copy()

3301

rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])

3302

del rtsp_format['play_path']

3303

del rtsp_format['ext']

3304

rtsp_format.update({

3305

'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),

3306

'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),

3307

'protocol': 'rtsp',

3308

})

3309

formats.extend([rtmp_format, rtsp_format])

3310

else:

3311

for protocol in ('rtmp', 'rtsp'):

3312

if protocol not in skip_protocols:

3313

formats.append({

3314

'url': f'{protocol}:{url_base}',

3315

'format_id': protocol,

3316

'protocol': protocol,

})

return formats

def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):

3321

mobj = re.search(

3322

r'''(?s)jwplayer\s*$\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*$(?!</script>).*?\.\s*setup\s*$\s*(?P<options>(?:\([^)]*$|[^)])+)\s*\)''',

webpage)

if mobj:

try:

jwplayer_data = self._parse_json(mobj.group('options'),

3327

video_id=video_id,

3328

transform_source=transform_source)

3329

except ExtractorError:

3330

pass

3331

else:

3332

if isinstance(jwplayer_data, dict):

3333

return jwplayer_data

3334

3335

def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):

3336

jwplayer_data = self._find_jwplayer_data(

3337

webpage, video_id, transform_source=js_to_json)

3338

return self._parse_jwplayer_data(

3339

jwplayer_data, video_id, *args, **kwargs)

3340

3341

def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,

3342

m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):

3343

entries = []

3344

if not isinstance(jwplayer_data, dict):

3345

return entries

3346

3347

playlist_items = jwplayer_data.get('playlist')

3348

# JWPlayer backward compatibility: single playlist item/flattened playlists

3349

# https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10

3350

# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96

3351

if not isinstance(playlist_items, list):

3352

playlist_items = (playlist_items or jwplayer_data, )

3353

3354

for video_data in playlist_items:

3355

if not isinstance(video_data, dict):

3356

continue

3357

# JWPlayer backward compatibility: flattened sources

3358

# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35

3359

if 'sources' not in video_data:

3360

video_data['sources'] = [video_data]

3361

3362

this_video_id = video_id or video_data['mediaid']

3363

3364

formats = self._parse_jwplayer_formats(

3365

video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,

3366

mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)

3367

3368

subtitles = {}

3369

tracks = video_data.get('tracks')

3370

if tracks and isinstance(tracks, list):

3371

for track in tracks:

3372

if not isinstance(track, dict):

3373

continue

3374

track_kind = track.get('kind')

3375

if not track_kind or not isinstance(track_kind, str):

3376

continue

3377

if track_kind.lower() not in ('captions', 'subtitles'):

3378

continue

3379

track_url = urljoin(base_url, track.get('file'))

3380

if not track_url:

3381

continue

3382

subtitles.setdefault(track.get('label') or 'en', []).append({

3383

'url': self._proto_relative_url(track_url)

})

entry = {

'id': this_video_id,

'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),

3389

'description': clean_html(video_data.get('description')),

3390

'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),

3391

'timestamp': int_or_none(video_data.get('pubdate')),

3392

'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),

3393

'subtitles': subtitles,

3394

'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...

3395

'genre': clean_html(video_data.get('genre')),

3396

'channel': clean_html(dict_get(video_data, ('category', 'channel'))),

3397

'season_number': int_or_none(video_data.get('season')),

3398

'episode_number': int_or_none(video_data.get('episode')),

3399

'release_year': int_or_none(video_data.get('releasedate')),

3400

'age_limit': int_or_none(video_data.get('age_restriction')),

3401

}

3402

# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32

3403

if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):

3404

entry.update({

3405

'_type': 'url_transparent',

3406

'url': formats[0]['url'],

3407

})

3408

else:

3409

entry['formats'] = formats

3410

entries.append(entry)

3411

if len(entries) == 1:

3412

return entries[0]

3413

else:

3414

return self.playlist_result(entries)

3415

3416

def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,

3417

m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):

3418

urls = set()

3419

formats = []

3420

for source in jwplayer_sources_data:

3421

if not isinstance(source, dict):

3422

continue

3423

source_url = urljoin(

3424

base_url, self._proto_relative_url(source.get('file')))

3425

if not source_url or source_url in urls:

3426

continue

3427

urls.add(source_url)

3428

source_type = source.get('type') or ''

3429

ext = mimetype2ext(source_type) or determine_ext(source_url)

3430

if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:

3431

formats.extend(self._extract_m3u8_formats(

3432

source_url, video_id, 'mp4', entry_protocol='m3u8_native',

3433

m3u8_id=m3u8_id, fatal=False))

3434

elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:

3435

formats.extend(self._extract_mpd_formats(

3436

source_url, video_id, mpd_id=mpd_id, fatal=False))

3437

elif ext == 'smil':

3438

formats.extend(self._extract_smil_formats(

3439

source_url, video_id, fatal=False))

3440

# https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67

3441

elif source_type.startswith('audio') or ext in (

3442

'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):

formats.append({

'url': source_url,

'vcodec': 'none',

'ext': ext,

})

else:

format_id = str_or_none(source.get('label'))

3450

height = int_or_none(source.get('height'))

3451

if height is None and format_id:

3452

# Often no height is provided but there is a label in

3453

# format like "1080p", "720p SD", or 1080.

3454

height = parse_resolution(format_id).get('height')

3455

a_format = {

3456

'url': source_url,

3457

'width': int_or_none(source.get('width')),

3458

'height': height,

3459

'tbr': int_or_none(source.get('bitrate'), scale=1000),

3460

'filesize': int_or_none(source.get('filesize')),

3461

'ext': ext,

3462

'format_id': format_id

3463

}

3464

if source_url.startswith('rtmp'):

3465

a_format['ext'] = 'flv'

3466

# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as

3467

# of jwplayer.flash.swf

3468

rtmp_url_parts = re.split(

3469

r'((?:mp4|mp3|flv):)', source_url, 1)

3470

if len(rtmp_url_parts) == 3:

3471

rtmp_url, prefix, play_path = rtmp_url_parts

3472

a_format.update({

3473

'url': rtmp_url,

3474

'play_path': prefix + play_path,

3475

})

3476

if rtmp_params:

3477

a_format.update(rtmp_params)

3478

formats.append(a_format)

3479

return formats

3480

3481

def _live_title(self, name):

3482

self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')

3483

return name

3484

3485

def _int(self, v, name, fatal=False, **kwargs):

3486

res = int_or_none(v, **kwargs)

3487

if res is None:

3488

msg = f'Failed to extract {name}: Could not parse value {v!r}'

3489

if fatal:

3490

raise ExtractorError(msg)

3491

else:

3492

self.report_warning(msg)

3493

return res

3494

3495

def _float(self, v, name, fatal=False, **kwargs):

3496

res = float_or_none(v, **kwargs)

3497

if res is None:

3498

msg = f'Failed to extract {name}: Could not parse value {v!r}'

3499

if fatal:

3500

raise ExtractorError(msg)

3501

else:

3502

self.report_warning(msg)

3503

return res

3504

3505

def _set_cookie(self, domain, name, value, expire_time=None, port=None,

3506

path='/', secure=False, discard=False, rest={}, **kwargs):

3507

cookie = http.cookiejar.Cookie(

3508

0, name, value, port, port is not None, domain, True,

3509

domain.startswith('.'), path, True, secure, expire_time,

3510

discard, None, None, rest)

3511

self.cookiejar.set_cookie(cookie)

3512

3513

def _get_cookies(self, url):

3514

""" Return a http.cookies.SimpleCookie with the cookies for the url """

3515

return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))

3516

3517

def _apply_first_set_cookie_header(self, url_handle, cookie):

3518

"""

3519

Apply first Set-Cookie header instead of the last. Experimental.

3520

3521

Some sites (e.g. [1-3]) may serve two cookies under the same name

3522

in Set-Cookie header and expect the first (old) one to be set rather

3523

than second (new). However, as of RFC6265 the newer one cookie

3524

should be set into cookie store what actually happens.

3525

We will workaround this issue by resetting the cookie to

3526

the first one manually.

3527

1. https://new.vk.com/

3528

2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201

3529

3. https://learning.oreilly.com/

3530

"""

3531

for header, cookies in url_handle.headers.items():

3532

if header.lower() != 'set-cookie':

3533

continue

3534

cookies = cookies.encode('iso-8859-1').decode('utf-8')

3535

cookie_value = re.search(

3536

r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)

3537

if cookie_value:

3538

value, domain = cookie_value.groups()

3539

self._set_cookie(domain, cookie, value)

break

@classmethod

def get_testcases(cls, include_onlymatching=False):

3544

# Do not look in super classes

3545

t = vars(cls).get('_TEST')

3546

if t:

3547

assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'

3548

tests = [t]

3549

else:

3550

tests = vars(cls).get('_TESTS', [])

3551

for t in tests:

3552

if not include_onlymatching and t.get('only_matching', False):

3553

continue

3554

t['name'] = cls.ie_key()

3555

yield t

3556

if getattr(cls, '__wrapped__', None):

3557

yield from cls.__wrapped__.get_testcases(include_onlymatching)

3558

3559

@classmethod

3560

def get_webpage_testcases(cls):

3561

tests = vars(cls).get('_WEBPAGE_TESTS', [])

3562

for t in tests:

3563

t['name'] = cls.ie_key()

3564

yield t

3565

if getattr(cls, '__wrapped__', None):

3566

yield from cls.__wrapped__.get_webpage_testcases()

3567

3568

@classproperty(cache=True)

3569

def age_limit(cls):

3570

"""Get age limit from the testcases"""

3571

return max(traverse_obj(

3572

(*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),

3573

(..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])

3574

3575

@classproperty(cache=True)

3576

def _RETURN_TYPE(cls):

3577

"""What the extractor returns: "video", "playlist", "any", or None (Unknown)"""

3578

tests = tuple(cls.get_testcases(include_onlymatching=False))

3579

if not tests:

3580

return None

3581

elif not any(k.startswith('playlist') for test in tests for k in test):

3582

return 'video'

3583

elif all(any(k.startswith('playlist') for k in test) for test in tests):

return 'playlist'

return 'any'

@classmethod

def is_single_video(cls, url):

3589

"""Returns whether the URL is of a single video, None if unknown"""

3590

if cls.suitable(url):

3591

return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)

3592

3593

@classmethod

3594

def is_suitable(cls, age_limit):

3595

"""Test whether the extractor is generally suitable for the given age limit"""

3596

return not age_restricted(cls.age_limit, age_limit)

3597

3598

@classmethod

3599

def description(cls, *, markdown=True, search_examples=None):

3600

"""Description of the extractor"""

3601

desc = ''

3602

if cls._NETRC_MACHINE:

3603

if markdown:

3604

desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'

3605

else:

3606

desc += f' [{cls._NETRC_MACHINE}]'

3607

if cls.IE_DESC is False:

3608

desc += ' [HIDDEN]'

3609

elif cls.IE_DESC:

3610

desc += f' {cls.IE_DESC}'

3611

if cls.SEARCH_KEY:

3612

desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'

3613

if search_examples:

3614

_COUNTS = ('', '5', '10', 'all')

3615

desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'

3616

if not cls.working():

3617

desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'

3618

3619

# Escape emojis. Ref: https://github.com/github/markup/issues/1153

3620

name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME

3621

return f'{name}:{desc}' if desc else name

3622

3623

def extract_subtitles(self, *args, **kwargs):

3624

if (self.get_param('writesubtitles', False)

3625

or self.get_param('listsubtitles')):

3626

return self._get_subtitles(*args, **kwargs)

3627

return {}

3628

3629

def _get_subtitles(self, *args, **kwargs):

3630

raise NotImplementedError('This method must be implemented by subclasses')

3631

3632

class CommentsDisabled(Exception):

3633

"""Raise in _get_comments if comments are disabled for the video"""

3634

3635

def extract_comments(self, *args, **kwargs):

3636

if not self.get_param('getcomments'):

3637

return None

3638

generator = self._get_comments(*args, **kwargs)

def extractor():

comments = []

interrupted = True

try:

while True:

comments.append(next(generator))

3646

except StopIteration:

3647

interrupted = False

3648

except KeyboardInterrupt:

3649

self.to_screen('Interrupted by user')

3650

except self.CommentsDisabled:

3651

return {'comments': None, 'comment_count': None}

3652

except Exception as e:

3653

if self.get_param('ignoreerrors') is not True:

3654

raise

3655

self._downloader.report_error(e)

3656

comment_count = len(comments)

3657

self.to_screen(f'Extracted {comment_count} comments')

3658

return {

3659

'comments': comments,

3660

'comment_count': None if interrupted else comment_count

}

return extractor

def _get_comments(self, *args, **kwargs):

3665

raise NotImplementedError('This method must be implemented by subclasses')

3666

3667

@staticmethod

3668

def _merge_subtitle_items(subtitle_list1, subtitle_list2):

3669

""" Merge subtitle items for one language. Items with duplicated URLs/data

3670

will be dropped. """

3671

list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}

3672

ret = list(subtitle_list1)

3673

ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)

return ret

@classmethod

def _merge_subtitles(cls, *dicts, target=None):

3678

""" Merge subtitle dictionaries, language by language. """

if target is None:

target = {}

for d in dicts:

for lang, subs in d.items():

3683

target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)

3684

return target

3685

3686

def extract_automatic_captions(self, *args, **kwargs):

3687

if (self.get_param('writeautomaticsub', False)

3688

or self.get_param('listsubtitles')):

3689

return self._get_automatic_captions(*args, **kwargs)

3690

return {}

3691

3692

def _get_automatic_captions(self, *args, **kwargs):

3693

raise NotImplementedError('This method must be implemented by subclasses')

3694

3695

@functools.cached_property

3696

def _cookies_passed(self):

3697

"""Whether cookies have been passed to YoutubeDL"""

3698

return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None

3699

3700

def mark_watched(self, *args, **kwargs):

3701

if not self.get_param('mark_watched', False):

3702

return

3703

if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:

3704

self._mark_watched(*args, **kwargs)

3705

3706

def _mark_watched(self, *args, **kwargs):

3707

raise NotImplementedError('This method must be implemented by subclasses')

3708

3709

def geo_verification_headers(self):

3710

headers = {}

3711

geo_verification_proxy = self.get_param('geo_verification_proxy')

3712

if geo_verification_proxy:

3713

headers['Ytdl-request-proxy'] = geo_verification_proxy

return headers

@staticmethod

def _generic_id(url):

3718

return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])

3719

3720

def _generic_title(self, url='', webpage='', *, default=None):

3721

return (self._og_search_title(webpage, default=None)

3722

or self._html_extract_title(webpage, default=None)

3723

or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])

3724

or default)

3725

3726

def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):

if not duration:

return

chapter_list = [{

'start_time': start_function(chapter),

3731

'title': title_function(chapter),

3732

} for chapter in chapter_list or []]

3733

if strict:

3734

warn = self.report_warning

3735

else:

3736

warn = self.write_debug

3737

chapter_list.sort(key=lambda c: c['start_time'] or 0)

3738

3739

chapters = [{'start_time': 0}]

3740

for idx, chapter in enumerate(chapter_list):

3741

if chapter['start_time'] is None:

3742

warn(f'Incomplete chapter {idx}')

3743

elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:

3744

chapters.append(chapter)

3745

elif chapter not in chapters:

3746

issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration

3747

else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')

3748

warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')

3749

return chapters[1:]

3750

3751

def _extract_chapters_from_description(self, description, duration):

3752

duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'

3753

sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'

3754

return self._extract_chapters_helper(

3755

re.findall(sep_re % (duration_re, r'.+?'), description or ''),

3756

start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],

3757

duration=duration, strict=False) or self._extract_chapters_helper(

3758

re.findall(sep_re % (r'.+?', duration_re), description or ''),

3759

start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],

3760

duration=duration, strict=False)

3761

3762

@staticmethod

3763

def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):

3764

all_known = all(map(

3765

lambda x: x is not None,

3766

(is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))

3767

return (

3768

'private' if is_private

3769

else 'premium_only' if needs_premium

3770

else 'subscriber_only' if needs_subscription

3771

else 'needs_auth' if needs_auth

3772

else 'unlisted' if is_unlisted

3773

else 'public' if all_known

3774

else None)

3775

3776

def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):

3777

'''

3778

@returns A list of values for the extractor argument given by "key"

3779

or "default" if no such key is present

3780

@param default The default value to return when the key is not present (default: [])

3781

@param casesense When false, the values are converted to lower case

3782

'''

3783

ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()

3784

val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))

3785

if val is None:

3786

return [] if default is NO_DEFAULT else default

3787

return list(val) if casesense else [x.lower() for x in val]

3788

3789

def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):

3790

if not playlist_id or not video_id:

3791

return not video_id

3792

3793

no_playlist = (smuggled_data or {}).get('force_noplaylist')

3794

if no_playlist is not None:

3795

return not no_playlist

3796

3797

video_id = '' if video_id is True else f' {video_id}'

3798

playlist_id = '' if playlist_id is True else f' {playlist_id}'

3799

if self.get_param('noplaylist'):

3800

self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')

3801

return False

3802

self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')

3803

return True

3804

3805

def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):

3806

RetryManager.report_retry(

3807

err, _count or int(fatal), _retries,

3808

info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,

3809

sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))

3810

3811

def RetryManager(self, **kwargs):

3812

return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)

3813

3814

def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):

3815

display_id = traverse_obj(info_dict, 'display_id', 'id')

3816

self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')

3817

return self._downloader.get_info_extractor('Generic')._extract_embeds(

3818

smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)

3819

3820

@classmethod

3821

def extract_from_webpage(cls, ydl, url, webpage):

3822

ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)

3823

else ydl.get_info_extractor(cls.ie_key()))

3824

for info in ie._extract_from_webpage(url, webpage) or []:

3825

# url = None since we do not want to set (webpage/original)_url

3826

ydl.add_default_extra_info(info, ie, None)

yield info

@classmethod

def _extract_from_webpage(cls, url, webpage):

3831

for embed_url in orderedSet(

3832

cls._extract_embed_urls(url, webpage) or [], lazy=True):

3833

yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)

3834

3835

@classmethod

3836

def _extract_embed_urls(cls, url, webpage):

3837

"""@returns all the embed urls on the webpage"""

3838

if '_EMBED_URL_RE' not in cls.__dict__:

3839

assert isinstance(cls._EMBED_REGEX, (list, tuple))

3840

for idx, regex in enumerate(cls._EMBED_REGEX):

3841

assert regex.count('(?P<url>') == 1, \

3842

f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'

3843

cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))

3844

3845

for regex in cls._EMBED_URL_RE:

3846

for mobj in regex.finditer(webpage):

3847

embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))

3848

if cls._VALID_URL is False or cls.suitable(embed_url):

3849

yield embed_url

3850

3851

class StopExtraction(Exception):

pass

@classmethod

def _extract_url(cls, webpage): # TODO: Remove

3856

"""Only for compatibility with some older extractors"""

3857

return next(iter(cls._extract_embed_urls(None, webpage) or []), None)

3858

3859

@classmethod

3860

def __init_subclass__(cls, *, plugin_name=None, **kwargs):

3861

if plugin_name:

3862

mro = inspect.getmro(cls)

3863

super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]

3864

cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key

3865

cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'

3866

while getattr(super_class, '__wrapped__', None):

3867

super_class = super_class.__wrapped__

3868

setattr(sys.modules[super_class.__module__], super_class.__name__, cls)

3869

_PLUGIN_OVERRIDES[super_class].append(cls)

3870

3871

return super().__init_subclass__(**kwargs)

3872

3873

3874

class SearchInfoExtractor(InfoExtractor):

3875

"""

3876

Base class for paged search queries extractors.

3877

They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}

3878

Instances should define _SEARCH_KEY and optionally _MAX_RESULTS

3879

"""

3880

3881

_MAX_RESULTS = float('inf')

3882

_RETURN_TYPE = 'playlist'

@classproperty

def _VALID_URL(cls):

return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

3887

3888

def _real_extract(self, query):

3889

prefix, query = self._match_valid_url(query).group('prefix', 'query')

3890

if prefix == '':

3891

return self._get_n_results(query, 1)

3892

elif prefix == 'all':

3893

return self._get_n_results(query, self._MAX_RESULTS)

else:

n = int(prefix)

if n <= 0:

raise ExtractorError(f'invalid download number {n} for query "{query}"')

3898

elif n > self._MAX_RESULTS:

3899

self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))

3900

n = self._MAX_RESULTS

3901

return self._get_n_results(query, n)

3902

3903

def _get_n_results(self, query, n):

3904

"""Get a specified number of results for a query.

3905

Either this function or _search_results must be overridden by subclasses """

3906

return self.playlist_result(

3907

itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),

3908

query, query)

3909

3910

def _search_results(self, query):

3911

"""Returns an iterator of search results"""

3912

raise NotImplementedError('This method must be implemented by subclasses')

@classproperty

def SEARCH_KEY(cls):

return cls._SEARCH_KEY

3917

3918

3919

class UnsupportedURLIE(InfoExtractor):

_VALID_URL = '.*'

_ENABLED = False

IE_DESC = False

def _real_extract(self, url):

3925

raise UnsupportedError(url)

3926

3927

3928

_PLUGIN_OVERRIDES = collections.defaultdict(list)