jfr.im git - yt-dlp.git/blame_incremental - yt

Commit	Line	Data
	1	import base64
	2	import collections
	3	import getpass
	4	import hashlib
	5	import http.client
	6	import http.cookiejar
	7	import http.cookies
	8	import inspect
	9	import itertools
	10	import json
	11	import math
	12	import netrc
	13	import os
	14	import random
	15	import re
	16	import subprocess
	17	import sys
	18	import time
	19	import types
	20	import urllib.parse
	21	import urllib.request
	22	import xml.etree.ElementTree
	23
	24	from ..compat import functools # isort: split
	25	from ..compat import (
	26	compat_etree_fromstring,
	27	compat_expanduser,
	28	compat_os_name,
	29	urllib_req_to_req,
	30	)
	31	from ..cookies import LenientSimpleCookie
	32	from ..downloader.f4m import get_base_url, remove_encrypted_media
	33	from ..downloader.hls import HlsFD
	34	from ..networking import HEADRequest, Request
	35	from ..networking.exceptions import (
	36	HTTPError,
	37	IncompleteRead,
	38	network_exceptions,
	39	)
	40	from ..utils import (
	41	IDENTITY,
	42	JSON_LD_RE,
	43	NO_DEFAULT,
	44	ExtractorError,
	45	FormatSorter,
	46	GeoRestrictedError,
	47	GeoUtils,
	48	LenientJSONDecoder,
	49	Popen,
	50	RegexNotFoundError,
	51	RetryManager,
	52	UnsupportedError,
	53	age_restricted,
	54	base_url,
	55	bug_reports_message,
	56	classproperty,
	57	clean_html,
	58	deprecation_warning,
	59	determine_ext,
	60	dict_get,
	61	encode_data_uri,
	62	error_to_compat_str,
	63	extract_attributes,
	64	filter_dict,
	65	fix_xml_ampersands,
	66	float_or_none,
	67	format_field,
	68	int_or_none,
	69	join_nonempty,
	70	js_to_json,
	71	mimetype2ext,
	72	netrc_from_content,
	73	orderedSet,
	74	parse_bitrate,
	75	parse_codecs,
	76	parse_duration,
	77	parse_iso8601,
	78	parse_m3u8_attributes,
	79	parse_resolution,
	80	sanitize_filename,
	81	sanitize_url,
	82	smuggle_url,
	83	str_or_none,
	84	str_to_int,
	85	strip_or_none,
	86	traverse_obj,
	87	truncate_string,
	88	try_call,
	89	try_get,
	90	unescapeHTML,
	91	unified_strdate,
	92	unified_timestamp,
	93	url_basename,
	94	url_or_none,
	95	urlhandle_detect_ext,
	96	urljoin,
	97	variadic,
	98	xpath_element,
	99	xpath_text,
	100	xpath_with_ns,
	101	)
	102
	103
	104	class InfoExtractor:
	105	"""Information Extractor class.
	106
	107	Information extractors are the classes that, given a URL, extract
	108	information about the video (or videos) the URL refers to. This
	109	information includes the real video URL, the video title, author and
	110	others. The information is stored in a dictionary which is then
	111	passed to the YoutubeDL. The YoutubeDL processes this
	112	information possibly downloading the video to the file system, among
	113	other possible outcomes.
	114
	115	The type field determines the type of the result.
	116	By far the most common value (and the default if _type is missing) is
	117	"video", which indicates a single video.
	118
	119	For a video, the dictionaries must include the following fields:
	120
	121	id: Video identifier.
	122	title: Video title, unescaped. Set to an empty string if video has
	123	no title as opposed to "None" which signifies that the
	124	extractor failed to obtain a title
	125
	126	Additionally, it must contain either a formats entry or a url one:
	127
	128	formats: A list of dictionaries for each format available, ordered
	129	from worst to best quality.
	130
	131	Potential fields:
	132	* url The mandatory URL representing the media:
	133	for plain file media - HTTP URL of this file,
	134	for RTMP - RTMP URL,
	135	for HLS - URL of the M3U8 media playlist,
	136	for HDS - URL of the F4M manifest,
	137	for DASH
	138	- HTTP URL to plain file media (in case of
	139	unfragmented media)
	140	- URL of the MPD manifest or base URL
	141	representing the media if MPD manifest
	142	is parsed from a string (in case of
	143	fragmented media)
	144	for MSS - URL of the ISM manifest.
	145	* request_data Data to send in POST request to the URL
	146	* manifest_url
	147	The URL of the manifest file in case of
	148	fragmented media:
	149	for HLS - URL of the M3U8 master playlist,
	150	for HDS - URL of the F4M manifest,
	151	for DASH - URL of the MPD manifest,
	152	for MSS - URL of the ISM manifest.
	153	* manifest_stream_number (For internal use only)
	154	The index of the stream in the manifest file
	155	* ext Will be calculated from URL if missing
	156	* format A human-readable description of the format
	157	("mp4 container with h264/opus").
	158	Calculated from the format_id, width, height.
	159	and format_note fields if missing.
	160	* format_id A short description of the format
	161	("mp4_h264_opus" or "19").
	162	Technically optional, but strongly recommended.
	163	* format_note Additional info about the format
	164	("3D" or "DASH video")
	165	* width Width of the video, if known
	166	* height Height of the video, if known
	167	* aspect_ratio Aspect ratio of the video, if known
	168	Automatically calculated from width and height
	169	* resolution Textual description of width and height
	170	Automatically calculated from width and height
	171	* dynamic_range The dynamic range of the video. One of:
	172	"SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
	173	* tbr Average bitrate of audio and video in KBit/s
	174	* abr Average audio bitrate in KBit/s
	175	* acodec Name of the audio codec in use
	176	* asr Audio sampling rate in Hertz
	177	* audio_channels Number of audio channels
	178	* vbr Average video bitrate in KBit/s
	179	* fps Frame rate
	180	* vcodec Name of the video codec in use
	181	* container Name of the container format
	182	* filesize The number of bytes, if known in advance
	183	* filesize_approx An estimate for the number of bytes
	184	* player_url SWF Player URL (used for rtmpdump).
	185	* protocol The protocol that will be used for the actual
	186	download, lower-case. One of "http", "https" or
	187	one of the protocols defined in downloader.PROTOCOL_MAP
	188	* fragment_base_url
	189	Base URL for fragments. Each fragment's path
	190	value (if present) will be relative to
	191	this URL.
	192	* fragments A list of fragments of a fragmented media.
	193	Each fragment entry must contain either an url
	194	or a path. If an url is present it should be
	195	considered by a client. Otherwise both path and
	196	fragment_base_url must be present. Here is
	197	the list of all potential fields:
	198	* "url" - fragment's URL
	199	* "path" - fragment's path relative to
	200	fragment_base_url
	201	* "duration" (optional, int or float)
	202	* "filesize" (optional, int)
	203	* is_from_start Is a live format that can be downloaded
	204	from the start. Boolean
	205	* preference Order number of this format. If this field is
	206	present and not None, the formats get sorted
	207	by this field, regardless of all other values.
	208	-1 for default (order by other properties),
	209	-2 or smaller for less than default.
	210	< -1000 to hide the format (if there is
	211	another one which is strictly better)
	212	* language Language code, e.g. "de" or "en-US".
	213	* language_preference Is this in the language mentioned in
	214	the URL?
	215	10 if it's what the URL is about,
	216	-1 for default (don't know),
	217	-10 otherwise, other values reserved for now.
	218	* quality Order number of the video quality of this
	219	format, irrespective of the file format.
	220	-1 for default (order by other properties),
	221	-2 or smaller for less than default.
	222	* source_preference Order number for this video source
	223	(quality takes higher priority)
	224	-1 for default (order by other properties),
	225	-2 or smaller for less than default.
	226	* http_headers A dictionary of additional HTTP headers
	227	to add to the request.
	228	* stretched_ratio If given and not 1, indicates that the
	229	video's pixels are not square.
	230	width : height ratio as float.
	231	* no_resume The server does not support resuming the
	232	(HTTP or RTMP) download. Boolean.
	233	* has_drm True if the format has DRM and cannot be downloaded.
	234	'maybe' if the format may have DRM and has to be tested before download.
	235	* extra_param_to_segment_url A query string to append to each
	236	fragment's URL, or to update each existing query string
	237	with. Only applied by the native HLS/DASH downloaders.
	238	* hls_aes A dictionary of HLS AES-128 decryption information
	239	used by the native HLS downloader to override the
	240	values in the media playlist when an '#EXT-X-KEY' tag
	241	is present in the playlist:
	242	* uri The URI from which the key will be downloaded
	243	* key The key (as hex) used to decrypt fragments.
	244	If `key` is given, any key URI will be ignored
	245	* iv The IV (as hex) used to decrypt fragments
	246	* downloader_options A dictionary of downloader options
	247	(For internal use only)
	248	* http_chunk_size Chunk size for HTTP downloads
	249	* ffmpeg_args Extra arguments for ffmpeg downloader
	250	RTMP formats can also have the additional fields: page_url,
	251	app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
	252	rtmp_protocol, rtmp_real_time
	253
	254	url: Final video URL.
	255	ext: Video filename extension.
	256	format: The video format, defaults to ext (used for --get-format)
	257	player_url: SWF Player URL (used for rtmpdump).
	258
	259	The following fields are optional:
	260
	261	direct: True if a direct video file was given (must only be set by GenericIE)
	262	alt_title: A secondary title of the video.
	263	display_id An alternative identifier for the video, not necessarily
	264	unique, but available before title. Typically, id is
	265	something like "4234987", title "Dancing naked mole rats",
	266	and display_id "dancing-naked-mole-rats"
	267	thumbnails: A list of dictionaries, with the following entries:
	268	* "id" (optional, string) - Thumbnail format ID
	269	* "url"
	270	* "preference" (optional, int) - quality of the image
	271	* "width" (optional, int)
	272	* "height" (optional, int)
	273	* "resolution" (optional, string "{width}x{height}",
	274	deprecated)
	275	* "filesize" (optional, int)
	276	* "http_headers" (dict) - HTTP headers for the request
	277	thumbnail: Full URL to a video thumbnail image.
	278	description: Full video description.
	279	uploader: Full name of the video uploader.
	280	license: License name the video is licensed under.
	281	creator: The creator of the video.
	282	timestamp: UNIX timestamp of the moment the video was uploaded
	283	upload_date: Video upload date in UTC (YYYYMMDD).
	284	If not explicitly set, calculated from timestamp
	285	release_timestamp: UNIX timestamp of the moment the video was released.
	286	If it is not clear whether to use timestamp or this, use the former
	287	release_date: The date (YYYYMMDD) when the video was released in UTC.
	288	If not explicitly set, calculated from release_timestamp
	289	release_year: Year (YYYY) as integer when the video or album was released.
	290	To be used if no exact release date is known.
	291	If not explicitly set, calculated from release_date.
	292	modified_timestamp: UNIX timestamp of the moment the video was last modified.
	293	modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
	294	If not explicitly set, calculated from modified_timestamp
	295	uploader_id: Nickname or id of the video uploader.
	296	uploader_url: Full URL to a personal webpage of the video uploader.
	297	channel: Full name of the channel the video is uploaded on.
	298	Note that channel fields may or may not repeat uploader
	299	fields. This depends on a particular extractor.
	300	channel_id: Id of the channel.
	301	channel_url: Full URL to a channel webpage.
	302	channel_follower_count: Number of followers of the channel.
	303	channel_is_verified: Whether the channel is verified on the platform.
	304	location: Physical location where the video was filmed.
	305	subtitles: The available subtitles as a dictionary in the format
	306	{tag: subformats}. "tag" is usually a language code, and
	307	"subformats" is a list sorted from lower to higher
	308	preference, each element is a dictionary with the "ext"
	309	entry and one of:
	310	* "data": The subtitles file contents
	311	* "url": A URL pointing to the subtitles file
	312	It can optionally also have:
	313	* "name": Name or description of the subtitles
	314	* "http_headers": A dictionary of additional HTTP headers
	315	to add to the request.
	316	"ext" will be calculated from URL if missing
	317	automatic_captions: Like 'subtitles'; contains automatically generated
	318	captions instead of normal subtitles
	319	duration: Length of the video in seconds, as an integer or float.
	320	view_count: How many users have watched the video on the platform.
	321	concurrent_view_count: How many users are currently watching the video on the platform.
	322	like_count: Number of positive ratings of the video
	323	dislike_count: Number of negative ratings of the video
	324	repost_count: Number of reposts of the video
	325	average_rating: Average rating give by users, the scale used depends on the webpage
	326	comment_count: Number of comments on the video
	327	comments: A list of comments, each with one or more of the following
	328	properties (all but one of text or html optional):
	329	* "author" - human-readable name of the comment author
	330	* "author_id" - user ID of the comment author
	331	* "author_thumbnail" - The thumbnail of the comment author
	332	* "author_url" - The url to the comment author's page
	333	* "author_is_verified" - Whether the author is verified
	334	on the platform
	335	* "author_is_uploader" - Whether the comment is made by
	336	the video uploader
	337	* "id" - Comment ID
	338	* "html" - Comment as HTML
	339	* "text" - Plain text of the comment
	340	* "timestamp" - UNIX timestamp of comment
	341	* "parent" - ID of the comment this one is replying to.
	342	Set to "root" to indicate that this is a
	343	comment to the original video.
	344	* "like_count" - Number of positive ratings of the comment
	345	* "dislike_count" - Number of negative ratings of the comment
	346	* "is_favorited" - Whether the comment is marked as
	347	favorite by the video uploader
	348	* "is_pinned" - Whether the comment is pinned to
	349	the top of the comments
	350	age_limit: Age restriction for the video, as an integer (years)
	351	webpage_url: The URL to the video webpage, if given to yt-dlp it
	352	should allow to get the same result again. (It will be set
	353	by YoutubeDL if it's missing)
	354	categories: A list of categories that the video falls in, for example
	355	["Sports", "Berlin"]
	356	tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
	357	cast: A list of the video cast
	358	is_live: True, False, or None (=unknown). Whether this video is a
	359	live stream that goes on instead of a fixed-length video.
	360	was_live: True, False, or None (=unknown). Whether this video was
	361	originally a live stream.
	362	live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
	363	or 'post_live' (was live, but VOD is not yet processed)
	364	If absent, automatically set from is_live, was_live
	365	start_time: Time in seconds where the reproduction should start, as
	366	specified in the URL.
	367	end_time: Time in seconds where the reproduction should end, as
	368	specified in the URL.
	369	chapters: A list of dictionaries, with the following entries:
	370	* "start_time" - The start time of the chapter in seconds
	371	* "end_time" - The end time of the chapter in seconds
	372	* "title" (optional, string)
	373	heatmap: A list of dictionaries, with the following entries:
	374	* "start_time" - The start time of the data point in seconds
	375	* "end_time" - The end time of the data point in seconds
	376	* "value" - The normalized value of the data point (float between 0 and 1)
	377	playable_in_embed: Whether this video is allowed to play in embedded
	378	players on other sites. Can be True (=always allowed),
	379	False (=never allowed), None (=unknown), or a string
	380	specifying the criteria for embedability; e.g. 'whitelist'
	381	availability: Under what condition the video is available. One of
	382	'private', 'premium_only', 'subscriber_only', 'needs_auth',
	383	'unlisted' or 'public'. Use 'InfoExtractor._availability'
	384	to set it
	385	media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
	386	_old_archive_ids: A list of old archive ids needed for backward compatibility
	387	_format_sort_fields: A list of fields to use for sorting formats
	388	__post_extractor: A function to be called just before the metadata is
	389	written to either disk, logger or console. The function
	390	must return a dict which will be added to the info_dict.
	391	This is usefull for additional information that is
	392	time-consuming to extract. Note that the fields thus
	393	extracted will not be available to output template and
	394	match_filter. So, only "comments" and "comment_count" are
	395	currently allowed to be extracted via this method.
	396
	397	The following fields should only be used when the video belongs to some logical
	398	chapter or section:
	399
	400	chapter: Name or title of the chapter the video belongs to.
	401	chapter_number: Number of the chapter the video belongs to, as an integer.
	402	chapter_id: Id of the chapter the video belongs to, as a unicode string.
	403
	404	The following fields should only be used when the video is an episode of some
	405	series, programme or podcast:
	406
	407	series: Title of the series or programme the video episode belongs to.
	408	series_id: Id of the series or programme the video episode belongs to, as a unicode string.
	409	season: Title of the season the video episode belongs to.
	410	season_number: Number of the season the video episode belongs to, as an integer.
	411	season_id: Id of the season the video episode belongs to, as a unicode string.
	412	episode: Title of the video episode. Unlike mandatory video title field,
	413	this field should denote the exact title of the video episode
	414	without any kind of decoration.
	415	episode_number: Number of the video episode within a season, as an integer.
	416	episode_id: Id of the video episode, as a unicode string.
	417
	418	The following fields should only be used when the media is a track or a part of
	419	a music album:
	420
	421	track: Title of the track.
	422	track_number: Number of the track within an album or a disc, as an integer.
	423	track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
	424	as a unicode string.
	425	artist: Artist(s) of the track.
	426	genre: Genre(s) of the track.
	427	album: Title of the album the track belongs to.
	428	album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
	429	album_artist: List of all artists appeared on the album (e.g.
	430	"Ash Borer / Fell Voices" or "Various Artists", useful for splits
	431	and compilations).
	432	disc_number: Number of the disc or other physical medium the track belongs to,
	433	as an integer.
	434	composer: Composer of the piece
	435
	436	The following fields should only be set for clips that should be cut from the original video:
	437
	438	section_start: Start time of the section in seconds
	439	section_end: End time of the section in seconds
	440
	441	The following fields should only be set for storyboards:
	442	rows: Number of rows in each storyboard fragment, as an integer
	443	columns: Number of columns in each storyboard fragment, as an integer
	444
	445	Unless mentioned otherwise, the fields should be Unicode strings.
	446
	447	Unless mentioned otherwise, None is equivalent to absence of information.
	448
	449
	450	_type "playlist" indicates multiple videos.
	451	There must be a key "entries", which is a list, an iterable, or a PagedList
	452	object, each element of which is a valid dictionary by this specification.
	453
	454	Additionally, playlists can have "id", "title", and any other relevant
	455	attributes with the same semantics as videos (see above).
	456
	457	It can also have the following optional fields:
	458
	459	playlist_count: The total number of videos in a playlist. If not given,
	460	YoutubeDL tries to calculate it from "entries"
	461
	462
	463	_type "multi_video" indicates that there are multiple videos that
	464	form a single show, for examples multiple acts of an opera or TV episode.
	465	It must have an entries key like a playlist and contain all the keys
	466	required for a video at the same time.
	467
	468
	469	_type "url" indicates that the video must be extracted from another
	470	location, possibly by a different extractor. Its only required key is:
	471	"url" - the next URL to extract.
	472	The key "ie_key" can be set to the class name (minus the trailing "IE",
	473	e.g. "Youtube") if the extractor class is known in advance.
	474	Additionally, the dictionary may have any properties of the resolved entity
	475	known in advance, for example "title" if the title of the referred video is
	476	known ahead of time.
	477
	478
	479	_type "url_transparent" entities have the same specification as "url", but
	480	indicate that the given additional information is more precise than the one
	481	associated with the resolved URL.
	482	This is useful when a site employs a video service that hosts the video and
	483	its technical metadata, but that video service does not embed a useful
	484	title, description etc.
	485
	486
	487	Subclasses of this should also be added to the list of extractors and
	488	should define _VALID_URL as a regexp or a Sequence of regexps, and
	489	re-define the _real_extract() and (optionally) _real_initialize() methods.
	490
	491	Subclasses may also override suitable() if necessary, but ensure the function
	492	signature is preserved and that this function imports everything it needs
	493	(except other extractors), so that lazy_extractors works correctly.
	494
	495	Subclasses can define a list of _EMBED_REGEX, which will be searched for in
	496	the HTML of Generic webpages. It may also override _extract_embed_urls
	497	or _extract_from_webpage as necessary. While these are normally classmethods,
	498	_extract_from_webpage is allowed to be an instance method.
	499
	500	_extract_from_webpage may raise self.StopExtraction() to stop further

1

import base64

import collections

import getpass

import hashlib

import http.client

import http.cookiejar

import http.cookies

import inspect

import itertools

import json

import math

import netrc

import os

import random

import re

import subprocess

import sys

import time

import types

import urllib.parse

import urllib.request

22

import xml.etree.ElementTree

23

24

from ..compat import functools # isort: split

25

from ..compat import (

26

compat_etree_fromstring,

compat_expanduser,

compat_os_name,

urllib_req_to_req,

)

from ..cookies import LenientSimpleCookie

32

from ..downloader.f4m import get_base_url, remove_encrypted_media

33

from ..downloader.hls import HlsFD

34

from ..networking import HEADRequest, Request

35

from ..networking.exceptions import (

HTTPError,

IncompleteRead,

network_exceptions,

)

from ..utils import (

IDENTITY,

JSON_LD_RE,

NO_DEFAULT,

ExtractorError,

FormatSorter,

GeoRestrictedError,

GeoUtils,

LenientJSONDecoder,

Popen,

RegexNotFoundError,

RetryManager,

UnsupportedError,

age_restricted,

base_url,

bug_reports_message,

classproperty,

clean_html,

deprecation_warning,

determine_ext,

dict_get,

encode_data_uri,

error_to_compat_str,

extract_attributes,

filter_dict,

fix_xml_ampersands,

float_or_none,

format_field,

int_or_none,

join_nonempty,

js_to_json,

mimetype2ext,

netrc_from_content,

orderedSet,

parse_bitrate,

parse_codecs,

parse_duration,

parse_iso8601,

parse_m3u8_attributes,

parse_resolution,

sanitize_filename,

sanitize_url,

smuggle_url,

str_or_none,

str_to_int,

strip_or_none,

traverse_obj,

truncate_string,

try_call,

try_get,

unescapeHTML,

unified_strdate,

unified_timestamp,

url_basename,

url_or_none,

urlhandle_detect_ext,

urljoin,

variadic,

xpath_element,

xpath_text,

xpath_with_ns,

)

class InfoExtractor:

"""Information Extractor class.

106

107

Information extractors are the classes that, given a URL, extract

108

information about the video (or videos) the URL refers to. This

109

information includes the real video URL, the video title, author and

110

others. The information is stored in a dictionary which is then

111

passed to the YoutubeDL. The YoutubeDL processes this

112

information possibly downloading the video to the file system, among

113

other possible outcomes.

114

115

The type field determines the type of the result.

116

By far the most common value (and the default if _type is missing) is

117

"video", which indicates a single video.

118

119

For a video, the dictionaries must include the following fields:

120

121

id: Video identifier.

122

title: Video title, unescaped. Set to an empty string if video has

123

no title as opposed to "None" which signifies that the

124

extractor failed to obtain a title

125

126

Additionally, it must contain either a formats entry or a url one:

127

128

formats: A list of dictionaries for each format available, ordered

129

from worst to best quality.

130

131

Potential fields:

132

* url The mandatory URL representing the media:

133

for plain file media - HTTP URL of this file,

134

for RTMP - RTMP URL,

135

for HLS - URL of the M3U8 media playlist,

136

for HDS - URL of the F4M manifest,

137

for DASH

138

- HTTP URL to plain file media (in case of

139

unfragmented media)

140

- URL of the MPD manifest or base URL

141

representing the media if MPD manifest

142

is parsed from a string (in case of

143

fragmented media)

144

for MSS - URL of the ISM manifest.

145

* request_data Data to send in POST request to the URL

146

* manifest_url

147

The URL of the manifest file in case of

148

fragmented media:

149

for HLS - URL of the M3U8 master playlist,

150

for HDS - URL of the F4M manifest,

151

for DASH - URL of the MPD manifest,

152

for MSS - URL of the ISM manifest.

153

* manifest_stream_number (For internal use only)

154

The index of the stream in the manifest file

155

* ext Will be calculated from URL if missing

156

* format A human-readable description of the format

157

("mp4 container with h264/opus").

158

Calculated from the format_id, width, height.

159

and format_note fields if missing.

160

* format_id A short description of the format

161

("mp4_h264_opus" or "19").

162

Technically optional, but strongly recommended.

163

* format_note Additional info about the format

164

("3D" or "DASH video")

165

* width Width of the video, if known

166

* height Height of the video, if known

167

* aspect_ratio Aspect ratio of the video, if known

168

Automatically calculated from width and height

169

* resolution Textual description of width and height

170

Automatically calculated from width and height

171

* dynamic_range The dynamic range of the video. One of:

172

"SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"

173

* tbr Average bitrate of audio and video in KBit/s

174

* abr Average audio bitrate in KBit/s

175

* acodec Name of the audio codec in use

176

* asr Audio sampling rate in Hertz

177

* audio_channels Number of audio channels

178

* vbr Average video bitrate in KBit/s

179

* fps Frame rate

180

* vcodec Name of the video codec in use

181

* container Name of the container format

182

* filesize The number of bytes, if known in advance

183

* filesize_approx An estimate for the number of bytes

184

* player_url SWF Player URL (used for rtmpdump).

185

* protocol The protocol that will be used for the actual

186

download, lower-case. One of "http", "https" or

187

one of the protocols defined in downloader.PROTOCOL_MAP

188

* fragment_base_url

189

Base URL for fragments. Each fragment's path

190

value (if present) will be relative to

191

this URL.

192

* fragments A list of fragments of a fragmented media.

193

Each fragment entry must contain either an url

194

or a path. If an url is present it should be

195

considered by a client. Otherwise both path and

196

fragment_base_url must be present. Here is

197

the list of all potential fields:

198

* "url" - fragment's URL

199

* "path" - fragment's path relative to

200

fragment_base_url

201

* "duration" (optional, int or float)

202

* "filesize" (optional, int)

203

* is_from_start Is a live format that can be downloaded

204

from the start. Boolean

205

* preference Order number of this format. If this field is

206

present and not None, the formats get sorted

207

by this field, regardless of all other values.

208

-1 for default (order by other properties),

209

-2 or smaller for less than default.

210

< -1000 to hide the format (if there is

211

another one which is strictly better)

212

* language Language code, e.g. "de" or "en-US".

213

* language_preference Is this in the language mentioned in

214

the URL?

215

10 if it's what the URL is about,

216

-1 for default (don't know),

217

-10 otherwise, other values reserved for now.

218

* quality Order number of the video quality of this

219

format, irrespective of the file format.

220

-1 for default (order by other properties),

221

-2 or smaller for less than default.

222

* source_preference Order number for this video source

223

(quality takes higher priority)

224

-1 for default (order by other properties),

225

-2 or smaller for less than default.

226

* http_headers A dictionary of additional HTTP headers

227

to add to the request.

228

* stretched_ratio If given and not 1, indicates that the

229

video's pixels are not square.

230

width : height ratio as float.

231

* no_resume The server does not support resuming the

232

(HTTP or RTMP) download. Boolean.

233

* has_drm True if the format has DRM and cannot be downloaded.

234

'maybe' if the format may have DRM and has to be tested before download.

235

* extra_param_to_segment_url A query string to append to each

236

fragment's URL, or to update each existing query string

237

with. Only applied by the native HLS/DASH downloaders.

238

* hls_aes A dictionary of HLS AES-128 decryption information

239

used by the native HLS downloader to override the

240

values in the media playlist when an '#EXT-X-KEY' tag

241

is present in the playlist:

242

* uri The URI from which the key will be downloaded

243

* key The key (as hex) used to decrypt fragments.

244

If `key` is given, any key URI will be ignored

245

* iv The IV (as hex) used to decrypt fragments

246

* downloader_options A dictionary of downloader options

247

(For internal use only)

248

* http_chunk_size Chunk size for HTTP downloads

249

* ffmpeg_args Extra arguments for ffmpeg downloader

250

RTMP formats can also have the additional fields: page_url,

251

app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,

252

rtmp_protocol, rtmp_real_time

253

254

url: Final video URL.

255

ext: Video filename extension.

256

format: The video format, defaults to ext (used for --get-format)

257

player_url: SWF Player URL (used for rtmpdump).

258

259

The following fields are optional:

260

261

direct: True if a direct video file was given (must only be set by GenericIE)

262

alt_title: A secondary title of the video.

263

display_id An alternative identifier for the video, not necessarily

264

unique, but available before title. Typically, id is

265

something like "4234987", title "Dancing naked mole rats",

266

and display_id "dancing-naked-mole-rats"

267

thumbnails: A list of dictionaries, with the following entries:

268

* "id" (optional, string) - Thumbnail format ID

269

* "url"

270

* "preference" (optional, int) - quality of the image

271

* "width" (optional, int)

272

* "height" (optional, int)

273

* "resolution" (optional, string "{width}x{height}",

274

deprecated)

275

* "filesize" (optional, int)

276

* "http_headers" (dict) - HTTP headers for the request

277

thumbnail: Full URL to a video thumbnail image.

278

description: Full video description.

279

uploader: Full name of the video uploader.

280

license: License name the video is licensed under.

281

creator: The creator of the video.

282

timestamp: UNIX timestamp of the moment the video was uploaded

283

upload_date: Video upload date in UTC (YYYYMMDD).

284

If not explicitly set, calculated from timestamp

285

release_timestamp: UNIX timestamp of the moment the video was released.

286

If it is not clear whether to use timestamp or this, use the former

287

release_date: The date (YYYYMMDD) when the video was released in UTC.

288

If not explicitly set, calculated from release_timestamp

289

release_year: Year (YYYY) as integer when the video or album was released.

290

To be used if no exact release date is known.

291

If not explicitly set, calculated from release_date.

292

modified_timestamp: UNIX timestamp of the moment the video was last modified.

293

modified_date: The date (YYYYMMDD) when the video was last modified in UTC.

294

If not explicitly set, calculated from modified_timestamp

295

uploader_id: Nickname or id of the video uploader.

296

uploader_url: Full URL to a personal webpage of the video uploader.

297

channel: Full name of the channel the video is uploaded on.

298

Note that channel fields may or may not repeat uploader

299

fields. This depends on a particular extractor.

300

channel_id: Id of the channel.

301

channel_url: Full URL to a channel webpage.

302

channel_follower_count: Number of followers of the channel.

303

channel_is_verified: Whether the channel is verified on the platform.

304

location: Physical location where the video was filmed.

305

subtitles: The available subtitles as a dictionary in the format

306

{tag: subformats}. "tag" is usually a language code, and

307

"subformats" is a list sorted from lower to higher

308

preference, each element is a dictionary with the "ext"

309

entry and one of:

310

* "data": The subtitles file contents

311

* "url": A URL pointing to the subtitles file

312

It can optionally also have:

313

* "name": Name or description of the subtitles

314

* "http_headers": A dictionary of additional HTTP headers

315

to add to the request.

316

"ext" will be calculated from URL if missing

317

automatic_captions: Like 'subtitles'; contains automatically generated

318

captions instead of normal subtitles

319

duration: Length of the video in seconds, as an integer or float.

320

view_count: How many users have watched the video on the platform.

321

concurrent_view_count: How many users are currently watching the video on the platform.

322

like_count: Number of positive ratings of the video

323

dislike_count: Number of negative ratings of the video

324

repost_count: Number of reposts of the video

325

average_rating: Average rating give by users, the scale used depends on the webpage

326

comment_count: Number of comments on the video

327

comments: A list of comments, each with one or more of the following

328

properties (all but one of text or html optional):

329

* "author" - human-readable name of the comment author

330

* "author_id" - user ID of the comment author

331

* "author_thumbnail" - The thumbnail of the comment author

332

* "author_url" - The url to the comment author's page

333

* "author_is_verified" - Whether the author is verified

334

on the platform

335

* "author_is_uploader" - Whether the comment is made by

336

the video uploader

337

* "id" - Comment ID

338

* "html" - Comment as HTML

339

* "text" - Plain text of the comment

340

* "timestamp" - UNIX timestamp of comment

341

* "parent" - ID of the comment this one is replying to.

342

Set to "root" to indicate that this is a

343

comment to the original video.

344

* "like_count" - Number of positive ratings of the comment

345

* "dislike_count" - Number of negative ratings of the comment

346

* "is_favorited" - Whether the comment is marked as

347

favorite by the video uploader

348

* "is_pinned" - Whether the comment is pinned to

349

the top of the comments

350

age_limit: Age restriction for the video, as an integer (years)

351

webpage_url: The URL to the video webpage, if given to yt-dlp it

352

should allow to get the same result again. (It will be set

353

by YoutubeDL if it's missing)

354

categories: A list of categories that the video falls in, for example

355

["Sports", "Berlin"]

356

tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]

357

cast: A list of the video cast

358

is_live: True, False, or None (=unknown). Whether this video is a

359

live stream that goes on instead of a fixed-length video.

360

was_live: True, False, or None (=unknown). Whether this video was

361

originally a live stream.

362

live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',

363

or 'post_live' (was live, but VOD is not yet processed)

364

If absent, automatically set from is_live, was_live

365

start_time: Time in seconds where the reproduction should start, as

366

specified in the URL.

367

end_time: Time in seconds where the reproduction should end, as

368

specified in the URL.

369

chapters: A list of dictionaries, with the following entries:

370

* "start_time" - The start time of the chapter in seconds

371

* "end_time" - The end time of the chapter in seconds

372

* "title" (optional, string)

373

heatmap: A list of dictionaries, with the following entries:

374

* "start_time" - The start time of the data point in seconds

375

* "end_time" - The end time of the data point in seconds

376

* "value" - The normalized value of the data point (float between 0 and 1)

377

playable_in_embed: Whether this video is allowed to play in embedded

378

players on other sites. Can be True (=always allowed),

379

False (=never allowed), None (=unknown), or a string

380

specifying the criteria for embedability; e.g. 'whitelist'

381

availability: Under what condition the video is available. One of

382

'private', 'premium_only', 'subscriber_only', 'needs_auth',

383

'unlisted' or 'public'. Use 'InfoExtractor._availability'

384

to set it

385

media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"

386

_old_archive_ids: A list of old archive ids needed for backward compatibility

387

_format_sort_fields: A list of fields to use for sorting formats

388

__post_extractor: A function to be called just before the metadata is

389

written to either disk, logger or console. The function

390

must return a dict which will be added to the info_dict.

391

This is usefull for additional information that is

392

time-consuming to extract. Note that the fields thus

393

extracted will not be available to output template and

394

match_filter. So, only "comments" and "comment_count" are

395

currently allowed to be extracted via this method.

396

397

The following fields should only be used when the video belongs to some logical

398

chapter or section:

399

400

chapter: Name or title of the chapter the video belongs to.

401

chapter_number: Number of the chapter the video belongs to, as an integer.

402

chapter_id: Id of the chapter the video belongs to, as a unicode string.

403

404

The following fields should only be used when the video is an episode of some

405

series, programme or podcast:

406

407

series: Title of the series or programme the video episode belongs to.

408

series_id: Id of the series or programme the video episode belongs to, as a unicode string.

409

season: Title of the season the video episode belongs to.

410

season_number: Number of the season the video episode belongs to, as an integer.

411

season_id: Id of the season the video episode belongs to, as a unicode string.

412

episode: Title of the video episode. Unlike mandatory video title field,

413

this field should denote the exact title of the video episode

414

without any kind of decoration.

415

episode_number: Number of the video episode within a season, as an integer.

416

episode_id: Id of the video episode, as a unicode string.

417

418

The following fields should only be used when the media is a track or a part of

419

a music album:

420

421

track: Title of the track.

422

track_number: Number of the track within an album or a disc, as an integer.

423

track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),

424

as a unicode string.

425

artist: Artist(s) of the track.

426

genre: Genre(s) of the track.

427

album: Title of the album the track belongs to.

428

album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).

429

album_artist: List of all artists appeared on the album (e.g.

430

"Ash Borer / Fell Voices" or "Various Artists", useful for splits

431

and compilations).

432

disc_number: Number of the disc or other physical medium the track belongs to,

433

as an integer.

434

composer: Composer of the piece

435

436

The following fields should only be set for clips that should be cut from the original video:

437

438

section_start: Start time of the section in seconds

439

section_end: End time of the section in seconds

440

441

The following fields should only be set for storyboards:

442

rows: Number of rows in each storyboard fragment, as an integer

443

columns: Number of columns in each storyboard fragment, as an integer

444

445

Unless mentioned otherwise, the fields should be Unicode strings.

446

447

Unless mentioned otherwise, None is equivalent to absence of information.

448

449

450

_type "playlist" indicates multiple videos.

451

There must be a key "entries", which is a list, an iterable, or a PagedList

452

object, each element of which is a valid dictionary by this specification.

453

454

Additionally, playlists can have "id", "title", and any other relevant

455

attributes with the same semantics as videos (see above).

456

457

It can also have the following optional fields:

458

459

playlist_count: The total number of videos in a playlist. If not given,

460

YoutubeDL tries to calculate it from "entries"

461

462

463

_type "multi_video" indicates that there are multiple videos that

464

form a single show, for examples multiple acts of an opera or TV episode.

465

It must have an entries key like a playlist and contain all the keys

466

required for a video at the same time.

467

468

469

_type "url" indicates that the video must be extracted from another

470

location, possibly by a different extractor. Its only required key is:

471

"url" - the next URL to extract.

472

The key "ie_key" can be set to the class name (minus the trailing "IE",

473

e.g. "Youtube") if the extractor class is known in advance.

474

Additionally, the dictionary may have any properties of the resolved entity

475

known in advance, for example "title" if the title of the referred video is

known ahead of time.

_type "url_transparent" entities have the same specification as "url", but

480

indicate that the given additional information is more precise than the one

481

associated with the resolved URL.

482

This is useful when a site employs a video service that hosts the video and

483

its technical metadata, but that video service does not embed a useful

484

title, description etc.

485

486

487

Subclasses of this should also be added to the list of extractors and

488

should define _VALID_URL as a regexp or a Sequence of regexps, and

489

re-define the _real_extract() and (optionally) _real_initialize() methods.

490

491

Subclasses may also override suitable() if necessary, but ensure the function

492

signature is preserved and that this function imports everything it needs

493

(except other extractors), so that lazy_extractors works correctly.

494

495

Subclasses can define a list of _EMBED_REGEX, which will be searched for in

496

the HTML of Generic webpages. It may also override _extract_embed_urls

497

or _extract_from_webpage as necessary. While these are normally classmethods,

498

_extract_from_webpage is allowed to be an instance method.

499

500

_extract_from_webpage may raise self.StopExtraction() to stop further

501

processing of the webpage and obtain exclusive rights to it. This is useful

502

when the extractor cannot reliably be matched using just the URL,

503

e.g. invidious/peertube instances

504

505

Embed-only extractors can be defined by setting _VALID_URL = False.

506

507

To support username + password (or netrc) login, the extractor must define a

508

_NETRC_MACHINE and re-define _perform_login(username, password) and

509

(optionally) _initialize_pre_login() methods. The _perform_login method will

510

be called between _initialize_pre_login and _real_initialize if credentials

511

are passed by the user. In cases where it is necessary to have the login

512

process as part of the extraction rather than initialization, _perform_login

513

can be left undefined.

514

515

_GEO_BYPASS attribute may be set to False in order to disable

516

geo restriction bypass mechanisms for a particular extractor.

517

Though it won't disable explicit geo restriction bypass based on

518

country code provided with geo_bypass_country.

519

520

_GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted

521

countries for this extractor. One of these countries will be used by

522

geo restriction bypass mechanism right away in order to bypass

523

geo restriction, of course, if the mechanism is not disabled.

524

525

_GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted

526

IP blocks in CIDR notation for this extractor. One of these IP blocks

527

will be used by geo restriction bypass mechanism similarly

528

to _GEO_COUNTRIES.

529

530

The _ENABLED attribute should be set to False for IEs that

531

are disabled by default and must be explicitly enabled.

532

533

The _WORKING attribute should be set to False for broken IEs

534

in order to warn the users and skip the tests.

"""

_ready = False

_downloader = None

_x_forwarded_for_ip = None

540

_GEO_BYPASS = True

541

_GEO_COUNTRIES = None

542

_GEO_IP_BLOCKS = None

543

_WORKING = True

544

_ENABLED = True

545

_NETRC_MACHINE = None

IE_DESC = None

SEARCH_KEY = None

_VALID_URL = None

_EMBED_REGEX = []

def _login_hint(self, method=NO_DEFAULT, netrc=None):

552

password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'

553

return {

554

None: '',

555

'any': f'Use --cookies, --cookies-from-browser, {password_hint}',

556

'password': f'Use {password_hint}',

557

'cookies': (

558

'Use --cookies-from-browser or --cookies for the authentication. '

559

'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),

560

}[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']

561

562

def __init__(self, downloader=None):

563

"""Constructor. Receives an optional downloader (a YoutubeDL instance).

564

If a downloader is not passed during initialization,

565

it must be set using "set_downloader()" before "extract()" is called"""

566

self._ready = False

567

self._x_forwarded_for_ip = None

568

self._printed_messages = set()

569

self.set_downloader(downloader)

570

571

@classmethod

572

def _match_valid_url(cls, url):

573

if cls._VALID_URL is False:

574

return None

575

# This does not use has/getattr intentionally - we want to know whether

576

# we have cached the regexp for *this* class, whereas getattr would also

577

# match the superclass

578

if '_VALID_URL_RE' not in cls.__dict__:

579

cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))

580

return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)

581

582

@classmethod

583

def suitable(cls, url):

584

"""Receives a URL and returns True if suitable for this IE."""

585

# This function must import everything it needs (except other extractors),

586

# so that lazy_extractors works correctly

587

return cls._match_valid_url(url) is not None

588

589

@classmethod

590

def _match_id(cls, url):

591

return cls._match_valid_url(url).group('id')

592

593

@classmethod

594

def get_temp_id(cls, url):

595

try:

596

return cls._match_id(url)

597

except (IndexError, AttributeError):

return None

@classmethod

def working(cls):

"""Getter method for _WORKING."""

return cls._WORKING

@classmethod

def supports_login(cls):

607

return bool(cls._NETRC_MACHINE)

608

609

def initialize(self):

610

"""Initializes an instance (authentication, etc)."""

611

self._printed_messages = set()

612

self._initialize_geo_bypass({

613

'countries': self._GEO_COUNTRIES,

614

'ip_blocks': self._GEO_IP_BLOCKS,

615

})

616

if not self._ready:

617

self._initialize_pre_login()

618

if self.supports_login():

619

username, password = self._get_login_info()

620

if username:

621

self._perform_login(username, password)

622

elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):

623

self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')

624

self._real_initialize()

625

self._ready = True

626

627

def _initialize_geo_bypass(self, geo_bypass_context):

628

"""

629

Initialize geo restriction bypass mechanism.

630

631

This method is used to initialize geo bypass mechanism based on faking

632

X-Forwarded-For HTTP header. A random country from provided country list

633

is selected and a random IP belonging to this country is generated. This

634

IP will be passed as X-Forwarded-For HTTP header in all subsequent

635

HTTP requests.

636

637

This method will be used for initial geo bypass mechanism initialization

638

during the instance initialization with _GEO_COUNTRIES and

639

_GEO_IP_BLOCKS.

640

641

You may also manually call it from extractor's code if geo bypass

642

information is not available beforehand (e.g. obtained during

643

extraction) or due to some other reason. In this case you should pass

644

this information in geo bypass context passed as first argument. It may

645

contain following fields:

646

647

countries: List of geo unrestricted countries (similar

648

to _GEO_COUNTRIES)

649

ip_blocks: List of geo unrestricted IP blocks in CIDR notation

650

(similar to _GEO_IP_BLOCKS)

651

652

"""

653

if not self._x_forwarded_for_ip:

654

655

# Geo bypass mechanism is explicitly disabled by user

656

if not self.get_param('geo_bypass', True):

657

return

658

659

if not geo_bypass_context:

660

geo_bypass_context = {}

661

662

# Backward compatibility: previously _initialize_geo_bypass

663

# expected a list of countries, some 3rd party code may still use

664

# it this way

665

if isinstance(geo_bypass_context, (list, tuple)):

666

geo_bypass_context = {

667

'countries': geo_bypass_context,

668

}

669

670

# The whole point of geo bypass mechanism is to fake IP

671

# as X-Forwarded-For HTTP header based on some IP block or

672

# country code.

673

674

# Path 1: bypassing based on IP block in CIDR notation

675

676

# Explicit IP block specified by user, use it right away

677

# regardless of whether extractor is geo bypassable or not

678

ip_block = self.get_param('geo_bypass_ip_block', None)

679

680

# Otherwise use random IP block from geo bypass context but only

681

# if extractor is known as geo bypassable

682

if not ip_block:

683

ip_blocks = geo_bypass_context.get('ip_blocks')

684

if self._GEO_BYPASS and ip_blocks:

685

ip_block = random.choice(ip_blocks)

686

687

if ip_block:

688

self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)

689

self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')

690

return

691

692

# Path 2: bypassing based on country code

693

694

# Explicit country code specified by user, use it right away

695

# regardless of whether extractor is geo bypassable or not

696

country = self.get_param('geo_bypass_country', None)

697

698

# Otherwise use random country code from geo bypass context but

699

# only if extractor is known as geo bypassable

700

if not country:

701

countries = geo_bypass_context.get('countries')

702

if self._GEO_BYPASS and countries:

703

country = random.choice(countries)

704

705

if country:

706

self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)

707

self._downloader.write_debug(

708

f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')

709

710

def extract(self, url):

711

"""Extracts URL information and returns it in list of dicts."""

try:

for _ in range(2):

try:

self.initialize()

self.to_screen('Extracting URL: %s' % (

717

url if self.get_param('verbose') else truncate_string(url, 100, 20)))

718

ie_result = self._real_extract(url)

719

if ie_result is None:

720

return None

721

if self._x_forwarded_for_ip:

722

ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip

723

subtitles = ie_result.get('subtitles') or {}

724

if 'no-live-chat' in self.get_param('compat_opts'):

725

for lang in ('live_chat', 'comments', 'danmaku'):

726

subtitles.pop(lang, None)

727

return ie_result

728

except GeoRestrictedError as e:

729

if self.__maybe_fake_ip_and_retry(e.countries):

730

continue

731

raise

732

except UnsupportedError:

733

raise

734

except ExtractorError as e:

735

e.video_id = e.video_id or self.get_temp_id(url)

736

e.ie = e.ie or self.IE_NAME,

737

e.traceback = e.traceback or sys.exc_info()[2]

738

raise

739

except IncompleteRead as e:

740

raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))

741

except (KeyError, StopIteration) as e:

742

raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))

743

744

def __maybe_fake_ip_and_retry(self, countries):

745

if (not self.get_param('geo_bypass_country', None)

746

and self._GEO_BYPASS

747

and self.get_param('geo_bypass', True)

748

and not self._x_forwarded_for_ip

749

and countries):

750

country_code = random.choice(countries)

751

self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)

752

if self._x_forwarded_for_ip:

753

self.report_warning(

754

'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'

755

% (self._x_forwarded_for_ip, country_code.upper()))

return True

return False

def set_downloader(self, downloader):

760

"""Sets a YoutubeDL instance as the downloader for this IE."""

761

self._downloader = downloader

@property

def cache(self):

return self._downloader.cache

@property

def cookiejar(self):

return self._downloader.cookiejar

770

771

def _initialize_pre_login(self):

772

""" Initialization before login. Redefine in subclasses."""

773

pass

774

775

def _perform_login(self, username, password):

776

""" Login with username and password. Redefine in subclasses."""

777

pass

778

779

def _real_initialize(self):

780

"""Real initialization process. Redefine in subclasses."""

781

pass

782

783

def _real_extract(self, url):

784

"""Real extraction process. Redefine in subclasses."""

785

raise NotImplementedError('This method must be implemented by subclasses')

@classmethod

def ie_key(cls):

"""A string for getting the InfoExtractor with get_info_extractor"""

790

return cls.__name__[:-2]

@classproperty

def IE_NAME(cls):

return cls.__name__[:-2]

795

796

@staticmethod

797

def __can_accept_status_code(err, expected_status):

798

assert isinstance(err, HTTPError)

799

if expected_status is None:

800

return False

801

elif callable(expected_status):

802

return expected_status(err.status) is True

803

else:

804

return err.status in variadic(expected_status)

805

806

def _create_request(self, url_or_request, data=None, headers=None, query=None):

807

if isinstance(url_or_request, urllib.request.Request):

808

self._downloader.deprecation_warning(

809

'Passing a urllib.request.Request to _create_request() is deprecated. '

810

'Use yt_dlp.networking.common.Request instead.')

811

url_or_request = urllib_req_to_req(url_or_request)

812

elif not isinstance(url_or_request, Request):

813

url_or_request = Request(url_or_request)

814

815

url_or_request.update(data=data, headers=headers, query=query)

816

return url_or_request

817

818

def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):

819

"""

820

Return the response handle.

821

822

See _download_webpage docstring for arguments specification.

823

"""

824

if not self._downloader._first_webpage_request:

825

sleep_interval = self.get_param('sleep_interval_requests') or 0

826

if sleep_interval > 0:

827

self.to_screen('Sleeping %s seconds ...' % sleep_interval)

828

time.sleep(sleep_interval)

829

else:

830

self._downloader._first_webpage_request = False

831

832

if note is None:

833

self.report_download_webpage(video_id)

834

elif note is not False:

835

if video_id is None:

836

self.to_screen(str(note))

837

else:

838

self.to_screen(f'{video_id}: {note}')

839

840

# Some sites check X-Forwarded-For HTTP header in order to figure out

841

# the origin of the client behind proxy. This allows bypassing geo

842

# restriction by faking this header's value to IP that belongs to some

843

# geo unrestricted country. We will do so once we encounter any

844

# geo restriction error.

845

if self._x_forwarded_for_ip:

846

headers = (headers or {}).copy()

847

headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)

848

849

try:

850

return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))

851

except network_exceptions as err:

852

if isinstance(err, HTTPError):

853

if self.__can_accept_status_code(err, expected_status):

return err.response

if errnote is False:

return False

if errnote is None:

errnote = 'Unable to download webpage'

860

861

errmsg = f'{errnote}: {error_to_compat_str(err)}'

862

if fatal:

863

raise ExtractorError(errmsg, cause=err)

864

else:

865

self.report_warning(errmsg)

866

return False

867

868

def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,

869

encoding=None, data=None, headers={}, query={}, expected_status=None):

870

"""

871

Return a tuple (page content as string, URL handle).

872

873

Arguments:

874

url_or_request -- plain text URL as a string or

875

a urllib.request.Request object

876

video_id -- Video/playlist/item identifier (string)

877

878

Keyword arguments:

879

note -- note printed before downloading (string)

880

errnote -- note printed in case of an error (string)

881

fatal -- flag denoting whether error should be considered fatal,

882

i.e. whether it should cause ExtractionError to be raised,

883

otherwise a warning will be reported and extraction continued

884

encoding -- encoding for a page content decoding, guessed automatically

885

when not explicitly specified

886

data -- POST data (bytes)

887

headers -- HTTP headers (dict)

888

query -- URL query (dict)

889

expected_status -- allows to accept failed HTTP requests (non 2xx

890

status code) by explicitly specifying a set of accepted status

891

codes. Can be any of the following entities:

892

- an integer type specifying an exact failed status code to

893

accept

894

- a list or a tuple of integer types specifying a list of

895

failed status codes to accept

896

- a callable accepting an actual failed status code and

897

returning True if it should be accepted

898

Note that this argument does not affect success status codes (2xx)

899

which are always accepted.

900

"""

901

902

# Strip hashes from the URL (#1038)

903

if isinstance(url_or_request, str):

904

url_or_request = url_or_request.partition('#')[0]

905

906

urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)

if urlh is False:

assert not fatal

return False

content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)

911

return (content, urlh)

912

913

@staticmethod

914

def _guess_encoding_from_content(content_type, webpage_bytes):

915

m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)

916

if m:

917

encoding = m.group(1)

918

else:

919

m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',

920

webpage_bytes[:1024])

921

if m:

922

encoding = m.group(1).decode('ascii')

923

elif webpage_bytes.startswith(b'\xff\xfe'):

encoding = 'utf-16'

else:

encoding = 'utf-8'

return encoding

def __check_blocked(self, content):

931

first_block = content[:512]

932

if ('<title>Access to this site is blocked</title>' in content

933

and 'Websense' in first_block):

934

msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'

935

blocked_iframe = self._html_search_regex(

936

r'<iframe src="([^"]+)"', content,

937

'Websense information URL', default=None)

938

if blocked_iframe:

939

msg += ' Visit %s for more details' % blocked_iframe

940

raise ExtractorError(msg, expected=True)

941

if '<title>The URL you requested has been blocked</title>' in first_block:

942

msg = (

943

'Access to this webpage has been blocked by Indian censorship. '

944

'Use a VPN or proxy server (with --proxy) to route around it.')

945

block_msg = self._html_search_regex(

946

r'</h1><p>(.*?)</p>',

947

content, 'block message', default=None)

948

if block_msg:

949

msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')

950

raise ExtractorError(msg, expected=True)

951

if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content

952

and 'blocklist.rkn.gov.ru' in content):

953

raise ExtractorError(

954

'Access to this webpage has been blocked by decision of the Russian government. '

955

'Visit http://blocklist.rkn.gov.ru/ for a block reason.',

956

expected=True)

957

958

def _request_dump_filename(self, url, video_id):

959

basen = f'{video_id}_{url}'

960

trim_length = self.get_param('trim_file_name') or 240

961

if len(basen) > trim_length:

962

h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()

963

basen = basen[:trim_length - len(h)] + h

964

filename = sanitize_filename(f'{basen}.dump', restricted=True)

965

# Working around MAX_PATH limitation on Windows (see

966

# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)

967

if compat_os_name == 'nt':

968

absfilepath = os.path.abspath(filename)

969

if len(absfilepath) > 259:

970

filename = fR'\\?\{absfilepath}'

971

return filename

972

973

def __decode_webpage(self, webpage_bytes, encoding, headers):

974

if not encoding:

975

encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)

976

try:

977

return webpage_bytes.decode(encoding, 'replace')

978

except LookupError:

979

return webpage_bytes.decode('utf-8', 'replace')

980

981

def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):

982

webpage_bytes = urlh.read()

983

if prefix is not None:

984

webpage_bytes = prefix + webpage_bytes

985

if self.get_param('dump_intermediate_pages', False):

986

self.to_screen('Dumping request to ' + urlh.url)

987

dump = base64.b64encode(webpage_bytes).decode('ascii')

988

self._downloader.to_screen(dump)

989

if self.get_param('write_pages'):

990

filename = self._request_dump_filename(urlh.url, video_id)

991

self.to_screen(f'Saving request to {filename}')

992

with open(filename, 'wb') as outf:

993

outf.write(webpage_bytes)

994

995

content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)

996

self.__check_blocked(content)

return content

def __print_error(self, errnote, fatal, video_id, err):

1001

if fatal:

1002

raise ExtractorError(f'{video_id}: {errnote}', cause=err)

1003

elif errnote:

1004

self.report_warning(f'{video_id}: {errnote}: {err}')

1005

1006

def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):

1007

if transform_source:

1008

xml_string = transform_source(xml_string)

1009

try:

1010

return compat_etree_fromstring(xml_string.encode('utf-8'))

1011

except xml.etree.ElementTree.ParseError as ve:

1012

self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)

1013

1014

def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):

1015

try:

1016

return json.loads(

1017

json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)

1018

except ValueError as ve:

1019

self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)

1020

1021

def _parse_socket_response_as_json(self, data, *args, **kwargs):

1022

return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)

1023

1024

def __create_download_methods(name, parser, note, errnote, return_value):

1025

1026

def parse(ie, content, *args, errnote=errnote, **kwargs):

if parser is None:

return content

if errnote is False:

kwargs['errnote'] = errnote

1031

# parser is fetched by name so subclasses can override it

1032

return getattr(ie, parser)(content, *args, **kwargs)

1033

1034

def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,

1035

fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):

1036

res = self._download_webpage_handle(

1037

url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,

1038

data=data, headers=headers, query=query, expected_status=expected_status)

if res is False:

return res

content, urlh = res

return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh

1043

1044

def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,

1045

fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):

1046

if self.get_param('load_pages'):

1047

url_or_request = self._create_request(url_or_request, data, headers, query)

1048

filename = self._request_dump_filename(url_or_request.url, video_id)

1049

self.to_screen(f'Loading request from {filename}')

1050

try:

1051

with open(filename, 'rb') as dumpf:

1052

webpage_bytes = dumpf.read()

1053

except OSError as e:

1054

self.report_warning(f'Unable to load request from disk: {e}')

1055

else:

1056

content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)

1057

return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)

kwargs = {

'note': note,

'errnote': errnote,

'transform_source': transform_source,

1062

'fatal': fatal,

1063

'encoding': encoding,

'data': data,

'headers': headers,

'query': query,

'expected_status': expected_status,

1068

}

1069

if parser is None:

1070

kwargs.pop('transform_source')

1071

# The method is fetched by name so subclasses can override _download_..._handle

1072

res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)

1073

return res if res is False else res[0]

1074

1075

def impersonate(func, name, return_value):

1076

func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'

1077

func.__doc__ = f'''

1078

@param transform_source Apply this transformation before parsing

1079

@returns {return_value}

1080

1081

See _download_webpage_handle docstring for other arguments specification

1082

'''

1083

1084

impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')

1085

impersonate(download_content, f'_download_{name}', f'{return_value}')

1086

return download_handle, download_content

1087

1088

_download_xml_handle, _download_xml = __create_download_methods(

1089

'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')

1090

_download_json_handle, _download_json = __create_download_methods(

1091

'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')

1092

_download_socket_json_handle, _download_socket_json = __create_download_methods(

1093

'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')

1094

__download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]

1095

1096

def _download_webpage(

1097

self, url_or_request, video_id, note=None, errnote=None,

1098

fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):

1099

"""

1100

Return the data of the page as a string.

1101

1102

Keyword arguments:

1103

tries -- number of tries

1104

timeout -- sleep interval between tries

1105

1106

See _download_webpage_handle docstring for other arguments specification.

1107

"""

1108

1109

R''' # NB: These are unused; should they be deprecated?

1110

if tries != 1:

1111

self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')

1112

if timeout is NO_DEFAULT:

1113

timeout = 5

1114

else:

1115

self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')

'''

try_count = 0

while True:

try:

return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)

1122

except IncompleteRead as e:

1123

try_count += 1

1124

if try_count >= tries:

1125

raise e

1126

self._sleep(timeout, video_id)

1127

1128

def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):

1129

idstr = format_field(video_id, None, '%s: ')

1130

msg = f'[{self.IE_NAME}] {idstr}{msg}'

1131

if only_once:

1132

if f'WARNING: {msg}' in self._printed_messages:

1133

return

1134

self._printed_messages.add(f'WARNING: {msg}')

1135

self._downloader.report_warning(msg, *args, **kwargs)

1136

1137

def to_screen(self, msg, *args, **kwargs):

1138

"""Print msg to screen, prefixing it with '[ie_name]'"""

1139

self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)

1140

1141

def write_debug(self, msg, *args, **kwargs):

1142

self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)

1143

1144

def get_param(self, name, default=None, *args, **kwargs):

1145

if self._downloader:

1146

return self._downloader.params.get(name, default, *args, **kwargs)

1147

return default

1148

1149

def report_drm(self, video_id, partial=NO_DEFAULT):

1150

if partial is not NO_DEFAULT:

1151

self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')

1152

self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)

1153

1154

def report_extraction(self, id_or_name):

1155

"""Report information extraction."""

1156

self.to_screen('%s: Extracting information' % id_or_name)

1157

1158

def report_download_webpage(self, video_id):

1159

"""Report webpage download."""

1160

self.to_screen('%s: Downloading webpage' % video_id)

1161

1162

def report_age_confirmation(self):

1163

"""Report attempt to confirm age."""

1164

self.to_screen('Confirming age')

1165

1166

def report_login(self):

1167

"""Report attempt to log in."""

1168

self.to_screen('Logging in')

1169

1170

def raise_login_required(

1171

self, msg='This video is only available for registered users',

1172

metadata_available=False, method=NO_DEFAULT):

1173

if metadata_available and (

1174

self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):

1175

self.report_warning(msg)

1176

return

1177

msg += format_field(self._login_hint(method), None, '. %s')

1178

raise ExtractorError(msg, expected=True)

1179

1180

def raise_geo_restricted(

1181

self, msg='This video is not available from your location due to geo restriction',

1182

countries=None, metadata_available=False):

1183

if metadata_available and (

1184

self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):

1185

self.report_warning(msg)

1186

else:

1187

raise GeoRestrictedError(msg, countries=countries)

1188

1189

def raise_no_formats(self, msg, expected=False, video_id=None):

1190

if expected and (

1191

self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):

1192

self.report_warning(msg, video_id)

1193

elif isinstance(msg, ExtractorError):

1194

raise msg

1195

else:

1196

raise ExtractorError(msg, expected=expected, video_id=video_id)

1197

1198

# Methods for following #608

1199

@staticmethod

1200

def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):

1201

"""Returns a URL that points to a page that should be processed"""

1202

if ie is not None:

1203

kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()

1204

if video_id is not None:

1205

kwargs['id'] = video_id

1206

if video_title is not None:

1207

kwargs['title'] = video_title

1208

return {

1209

**kwargs,

1210

'_type': 'url_transparent' if url_transparent else 'url',

'url': url,

}

@classmethod

def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,

1216

getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):

1217

return cls.playlist_result(

1218

(cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),

1219

playlist_id, playlist_title, **kwargs)

1220

1221

@staticmethod

1222

def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):

1223

"""Returns a playlist"""

1224

if playlist_id:

1225

kwargs['id'] = playlist_id

1226

if playlist_title:

1227

kwargs['title'] = playlist_title

1228

if playlist_description is not None:

1229

kwargs['description'] = playlist_description

1230

return {

1231

**kwargs,

1232

'_type': 'multi_video' if multi_video else 'playlist',

'entries': entries,

}

def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):

1237

"""

1238

Perform a regex search on the given string, using a single or a list of

1239

patterns returning the first matching group.

1240

In case of failure return a default value or raise a WARNING or a

1241

RegexNotFoundError, depending on fatal, specifying the field name.

"""

if string is None:

mobj = None

elif isinstance(pattern, (str, re.Pattern)):

1246

mobj = re.search(pattern, string, flags)

1247

else:

1248

for p in pattern:

1249

mobj = re.search(p, string, flags)

if mobj:

break

_name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)

if mobj:

if group is None:

# return the first matching group

1258

return next(g for g in mobj.groups() if g is not None)

1259

elif isinstance(group, (list, tuple)):

1260

return tuple(mobj.group(g) for g in group)

1261

else:

1262

return mobj.group(group)

1263

elif default is not NO_DEFAULT:

1264

return default

1265

elif fatal:

1266

raise RegexNotFoundError('Unable to extract %s' % _name)

1267

else:

1268

self.report_warning('unable to extract %s' % _name + bug_reports_message())

1269

return None

1270

1271

def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',

1272

contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):

1273

"""Searches string for the JSON object specified by start_pattern"""

1274

# NB: end_pattern is only used to reduce the size of the initial match

1275

if default is NO_DEFAULT:

1276

default, has_default = {}, False

1277

else:

1278

fatal, has_default = False, True

1279

1280

json_string = self._search_regex(

1281

rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',

1282

string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)

if not json_string:

return default

_name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)

1287

try:

1288

return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)

1289

except ExtractorError as e:

1290

if fatal:

1291

raise ExtractorError(

1292

f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)

1293

elif not has_default:

1294

self.report_warning(

1295

f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)

1296

return default

1297

1298

def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):

1299

"""

1300

Like _search_regex, but strips HTML tags and unescapes entities.

1301

"""

1302

res = self._search_regex(pattern, string, name, default, fatal, flags, group)

1303

if isinstance(res, tuple):

1304

return tuple(map(clean_html, res))

1305

return clean_html(res)

1306

1307

def _get_netrc_login_info(self, netrc_machine=None):

1308

netrc_machine = netrc_machine or self._NETRC_MACHINE

1309

1310

cmd = self.get_param('netrc_cmd')

1311

if cmd:

1312

cmd = cmd.replace('{}', netrc_machine)

1313

self.to_screen(f'Executing command: {cmd}')

1314

stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)

1315

if ret != 0:

1316

raise OSError(f'Command returned error code {ret}')

1317

info = netrc_from_content(stdout).authenticators(netrc_machine)

1318

1319

elif self.get_param('usenetrc', False):

1320

netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')

1321

if os.path.isdir(netrc_file):

1322

netrc_file = os.path.join(netrc_file, '.netrc')

1323

info = netrc.netrc(netrc_file).authenticators(netrc_machine)

else:

return None, None

if not info:

raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')

1329

return info[0], info[2]

1330

1331

def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):

1332

"""

1333

Get the login info as (username, password)

1334

First look for the manually specified credentials using username_option

1335

and password_option as keys in params dictionary. If no such credentials

1336

are available try the netrc_cmd if it is defined or look in the

1337

netrc file using the netrc_machine or _NETRC_MACHINE value.

1338

If there's no info available, return (None, None)

1339

"""

1340

1341

username = self.get_param(username_option)

1342

if username is not None:

1343

password = self.get_param(password_option)

1344

else:

1345

try:

1346

username, password = self._get_netrc_login_info(netrc_machine)

1347

except (OSError, netrc.NetrcParseError) as err:

1348

self.report_warning(f'Failed to parse .netrc: {err}')

1349

return None, None

1350

return username, password

1351

1352

def _get_tfa_info(self, note='two-factor verification code'):

1353

"""

1354

Get the two-factor authentication info

1355

TODO - asking the user will be required for sms/phone verify

1356

currently just uses the command line option

1357

If there's no info available, return None

1358

"""

1359

1360

tfa = self.get_param('twofactor')

if tfa is not None:

return tfa

return getpass.getpass('Type %s and press [Return]: ' % note)

1365

1366

# Helper functions for extracting OpenGraph info

1367

@staticmethod

1368

def _og_regexes(prop):

1369

content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'

1370

property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'

1371

% {'prop': re.escape(prop), 'sep': '(?::|[:-])'})

1372

template = r'<meta[^>]+?%s[^>]+?%s'

1373

return [

1374

template % (property_re, content_re),

1375

template % (content_re, property_re),

]

@staticmethod

def _meta_regex(prop):

1380

return r'''(?isx)<meta

1381

(?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)

1382

[^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)

1383

1384

def _og_search_property(self, prop, html, name=None, **kargs):

1385

prop = variadic(prop)

1386

if name is None:

1387

name = 'OpenGraph %s' % prop[0]

1388

og_regexes = []

1389

for p in prop:

1390

og_regexes.extend(self._og_regexes(p))

1391

escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)

1392

if escaped is None:

1393

return None

1394

return unescapeHTML(escaped)

1395

1396

def _og_search_thumbnail(self, html, **kargs):

1397

return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)

1398

1399

def _og_search_description(self, html, **kargs):

1400

return self._og_search_property('description', html, fatal=False, **kargs)

1401

1402

def _og_search_title(self, html, *, fatal=False, **kargs):

1403

return self._og_search_property('title', html, fatal=fatal, **kargs)

1404

1405

def _og_search_video_url(self, html, name='video url', secure=True, **kargs):

1406

regexes = self._og_regexes('video') + self._og_regexes('video:url')

1407

if secure:

1408

regexes = self._og_regexes('video:secure_url') + regexes

1409

return self._html_search_regex(regexes, html, name, **kargs)

1410

1411

def _og_search_url(self, html, **kargs):

1412

return self._og_search_property('url', html, **kargs)

1413

1414

def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):

1415

return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)

1416

1417

def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):

1418

name = variadic(name)

1419

if display_name is None:

1420

display_name = name[0]

1421

return self._html_search_regex(

1422

[self._meta_regex(n) for n in name],

1423

html, display_name, fatal=fatal, group='content', **kwargs)

1424

1425

def _dc_search_uploader(self, html):

1426

return self._html_search_meta('dc.creator', html, 'uploader')

1427

1428

@staticmethod

1429

def _rta_search(html):

1430

# See http://www.rtalabel.org/index.php?content=howtofaq#single

1431

if re.search(r'(?ix)<meta\s+name="rating"\s+'

1432

r' content="RTA-5042-1996-1400-1577-RTA"',

html):

return 18

# And then there are the jokers who advertise that they use RTA, but actually don't.

1437

AGE_LIMIT_MARKERS = [

1438

r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',

1439

r'>[^<]*you acknowledge you are at least (\d+) years old',

1440

r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',

]

age_limit = 0

for marker in AGE_LIMIT_MARKERS:

1445

mobj = re.search(marker, html)

1446

if mobj:

1447

age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))

1448

return age_limit

1449

1450

def _media_rating_search(self, html):

1451

# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/

1452

rating = self._html_search_meta('rating', html)

if not rating:

return None

RATING_TABLE = {

'safe for kids': 0,

'general': 8,

'14 years': 14,

'mature': 17,

'restricted': 19,

}

return RATING_TABLE.get(rating.lower())

1465

1466

def _family_friendly_search(self, html):

1467

# See http://schema.org/VideoObject

1468

family_friendly = self._html_search_meta(

1469

'isFamilyFriendly', html, default=None)

1470

1471

if not family_friendly:

return None

RATING_TABLE = {

'1': 0,

'true': 0,

'0': 18,

'false': 18,

}

return RATING_TABLE.get(family_friendly.lower())

1481

1482

def _twitter_search_player(self, html):

1483

return self._html_search_meta('twitter:player', html,

1484

'twitter card player')

1485

1486

def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):

1487

"""Yield all json ld objects in the html"""

1488

if default is not NO_DEFAULT:

1489

fatal = False

1490

for mobj in re.finditer(JSON_LD_RE, html):

1491

json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)

1492

for json_ld in variadic(json_ld_item):

1493

if isinstance(json_ld, dict):

1494

yield json_ld

1495

1496

def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):

1497

"""Search for a video in any json ld in the html"""

1498

if default is not NO_DEFAULT:

1499

fatal = False

1500

info = self._json_ld(

1501

list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),

1502

video_id, fatal=fatal, expected_type=expected_type)

1503

if info:

1504

return info

1505

if default is not NO_DEFAULT:

1506

return default

1507

elif fatal:

1508

raise RegexNotFoundError('Unable to extract JSON-LD')

1509

else:

1510

self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())

1511

return {}

1512

1513

def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):

1514

if isinstance(json_ld, str):

1515

json_ld = self._parse_json(json_ld, video_id, fatal=fatal)

if not json_ld:

return {}

info = {}

INTERACTION_TYPE_MAP = {

1521

'CommentAction': 'comment',

1522

'AgreeAction': 'like',

1523

'DisagreeAction': 'dislike',

1524

'LikeAction': 'like',

1525

'DislikeAction': 'dislike',

1526

'ListenAction': 'view',

1527

'WatchAction': 'view',

1528

'ViewAction': 'view',

1529

}

1530

1531

def is_type(e, *expected_types):

1532

type = variadic(traverse_obj(e, '@type'))

1533

return any(x in type for x in expected_types)

1534

1535

def extract_interaction_type(e):

1536

interaction_type = e.get('interactionType')

1537

if isinstance(interaction_type, dict):

1538

interaction_type = interaction_type.get('@type')

1539

return str_or_none(interaction_type)

1540

1541

def extract_interaction_statistic(e):

1542

interaction_statistic = e.get('interactionStatistic')

1543

if isinstance(interaction_statistic, dict):

1544

interaction_statistic = [interaction_statistic]

1545

if not isinstance(interaction_statistic, list):

1546

return

1547

for is_e in interaction_statistic:

1548

if not is_type(is_e, 'InteractionCounter'):

1549

continue

1550

interaction_type = extract_interaction_type(is_e)

1551

if not interaction_type:

1552

continue

1553

# For interaction count some sites provide string instead of

1554

# an integer (as per spec) with non digit characters (e.g. ",")

1555

# so extracting count with more relaxed str_to_int

1556

interaction_count = str_to_int(is_e.get('userInteractionCount'))

1557

if interaction_count is None:

1558

continue

1559

count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])

1560

if not count_kind:

1561

continue

1562

count_key = '%s_count' % count_kind

1563

if info.get(count_key) is not None:

1564

continue

1565

info[count_key] = interaction_count

1566

1567

def extract_chapter_information(e):

1568

chapters = [{

1569

'title': part.get('name'),

1570

'start_time': part.get('startOffset'),

1571

'end_time': part.get('endOffset'),

1572

} for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']

1573

for idx, (last_c, current_c, next_c) in enumerate(zip(

1574

[{'end_time': 0}] + chapters, chapters, chapters[1:])):

1575

current_c['end_time'] = current_c['end_time'] or next_c['start_time']

1576

current_c['start_time'] = current_c['start_time'] or last_c['end_time']

1577

if None in current_c.values():

1578

self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')

1579

return

1580

if chapters:

1581

chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']

1582

info['chapters'] = chapters

1583

1584

def extract_video_object(e):

1585

author = e.get('author')

1586

info.update({

1587

'url': url_or_none(e.get('contentUrl')),

1588

'ext': mimetype2ext(e.get('encodingFormat')),

1589

'title': unescapeHTML(e.get('name')),

1590

'description': unescapeHTML(e.get('description')),

1591

'thumbnails': [{'url': unescapeHTML(url)}

1592

for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))

1593

if url_or_none(url)],

1594

'duration': parse_duration(e.get('duration')),

1595

'timestamp': unified_timestamp(e.get('uploadDate')),

1596

# author can be an instance of 'Organization' or 'Person' types.

1597

# both types can have 'name' property(inherited from 'Thing' type). [1]

1598

# however some websites are using 'Text' type instead.

1599

# 1. https://schema.org/VideoObject

1600

'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,

1601

'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),

1602

'filesize': int_or_none(float_or_none(e.get('contentSize'))),

1603

'tbr': int_or_none(e.get('bitrate')),

1604

'width': int_or_none(e.get('width')),

1605

'height': int_or_none(e.get('height')),

1606

'view_count': int_or_none(e.get('interactionCount')),

1607

'tags': try_call(lambda: e.get('keywords').split(',')),

1608

})

1609

if is_type(e, 'AudioObject'):

1610

info.update({

1611

'vcodec': 'none',

1612

'abr': int_or_none(e.get('bitrate')),

1613

})

1614

extract_interaction_statistic(e)

1615

extract_chapter_information(e)

1616

1617

def traverse_json_ld(json_ld, at_top_level=True):

1618

for e in variadic(json_ld):

1619

if not isinstance(e, dict):

1620

continue

1621

if at_top_level and '@context' not in e:

1622

continue

1623

if at_top_level and set(e.keys()) == {'@context', '@graph'}:

1624

traverse_json_ld(e['@graph'], at_top_level=False)

1625

continue

1626

if expected_type is not None and not is_type(e, expected_type):

1627

continue

1628

rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)

1629

if rating is not None:

1630

info['average_rating'] = rating

1631

if is_type(e, 'TVEpisode', 'Episode'):

1632

episode_name = unescapeHTML(e.get('name'))

1633

info.update({

1634

'episode': episode_name,

1635

'episode_number': int_or_none(e.get('episodeNumber')),

1636

'description': unescapeHTML(e.get('description')),

1637

})

1638

if not info.get('title') and episode_name:

1639

info['title'] = episode_name

1640

part_of_season = e.get('partOfSeason')

1641

if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):

1642

info.update({

1643

'season': unescapeHTML(part_of_season.get('name')),

1644

'season_number': int_or_none(part_of_season.get('seasonNumber')),

1645

})

1646

part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')

1647

if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):

1648

info['series'] = unescapeHTML(part_of_series.get('name'))

1649

elif is_type(e, 'Movie'):

1650

info.update({

1651

'title': unescapeHTML(e.get('name')),

1652

'description': unescapeHTML(e.get('description')),

1653

'duration': parse_duration(e.get('duration')),

1654

'timestamp': unified_timestamp(e.get('dateCreated')),

1655

})

1656

elif is_type(e, 'Article', 'NewsArticle'):

1657

info.update({

1658

'timestamp': parse_iso8601(e.get('datePublished')),

1659

'title': unescapeHTML(e.get('headline')),

1660

'description': unescapeHTML(e.get('articleBody') or e.get('description')),

1661

})

1662

if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):

1663

extract_video_object(e['video'][0])

1664

elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):

1665

extract_video_object(e['subjectOf'][0])

1666

elif is_type(e, 'VideoObject', 'AudioObject'):

1667

extract_video_object(e)

1668

if expected_type is None:

continue

else:

break

video = e.get('video')

1673

if is_type(video, 'VideoObject'):

1674

extract_video_object(video)

1675

if expected_type is None:

continue

else:

break

traverse_json_ld(json_ld)

1681

return filter_dict(info)

1682

1683

def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):

1684

return self._parse_json(

1685

self._search_regex(

1686

r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',

1687

webpage, 'next.js data', fatal=fatal, **kw),

1688

video_id, transform_source=transform_source, fatal=fatal)

1689

1690

def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):

1691

"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""

1692

rectx = re.escape(context_name)

1693

FUNCTION_RE = r'$function\((?P<arg_keys>.*?)${.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}$(?P<arg_vals>.*?)$'

1694

js, arg_keys, arg_vals = self._search_regex(

1695

(rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),

1696

webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),

1697

default=NO_DEFAULT if fatal else (None, None, None))

if js is None:

return {}

args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(

1702

f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))

1703

1704

ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)

1705

return traverse_obj(ret, traverse) or {}

1706

1707

@staticmethod

1708

def _hidden_inputs(html):

1709

html = re.sub(r'', '', html)

1710

hidden_inputs = {}

1711

for input in re.findall(r'(?i)(<input[^>]+>)', html):

1712

attrs = extract_attributes(input)

1713

if not input:

1714

continue

1715

if attrs.get('type') not in ('hidden', 'submit'):

1716

continue

1717

name = attrs.get('name') or attrs.get('id')

1718

value = attrs.get('value')

1719

if name and value is not None:

1720

hidden_inputs[name] = value

1721

return hidden_inputs

1722

1723

def _form_hidden_inputs(self, form_id, html):

1724

form = self._search_regex(

1725

r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,

1726

html, '%s form' % form_id, group='form')

1727

return self._hidden_inputs(form)

1728

1729

@classproperty(cache=True)

1730

def FormatSort(cls):

1731

class FormatSort(FormatSorter):

1732

def __init__(ie, *args, **kwargs):

1733

super().__init__(ie._downloader, *args, **kwargs)

1734

1735

deprecation_warning(

1736

'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '

1737

'Use yt_dlp.utils.FormatSorter instead')

1738

return FormatSort

1739

1740

def _sort_formats(self, formats, field_preference=[]):

1741

if not field_preference:

1742

self._downloader.deprecation_warning(

1743

'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')

1744

return

1745

self._downloader.deprecation_warning(

1746

'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '

1747

'Return _format_sort_fields in the info_dict instead')

1748

if formats:

1749

formats[0]['__sort_fields'] = field_preference

1750

1751

def _check_formats(self, formats, video_id):

1752

if formats:

1753

formats[:] = filter(

1754

lambda f: self._is_valid_url(

1755

f['url'], video_id,

1756

item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),

formats)

@staticmethod

def _remove_duplicate_formats(formats):

format_urls = set()

unique_formats = []

for f in formats:

if f['url'] not in format_urls:

1765

format_urls.add(f['url'])

1766

unique_formats.append(f)

1767

formats[:] = unique_formats

1768

1769

def _is_valid_url(self, url, video_id, item='video', headers={}):

1770

url = self._proto_relative_url(url, scheme='http:')

1771

# For now assume non HTTP(S) URLs always valid

1772

if not (url.startswith('http://') or url.startswith('https://')):

1773

return True

1774

try:

1775

self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)

1776

return True

1777

except ExtractorError as e:

1778

self.to_screen(

1779

'%s: %s URL is invalid, skipping: %s'

1780

% (video_id, item, error_to_compat_str(e.cause)))

1781

return False

1782

1783

def http_scheme(self):

1784

""" Either "http:" or "https:", depending on the user's preferences """

1785

return (

1786

'http:'

1787

if self.get_param('prefer_insecure', False)

1788

else 'https:')

1789

1790

def _proto_relative_url(self, url, scheme=None):

1791

scheme = scheme or self.http_scheme()

1792

assert scheme.endswith(':')

1793

return sanitize_url(url, scheme=scheme[:-1])

1794

1795

def _sleep(self, timeout, video_id, msg_template=None):

1796

if msg_template is None:

1797

msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'

1798

msg = msg_template % {'video_id': video_id, 'timeout': timeout}

self.to_screen(msg)

time.sleep(timeout)

def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,

1803

transform_source=lambda s: fix_xml_ampersands(s).strip(),

1804

fatal=True, m3u8_id=None, data=None, headers={}, query={}):

1805

if self.get_param('ignore_no_formats_error'):

1806

fatal = False

1807

1808

res = self._download_xml_handle(

1809

manifest_url, video_id, 'Downloading f4m manifest',

1810

'Unable to download f4m manifest',

1811

# Some manifests may be malformed, e.g. prosiebensat1 generated manifests

1812

# (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)

1813

transform_source=transform_source,

1814

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return []

manifest, urlh = res

manifest_url = urlh.url

1820

1821

return self._parse_f4m_formats(

1822

manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,

1823

transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)

1824

1825

def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,

1826

transform_source=lambda s: fix_xml_ampersands(s).strip(),

1827

fatal=True, m3u8_id=None):

1828

if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:

1829

return []

1830

1831

# currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy

1832

akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')

1833

if akamai_pv is not None and ';' in akamai_pv.text:

1834

playerVerificationChallenge = akamai_pv.text.split(';')[0]

1835

if playerVerificationChallenge.strip() != '':

return []

formats = []

manifest_version = '1.0'

1840

media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')

1841

if not media_nodes:

1842

manifest_version = '2.0'

1843

media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')

1844

# Remove unsupported DRM protected media from final formats

1845

# rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).

1846

media_nodes = remove_encrypted_media(media_nodes)

if not media_nodes:

return formats

manifest_base_url = get_base_url(manifest)

1851

1852

bootstrap_info = xpath_element(

1853

manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],

1854

'bootstrap info', default=None)

1855

1856

vcodec = None

1857

mime_type = xpath_text(

1858

manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],

1859

'base URL', default=None)

1860

if mime_type and mime_type.startswith('audio/'):

1861

vcodec = 'none'

1862

1863

for i, media_el in enumerate(media_nodes):

1864

tbr = int_or_none(media_el.attrib.get('bitrate'))

1865

width = int_or_none(media_el.attrib.get('width'))

1866

height = int_or_none(media_el.attrib.get('height'))

1867

format_id = join_nonempty(f4m_id, tbr or i)

1868

# If <bootstrapInfo> is present, the specified f4m is a

1869

# stream-level manifest, and only set-level manifests may refer to

1870

# external resources. See section 11.4 and section 4 of F4M spec

1871

if bootstrap_info is None:

1872

media_url = None

1873

# @href is introduced in 2.0, see section 11.6 of F4M spec

1874

if manifest_version == '2.0':

1875

media_url = media_el.attrib.get('href')

1876

if media_url is None:

1877

media_url = media_el.attrib.get('url')

if not media_url:

continue

manifest_url = (

media_url if media_url.startswith('http://') or media_url.startswith('https://')

1882

else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))

1883

# If media_url is itself a f4m manifest do the recursive extraction

1884

# since bitrates in parent manifest (this one) and media_url manifest

1885

# may differ leading to inability to resolve the format by requested

1886

# bitrate in f4m downloader

1887

ext = determine_ext(manifest_url)

1888

if ext == 'f4m':

1889

f4m_formats = self._extract_f4m_formats(

1890

manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,

1891

transform_source=transform_source, fatal=fatal)

1892

# Sometimes stream-level manifest contains single media entry that

1893

# does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).

1894

# At the same time parent's media entry in set-level manifest may

1895

# contain it. We will copy it from parent in such cases.

1896

if len(f4m_formats) == 1:

1897

f = f4m_formats[0]

1898

f.update({

1899

'tbr': f.get('tbr') or tbr,

1900

'width': f.get('width') or width,

1901

'height': f.get('height') or height,

1902

'format_id': f.get('format_id') if not tbr else format_id,

1903

'vcodec': vcodec,

1904

})

1905

formats.extend(f4m_formats)

1906

continue

1907

elif ext == 'm3u8':

1908

formats.extend(self._extract_m3u8_formats(

1909

manifest_url, video_id, 'mp4', preference=preference,

1910

quality=quality, m3u8_id=m3u8_id, fatal=fatal))

1911

continue

1912

formats.append({

1913

'format_id': format_id,

1914

'url': manifest_url,

1915

'manifest_url': manifest_url,

1916

'ext': 'flv' if bootstrap_info is not None else None,

'protocol': 'f4m',

'tbr': tbr,

'width': width,

'height': height,

'vcodec': vcodec,

'preference': preference,

'quality': quality,

})

return formats

def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):

1928

return {

1929

'format_id': join_nonempty(m3u8_id, 'meta'),

'url': m3u8_url,

'ext': ext,

'protocol': 'm3u8',

'preference': preference - 100 if preference else -100,

1934

'quality': quality,

1935

'resolution': 'multiple',

1936

'format_note': 'Quality selection URL',

1937

}

1938

1939

def _report_ignoring_subs(self, name):

1940

self.report_warning(bug_reports_message(

1941

f'Ignoring subtitle tracks found in the {name} manifest; '

1942

'if any subtitle tracks are missing,'

1943

), only_once=True)

1944

1945

def _extract_m3u8_formats(self, *args, **kwargs):

1946

fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)

1947

if subs:

1948

self._report_ignoring_subs('HLS')

1949

return fmts

1950

1951

def _extract_m3u8_formats_and_subtitles(

1952

self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',

1953

preference=None, quality=None, m3u8_id=None, note=None,

1954

errnote=None, fatal=True, live=False, data=None, headers={},

1955

query={}):

1956

1957

if self.get_param('ignore_no_formats_error'):

fatal = False

if not m3u8_url:

if errnote is not False:

1962

errnote = errnote or 'Failed to obtain m3u8 URL'

1963

if fatal:

1964

raise ExtractorError(errnote, video_id=video_id)

1965

self.report_warning(f'{errnote}{bug_reports_message()}')

1966

return [], {}

1967

1968

res = self._download_webpage_handle(

1969

m3u8_url, video_id,

1970

note='Downloading m3u8 information' if note is None else note,

1971

errnote='Failed to download m3u8 information' if errnote is None else errnote,

1972

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return [], {}

m3u8_doc, urlh = res

m3u8_url = urlh.url

return self._parse_m3u8_formats_and_subtitles(

1981

m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,

1982

preference=preference, quality=quality, m3u8_id=m3u8_id,

1983

note=note, errnote=errnote, fatal=fatal, live=live, data=data,

1984

headers=headers, query=query, video_id=video_id)

1985

1986

def _parse_m3u8_formats_and_subtitles(

1987

self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',

1988

preference=None, quality=None, m3u8_id=None, live=False, note=None,

1989

errnote=None, fatal=True, data=None, headers={}, query={},

1990

video_id=None):

1991

formats, subtitles = [], {}

1992

has_drm = HlsFD._has_drm(m3u8_doc)

1993

1994

def format_url(url):

1995

return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)

1996

1997

if self.get_param('hls_split_discontinuity', False):

1998

def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):

if not m3u8_doc:

if not manifest_url:

return []

m3u8_doc = self._download_webpage(

2003

manifest_url, video_id, fatal=fatal, data=data, headers=headers,

2004

note=False, errnote='Failed to download m3u8 playlist information')

2005

if m3u8_doc is False:

2006

return []

2007

return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))

2008

2009

else:

2010

def _extract_m3u8_playlist_indices(*args, **kwargs):

return [None]

# References:

# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21

2015

# 2. https://github.com/ytdl-org/youtube-dl/issues/12211

2016

# 3. https://github.com/ytdl-org/youtube-dl/issues/18923

2017

2018

# We should try extracting formats only from master playlists [1, 4.3.4],

2019

# i.e. playlists that describe available qualities. On the other hand

2020

# media playlists [1, 4.3.3] should be returned as is since they contain

2021

# just the media without qualities renditions.

2022

# Fortunately, master playlist can be easily distinguished from media

2023

# playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]

2024

# master playlist tags MUST NOT appear in a media playlist and vice versa.

2025

# As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every

2026

# media playlist and MUST NOT appear in master playlist thus we can

2027

# clearly detect media playlist with this criterion.

2028

2029

if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is

2030

formats = [{

2031

'format_id': join_nonempty(m3u8_id, idx),

2032

'format_index': idx,

2033

'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),

2034

'ext': ext,

2035

'protocol': entry_protocol,

2036

'preference': preference,

2037

'quality': quality,

2038

'has_drm': has_drm,

2039

} for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]

2040

2041

return formats, subtitles

groups = {}

last_stream_inf = {}

def extract_media(x_media_line):

2047

media = parse_m3u8_attributes(x_media_line)

2048

# As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED

2049

media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')

2050

if not (media_type and group_id and name):

2051

return

2052

groups.setdefault(group_id, []).append(media)

2053

# <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>

2054

if media_type == 'SUBTITLES':

2055

# According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the

2056

# EXT-X-MEDIA tag if the media type is SUBTITLES.

2057

# However, lack of URI has been spotted in the wild.

2058

# e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339

2059

if not media.get('URI'):

2060

return

2061

url = format_url(media['URI'])

2062

sub_info = {

2063

'url': url,

2064

'ext': determine_ext(url),

2065

}

2066

if sub_info['ext'] == 'm3u8':

2067

# Per RFC 8216 §3.1, the only possible subtitle format m3u8

2068

# files may contain is WebVTT:

2069

# <https://tools.ietf.org/html/rfc8216#section-3.1>

2070

sub_info['ext'] = 'vtt'

2071

sub_info['protocol'] = 'm3u8_native'

2072

lang = media.get('LANGUAGE') or 'und'

2073

subtitles.setdefault(lang, []).append(sub_info)

2074

if media_type not in ('VIDEO', 'AUDIO'):

2075

return

2076

media_url = media.get('URI')

2077

if media_url:

2078

manifest_url = format_url(media_url)

2079

formats.extend({

2080

'format_id': join_nonempty(m3u8_id, group_id, name, idx),

'format_note': name,

'format_index': idx,

'url': manifest_url,

'manifest_url': m3u8_url,

2085

'language': media.get('LANGUAGE'),

2086

'ext': ext,

2087

'protocol': entry_protocol,

2088

'preference': preference,

2089

'quality': quality,

2090

'has_drm': has_drm,

2091

'vcodec': 'none' if media_type == 'AUDIO' else None,

2092

} for idx in _extract_m3u8_playlist_indices(manifest_url))

2093

2094

def build_stream_name():

2095

# Despite specification does not mention NAME attribute for

2096

# EXT-X-STREAM-INF tag it still sometimes may be present (see [1]

2097

# or vidio test in TestInfoExtractor.test_parse_m3u8_formats)

2098

# 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015

2099

stream_name = last_stream_inf.get('NAME')

2100

if stream_name:

2101

return stream_name

2102

# If there is no NAME in EXT-X-STREAM-INF it will be obtained

2103

# from corresponding rendition group

2104

stream_group_id = last_stream_inf.get('VIDEO')

2105

if not stream_group_id:

2106

return

2107

stream_group = groups.get(stream_group_id)

2108

if not stream_group:

2109

return stream_group_id

2110

rendition = stream_group[0]

2111

return rendition.get('NAME') or stream_group_id

2112

2113

# parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the

2114

# chance to detect video only formats when EXT-X-STREAM-INF tags

2115

# precede EXT-X-MEDIA tags in HLS manifest such as [3].

2116

for line in m3u8_doc.splitlines():

2117

if line.startswith('#EXT-X-MEDIA:'):

2118

extract_media(line)

2119

2120

for line in m3u8_doc.splitlines():

2121

if line.startswith('#EXT-X-STREAM-INF:'):

2122

last_stream_inf = parse_m3u8_attributes(line)

2123

elif line.startswith('#') or not line.strip():

continue

else:

tbr = float_or_none(

last_stream_inf.get('AVERAGE-BANDWIDTH')

2128

or last_stream_inf.get('BANDWIDTH'), scale=1000)

2129

manifest_url = format_url(line.strip())

2130

2131

for idx in _extract_m3u8_playlist_indices(manifest_url):

2132

format_id = [m3u8_id, None, idx]

2133

# Bandwidth of live streams may differ over time thus making

2134

# format_id unpredictable. So it's better to keep provided

2135

# format_id intact.

2136

if not live:

2137

stream_name = build_stream_name()

2138

format_id[1] = stream_name or '%d' % (tbr or len(formats))

2139

f = {

2140

'format_id': join_nonempty(*format_id),

2141

'format_index': idx,

2142

'url': manifest_url,

2143

'manifest_url': m3u8_url,

2144

'tbr': tbr,

2145

'ext': ext,

2146

'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),

2147

'protocol': entry_protocol,

2148

'preference': preference,

'quality': quality,

'has_drm': has_drm,

}

resolution = last_stream_inf.get('RESOLUTION')

2153

if resolution:

2154

mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)

2155

if mobj:

2156

f['width'] = int(mobj.group('width'))

2157

f['height'] = int(mobj.group('height'))

2158

# Unified Streaming Platform

2159

mobj = re.search(

2160

r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])

2161

if mobj:

2162

abr, vbr = mobj.groups()

2163

abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)

f.update({

'vbr': vbr,

'abr': abr,

})

codecs = parse_codecs(last_stream_inf.get('CODECS'))

2169

f.update(codecs)

2170

audio_group_id = last_stream_inf.get('AUDIO')

2171

# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which

2172

# references a rendition group MUST have a CODECS attribute.

2173

# However, this is not always respected. E.g. [2]

2174

# contains EXT-X-STREAM-INF tag which references AUDIO

2175

# rendition group but does not have CODECS and despite

2176

# referencing an audio group it represents a complete

2177

# (with audio and video) format. So, for such cases we will

2178

# ignore references to rendition groups and treat them

2179

# as complete formats.

2180

if audio_group_id and codecs and f.get('vcodec') != 'none':

2181

audio_group = groups.get(audio_group_id)

2182

if audio_group and audio_group[0].get('URI'):

2183

# TODO: update acodec for audio only formats with

# the same GROUP-ID

f['acodec'] = 'none'

if not f.get('ext'):

f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'

formats.append(f)

# for DailyMotion

progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')

2192

if progressive_uri:

2193

http_f = f.copy()

2194

del http_f['manifest_url']

2195

http_f.update({

2196

'format_id': f['format_id'].replace('hls-', 'http-'),

2197

'protocol': 'http',

2198

'url': progressive_uri,

2199

})

2200

formats.append(http_f)

2201

2202

last_stream_inf = {}

2203

return formats, subtitles

2204

2205

def _extract_m3u8_vod_duration(

2206

self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):

2207

2208

m3u8_vod = self._download_webpage(

2209

m3u8_vod_url, video_id,

2210

note='Downloading m3u8 VOD manifest' if note is None else note,

2211

errnote='Failed to download VOD manifest' if errnote is None else errnote,

2212

fatal=False, data=data, headers=headers, query=query)

2213

2214

return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)

2215

2216

def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):

2217

if '#EXT-X-ENDLIST' not in m3u8_vod:

return None

return int(sum(

float(line[len('#EXTINF:'):].split(',')[0])

2222

for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None

2223

2224

def _extract_mpd_vod_duration(

2225

self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):

2226

2227

mpd_doc = self._download_xml(

2228

mpd_url, video_id,

2229

note='Downloading MPD VOD manifest' if note is None else note,

2230

errnote='Failed to download VOD manifest' if errnote is None else errnote,

2231

fatal=False, data=data, headers=headers, query=query)

2232

if not isinstance(mpd_doc, xml.etree.ElementTree.Element):

2233

return None

2234

return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))

2235

2236

@staticmethod

2237

def _xpath_ns(path, namespace=None):

if not namespace:

return path

out = []

for c in path.split('/'):

2242

if not c or c == '.':

2243

out.append(c)

2244

else:

2245

out.append('{%s}%s' % (namespace, c))

2246

return '/'.join(out)

2247

2248

def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):

2249

if self.get_param('ignore_no_formats_error'):

2250

fatal = False

2251

2252

res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)

if res is False:

assert not fatal

return [], {}

smil, urlh = res

return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,

2259

namespace=self._parse_smil_namespace(smil))

2260

2261

def _extract_smil_formats(self, *args, **kwargs):

2262

fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)

2263

if subs:

2264

self._report_ignoring_subs('SMIL')

2265

return fmts

2266

2267

def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):

2268

res = self._download_smil(smil_url, video_id, fatal=fatal)

if res is False:

return {}

smil, urlh = res

smil_url = urlh.url

return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)

2276

2277

def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):

2278

return self._download_xml_handle(

2279

smil_url, video_id, 'Downloading SMIL file',

2280

'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)

2281

2282

def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):

2283

namespace = self._parse_smil_namespace(smil)

2284

2285

formats, subtitles = self._parse_smil_formats_and_subtitles(

2286

smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)

2287

2288

video_id = os.path.splitext(url_basename(smil_url))[0]

title = None

description = None

upload_date = None

for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):

2293

name = meta.attrib.get('name')

2294

content = meta.attrib.get('content')

2295

if not name or not content:

2296

continue

2297

if not title and name == 'title':

2298

title = content

2299

elif not description and name in ('description', 'abstract'):

2300

description = content

2301

elif not upload_date and name == 'date':

2302

upload_date = unified_strdate(content)

2303

2304

thumbnails = [{

2305

'id': image.get('type'),

2306

'url': image.get('src'),

2307

'width': int_or_none(image.get('width')),

2308

'height': int_or_none(image.get('height')),

2309

} for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]

return {

'id': video_id,

'title': title or video_id,

2314

'description': description,

2315

'upload_date': upload_date,

2316

'thumbnails': thumbnails,

2317

'formats': formats,

2318

'subtitles': subtitles,

2319

}

2320

2321

def _parse_smil_namespace(self, smil):

2322

return self._search_regex(

2323

r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)

2324

2325

def _parse_smil_formats(self, *args, **kwargs):

2326

fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)

2327

if subs:

2328

self._report_ignoring_subs('SMIL')

2329

return fmts

2330

2331

def _parse_smil_formats_and_subtitles(

2332

self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):

2333

base = smil_url

2334

for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):

2335

b = meta.get('base') or meta.get('httpBase')

if b:

base = b

break

formats, subtitles = [], {}

rtmp_count = 0

http_count = 0

m3u8_count = 0

imgs_count = 0

srcs = set()

media = itertools.chain.from_iterable(

2348

smil.findall(self._xpath_ns(arg, namespace))

2349

for arg in ['.//video', './/audio', './/media'])

2350

for medium in media:

2351

src = medium.get('src')

2352

if not src or src in srcs:

continue

srcs.add(src)

bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)

2357

filesize = int_or_none(medium.get('size') or medium.get('fileSize'))

2358

width = int_or_none(medium.get('width'))

2359

height = int_or_none(medium.get('height'))

2360

proto = medium.get('proto')

2361

ext = medium.get('ext')

2362

src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(

2363

self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))

2364

streamer = medium.get('streamer') or base

2365

2366

if proto == 'rtmp' or streamer.startswith('rtmp'):

rtmp_count += 1

formats.append({

'url': streamer,

'play_path': src,

'ext': 'flv',

'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),

2373

'tbr': bitrate,

2374

'filesize': filesize,

'width': width,

'height': height,

})

if transform_rtmp_url:

2379

streamer, src = transform_rtmp_url(streamer, src)

formats[-1].update({

'url': streamer,

'play_path': src,

})

continue

src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)

2387

src_url = src_url.strip()

2388

2389

if proto == 'm3u8' or src_ext == 'm3u8':

2390

m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(

2391

src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)

2392

self._merge_subtitles(m3u8_subs, target=subtitles)

2393

if len(m3u8_formats) == 1:

2394

m3u8_count += 1

2395

m3u8_formats[0].update({

2396

'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),

'tbr': bitrate,

'width': width,

'height': height,

})

formats.extend(m3u8_formats)

2402

elif src_ext == 'f4m':

f4m_url = src_url

if not f4m_params:

f4m_params = {

'hdcore': '3.2.0',

'plugin': 'flowplayer-3.2.0.1',

2408

}

2409

f4m_url += '&' if '?' in f4m_url else '?'

2410

f4m_url += urllib.parse.urlencode(f4m_params)

2411

formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))

2412

elif src_ext == 'mpd':

2413

mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(

2414

src_url, video_id, mpd_id='dash', fatal=False)

2415

formats.extend(mpd_formats)

2416

self._merge_subtitles(mpd_subs, target=subtitles)

2417

elif re.search(r'\.ism/[Mm]anifest', src_url):

2418

ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(

2419

src_url, video_id, ism_id='mss', fatal=False)

2420

formats.extend(ism_formats)

2421

self._merge_subtitles(ism_subs, target=subtitles)

2422

elif src_url.startswith('http') and self._is_valid_url(src, video_id):

http_count += 1

formats.append({

'url': src_url,

'ext': ext or src_ext or 'flv',

2427

'format_id': 'http-%d' % (bitrate or http_count),

2428

'tbr': bitrate,

2429

'filesize': filesize,

'width': width,

'height': height,

})

for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):

2435

src = medium.get('src')

2436

if not src or src in srcs:

continue

srcs.add(src)

imgs_count += 1

formats.append({

'format_id': 'imagestream-%d' % (imgs_count),

2443

'url': src,

2444

'ext': mimetype2ext(medium.get('type')),

2445

'acodec': 'none',

2446

'vcodec': 'none',

2447

'width': int_or_none(medium.get('width')),

2448

'height': int_or_none(medium.get('height')),

2449

'format_note': 'SMIL storyboards',

2450

})

2451

2452

smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)

2453

self._merge_subtitles(smil_subs, target=subtitles)

2454

2455

return formats, subtitles

2456

2457

def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):

2458

urls = []

2459

subtitles = {}

2460

for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):

2461

src = textstream.get('src')

2462

if not src or src in urls:

2463

continue

2464

urls.append(src)

2465

ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)

2466

lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang

2467

subtitles.setdefault(lang, []).append({

'url': src,

'ext': ext,

})

return subtitles

def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):

2474

res = self._download_xml_handle(

2475

xspf_url, playlist_id, 'Downloading xpsf playlist',

2476

'Unable to download xspf manifest', fatal=fatal)

if res is False:

return []

xspf, urlh = res

xspf_url = urlh.url

return self._parse_xspf(

2484

xspf, playlist_id, xspf_url=xspf_url,

2485

xspf_base_url=base_url(xspf_url))

2486

2487

def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):

2488

NS_MAP = {

2489

'xspf': 'http://xspf.org/ns/0/',

2490

's1': 'http://static.streamone.nl/player/ns/0',

}

entries = []

for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):

2495

title = xpath_text(

2496

track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)

2497

description = xpath_text(

2498

track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')

2499

thumbnail = xpath_text(

2500

track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')

2501

duration = float_or_none(

2502

xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)

2503

2504

formats = []

2505

for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):

2506

format_url = urljoin(xspf_base_url, location.text)

if not format_url:

continue

formats.append({

'url': format_url,

'manifest_url': xspf_url,

2512

'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),

2513

'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),

2514

'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),

})

entries.append({

'id': playlist_id,

'title': title,

'description': description,

2521

'thumbnail': thumbnail,

2522

'duration': duration,

'formats': formats,

})

return entries

def _extract_mpd_formats(self, *args, **kwargs):

2528

fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)

2529

if subs:

2530

self._report_ignoring_subs('DASH')

2531

return fmts

2532

2533

def _extract_mpd_formats_and_subtitles(

2534

self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,

2535

fatal=True, data=None, headers={}, query={}):

2536

2537

if self.get_param('ignore_no_formats_error'):

2538

fatal = False

2539

2540

res = self._download_xml_handle(

2541

mpd_url, video_id,

2542

note='Downloading MPD manifest' if note is None else note,

2543

errnote='Failed to download MPD manifest' if errnote is None else errnote,

2544

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return [], {}

mpd_doc, urlh = res

if mpd_doc is None:

return [], {}

# We could have been redirected to a new url when we retrieved our mpd file.

2552

mpd_url = urlh.url

2553

mpd_base_url = base_url(mpd_url)

2554

2555

return self._parse_mpd_formats_and_subtitles(

2556

mpd_doc, mpd_id, mpd_base_url, mpd_url)

2557

2558

def _parse_mpd_formats(self, *args, **kwargs):

2559

fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)

2560

if subs:

2561

self._report_ignoring_subs('DASH')

2562

return fmts

2563

2564

def _parse_mpd_formats_and_subtitles(

2565

self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):

2566

"""

2567

Parse formats from MPD manifest.

2568

References:

2569

1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),

2570

http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip

2571

2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP

2572

"""

2573

if not self.get_param('dynamic_mpd', True):

2574

if mpd_doc.get('type') == 'dynamic':

2575

return [], {}

2576

2577

namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)

2578

2579

def _add_ns(path):

2580

return self._xpath_ns(path, namespace)

2581

2582

def is_drm_protected(element):

2583

return element.find(_add_ns('ContentProtection')) is not None

2584

2585

def extract_multisegment_info(element, ms_parent_info):

2586

ms_info = ms_parent_info.copy()

2587

2588

# As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some

2589

# common attributes and elements. We will only extract relevant

2590

# for us.

2591

def extract_common(source):

2592

segment_timeline = source.find(_add_ns('SegmentTimeline'))

2593

if segment_timeline is not None:

2594

s_e = segment_timeline.findall(_add_ns('S'))

2595

if s_e:

2596

ms_info['total_number'] = 0

2597

ms_info['s'] = []

2598

for s in s_e:

2599

r = int(s.get('r', 0))

2600

ms_info['total_number'] += 1 + r

2601

ms_info['s'].append({

2602

't': int(s.get('t', 0)),

2603

# @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])

2604

'd': int(s.attrib['d']),

2605

'r': r,

2606

})

2607

start_number = source.get('startNumber')

2608

if start_number:

2609

ms_info['start_number'] = int(start_number)

2610

timescale = source.get('timescale')

2611

if timescale:

2612

ms_info['timescale'] = int(timescale)

2613

segment_duration = source.get('duration')

2614

if segment_duration:

2615

ms_info['segment_duration'] = float(segment_duration)

2616

2617

def extract_Initialization(source):

2618

initialization = source.find(_add_ns('Initialization'))

2619

if initialization is not None:

2620

ms_info['initialization_url'] = initialization.attrib['sourceURL']

2621

2622

segment_list = element.find(_add_ns('SegmentList'))

2623

if segment_list is not None:

2624

extract_common(segment_list)

2625

extract_Initialization(segment_list)

2626

segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))

2627

if segment_urls_e:

2628

ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]

2629

else:

2630

segment_template = element.find(_add_ns('SegmentTemplate'))

2631

if segment_template is not None:

2632

extract_common(segment_template)

2633

media = segment_template.get('media')

2634

if media:

2635

ms_info['media'] = media

2636

initialization = segment_template.get('initialization')

2637

if initialization:

2638

ms_info['initialization'] = initialization

2639

else:

2640

extract_Initialization(segment_template)

2641

return ms_info

2642

2643

mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))

2644

formats, subtitles = [], {}

2645

stream_numbers = collections.defaultdict(int)

2646

for period in mpd_doc.findall(_add_ns('Period')):

2647

period_duration = parse_duration(period.get('duration')) or mpd_duration

2648

period_ms_info = extract_multisegment_info(period, {

'start_number': 1,

'timescale': 1,

})

for adaptation_set in period.findall(_add_ns('AdaptationSet')):

2653

adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)

2654

for representation in adaptation_set.findall(_add_ns('Representation')):

2655

representation_attrib = adaptation_set.attrib.copy()

2656

representation_attrib.update(representation.attrib)

2657

# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory

2658

mime_type = representation_attrib['mimeType']

2659

content_type = representation_attrib.get('contentType', mime_type.split('/')[0])

2660

2661

codec_str = representation_attrib.get('codecs', '')

2662

# Some kind of binary subtitle found in some youtube livestreams

2663

if mime_type == 'application/x-rawcc':

2664

codecs = {'scodec': codec_str}

2665

else:

2666

codecs = parse_codecs(codec_str)

2667

if content_type not in ('video', 'audio', 'text'):

2668

if mime_type == 'image/jpeg':

2669

content_type = mime_type

2670

elif codecs.get('vcodec', 'none') != 'none':

2671

content_type = 'video'

2672

elif codecs.get('acodec', 'none') != 'none':

2673

content_type = 'audio'

2674

elif codecs.get('scodec', 'none') != 'none':

2675

content_type = 'text'

2676

elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):

2677

content_type = 'text'

2678

else:

2679

self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)

continue

base_url = ''

for element in (representation, adaptation_set, period, mpd_doc):

2684

base_url_e = element.find(_add_ns('BaseURL'))

2685

if try_call(lambda: base_url_e.text) is not None:

2686

base_url = base_url_e.text + base_url

2687

if re.match(r'^https?://', base_url):

2688

break

2689

if mpd_base_url and base_url.startswith('/'):

2690

base_url = urllib.parse.urljoin(mpd_base_url, base_url)

2691

elif mpd_base_url and not re.match(r'^https?://', base_url):

2692

if not mpd_base_url.endswith('/'):

2693

mpd_base_url += '/'

2694

base_url = mpd_base_url + base_url

2695

representation_id = representation_attrib.get('id')

2696

lang = representation_attrib.get('lang')

2697

url_el = representation.find(_add_ns('BaseURL'))

2698

filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)

2699

bandwidth = int_or_none(representation_attrib.get('bandwidth'))

2700

if representation_id is not None:

2701

format_id = representation_id

2702

else:

2703

format_id = content_type

2704

if mpd_id:

2705

format_id = mpd_id + '-' + format_id

2706

if content_type in ('video', 'audio'):

2707

f = {

2708

'format_id': format_id,

2709

'manifest_url': mpd_url,

2710

'ext': mimetype2ext(mime_type),

2711

'width': int_or_none(representation_attrib.get('width')),

2712

'height': int_or_none(representation_attrib.get('height')),

2713

'tbr': float_or_none(bandwidth, 1000),

2714

'asr': int_or_none(representation_attrib.get('audioSamplingRate')),

2715

'fps': int_or_none(representation_attrib.get('frameRate')),

2716

'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,

2717

'format_note': 'DASH %s' % content_type,

2718

'filesize': filesize,

2719

'container': mimetype2ext(mime_type) + '_dash',

2720

**codecs

2721

}

2722

elif content_type == 'text':

2723

f = {

2724

'ext': mimetype2ext(mime_type),

2725

'manifest_url': mpd_url,

2726

'filesize': filesize,

2727

}

2728

elif content_type == 'image/jpeg':

2729

# See test case in VikiIE

2730

# https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1

2731

f = {

2732

'format_id': format_id,

2733

'ext': 'mhtml',

2734

'manifest_url': mpd_url,

2735

'format_note': 'DASH storyboards (jpeg)',

'acodec': 'none',

'vcodec': 'none',

}

if is_drm_protected(adaptation_set) or is_drm_protected(representation):

2740

f['has_drm'] = True

2741

representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)

2742

2743

def prepare_template(template_name, identifiers):

2744

tmpl = representation_ms_info[template_name]

2745

if representation_id is not None:

2746

tmpl = tmpl.replace('$RepresentationID$', representation_id)

2747

# First of, % characters outside $...$ templates

2748

# must be escaped by doubling for proper processing

2749

# by % operator string formatting used further (see

2750

# https://github.com/ytdl-org/youtube-dl/issues/16867).

t = ''

in_template = False

for c in tmpl:

t += c

if c == '$':

in_template = not in_template

2757

elif c == '%' and not in_template:

2758

t += c

2759

# Next, $...$ templates are translated to their

2760

# %(...) counterparts to be used with % operator

2761

t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)

2762

t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)

t.replace('$$', '$')

return t

# @initialization is a regular template like @media one

2767

# so it should be handled just the same way (see

2768

# https://github.com/ytdl-org/youtube-dl/issues/11605)

2769

if 'initialization' in representation_ms_info:

2770

initialization_template = prepare_template(

2771

'initialization',

2772

# As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and

2773

# $Time$ shall not be included for @initialization thus

2774

# only $Bandwidth$ remains

2775

('Bandwidth', ))

2776

representation_ms_info['initialization_url'] = initialization_template % {

2777

'Bandwidth': bandwidth,

2778

}

2779

2780

def location_key(location):

2781

return 'url' if re.match(r'^https?://', location) else 'path'

2782

2783

if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:

2784

2785

media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))

2786

media_location_key = location_key(media_template)

2787

2788

# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$

2789

# can't be used at the same time

2790

if '%(Number' in media_template and 's' not in representation_ms_info:

2791

segment_duration = None

2792

if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:

2793

segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])

2794

representation_ms_info['total_number'] = int(math.ceil(

2795

float_or_none(period_duration, segment_duration, default=0)))

2796

representation_ms_info['fragments'] = [{

2797

media_location_key: media_template % {

2798

'Number': segment_number,

2799

'Bandwidth': bandwidth,

2800

},

2801

'duration': segment_duration,

2802

} for segment_number in range(

2803

representation_ms_info['start_number'],

2804

representation_ms_info['total_number'] + representation_ms_info['start_number'])]

2805

else:

2806

# $Number*$ or $Time$ in media template with S list available

2807

# Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg

2808

# Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411

2809

representation_ms_info['fragments'] = []

2810

segment_time = 0

2811

segment_d = None

2812

segment_number = representation_ms_info['start_number']

2813

2814

def add_segment_url():

2815

segment_url = media_template % {

2816

'Time': segment_time,

2817

'Bandwidth': bandwidth,

2818

'Number': segment_number,

2819

}

2820

representation_ms_info['fragments'].append({

2821

media_location_key: segment_url,

2822

'duration': float_or_none(segment_d, representation_ms_info['timescale']),

2823

})

2824

2825

for num, s in enumerate(representation_ms_info['s']):

2826

segment_time = s.get('t') or segment_time

segment_d = s['d']

add_segment_url()

segment_number += 1

for r in range(s.get('r', 0)):

2831

segment_time += segment_d

2832

add_segment_url()

2833

segment_number += 1

2834

segment_time += segment_d

2835

elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:

2836

# No media template,

2837

# e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI

2838

# or any YouTube dashsegments video

2839

fragments = []

2840

segment_index = 0

2841

timescale = representation_ms_info['timescale']

2842

for s in representation_ms_info['s']:

2843

duration = float_or_none(s['d'], timescale)

2844

for r in range(s.get('r', 0) + 1):

2845

segment_uri = representation_ms_info['segment_urls'][segment_index]

2846

fragments.append({

2847

location_key(segment_uri): segment_uri,

2848

'duration': duration,

2849

})

2850

segment_index += 1

2851

representation_ms_info['fragments'] = fragments

2852

elif 'segment_urls' in representation_ms_info:

2853

# Segment URLs with no SegmentTimeline

2854

# E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091

2855

# https://github.com/ytdl-org/youtube-dl/pull/14844

2856

fragments = []

2857

segment_duration = float_or_none(

2858

representation_ms_info['segment_duration'],

2859

representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None

2860

for segment_url in representation_ms_info['segment_urls']:

2861

fragment = {

2862

location_key(segment_url): segment_url,

2863

}

2864

if segment_duration:

2865

fragment['duration'] = segment_duration

2866

fragments.append(fragment)

2867

representation_ms_info['fragments'] = fragments

2868

# If there is a fragments key available then we correctly recognized fragmented media.

2869

# Otherwise we will assume unfragmented media with direct access. Technically, such

2870

# assumption is not necessarily correct since we may simply have no support for

2871

# some forms of fragmented media renditions yet, but for now we'll use this fallback.

2872

if 'fragments' in representation_ms_info:

2873

f.update({

2874

# NB: mpd_url may be empty when MPD manifest is parsed from a string

2875

'url': mpd_url or base_url,

2876

'fragment_base_url': base_url,

2877

'fragments': [],

2878

'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',

2879

})

2880

if 'initialization_url' in representation_ms_info:

2881

initialization_url = representation_ms_info['initialization_url']

2882

if not f.get('url'):

2883

f['url'] = initialization_url

2884

f['fragments'].append({location_key(initialization_url): initialization_url})

2885

f['fragments'].extend(representation_ms_info['fragments'])

2886

if not period_duration:

2887

period_duration = try_get(

2888

representation_ms_info,

2889

lambda r: sum(frag['duration'] for frag in r['fragments']), float)

2890

else:

2891

# Assuming direct URL to unfragmented media.

2892

f['url'] = base_url

2893

if content_type in ('video', 'audio', 'image/jpeg'):

2894

f['manifest_stream_number'] = stream_numbers[f['url']]

2895

stream_numbers[f['url']] += 1

2896

formats.append(f)

2897

elif content_type == 'text':

2898

subtitles.setdefault(lang or 'und', []).append(f)

2899

2900

return formats, subtitles

2901

2902

def _extract_ism_formats(self, *args, **kwargs):

2903

fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)

2904

if subs:

2905

self._report_ignoring_subs('ISM')

2906

return fmts

2907

2908

def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):

2909

if self.get_param('ignore_no_formats_error'):

2910

fatal = False

2911

2912

res = self._download_xml_handle(

2913

ism_url, video_id,

2914

note='Downloading ISM manifest' if note is None else note,

2915

errnote='Failed to download ISM manifest' if errnote is None else errnote,

2916

fatal=fatal, data=data, headers=headers, query=query)

if res is False:

return [], {}

ism_doc, urlh = res

if ism_doc is None:

return [], {}

return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)

2924

2925

def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):

2926

"""

2927

Parse formats from ISM manifest.

2928

References:

2929

1. [MS-SSTR]: Smooth Streaming Protocol,

2930

https://msdn.microsoft.com/en-us/library/ff469518.aspx

2931

"""

2932

if ism_doc.get('IsLive') == 'TRUE':

2933

return [], {}

2934

2935

duration = int(ism_doc.attrib['Duration'])

2936

timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000

formats = []

subtitles = {}

for stream in ism_doc.findall('StreamIndex'):

2941

stream_type = stream.get('Type')

2942

if stream_type not in ('video', 'audio', 'text'):

2943

continue

2944

url_pattern = stream.attrib['Url']

2945

stream_timescale = int_or_none(stream.get('TimeScale')) or timescale

2946

stream_name = stream.get('Name')

2947

stream_language = stream.get('Language', 'und')

2948

for track in stream.findall('QualityLevel'):

2949

KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}

2950

fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))

2951

# TODO: add support for WVC1 and WMAP

2952

if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):

2953

self.report_warning('%s is not a supported codec' % fourcc)

2954

continue

2955

tbr = int(track.attrib['Bitrate']) // 1000

2956

# [1] does not mention Width and Height attributes. However,

2957

# they're often present while MaxWidth and MaxHeight are

2958

# missing, so should be used as fallbacks

2959

width = int_or_none(track.get('MaxWidth') or track.get('Width'))

2960

height = int_or_none(track.get('MaxHeight') or track.get('Height'))

2961

sampling_rate = int_or_none(track.get('SamplingRate'))

2962

2963

track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)

2964

track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)

fragments = []

fragment_ctx = {

'time': 0,

}

stream_fragments = stream.findall('c')

2971

for stream_fragment_index, stream_fragment in enumerate(stream_fragments):

2972

fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']

2973

fragment_repeat = int_or_none(stream_fragment.get('r')) or 1

2974

fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))

2975

if not fragment_ctx['duration']:

2976

try:

2977

next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])

2978

except IndexError:

2979

next_fragment_time = duration

2980

fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat

2981

for _ in range(fragment_repeat):

2982

fragments.append({

2983

'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),

2984

'duration': fragment_ctx['duration'] / stream_timescale,

2985

})

2986

fragment_ctx['time'] += fragment_ctx['duration']

2987

2988

if stream_type == 'text':

2989

subtitles.setdefault(stream_language, []).append({

'ext': 'ismt',

'protocol': 'ism',

'url': ism_url,

'manifest_url': ism_url,

2994

'fragments': fragments,

2995

'_download_params': {

2996

'stream_type': stream_type,

2997

'duration': duration,

2998

'timescale': stream_timescale,

2999

'fourcc': fourcc,

3000

'language': stream_language,

3001

'codec_private_data': track.get('CodecPrivateData'),

3002

}

3003

})

3004

elif stream_type in ('video', 'audio'):

3005

formats.append({

3006

'format_id': join_nonempty(ism_id, stream_name, tbr),

3007

'url': ism_url,

3008

'manifest_url': ism_url,

3009

'ext': 'ismv' if stream_type == 'video' else 'isma',

'width': width,

'height': height,

'tbr': tbr,

'asr': sampling_rate,

3014

'vcodec': 'none' if stream_type == 'audio' else fourcc,

3015

'acodec': 'none' if stream_type == 'video' else fourcc,

3016

'protocol': 'ism',

3017

'fragments': fragments,

3018

'has_drm': ism_doc.find('Protection') is not None,

3019

'language': stream_language,

3020

'audio_channels': int_or_none(track.get('Channels')),

3021

'_download_params': {

3022

'stream_type': stream_type,

3023

'duration': duration,

3024

'timescale': stream_timescale,

3025

'width': width or 0,

3026

'height': height or 0,

3027

'fourcc': fourcc,

3028

'language': stream_language,

3029

'codec_private_data': track.get('CodecPrivateData'),

3030

'sampling_rate': sampling_rate,

3031

'channels': int_or_none(track.get('Channels', 2)),

3032

'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),

3033

'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),

3034

},

3035

})

3036

return formats, subtitles

3037

3038

def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):

3039

def absolute_url(item_url):

3040

return urljoin(base_url, item_url)

3041

3042

def parse_content_type(content_type):

3043

if not content_type:

3044

return {}

3045

ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)

3046

if ctr:

3047

mimetype, codecs = ctr.groups()

3048

f = parse_codecs(codecs)

3049

f['ext'] = mimetype2ext(mimetype)

return f

return {}

def _media_formats(src, cur_media_type, type_info=None):

3054

type_info = type_info or {}

3055

full_url = absolute_url(src)

3056

ext = type_info.get('ext') or determine_ext(full_url)

3057

if ext == 'm3u8':

3058

is_plain_url = False

3059

formats = self._extract_m3u8_formats(

3060

full_url, video_id, ext='mp4',

3061

entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,

3062

preference=preference, quality=quality, fatal=False)

3063

elif ext == 'mpd':

3064

is_plain_url = False

3065

formats = self._extract_mpd_formats(

3066

full_url, video_id, mpd_id=mpd_id, fatal=False)

else:

is_plain_url = True

formats = [{

'url': full_url,

'vcodec': 'none' if cur_media_type == 'audio' else None,

3072

'ext': ext,

3073

}]

3074

return is_plain_url, formats

3075

3076

entries = []

3077

# amp-video and amp-audio are very similar to their HTML5 counterparts

3078

# so we will include them right here (see

3079

# https://www.ampproject.org/docs/reference/components/amp-video)

3080

# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/

3081

_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'

3082

media_tags = [(media_tag, media_tag_name, media_type, '')

3083

for media_tag, media_tag_name, media_type

3084

in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]

3085

media_tags.extend(re.findall(

3086

# We only allow video|audio followed by a whitespace or '>'.

3087

# Allowing more characters may end up in significant slow down (see

3088

# https://github.com/ytdl-org/youtube-dl/issues/11979,

3089

# e.g. http://www.porntrex.com/maps/videositemap.xml).

3090

r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))

3091

for media_tag, _, media_type, media_content in media_tags:

media_info = {

'formats': [],

'subtitles': {},

}

media_attributes = extract_attributes(media_tag)

3097

src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))

3098

if src:

3099

f = parse_content_type(media_attributes.get('type'))

3100

_, formats = _media_formats(src, media_type, f)

3101

media_info['formats'].extend(formats)

3102

media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))

3103

if media_content:

3104

for source_tag in re.findall(r'<source[^>]+>', media_content):

3105

s_attr = extract_attributes(source_tag)

3106

# data-video-src and data-src are non standard but seen

3107

# several times in the wild

3108

src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))

3109

if not src:

3110

continue

3111

f = parse_content_type(s_attr.get('type'))

3112

is_plain_url, formats = _media_formats(src, media_type, f)

3113

if is_plain_url:

3114

# width, height, res, label and title attributes are

3115

# all not standard but seen several times in the wild

3116

labels = [

3117

s_attr.get(lbl)

3118

for lbl in ('label', 'title')

3119

if str_or_none(s_attr.get(lbl))

3120

]

3121

width = int_or_none(s_attr.get('width'))

3122

height = (int_or_none(s_attr.get('height'))

3123

or int_or_none(s_attr.get('res')))

3124

if not width or not height:

3125

for lbl in labels:

3126

resolution = parse_resolution(lbl)

3127

if not resolution:

3128

continue

3129

width = width or resolution.get('width')

3130

height = height or resolution.get('height')

3131

for lbl in labels:

3132

tbr = parse_bitrate(lbl)

if tbr:

break

else:

tbr = None

f.update({

'width': width,

'height': height,

'tbr': tbr,

'format_id': s_attr.get('label') or s_attr.get('title'),

3142

})

3143

f.update(formats[0])

3144

media_info['formats'].append(f)

3145

else:

3146

media_info['formats'].extend(formats)

3147

for track_tag in re.findall(r'<track[^>]+>', media_content):

3148

track_attributes = extract_attributes(track_tag)

3149

kind = track_attributes.get('kind')

3150

if not kind or kind in ('subtitles', 'captions'):

3151

src = strip_or_none(track_attributes.get('src'))

3152

if not src:

3153

continue

3154

lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')

3155

media_info['subtitles'].setdefault(lang, []).append({

3156

'url': absolute_url(src),

3157

})

3158

for f in media_info['formats']:

3159

f.setdefault('http_headers', {})['Referer'] = base_url

3160

if media_info['formats'] or media_info['subtitles']:

3161

entries.append(media_info)

3162

return entries

3163

3164

def _extract_akamai_formats(self, *args, **kwargs):

3165

fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)

3166

if subs:

3167

self._report_ignoring_subs('akamai')

3168

return fmts

3169

3170

def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):

3171

signed = 'hdnea=' in manifest_url

3172

if not signed:

3173

# https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html

3174

manifest_url = re.sub(

3175

r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',

3176

'', manifest_url).strip('?')

formats = []

subtitles = {}

hdcore_sign = 'hdcore=3.7.0'

3182

f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')

3183

hds_host = hosts.get('hds')

3184

if hds_host:

3185

f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)

3186

if 'hdcore=' not in f4m_url:

3187

f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign

3188

f4m_formats = self._extract_f4m_formats(

3189

f4m_url, video_id, f4m_id='hds', fatal=False)

3190

for entry in f4m_formats:

3191

entry.update({'extra_param_to_segment_url': hdcore_sign})

3192

formats.extend(f4m_formats)

3193

3194

m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')

3195

hls_host = hosts.get('hls')

3196

if hls_host:

3197

m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)

3198

m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(

3199

m3u8_url, video_id, 'mp4', 'm3u8_native',

3200

m3u8_id='hls', fatal=False)

3201

formats.extend(m3u8_formats)

3202

subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)

3203

3204

http_host = hosts.get('http')

3205

if http_host and m3u8_formats and not signed:

3206

REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'

3207

qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')

3208

qualities_length = len(qualities)

3209

if len(m3u8_formats) in (qualities_length, qualities_length + 1):

3210

i = 0

3211

for f in m3u8_formats:

3212

if f['vcodec'] != 'none':

3213

for protocol in ('http', 'https'):

3214

http_f = f.copy()

3215

del http_f['manifest_url']

3216

http_url = re.sub(

3217

REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])

3218

http_f.update({

3219

'format_id': http_f['format_id'].replace('hls-', protocol + '-'),

3220

'url': http_url,

3221

'protocol': protocol,

3222

})

3223

formats.append(http_f)

3224

i += 1

3225

3226

return formats, subtitles

3227

3228

def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):

3229

query = urllib.parse.urlparse(url).query

3230

3231

mobj = re.search(

3232

r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)

3233

url_base = mobj.group('url')

3234

http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)

3235

formats = []

3236

3237

def manifest_url(manifest):

3238

m_url = f'{http_base_url}/{manifest}'

3239

if query:

3240

m_url += '?%s' % query

3241

return m_url

3242

3243

if 'm3u8' not in skip_protocols:

3244

formats.extend(self._extract_m3u8_formats(

3245

manifest_url('playlist.m3u8'), video_id, 'mp4',

3246

m3u8_entry_protocol, m3u8_id='hls', fatal=False))

3247

if 'f4m' not in skip_protocols:

3248

formats.extend(self._extract_f4m_formats(

3249

manifest_url('manifest.f4m'),

3250

video_id, f4m_id='hds', fatal=False))

3251

if 'dash' not in skip_protocols:

3252

formats.extend(self._extract_mpd_formats(

3253

manifest_url('manifest.mpd'),

3254

video_id, mpd_id='dash', fatal=False))

3255

if re.search(r'(?:/smil:|\.smil)', url_base):

3256

if 'smil' not in skip_protocols:

3257

rtmp_formats = self._extract_smil_formats(

3258

manifest_url('jwplayer.smil'),

3259

video_id, fatal=False)

3260

for rtmp_format in rtmp_formats:

3261

rtsp_format = rtmp_format.copy()

3262

rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])

3263

del rtsp_format['play_path']

3264

del rtsp_format['ext']

3265

rtsp_format.update({

3266

'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),

3267

'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),

3268

'protocol': 'rtsp',

3269

})

3270

formats.extend([rtmp_format, rtsp_format])

3271

else:

3272

for protocol in ('rtmp', 'rtsp'):

3273

if protocol not in skip_protocols:

3274

formats.append({

3275

'url': f'{protocol}:{url_base}',

3276

'format_id': protocol,

3277

'protocol': protocol,

})

return formats

def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):

3282

mobj = re.search(

3283

r'''(?s)jwplayer\s*$\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*$(?!</script>).*?\.\s*setup\s*$\s*(?P<options>(?:\([^)]*$|[^)])+)\s*\)''',

webpage)

if mobj:

try:

jwplayer_data = self._parse_json(mobj.group('options'),

3288

video_id=video_id,

3289

transform_source=transform_source)

3290

except ExtractorError:

3291

pass

3292

else:

3293

if isinstance(jwplayer_data, dict):

3294

return jwplayer_data

3295

3296

def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):

3297

jwplayer_data = self._find_jwplayer_data(

3298

webpage, video_id, transform_source=js_to_json)

3299

return self._parse_jwplayer_data(

3300

jwplayer_data, video_id, *args, **kwargs)

3301

3302

def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,

3303

m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):

3304

entries = []

3305

if not isinstance(jwplayer_data, dict):

3306

return entries

3307

3308

playlist_items = jwplayer_data.get('playlist')

3309

# JWPlayer backward compatibility: single playlist item/flattened playlists

3310

# https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10

3311

# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96

3312

if not isinstance(playlist_items, list):

3313

playlist_items = (playlist_items or jwplayer_data, )

3314

3315

for video_data in playlist_items:

3316

if not isinstance(video_data, dict):

3317

continue

3318

# JWPlayer backward compatibility: flattened sources

3319

# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35

3320

if 'sources' not in video_data:

3321

video_data['sources'] = [video_data]

3322

3323

this_video_id = video_id or video_data['mediaid']

3324

3325

formats = self._parse_jwplayer_formats(

3326

video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,

3327

mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)

3328

3329

subtitles = {}

3330

tracks = video_data.get('tracks')

3331

if tracks and isinstance(tracks, list):

3332

for track in tracks:

3333

if not isinstance(track, dict):

3334

continue

3335

track_kind = track.get('kind')

3336

if not track_kind or not isinstance(track_kind, str):

3337

continue

3338

if track_kind.lower() not in ('captions', 'subtitles'):

3339

continue

3340

track_url = urljoin(base_url, track.get('file'))

3341

if not track_url:

3342

continue

3343

subtitles.setdefault(track.get('label') or 'en', []).append({

3344

'url': self._proto_relative_url(track_url)

})

entry = {

'id': this_video_id,

'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),

3350

'description': clean_html(video_data.get('description')),

3351

'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),

3352

'timestamp': int_or_none(video_data.get('pubdate')),

3353

'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),

3354

'subtitles': subtitles,

3355

'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...

3356

'genre': clean_html(video_data.get('genre')),

3357

'channel': clean_html(dict_get(video_data, ('category', 'channel'))),

3358

'season_number': int_or_none(video_data.get('season')),

3359

'episode_number': int_or_none(video_data.get('episode')),

3360

'release_year': int_or_none(video_data.get('releasedate')),

3361

'age_limit': int_or_none(video_data.get('age_restriction')),

3362

}

3363

# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32

3364

if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):

3365

entry.update({

3366

'_type': 'url_transparent',

3367

'url': formats[0]['url'],

3368

})

3369

else:

3370

entry['formats'] = formats

3371

entries.append(entry)

3372

if len(entries) == 1:

3373

return entries[0]

3374

else:

3375

return self.playlist_result(entries)

3376

3377

def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,

3378

m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):

3379

urls = set()

3380

formats = []

3381

for source in jwplayer_sources_data:

3382

if not isinstance(source, dict):

3383

continue

3384

source_url = urljoin(

3385

base_url, self._proto_relative_url(source.get('file')))

3386

if not source_url or source_url in urls:

3387

continue

3388

urls.add(source_url)

3389

source_type = source.get('type') or ''

3390

ext = mimetype2ext(source_type) or determine_ext(source_url)

3391

if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:

3392

formats.extend(self._extract_m3u8_formats(

3393

source_url, video_id, 'mp4', entry_protocol='m3u8_native',

3394

m3u8_id=m3u8_id, fatal=False))

3395

elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:

3396

formats.extend(self._extract_mpd_formats(

3397

source_url, video_id, mpd_id=mpd_id, fatal=False))

3398

elif ext == 'smil':

3399

formats.extend(self._extract_smil_formats(

3400

source_url, video_id, fatal=False))

3401

# https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67

3402

elif source_type.startswith('audio') or ext in (

3403

'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):

formats.append({

'url': source_url,

'vcodec': 'none',

'ext': ext,

})

else:

format_id = str_or_none(source.get('label'))

3411

height = int_or_none(source.get('height'))

3412

if height is None and format_id:

3413

# Often no height is provided but there is a label in

3414

# format like "1080p", "720p SD", or 1080.

3415

height = parse_resolution(format_id).get('height')

3416

a_format = {

3417

'url': source_url,

3418

'width': int_or_none(source.get('width')),

3419

'height': height,

3420

'tbr': int_or_none(source.get('bitrate'), scale=1000),

3421

'filesize': int_or_none(source.get('filesize')),

3422

'ext': ext,

3423

'format_id': format_id

3424

}

3425

if source_url.startswith('rtmp'):

3426

a_format['ext'] = 'flv'

3427

# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as

3428

# of jwplayer.flash.swf

3429

rtmp_url_parts = re.split(

3430

r'((?:mp4|mp3|flv):)', source_url, 1)

3431

if len(rtmp_url_parts) == 3:

3432

rtmp_url, prefix, play_path = rtmp_url_parts

3433

a_format.update({

3434

'url': rtmp_url,

3435

'play_path': prefix + play_path,

3436

})

3437

if rtmp_params:

3438

a_format.update(rtmp_params)

3439

formats.append(a_format)

3440

return formats

3441

3442

def _live_title(self, name):

3443

self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')

3444

return name

3445

3446

def _int(self, v, name, fatal=False, **kwargs):

3447

res = int_or_none(v, **kwargs)

3448

if res is None:

3449

msg = f'Failed to extract {name}: Could not parse value {v!r}'

3450

if fatal:

3451

raise ExtractorError(msg)

3452

else:

3453

self.report_warning(msg)

3454

return res

3455

3456

def _float(self, v, name, fatal=False, **kwargs):

3457

res = float_or_none(v, **kwargs)

3458

if res is None:

3459

msg = f'Failed to extract {name}: Could not parse value {v!r}'

3460

if fatal:

3461

raise ExtractorError(msg)

3462

else:

3463

self.report_warning(msg)

3464

return res

3465

3466

def _set_cookie(self, domain, name, value, expire_time=None, port=None,

3467

path='/', secure=False, discard=False, rest={}, **kwargs):

3468

cookie = http.cookiejar.Cookie(

3469

0, name, value, port, port is not None, domain, True,

3470

domain.startswith('.'), path, True, secure, expire_time,

3471

discard, None, None, rest)

3472

self.cookiejar.set_cookie(cookie)

3473

3474

def _get_cookies(self, url):

3475

""" Return a http.cookies.SimpleCookie with the cookies for the url """

3476

return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))

3477

3478

def _apply_first_set_cookie_header(self, url_handle, cookie):

3479

"""

3480

Apply first Set-Cookie header instead of the last. Experimental.

3481

3482

Some sites (e.g. [1-3]) may serve two cookies under the same name

3483

in Set-Cookie header and expect the first (old) one to be set rather

3484

than second (new). However, as of RFC6265 the newer one cookie

3485

should be set into cookie store what actually happens.

3486

We will workaround this issue by resetting the cookie to

3487

the first one manually.

3488

1. https://new.vk.com/

3489

2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201

3490

3. https://learning.oreilly.com/

3491

"""

3492

for header, cookies in url_handle.headers.items():

3493

if header.lower() != 'set-cookie':

3494

continue

3495

cookies = cookies.encode('iso-8859-1').decode('utf-8')

3496

cookie_value = re.search(

3497

r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)

3498

if cookie_value:

3499

value, domain = cookie_value.groups()

3500

self._set_cookie(domain, cookie, value)

break

@classmethod

def get_testcases(cls, include_onlymatching=False):

3505

# Do not look in super classes

3506

t = vars(cls).get('_TEST')

3507

if t:

3508

assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'

3509

tests = [t]

3510

else:

3511

tests = vars(cls).get('_TESTS', [])

3512

for t in tests:

3513

if not include_onlymatching and t.get('only_matching', False):

3514

continue

3515

t['name'] = cls.ie_key()

3516

yield t

3517

if getattr(cls, '__wrapped__', None):

3518

yield from cls.__wrapped__.get_testcases(include_onlymatching)

3519

3520

@classmethod

3521

def get_webpage_testcases(cls):

3522

tests = vars(cls).get('_WEBPAGE_TESTS', [])

3523

for t in tests:

3524

t['name'] = cls.ie_key()

3525

yield t

3526

if getattr(cls, '__wrapped__', None):

3527

yield from cls.__wrapped__.get_webpage_testcases()

3528

3529

@classproperty(cache=True)

3530

def age_limit(cls):

3531

"""Get age limit from the testcases"""

3532

return max(traverse_obj(

3533

(*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),

3534

(..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])

3535

3536

@classproperty(cache=True)

3537

def _RETURN_TYPE(cls):

3538

"""What the extractor returns: "video", "playlist", "any", or None (Unknown)"""

3539

tests = tuple(cls.get_testcases(include_onlymatching=False))

3540

if not tests:

3541

return None

3542

elif not any(k.startswith('playlist') for test in tests for k in test):

3543

return 'video'

3544

elif all(any(k.startswith('playlist') for k in test) for test in tests):

return 'playlist'

return 'any'

@classmethod

def is_single_video(cls, url):

3550

"""Returns whether the URL is of a single video, None if unknown"""

3551

if cls.suitable(url):

3552

return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)

3553

3554

@classmethod

3555

def is_suitable(cls, age_limit):

3556

"""Test whether the extractor is generally suitable for the given age limit"""

3557

return not age_restricted(cls.age_limit, age_limit)

3558

3559

@classmethod

3560

def description(cls, *, markdown=True, search_examples=None):

3561

"""Description of the extractor"""

3562

desc = ''

3563

if cls._NETRC_MACHINE:

3564

if markdown:

3565

desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'

3566

else:

3567

desc += f' [{cls._NETRC_MACHINE}]'

3568

if cls.IE_DESC is False:

3569

desc += ' [HIDDEN]'

3570

elif cls.IE_DESC:

3571

desc += f' {cls.IE_DESC}'

3572

if cls.SEARCH_KEY:

3573

desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'

3574

if search_examples:

3575

_COUNTS = ('', '5', '10', 'all')

3576

desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'

3577

if not cls.working():

3578

desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'

3579

3580

# Escape emojis. Ref: https://github.com/github/markup/issues/1153

3581

name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME

3582

return f'{name}:{desc}' if desc else name

3583

3584

def extract_subtitles(self, *args, **kwargs):

3585

if (self.get_param('writesubtitles', False)

3586

or self.get_param('listsubtitles')):

3587

return self._get_subtitles(*args, **kwargs)

3588

return {}

3589

3590

def _get_subtitles(self, *args, **kwargs):

3591

raise NotImplementedError('This method must be implemented by subclasses')

3592

3593

class CommentsDisabled(Exception):

3594

"""Raise in _get_comments if comments are disabled for the video"""

3595

3596

def extract_comments(self, *args, **kwargs):

3597

if not self.get_param('getcomments'):

3598

return None

3599

generator = self._get_comments(*args, **kwargs)

def extractor():

comments = []

interrupted = True

try:

while True:

comments.append(next(generator))

3607

except StopIteration:

3608

interrupted = False

3609

except KeyboardInterrupt:

3610

self.to_screen('Interrupted by user')

3611

except self.CommentsDisabled:

3612

return {'comments': None, 'comment_count': None}

3613

except Exception as e:

3614

if self.get_param('ignoreerrors') is not True:

3615

raise

3616

self._downloader.report_error(e)

3617

comment_count = len(comments)

3618

self.to_screen(f'Extracted {comment_count} comments')

3619

return {

3620

'comments': comments,

3621

'comment_count': None if interrupted else comment_count

}

return extractor

def _get_comments(self, *args, **kwargs):

3626

raise NotImplementedError('This method must be implemented by subclasses')

3627

3628

@staticmethod

3629

def _merge_subtitle_items(subtitle_list1, subtitle_list2):

3630

""" Merge subtitle items for one language. Items with duplicated URLs/data

3631

will be dropped. """

3632

list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}

3633

ret = list(subtitle_list1)

3634

ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)

return ret

@classmethod

def _merge_subtitles(cls, *dicts, target=None):

3639

""" Merge subtitle dictionaries, language by language. """

if target is None:

target = {}

for d in dicts:

for lang, subs in d.items():

3644

target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)

3645

return target

3646

3647

def extract_automatic_captions(self, *args, **kwargs):

3648

if (self.get_param('writeautomaticsub', False)

3649

or self.get_param('listsubtitles')):

3650

return self._get_automatic_captions(*args, **kwargs)

3651

return {}

3652

3653

def _get_automatic_captions(self, *args, **kwargs):

3654

raise NotImplementedError('This method must be implemented by subclasses')

3655

3656

@functools.cached_property

3657

def _cookies_passed(self):

3658

"""Whether cookies have been passed to YoutubeDL"""

3659

return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None

3660

3661

def mark_watched(self, *args, **kwargs):

3662

if not self.get_param('mark_watched', False):

3663

return

3664

if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:

3665

self._mark_watched(*args, **kwargs)

3666

3667

def _mark_watched(self, *args, **kwargs):

3668

raise NotImplementedError('This method must be implemented by subclasses')

3669

3670

def geo_verification_headers(self):

3671

headers = {}

3672

geo_verification_proxy = self.get_param('geo_verification_proxy')

3673

if geo_verification_proxy:

3674

headers['Ytdl-request-proxy'] = geo_verification_proxy

return headers

@staticmethod

def _generic_id(url):

3679

return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])

3680

3681

def _generic_title(self, url='', webpage='', *, default=None):

3682

return (self._og_search_title(webpage, default=None)

3683

or self._html_extract_title(webpage, default=None)

3684

or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])

3685

or default)

3686

3687

def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):

if not duration:

return

chapter_list = [{

'start_time': start_function(chapter),

3692

'title': title_function(chapter),

3693

} for chapter in chapter_list or []]

3694

if strict:

3695

warn = self.report_warning

3696

else:

3697

warn = self.write_debug

3698

chapter_list.sort(key=lambda c: c['start_time'] or 0)

3699

3700

chapters = [{'start_time': 0}]

3701

for idx, chapter in enumerate(chapter_list):

3702

if chapter['start_time'] is None:

3703

warn(f'Incomplete chapter {idx}')

3704

elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:

3705

chapters.append(chapter)

3706

elif chapter not in chapters:

3707

issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration

3708

else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')

3709

warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')

3710

return chapters[1:]

3711

3712

def _extract_chapters_from_description(self, description, duration):

3713

duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'

3714

sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'

3715

return self._extract_chapters_helper(

3716

re.findall(sep_re % (duration_re, r'.+?'), description or ''),

3717

start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],

3718

duration=duration, strict=False) or self._extract_chapters_helper(

3719

re.findall(sep_re % (r'.+?', duration_re), description or ''),

3720

start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],

3721

duration=duration, strict=False)

3722

3723

@staticmethod

3724

def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):

3725

all_known = all(map(

3726

lambda x: x is not None,

3727

(is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))

3728

return (

3729

'private' if is_private

3730

else 'premium_only' if needs_premium

3731

else 'subscriber_only' if needs_subscription

3732

else 'needs_auth' if needs_auth

3733

else 'unlisted' if is_unlisted

3734

else 'public' if all_known

3735

else None)

3736

3737

def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):

3738

'''

3739

@returns A list of values for the extractor argument given by "key"

3740

or "default" if no such key is present

3741

@param default The default value to return when the key is not present (default: [])

3742

@param casesense When false, the values are converted to lower case

3743

'''

3744

ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()

3745

val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))

3746

if val is None:

3747

return [] if default is NO_DEFAULT else default

3748

return list(val) if casesense else [x.lower() for x in val]

3749

3750

def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):

3751

if not playlist_id or not video_id:

3752

return not video_id

3753

3754

no_playlist = (smuggled_data or {}).get('force_noplaylist')

3755

if no_playlist is not None:

3756

return not no_playlist

3757

3758

video_id = '' if video_id is True else f' {video_id}'

3759

playlist_id = '' if playlist_id is True else f' {playlist_id}'

3760

if self.get_param('noplaylist'):

3761

self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')

3762

return False

3763

self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')

3764

return True

3765

3766

def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):

3767

RetryManager.report_retry(

3768

err, _count or int(fatal), _retries,

3769

info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,

3770

sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))

3771

3772

def RetryManager(self, **kwargs):

3773

return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)

3774

3775

def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):

3776

display_id = traverse_obj(info_dict, 'display_id', 'id')

3777

self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')

3778

return self._downloader.get_info_extractor('Generic')._extract_embeds(

3779

smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)

3780

3781

@classmethod

3782

def extract_from_webpage(cls, ydl, url, webpage):

3783

ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)

3784

else ydl.get_info_extractor(cls.ie_key()))

3785

for info in ie._extract_from_webpage(url, webpage) or []:

3786

# url = None since we do not want to set (webpage/original)_url

3787

ydl.add_default_extra_info(info, ie, None)

yield info

@classmethod

def _extract_from_webpage(cls, url, webpage):

3792

for embed_url in orderedSet(

3793

cls._extract_embed_urls(url, webpage) or [], lazy=True):

3794

yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)

3795

3796

@classmethod

3797

def _extract_embed_urls(cls, url, webpage):

3798

"""@returns all the embed urls on the webpage"""

3799

if '_EMBED_URL_RE' not in cls.__dict__:

3800

assert isinstance(cls._EMBED_REGEX, (list, tuple))

3801

for idx, regex in enumerate(cls._EMBED_REGEX):

3802

assert regex.count('(?P<url>') == 1, \

3803

f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'

3804

cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))

3805

3806

for regex in cls._EMBED_URL_RE:

3807

for mobj in regex.finditer(webpage):

3808

embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))

3809

if cls._VALID_URL is False or cls.suitable(embed_url):

3810

yield embed_url

3811

3812

class StopExtraction(Exception):

pass

@classmethod

def _extract_url(cls, webpage): # TODO: Remove

3817

"""Only for compatibility with some older extractors"""

3818

return next(iter(cls._extract_embed_urls(None, webpage) or []), None)

3819

3820

@classmethod

3821

def __init_subclass__(cls, *, plugin_name=None, **kwargs):

3822

if plugin_name:

3823

mro = inspect.getmro(cls)

3824

super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]

3825

cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key

3826

cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'

3827

while getattr(super_class, '__wrapped__', None):

3828

super_class = super_class.__wrapped__

3829

setattr(sys.modules[super_class.__module__], super_class.__name__, cls)

3830

_PLUGIN_OVERRIDES[super_class].append(cls)

3831

3832

return super().__init_subclass__(**kwargs)

3833

3834

3835

class SearchInfoExtractor(InfoExtractor):

3836

"""

3837

Base class for paged search queries extractors.

3838

They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}

3839

Instances should define _SEARCH_KEY and optionally _MAX_RESULTS

3840

"""

3841

3842

_MAX_RESULTS = float('inf')

3843

_RETURN_TYPE = 'playlist'

@classproperty

def _VALID_URL(cls):

return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY

3848

3849

def _real_extract(self, query):

3850

prefix, query = self._match_valid_url(query).group('prefix', 'query')

3851

if prefix == '':

3852

return self._get_n_results(query, 1)

3853

elif prefix == 'all':

3854

return self._get_n_results(query, self._MAX_RESULTS)

else:

n = int(prefix)

if n <= 0:

raise ExtractorError(f'invalid download number {n} for query "{query}"')

3859

elif n > self._MAX_RESULTS:

3860

self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))

3861

n = self._MAX_RESULTS

3862

return self._get_n_results(query, n)

3863

3864

def _get_n_results(self, query, n):

3865

"""Get a specified number of results for a query.

3866

Either this function or _search_results must be overridden by subclasses """

3867

return self.playlist_result(

3868

itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),

3869

query, query)

3870

3871

def _search_results(self, query):

3872

"""Returns an iterator of search results"""

3873

raise NotImplementedError('This method must be implemented by subclasses')

@classproperty

def SEARCH_KEY(cls):

return cls._SEARCH_KEY

3878

3879

3880

class UnsupportedURLIE(InfoExtractor):

_VALID_URL = '.*'

_ENABLED = False

IE_DESC = False

def _real_extract(self, url):

3886

raise UnsupportedError(url)

3887

3888

3889

_PLUGIN_OVERRIDES = collections.defaultdict(list)