]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
Add new field `aspect_ratio`
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
ac668111 3import getpass
3ec05685 4import hashlib
54007a45 5import http.client
6import http.cookiejar
7import http.cookies
2314b4d8 8import inspect
cc16383f 9import itertools
3d3538e4 10import json
f8271158 11import math
4094b6e3 12import netrc
d6983cb4 13import os
773f291d 14import random
6929b41a 15import re
d6983cb4 16import sys
4094b6e3 17import time
8f97a15d 18import types
14f25df2 19import urllib.parse
ac668111 20import urllib.request
f8271158 21import xml.etree.ElementTree
d6983cb4 22
6929b41a 23from ..compat import functools # isort: split
14f25df2 24from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
8817a80d 25from ..cookies import LenientSimpleCookie
eb8a4433 26from ..downloader import FileDownloader
f8271158 27from ..downloader.f4m import get_base_url, remove_encrypted_media
8c25f81b 28from ..utils import (
8f97a15d 29 IDENTITY,
f8271158 30 JSON_LD_RE,
31 NO_DEFAULT,
32 ExtractorError,
33 GeoRestrictedError,
34 GeoUtils,
b7c47b74 35 LenientJSONDecoder,
f8271158 36 RegexNotFoundError,
be5c1ae8 37 RetryManager,
f8271158 38 UnsupportedError,
05900629 39 age_restricted,
02dc0a36 40 base_url,
08f2a92c 41 bug_reports_message,
82d02080 42 classproperty,
d6983cb4 43 clean_html,
70f0f5a8 44 determine_ext,
46b18f23 45 determine_protocol,
d493f15c 46 dict_get,
42676437 47 encode_data_uri,
9b9c5355 48 error_to_compat_str,
46b18f23 49 extract_attributes,
90137ca4 50 filter_dict,
97f4aecf 51 fix_xml_ampersands,
b14f3a4c 52 float_or_none,
b868936c 53 format_field,
31bb8d3f 54 int_or_none,
34921b43 55 join_nonempty,
a4a554a7 56 js_to_json,
46b18f23 57 mimetype2ext,
3158150c 58 network_exceptions,
46b18f23 59 orderedSet,
d493f15c 60 parse_bitrate,
46b18f23
JH
61 parse_codecs,
62 parse_duration,
4ca2a3cf 63 parse_iso8601,
46b18f23 64 parse_m3u8_attributes,
d493f15c 65 parse_resolution,
46b18f23 66 sanitize_filename,
8f97a15d 67 sanitize_url,
b868936c 68 sanitized_Request,
ade1fa70 69 smuggle_url,
d493f15c 70 str_or_none,
ce5b9040 71 str_to_int,
f856816b 72 strip_or_none,
5d3a0e79 73 traverse_obj,
47046464 74 try_call,
ffa89477 75 try_get,
f38de77f 76 unescapeHTML,
647eab45 77 unified_strdate,
6b3a3098 78 unified_timestamp,
46b18f23 79 update_Request,
09d02ea4 80 update_url_query,
a107193e 81 url_basename,
bebef109 82 url_or_none,
b868936c 83 urljoin,
6606817a 84 variadic,
a6571f10 85 xpath_element,
8d6765cf
S
86 xpath_text,
87 xpath_with_ns,
d6983cb4 88)
c342041f 89
d6983cb4 90
86e5f3ed 91class InfoExtractor:
d6983cb4
PH
92 """Information Extractor class.
93
94 Information extractors are the classes that, given a URL, extract
95 information about the video (or videos) the URL refers to. This
96 information includes the real video URL, the video title, author and
97 others. The information is stored in a dictionary which is then
5d380852 98 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
99 information possibly downloading the video to the file system, among
100 other possible outcomes.
101
cf0649f8 102 The type field determines the type of the result.
fed5d032
PH
103 By far the most common value (and the default if _type is missing) is
104 "video", which indicates a single video.
105
106 For a video, the dictionaries must include the following fields:
d6983cb4
PH
107
108 id: Video identifier.
d4736fdb 109 title: Video title, unescaped. Set to an empty string if video has
110 no title as opposed to "None" which signifies that the
111 extractor failed to obtain a title
d67b0b15 112
f49d89ee 113 Additionally, it must contain either a formats entry or a url one:
d67b0b15 114
f49d89ee
PH
115 formats: A list of dictionaries for each format available, ordered
116 from worst to best quality.
117
118 Potential fields:
c790e93a
S
119 * url The mandatory URL representing the media:
120 for plain file media - HTTP URL of this file,
121 for RTMP - RTMP URL,
122 for HLS - URL of the M3U8 media playlist,
123 for HDS - URL of the F4M manifest,
79d2077e
S
124 for DASH
125 - HTTP URL to plain file media (in case of
126 unfragmented media)
127 - URL of the MPD manifest or base URL
128 representing the media if MPD manifest
8ed7a233 129 is parsed from a string (in case of
79d2077e 130 fragmented media)
c790e93a 131 for MSS - URL of the ISM manifest.
86f4d14f
S
132 * manifest_url
133 The URL of the manifest file in case of
c790e93a
S
134 fragmented media:
135 for HLS - URL of the M3U8 master playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH - URL of the MPD manifest,
138 for MSS - URL of the ISM manifest.
a44ca5a4 139 * manifest_stream_number (For internal use only)
140 The index of the stream in the manifest file
10952eb2 141 * ext Will be calculated from URL if missing
d67b0b15
PH
142 * format A human-readable description of the format
143 ("mp4 container with h264/opus").
144 Calculated from the format_id, width, height.
145 and format_note fields if missing.
146 * format_id A short description of the format
5d4f3985
PH
147 ("mp4_h264_opus" or "19").
148 Technically optional, but strongly recommended.
d67b0b15
PH
149 * format_note Additional info about the format
150 ("3D" or "DASH video")
151 * width Width of the video, if known
152 * height Height of the video, if known
105bfd90 153 * aspect_ratio Aspect ratio of the video, if known
154 Automatically calculated from width and height
f49d89ee 155 * resolution Textual description of width and height
105bfd90 156 Automatically calculated from width and height
176f1866 157 * dynamic_range The dynamic range of the video. One of:
158 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 159 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
160 * abr Average audio bitrate in KBit/s
161 * acodec Name of the audio codec in use
dd27fd17 162 * asr Audio sampling rate in Hertz
b8ed0f15 163 * audio_channels Number of audio channels
d67b0b15 164 * vbr Average video bitrate in KBit/s
fbb21cf5 165 * fps Frame rate
d67b0b15 166 * vcodec Name of the video codec in use
1394ce65 167 * container Name of the container format
d67b0b15 168 * filesize The number of bytes, if known in advance
9732d77e 169 * filesize_approx An estimate for the number of bytes
d67b0b15 170 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 171 * protocol The protocol that will be used for the actual
adbc4ec4
THD
172 download, lower-case. One of "http", "https" or
173 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
174 * fragment_base_url
175 Base URL for fragments. Each fragment's path
176 value (if present) will be relative to
177 this URL.
178 * fragments A list of fragments of a fragmented media.
179 Each fragment entry must contain either an url
180 or a path. If an url is present it should be
181 considered by a client. Otherwise both path and
182 fragment_base_url must be present. Here is
183 the list of all potential fields:
184 * "url" - fragment's URL
185 * "path" - fragment's path relative to
186 fragment_base_url
a0d5077c
S
187 * "duration" (optional, int or float)
188 * "filesize" (optional, int)
adbc4ec4
THD
189 * is_from_start Is a live format that can be downloaded
190 from the start. Boolean
f49d89ee 191 * preference Order number of this format. If this field is
08d13955 192 present and not None, the formats get sorted
38d63d84 193 by this field, regardless of all other values.
f49d89ee
PH
194 -1 for default (order by other properties),
195 -2 or smaller for less than default.
e65566a9
PH
196 < -1000 to hide the format (if there is
197 another one which is strictly better)
32f90364
PH
198 * language Language code, e.g. "de" or "en-US".
199 * language_preference Is this in the language mentioned in
200 the URL?
aff2f4f4
PH
201 10 if it's what the URL is about,
202 -1 for default (don't know),
203 -10 otherwise, other values reserved for now.
5d73273f
PH
204 * quality Order number of the video quality of this
205 format, irrespective of the file format.
206 -1 for default (order by other properties),
207 -2 or smaller for less than default.
c64ed2a3
PH
208 * source_preference Order number for this video source
209 (quality takes higher priority)
210 -1 for default (order by other properties),
211 -2 or smaller for less than default.
d769be6c
PH
212 * http_headers A dictionary of additional HTTP headers
213 to add to the request.
6271f1ca 214 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
215 video's pixels are not square.
216 width : height ratio as float.
217 * no_resume The server does not support resuming the
218 (HTTP or RTMP) download. Boolean.
88acdbc2 219 * has_drm The format has DRM and cannot be downloaded. Boolean
0a5a191a 220 * downloader_options A dictionary of downloader options
221 (For internal use only)
222 * http_chunk_size Chunk size for HTTP downloads
223 * ffmpeg_args Extra arguments for ffmpeg downloader
3b1fe47d 224 RTMP formats can also have the additional fields: page_url,
225 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
226 rtmp_protocol, rtmp_real_time
3dee7826 227
c0ba0f48 228 url: Final video URL.
d6983cb4 229 ext: Video filename extension.
d67b0b15
PH
230 format: The video format, defaults to ext (used for --get-format)
231 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 232
d6983cb4
PH
233 The following fields are optional:
234
08d30158 235 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 236 alt_title: A secondary title of the video.
0afef30b
PH
237 display_id An alternative identifier for the video, not necessarily
238 unique, but available before title. Typically, id is
239 something like "4234987", title "Dancing naked mole rats",
240 and display_id "dancing-naked-mole-rats"
d5519808 241 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 242 * "id" (optional, string) - Thumbnail format ID
d5519808 243 * "url"
cfb56d1a 244 * "preference" (optional, int) - quality of the image
d5519808
PH
245 * "width" (optional, int)
246 * "height" (optional, int)
5e1c39ac 247 * "resolution" (optional, string "{width}x{height}",
d5519808 248 deprecated)
2de624fd 249 * "filesize" (optional, int)
297e9952 250 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 251 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 252 description: Full video description.
d6983cb4 253 uploader: Full name of the video uploader.
2bc0c46f 254 license: License name the video is licensed under.
8a92e51c 255 creator: The creator of the video.
10db0d2f 256 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 257 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 258 If not explicitly set, calculated from timestamp
259 release_timestamp: UNIX timestamp of the moment the video was released.
260 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 261 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 262 If not explicitly set, calculated from release_timestamp
263 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 264 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 265 If not explicitly set, calculated from modified_timestamp
d6983cb4 266 uploader_id: Nickname or id of the video uploader.
7bcd2830 267 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 268 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 269 Note that channel fields may or may not repeat uploader
6f1f59f3
S
270 fields. This depends on a particular extractor.
271 channel_id: Id of the channel.
272 channel_url: Full URL to a channel webpage.
6c73052c 273 channel_follower_count: Number of followers of the channel.
da9ec3b9 274 location: Physical location where the video was filmed.
a504ced0 275 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
276 {tag: subformats}. "tag" is usually a language code, and
277 "subformats" is a list sorted from lower to higher
278 preference, each element is a dictionary with the "ext"
279 entry and one of:
a504ced0 280 * "data": The subtitles file contents
10952eb2 281 * "url": A URL pointing to the subtitles file
2412044c 282 It can optionally also have:
283 * "name": Name or description of the subtitles
08d30158 284 * "http_headers": A dictionary of additional HTTP headers
297e9952 285 to add to the request.
4bba3716 286 "ext" will be calculated from URL if missing
e167860c 287 automatic_captions: Like 'subtitles'; contains automatically generated
288 captions instead of normal subtitles
62d231c0 289 duration: Length of the video in seconds, as an integer or float.
f3d29461 290 view_count: How many users have watched the video on the platform.
867c66ff 291 concurrent_view_count: How many users are currently watching the video on the platform.
19e3dfc9
PH
292 like_count: Number of positive ratings of the video
293 dislike_count: Number of negative ratings of the video
02835c6b 294 repost_count: Number of reposts of the video
2d30521a 295 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 296 comment_count: Number of comments on the video
dd622d7c
PH
297 comments: A list of comments, each with one or more of the following
298 properties (all but one of text or html optional):
299 * "author" - human-readable name of the comment author
300 * "author_id" - user ID of the comment author
a1c5d2ca 301 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
302 * "id" - Comment ID
303 * "html" - Comment as HTML
304 * "text" - Plain text of the comment
305 * "timestamp" - UNIX timestamp of comment
306 * "parent" - ID of the comment this one is replying to.
307 Set to "root" to indicate that this is a
308 comment to the original video.
a1c5d2ca
M
309 * "like_count" - Number of positive ratings of the comment
310 * "dislike_count" - Number of negative ratings of the comment
311 * "is_favorited" - Whether the comment is marked as
312 favorite by the video uploader
313 * "author_is_uploader" - Whether the comment is made by
314 the video uploader
8dbe9899 315 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 316 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
317 should allow to get the same result again. (It will be set
318 by YoutubeDL if it's missing)
ad3bc6ac
PH
319 categories: A list of categories that the video falls in, for example
320 ["Sports", "Berlin"]
864f24bd 321 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 322 cast: A list of the video cast
7267bd53
PH
323 is_live: True, False, or None (=unknown). Whether this video is a
324 live stream that goes on instead of a fixed-length video.
f76ede8e 325 was_live: True, False, or None (=unknown). Whether this video was
326 originally a live stream.
0647d925 327 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
e325a21a 328 or 'post_live' (was live, but VOD is not yet processed)
ae30b840 329 If absent, automatically set from is_live, was_live
7c80519c 330 start_time: Time in seconds where the reproduction should start, as
10952eb2 331 specified in the URL.
297a564b 332 end_time: Time in seconds where the reproduction should end, as
10952eb2 333 specified in the URL.
55949fed 334 chapters: A list of dictionaries, with the following entries:
335 * "start_time" - The start time of the chapter in seconds
336 * "end_time" - The end time of the chapter in seconds
337 * "title" (optional, string)
6cfda058 338 playable_in_embed: Whether this video is allowed to play in embedded
339 players on other sites. Can be True (=always allowed),
340 False (=never allowed), None (=unknown), or a string
62b58c09 341 specifying the criteria for embedability; e.g. 'whitelist'
c224251a
M
342 availability: Under what condition the video is available. One of
343 'private', 'premium_only', 'subscriber_only', 'needs_auth',
344 'unlisted' or 'public'. Use 'InfoExtractor._availability'
345 to set it
1e8fe57e 346 _old_archive_ids: A list of old archive ids needed for backward compatibility
277d6ff5 347 __post_extractor: A function to be called just before the metadata is
348 written to either disk, logger or console. The function
349 must return a dict which will be added to the info_dict.
350 This is usefull for additional information that is
351 time-consuming to extract. Note that the fields thus
352 extracted will not be available to output template and
353 match_filter. So, only "comments" and "comment_count" are
354 currently allowed to be extracted via this method.
d6983cb4 355
7109903e
S
356 The following fields should only be used when the video belongs to some logical
357 chapter or section:
358
359 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
360 chapter_number: Number of the chapter the video belongs to, as an integer.
361 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
362
363 The following fields should only be used when the video is an episode of some
8d76bdf1 364 series, programme or podcast:
7109903e
S
365
366 series: Title of the series or programme the video episode belongs to.
9ac24e23 367 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 368 season: Title of the season the video episode belongs to.
27bfd4e5
S
369 season_number: Number of the season the video episode belongs to, as an integer.
370 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
371 episode: Title of the video episode. Unlike mandatory video title field,
372 this field should denote the exact title of the video episode
373 without any kind of decoration.
27bfd4e5
S
374 episode_number: Number of the video episode within a season, as an integer.
375 episode_id: Id of the video episode, as a unicode string.
7109903e 376
7a93ab5f
S
377 The following fields should only be used when the media is a track or a part of
378 a music album:
379
380 track: Title of the track.
381 track_number: Number of the track within an album or a disc, as an integer.
382 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
383 as a unicode string.
384 artist: Artist(s) of the track.
385 genre: Genre(s) of the track.
386 album: Title of the album the track belongs to.
387 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
388 album_artist: List of all artists appeared on the album (e.g.
389 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
390 and compilations).
391 disc_number: Number of the disc or other physical medium the track belongs to,
392 as an integer.
393 release_year: Year (YYYY) when the album was released.
8bcd4048 394 composer: Composer of the piece
7a93ab5f 395
3975b4d2 396 The following fields should only be set for clips that should be cut from the original video:
397
398 section_start: Start time of the section in seconds
399 section_end: End time of the section in seconds
400
45e8a04e 401 The following fields should only be set for storyboards:
402 rows: Number of rows in each storyboard fragment, as an integer
403 columns: Number of columns in each storyboard fragment, as an integer
404
deefc05b 405 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 406
d838b1bd
PH
407 Unless mentioned otherwise, None is equivalent to absence of information.
408
fed5d032
PH
409
410 _type "playlist" indicates multiple videos.
b82f815f
PH
411 There must be a key "entries", which is a list, an iterable, or a PagedList
412 object, each element of which is a valid dictionary by this specification.
fed5d032 413
962ffcf8 414 Additionally, playlists can have "id", "title", and any other relevant
b60419c5 415 attributes with the same semantics as videos (see above).
fed5d032 416
f0d785d3 417 It can also have the following optional fields:
418
419 playlist_count: The total number of videos in a playlist. If not given,
420 YoutubeDL tries to calculate it from "entries"
421
fed5d032
PH
422
423 _type "multi_video" indicates that there are multiple videos that
424 form a single show, for examples multiple acts of an opera or TV episode.
425 It must have an entries key like a playlist and contain all the keys
426 required for a video at the same time.
427
428
429 _type "url" indicates that the video must be extracted from another
430 location, possibly by a different extractor. Its only required key is:
431 "url" - the next URL to extract.
f58766ce
PH
432 The key "ie_key" can be set to the class name (minus the trailing "IE",
433 e.g. "Youtube") if the extractor class is known in advance.
434 Additionally, the dictionary may have any properties of the resolved entity
435 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
436 known ahead of time.
437
438
439 _type "url_transparent" entities have the same specification as "url", but
440 indicate that the given additional information is more precise than the one
441 associated with the resolved URL.
442 This is useful when a site employs a video service that hosts the video and
443 its technical metadata, but that video service does not embed a useful
444 title, description etc.
445
446
8f97a15d 447 Subclasses of this should also be added to the list of extractors and
448 should define a _VALID_URL regexp and, re-define the _real_extract() and
449 (optionally) _real_initialize() methods.
d6983cb4 450
e6f21b3d 451 Subclasses may also override suitable() if necessary, but ensure the function
452 signature is preserved and that this function imports everything it needs
52efa4b3 453 (except other extractors), so that lazy_extractors works correctly.
454
8f97a15d 455 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
456 the HTML of Generic webpages. It may also override _extract_embed_urls
457 or _extract_from_webpage as necessary. While these are normally classmethods,
458 _extract_from_webpage is allowed to be an instance method.
459
460 _extract_from_webpage may raise self.StopExtraction() to stop further
461 processing of the webpage and obtain exclusive rights to it. This is useful
62b58c09
L
462 when the extractor cannot reliably be matched using just the URL,
463 e.g. invidious/peertube instances
8f97a15d 464
465 Embed-only extractors can be defined by setting _VALID_URL = False.
466
52efa4b3 467 To support username + password (or netrc) login, the extractor must define a
468 _NETRC_MACHINE and re-define _perform_login(username, password) and
469 (optionally) _initialize_pre_login() methods. The _perform_login method will
470 be called between _initialize_pre_login and _real_initialize if credentials
471 are passed by the user. In cases where it is necessary to have the login
472 process as part of the extraction rather than initialization, _perform_login
473 can be left undefined.
e6f21b3d 474
4248dad9 475 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
476 geo restriction bypass mechanisms for a particular extractor.
477 Though it won't disable explicit geo restriction bypass based on
504f20dd 478 country code provided with geo_bypass_country.
4248dad9
S
479
480 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
481 countries for this extractor. One of these countries will be used by
482 geo restriction bypass mechanism right away in order to bypass
504f20dd 483 geo restriction, of course, if the mechanism is not disabled.
773f291d 484
5f95927a
S
485 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
486 IP blocks in CIDR notation for this extractor. One of these IP blocks
487 will be used by geo restriction bypass mechanism similarly
504f20dd 488 to _GEO_COUNTRIES.
3ccdde8c 489
fe7866d0 490 The _ENABLED attribute should be set to False for IEs that
491 are disabled by default and must be explicitly enabled.
492
e6f21b3d 493 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
494 in order to warn the users and skip the tests.
495 """
496
497 _ready = False
498 _downloader = None
773f291d 499 _x_forwarded_for_ip = None
4248dad9
S
500 _GEO_BYPASS = True
501 _GEO_COUNTRIES = None
5f95927a 502 _GEO_IP_BLOCKS = None
d6983cb4 503 _WORKING = True
fe7866d0 504 _ENABLED = True
52efa4b3 505 _NETRC_MACHINE = None
231025c4 506 IE_DESC = None
8dcce6a8 507 SEARCH_KEY = None
8f97a15d 508 _VALID_URL = None
509 _EMBED_REGEX = []
d6983cb4 510
8dcce6a8 511 def _login_hint(self, method=NO_DEFAULT, netrc=None):
512 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
513 return {
514 None: '',
515 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
516 'password': f'Use {password_hint}',
517 'cookies': (
518 'Use --cookies-from-browser or --cookies for the authentication. '
17ffed18 519 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
8dcce6a8 520 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 521
d6983cb4 522 def __init__(self, downloader=None):
49a57e70 523 """Constructor. Receives an optional downloader (a YoutubeDL instance).
524 If a downloader is not passed during initialization,
525 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 526 self._ready = False
773f291d 527 self._x_forwarded_for_ip = None
28f436ba 528 self._printed_messages = set()
d6983cb4
PH
529 self.set_downloader(downloader)
530
531 @classmethod
5ad28e7f 532 def _match_valid_url(cls, url):
8f97a15d 533 if cls._VALID_URL is False:
534 return None
79cb2577
PH
535 # This does not use has/getattr intentionally - we want to know whether
536 # we have cached the regexp for *this* class, whereas getattr would also
537 # match the superclass
538 if '_VALID_URL_RE' not in cls.__dict__:
539 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 540 return cls._VALID_URL_RE.match(url)
541
542 @classmethod
543 def suitable(cls, url):
544 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 545 # This function must import everything it needs (except other extractors),
546 # so that lazy_extractors works correctly
5ad28e7f 547 return cls._match_valid_url(url) is not None
d6983cb4 548
ed9266db
PH
549 @classmethod
550 def _match_id(cls, url):
5ad28e7f 551 return cls._match_valid_url(url).group('id')
ed9266db 552
1151c407 553 @classmethod
554 def get_temp_id(cls, url):
555 try:
556 return cls._match_id(url)
557 except (IndexError, AttributeError):
558 return None
559
d6983cb4
PH
560 @classmethod
561 def working(cls):
562 """Getter method for _WORKING."""
563 return cls._WORKING
564
52efa4b3 565 @classmethod
566 def supports_login(cls):
567 return bool(cls._NETRC_MACHINE)
568
d6983cb4
PH
569 def initialize(self):
570 """Initializes an instance (authentication, etc)."""
28f436ba 571 self._printed_messages = set()
5f95927a
S
572 self._initialize_geo_bypass({
573 'countries': self._GEO_COUNTRIES,
574 'ip_blocks': self._GEO_IP_BLOCKS,
575 })
4248dad9 576 if not self._ready:
52efa4b3 577 self._initialize_pre_login()
578 if self.supports_login():
579 username, password = self._get_login_info()
580 if username:
581 self._perform_login(username, password)
582 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 583 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
584 self._real_initialize()
585 self._ready = True
586
5f95927a 587 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
588 """
589 Initialize geo restriction bypass mechanism.
590
591 This method is used to initialize geo bypass mechanism based on faking
592 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 593 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
594 IP will be passed as X-Forwarded-For HTTP header in all subsequent
595 HTTP requests.
e39b5d4a
S
596
597 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
598 during the instance initialization with _GEO_COUNTRIES and
599 _GEO_IP_BLOCKS.
e39b5d4a 600
5f95927a 601 You may also manually call it from extractor's code if geo bypass
e39b5d4a 602 information is not available beforehand (e.g. obtained during
5f95927a
S
603 extraction) or due to some other reason. In this case you should pass
604 this information in geo bypass context passed as first argument. It may
605 contain following fields:
606
607 countries: List of geo unrestricted countries (similar
608 to _GEO_COUNTRIES)
609 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
610 (similar to _GEO_IP_BLOCKS)
611
e39b5d4a 612 """
773f291d 613 if not self._x_forwarded_for_ip:
5f95927a
S
614
615 # Geo bypass mechanism is explicitly disabled by user
a06916d9 616 if not self.get_param('geo_bypass', True):
5f95927a
S
617 return
618
619 if not geo_bypass_context:
620 geo_bypass_context = {}
621
622 # Backward compatibility: previously _initialize_geo_bypass
623 # expected a list of countries, some 3rd party code may still use
624 # it this way
625 if isinstance(geo_bypass_context, (list, tuple)):
626 geo_bypass_context = {
627 'countries': geo_bypass_context,
628 }
629
630 # The whole point of geo bypass mechanism is to fake IP
631 # as X-Forwarded-For HTTP header based on some IP block or
632 # country code.
633
634 # Path 1: bypassing based on IP block in CIDR notation
635
636 # Explicit IP block specified by user, use it right away
637 # regardless of whether extractor is geo bypassable or not
a06916d9 638 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
639
640 # Otherwise use random IP block from geo bypass context but only
641 # if extractor is known as geo bypassable
642 if not ip_block:
643 ip_blocks = geo_bypass_context.get('ip_blocks')
644 if self._GEO_BYPASS and ip_blocks:
645 ip_block = random.choice(ip_blocks)
646
647 if ip_block:
648 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 649 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
650 return
651
652 # Path 2: bypassing based on country code
653
654 # Explicit country code specified by user, use it right away
655 # regardless of whether extractor is geo bypassable or not
a06916d9 656 country = self.get_param('geo_bypass_country', None)
5f95927a
S
657
658 # Otherwise use random country code from geo bypass context but
659 # only if extractor is known as geo bypassable
660 if not country:
661 countries = geo_bypass_context.get('countries')
662 if self._GEO_BYPASS and countries:
663 country = random.choice(countries)
664
665 if country:
666 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 667 self._downloader.write_debug(
86e5f3ed 668 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
669
670 def extract(self, url):
671 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 672 try:
773f291d
S
673 for _ in range(2):
674 try:
675 self.initialize()
a06916d9 676 self.write_debug('Extracting URL: %s' % url)
0016b84e 677 ie_result = self._real_extract(url)
07cce701 678 if ie_result is None:
679 return None
0016b84e
S
680 if self._x_forwarded_for_ip:
681 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
b79f9e30 682 subtitles = ie_result.get('subtitles') or {}
683 if 'no-live-chat' in self.get_param('compat_opts'):
684 for lang in ('live_chat', 'comments', 'danmaku'):
685 subtitles.pop(lang, None)
0016b84e 686 return ie_result
773f291d 687 except GeoRestrictedError as e:
4248dad9
S
688 if self.__maybe_fake_ip_and_retry(e.countries):
689 continue
773f291d 690 raise
0db3bae8 691 except UnsupportedError:
692 raise
1151c407 693 except ExtractorError as e:
0db3bae8 694 kwargs = {
695 'video_id': e.video_id or self.get_temp_id(url),
696 'ie': self.IE_NAME,
b69fd25c 697 'tb': e.traceback or sys.exc_info()[2],
0db3bae8 698 'expected': e.expected,
699 'cause': e.cause
700 }
701 if hasattr(e, 'countries'):
702 kwargs['countries'] = e.countries
7265a219 703 raise type(e)(e.orig_msg, **kwargs)
ac668111 704 except http.client.IncompleteRead as e:
1151c407 705 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 706 except (KeyError, StopIteration) as e:
1151c407 707 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 708
4248dad9 709 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 710 if (not self.get_param('geo_bypass_country', None)
3089bc74 711 and self._GEO_BYPASS
a06916d9 712 and self.get_param('geo_bypass', True)
3089bc74
S
713 and not self._x_forwarded_for_ip
714 and countries):
eea0716c
S
715 country_code = random.choice(countries)
716 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
717 if self._x_forwarded_for_ip:
718 self.report_warning(
eea0716c
S
719 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
720 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
721 return True
722 return False
723
d6983cb4 724 def set_downloader(self, downloader):
08d30158 725 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
726 self._downloader = downloader
727
9809740b 728 @property
729 def cache(self):
730 return self._downloader.cache
731
732 @property
733 def cookiejar(self):
734 return self._downloader.cookiejar
735
52efa4b3 736 def _initialize_pre_login(self):
962ffcf8 737 """ Initialization before login. Redefine in subclasses."""
52efa4b3 738 pass
739
740 def _perform_login(self, username, password):
741 """ Login with username and password. Redefine in subclasses."""
742 pass
743
d6983cb4
PH
744 def _real_initialize(self):
745 """Real initialization process. Redefine in subclasses."""
746 pass
747
748 def _real_extract(self, url):
749 """Real extraction process. Redefine in subclasses."""
08d30158 750 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 751
56c73665
JMF
752 @classmethod
753 def ie_key(cls):
754 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 755 return cls.__name__[:-2]
56c73665 756
82d02080 757 @classproperty
758 def IE_NAME(cls):
759 return cls.__name__[:-2]
d6983cb4 760
d391b7e2
S
761 @staticmethod
762 def __can_accept_status_code(err, expected_status):
ac668111 763 assert isinstance(err, urllib.error.HTTPError)
d391b7e2
S
764 if expected_status is None:
765 return False
d391b7e2
S
766 elif callable(expected_status):
767 return expected_status(err.code) is True
768 else:
6606817a 769 return err.code in variadic(expected_status)
d391b7e2 770
c043c246 771 def _create_request(self, url_or_request, data=None, headers=None, query=None):
ac668111 772 if isinstance(url_or_request, urllib.request.Request):
09d02ea4 773 return update_Request(url_or_request, data=data, headers=headers, query=query)
774 if query:
775 url_or_request = update_url_query(url_or_request, query)
c043c246 776 return sanitized_Request(url_or_request, data, headers or {})
f95b9dee 777
c043c246 778 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
d391b7e2
S
779 """
780 Return the response handle.
781
782 See _download_webpage docstring for arguments specification.
783 """
1cf376f5 784 if not self._downloader._first_webpage_request:
49a57e70 785 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 786 if sleep_interval > 0:
5ef7d9bd 787 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 788 time.sleep(sleep_interval)
789 else:
790 self._downloader._first_webpage_request = False
791
d6983cb4
PH
792 if note is None:
793 self.report_download_webpage(video_id)
794 elif note is not False:
7cc3570e 795 if video_id is None:
86e5f3ed 796 self.to_screen(str(note))
7cc3570e 797 else:
86e5f3ed 798 self.to_screen(f'{video_id}: {note}')
2132edaa
S
799
800 # Some sites check X-Forwarded-For HTTP header in order to figure out
801 # the origin of the client behind proxy. This allows bypassing geo
802 # restriction by faking this header's value to IP that belongs to some
803 # geo unrestricted country. We will do so once we encounter any
804 # geo restriction error.
805 if self._x_forwarded_for_ip:
c043c246 806 headers = (headers or {}).copy()
807 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
2132edaa 808
d6983cb4 809 try:
f95b9dee 810 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 811 except network_exceptions as err:
ac668111 812 if isinstance(err, urllib.error.HTTPError):
d391b7e2 813 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
814 # Retain reference to error to prevent file object from
815 # being closed before it can be read. Works around the
816 # effects of <https://bugs.python.org/issue15002>
817 # introduced in Python 3.4.1.
818 err.fp._error = err
d391b7e2
S
819 return err.fp
820
aa94a6d3
PH
821 if errnote is False:
822 return False
d6983cb4 823 if errnote is None:
f1a9d64e 824 errnote = 'Unable to download webpage'
7f8b2714 825
86e5f3ed 826 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 827 if fatal:
497d2fab 828 raise ExtractorError(errmsg, cause=err)
7cc3570e 829 else:
6a39ee13 830 self.report_warning(errmsg)
7cc3570e 831 return False
d6983cb4 832
1890fc63 833 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
834 encoding=None, data=None, headers={}, query={}, expected_status=None):
d391b7e2
S
835 """
836 Return a tuple (page content as string, URL handle).
837
617f658b 838 Arguments:
839 url_or_request -- plain text URL as a string or
ac668111 840 a urllib.request.Request object
617f658b 841 video_id -- Video/playlist/item identifier (string)
842
843 Keyword arguments:
844 note -- note printed before downloading (string)
845 errnote -- note printed in case of an error (string)
846 fatal -- flag denoting whether error should be considered fatal,
847 i.e. whether it should cause ExtractionError to be raised,
848 otherwise a warning will be reported and extraction continued
849 encoding -- encoding for a page content decoding, guessed automatically
850 when not explicitly specified
851 data -- POST data (bytes)
852 headers -- HTTP headers (dict)
853 query -- URL query (dict)
854 expected_status -- allows to accept failed HTTP requests (non 2xx
855 status code) by explicitly specifying a set of accepted status
856 codes. Can be any of the following entities:
857 - an integer type specifying an exact failed status code to
858 accept
859 - a list or a tuple of integer types specifying a list of
860 failed status codes to accept
861 - a callable accepting an actual failed status code and
862 returning True if it should be accepted
863 Note that this argument does not affect success status codes (2xx)
864 which are always accepted.
d391b7e2 865 """
617f658b 866
b9d3e163 867 # Strip hashes from the URL (#1038)
14f25df2 868 if isinstance(url_or_request, str):
b9d3e163
PH
869 url_or_request = url_or_request.partition('#')[0]
870
d391b7e2 871 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
872 if urlh is False:
873 assert not fatal
874 return False
c9a77969 875 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
876 return (content, urlh)
877
c9a77969
YCH
878 @staticmethod
879 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
880 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
881 if m:
882 encoding = m.group(1)
883 else:
0d75ae2c 884 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
885 webpage_bytes[:1024])
886 if m:
887 encoding = m.group(1).decode('ascii')
b60016e8
PH
888 elif webpage_bytes.startswith(b'\xff\xfe'):
889 encoding = 'utf-16'
f143d86a
PH
890 else:
891 encoding = 'utf-8'
c9a77969
YCH
892
893 return encoding
894
4457823d
S
895 def __check_blocked(self, content):
896 first_block = content[:512]
3089bc74
S
897 if ('<title>Access to this site is blocked</title>' in content
898 and 'Websense' in first_block):
4457823d
S
899 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
900 blocked_iframe = self._html_search_regex(
901 r'<iframe src="([^"]+)"', content,
902 'Websense information URL', default=None)
903 if blocked_iframe:
904 msg += ' Visit %s for more details' % blocked_iframe
905 raise ExtractorError(msg, expected=True)
906 if '<title>The URL you requested has been blocked</title>' in first_block:
907 msg = (
908 'Access to this webpage has been blocked by Indian censorship. '
909 'Use a VPN or proxy server (with --proxy) to route around it.')
910 block_msg = self._html_search_regex(
911 r'</h1><p>(.*?)</p>',
912 content, 'block message', default=None)
913 if block_msg:
914 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
915 raise ExtractorError(msg, expected=True)
3089bc74
S
916 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
917 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
918 raise ExtractorError(
919 'Access to this webpage has been blocked by decision of the Russian government. '
920 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
921 expected=True)
922
f95b9dee 923 def _request_dump_filename(self, url, video_id):
924 basen = f'{video_id}_{url}'
925 trim_length = self.get_param('trim_file_name') or 240
926 if len(basen) > trim_length:
927 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
928 basen = basen[:trim_length - len(h)] + h
929 filename = sanitize_filename(f'{basen}.dump', restricted=True)
930 # Working around MAX_PATH limitation on Windows (see
931 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
932 if compat_os_name == 'nt':
933 absfilepath = os.path.abspath(filename)
934 if len(absfilepath) > 259:
935 filename = fR'\\?\{absfilepath}'
936 return filename
937
938 def __decode_webpage(self, webpage_bytes, encoding, headers):
939 if not encoding:
940 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
941 try:
942 return webpage_bytes.decode(encoding, 'replace')
943 except LookupError:
944 return webpage_bytes.decode('utf-8', 'replace')
945
c9a77969 946 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
947 webpage_bytes = urlh.read()
948 if prefix is not None:
949 webpage_bytes = prefix + webpage_bytes
a06916d9 950 if self.get_param('dump_intermediate_pages', False):
f610dbb0 951 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
952 dump = base64.b64encode(webpage_bytes).decode('ascii')
953 self._downloader.to_screen(dump)
f95b9dee 954 if self.get_param('write_pages'):
e121e3ce 955 filename = self._request_dump_filename(urlh.geturl(), video_id)
f95b9dee 956 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
957 with open(filename, 'wb') as outf:
958 outf.write(webpage_bytes)
959
f95b9dee 960 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 961 self.__check_blocked(content)
2410c43d 962
23be51d8 963 return content
d6983cb4 964
6edf2808 965 def __print_error(self, errnote, fatal, video_id, err):
966 if fatal:
c6e07cf1 967 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
6edf2808 968 elif errnote:
c6e07cf1 969 self.report_warning(f'{video_id}: {errnote}: {err}')
6edf2808 970
971 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
e2b38da9
PH
972 if transform_source:
973 xml_string = transform_source(xml_string)
e01c3d2e
S
974 try:
975 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 976 except xml.etree.ElementTree.ParseError as ve:
6edf2808 977 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
267ed0c5 978
6edf2808 979 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
3d3538e4 980 try:
b7c47b74 981 return json.loads(
982 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
3d3538e4 983 except ValueError as ve:
6edf2808 984 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
3d3538e4 985
6edf2808 986 def _parse_socket_response_as_json(self, data, *args, **kwargs):
987 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
adddc50c 988
617f658b 989 def __create_download_methods(name, parser, note, errnote, return_value):
990
6edf2808 991 def parse(ie, content, *args, errnote=errnote, **kwargs):
617f658b 992 if parser is None:
993 return content
6edf2808 994 if errnote is False:
995 kwargs['errnote'] = errnote
617f658b 996 # parser is fetched by name so subclasses can override it
997 return getattr(ie, parser)(content, *args, **kwargs)
998
c4910024 999 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1000 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1001 res = self._download_webpage_handle(
1002 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1003 data=data, headers=headers, query=query, expected_status=expected_status)
617f658b 1004 if res is False:
1005 return res
1006 content, urlh = res
6edf2808 1007 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
617f658b 1008
f95b9dee 1009 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
c4910024 1010 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
f95b9dee 1011 if self.get_param('load_pages'):
1012 url_or_request = self._create_request(url_or_request, data, headers, query)
1013 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1014 self.to_screen(f'Loading request from {filename}')
1015 try:
1016 with open(filename, 'rb') as dumpf:
1017 webpage_bytes = dumpf.read()
1018 except OSError as e:
1019 self.report_warning(f'Unable to load request from disk: {e}')
1020 else:
1021 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
6edf2808 1022 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
c4910024 1023 kwargs = {
1024 'note': note,
1025 'errnote': errnote,
1026 'transform_source': transform_source,
1027 'fatal': fatal,
1028 'encoding': encoding,
1029 'data': data,
1030 'headers': headers,
1031 'query': query,
1032 'expected_status': expected_status,
1033 }
617f658b 1034 if parser is None:
c4910024 1035 kwargs.pop('transform_source')
617f658b 1036 # The method is fetched by name so subclasses can override _download_..._handle
c4910024 1037 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
617f658b 1038 return res if res is False else res[0]
1039
1040 def impersonate(func, name, return_value):
1041 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1042 func.__doc__ = f'''
1043 @param transform_source Apply this transformation before parsing
1044 @returns {return_value}
1045
1046 See _download_webpage_handle docstring for other arguments specification
1047 '''
1048
1049 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1050 impersonate(download_content, f'_download_{name}', f'{return_value}')
1051 return download_handle, download_content
1052
1053 _download_xml_handle, _download_xml = __create_download_methods(
1054 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1055 _download_json_handle, _download_json = __create_download_methods(
1056 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1057 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1058 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1059 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1060
617f658b 1061 def _download_webpage(
1062 self, url_or_request, video_id, note=None, errnote=None,
1063 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1064 """
617f658b 1065 Return the data of the page as a string.
adddc50c 1066
617f658b 1067 Keyword arguments:
1068 tries -- number of tries
1069 timeout -- sleep interval between tries
1070
1071 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1072 """
617f658b 1073
1074 R''' # NB: These are unused; should they be deprecated?
1075 if tries != 1:
1076 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1077 if timeout is NO_DEFAULT:
1078 timeout = 5
1079 else:
1080 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1081 '''
1082
1083 try_count = 0
1084 while True:
1085 try:
1086 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
ac668111 1087 except http.client.IncompleteRead as e:
617f658b 1088 try_count += 1
1089 if try_count >= tries:
1090 raise e
1091 self._sleep(timeout, video_id)
adddc50c 1092
28f436ba 1093 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
a70635b8 1094 idstr = format_field(video_id, None, '%s: ')
28f436ba 1095 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1096 if only_once:
1097 if f'WARNING: {msg}' in self._printed_messages:
1098 return
1099 self._printed_messages.add(f'WARNING: {msg}')
1100 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1101
a06916d9 1102 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1103 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1104 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1105
1106 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1107 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1108
1109 def get_param(self, name, default=None, *args, **kwargs):
1110 if self._downloader:
1111 return self._downloader.params.get(name, default, *args, **kwargs)
1112 return default
d6983cb4 1113
d5d1df8a 1114 def report_drm(self, video_id, partial=NO_DEFAULT):
1115 if partial is not NO_DEFAULT:
1116 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
88acdbc2 1117 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1118
d6983cb4
PH
1119 def report_extraction(self, id_or_name):
1120 """Report information extraction."""
f1a9d64e 1121 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1122
1123 def report_download_webpage(self, video_id):
1124 """Report webpage download."""
f1a9d64e 1125 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1126
1127 def report_age_confirmation(self):
1128 """Report attempt to confirm age."""
f1a9d64e 1129 self.to_screen('Confirming age')
d6983cb4 1130
fc79158d
JMF
1131 def report_login(self):
1132 """Report attempt to log in."""
f1a9d64e 1133 self.to_screen('Logging in')
fc79158d 1134
b7da73eb 1135 def raise_login_required(
9d5d4d64 1136 self, msg='This video is only available for registered users',
52efa4b3 1137 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1138 if metadata_available and (
1139 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1140 self.report_warning(msg)
7265a219 1141 return
a70635b8 1142 msg += format_field(self._login_hint(method), None, '. %s')
46890374 1143 raise ExtractorError(msg, expected=True)
43e7d3c9 1144
b7da73eb 1145 def raise_geo_restricted(
1146 self, msg='This video is not available from your location due to geo restriction',
1147 countries=None, metadata_available=False):
f2ebc5c7 1148 if metadata_available and (
1149 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1150 self.report_warning(msg)
1151 else:
1152 raise GeoRestrictedError(msg, countries=countries)
1153
1154 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1155 if expected and (
1156 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1157 self.report_warning(msg, video_id)
68f5867c
L
1158 elif isinstance(msg, ExtractorError):
1159 raise msg
b7da73eb 1160 else:
1161 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1162
5f6a1245 1163 # Methods for following #608
c0d0b01f 1164 @staticmethod
311b6615 1165 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1166 """Returns a URL that points to a page that should be processed"""
311b6615 1167 if ie is not None:
1168 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1169 if video_id is not None:
311b6615 1170 kwargs['id'] = video_id
830d53bf 1171 if video_title is not None:
311b6615 1172 kwargs['title'] = video_title
1173 return {
1174 **kwargs,
1175 '_type': 'url_transparent' if url_transparent else 'url',
1176 'url': url,
1177 }
1178
8f97a15d 1179 @classmethod
1180 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1181 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1182 return cls.playlist_result(
1183 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1184 playlist_id, playlist_title, **kwargs)
46b18f23 1185
c0d0b01f 1186 @staticmethod
311b6615 1187 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1188 """Returns a playlist"""
d6983cb4 1189 if playlist_id:
311b6615 1190 kwargs['id'] = playlist_id
d6983cb4 1191 if playlist_title:
311b6615 1192 kwargs['title'] = playlist_title
ecc97af3 1193 if playlist_description is not None:
311b6615 1194 kwargs['description'] = playlist_description
1195 return {
1196 **kwargs,
1197 '_type': 'multi_video' if multi_video else 'playlist',
1198 'entries': entries,
1199 }
d6983cb4 1200
c342041f 1201 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1202 """
1203 Perform a regex search on the given string, using a single or a list of
1204 patterns returning the first matching group.
1205 In case of failure return a default value or raise a WARNING or a
55b3e45b 1206 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1207 """
61d3665d 1208 if string is None:
1209 mobj = None
77f90330 1210 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1211 mobj = re.search(pattern, string, flags)
1212 else:
1213 for p in pattern:
1214 mobj = re.search(p, string, flags)
c3415d1b
PH
1215 if mobj:
1216 break
d6983cb4 1217
ec11a9f4 1218 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1219
1220 if mobj:
711ede6e
PH
1221 if group is None:
1222 # return the first matching group
1223 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1224 elif isinstance(group, (list, tuple)):
1225 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1226 else:
1227 return mobj.group(group)
c342041f 1228 elif default is not NO_DEFAULT:
d6983cb4
PH
1229 return default
1230 elif fatal:
f1a9d64e 1231 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1232 else:
6a39ee13 1233 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1234 return None
1235
f0bc6e20 1236 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
8b7fb8b6 1237 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
b7c47b74 1238 """Searches string for the JSON object specified by start_pattern"""
1239 # NB: end_pattern is only used to reduce the size of the initial match
f0bc6e20 1240 if default is NO_DEFAULT:
1241 default, has_default = {}, False
1242 else:
1243 fatal, has_default = False, True
1244
1245 json_string = self._search_regex(
8b7fb8b6 1246 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
f0bc6e20 1247 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1248 if not json_string:
1249 return default
1250
1251 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1252 try:
1253 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1254 except ExtractorError as e:
1255 if fatal:
1256 raise ExtractorError(
1257 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1258 elif not has_default:
1259 self.report_warning(
1260 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1261 return default
b7c47b74 1262
c342041f 1263 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1264 """
1265 Like _search_regex, but strips HTML tags and unescapes entities.
1266 """
711ede6e 1267 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1268 if res:
1269 return clean_html(res).strip()
1270 else:
1271 return res
1272
2118fdd1
RA
1273 def _get_netrc_login_info(self, netrc_machine=None):
1274 username = None
1275 password = None
1276 netrc_machine = netrc_machine or self._NETRC_MACHINE
1277
a06916d9 1278 if self.get_param('usenetrc', False):
2118fdd1 1279 try:
0001fcb5 1280 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1281 if os.path.isdir(netrc_file):
1282 netrc_file = os.path.join(netrc_file, '.netrc')
1283 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1284 if info is not None:
1285 username = info[0]
1286 password = info[2]
1287 else:
dcce092e
S
1288 raise netrc.NetrcParseError(
1289 'No authenticators for %s' % netrc_machine)
86e5f3ed 1290 except (OSError, netrc.NetrcParseError) as err:
6a39ee13 1291 self.report_warning(
dcce092e 1292 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1293
dcce092e 1294 return username, password
2118fdd1 1295
1b6712ab 1296 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1297 """
cf0649f8 1298 Get the login info as (username, password)
32443dd3
S
1299 First look for the manually specified credentials using username_option
1300 and password_option as keys in params dictionary. If no such credentials
1301 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1302 value.
fc79158d
JMF
1303 If there's no info available, return (None, None)
1304 """
fc79158d
JMF
1305
1306 # Attempt to use provided username and password or .netrc data
a06916d9 1307 username = self.get_param(username_option)
1308 if username is not None:
1309 password = self.get_param(password_option)
2118fdd1 1310 else:
1b6712ab 1311 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1312
2133565c 1313 return username, password
fc79158d 1314
e64b7569 1315 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1316 """
1317 Get the two-factor authentication info
1318 TODO - asking the user will be required for sms/phone verify
1319 currently just uses the command line option
1320 If there's no info available, return None
1321 """
83317f69 1322
a06916d9 1323 tfa = self.get_param('twofactor')
1324 if tfa is not None:
1325 return tfa
83317f69 1326
ac668111 1327 return getpass.getpass('Type %s and press [Return]: ' % note)
83317f69 1328
46720279
JMF
1329 # Helper functions for extracting OpenGraph info
1330 @staticmethod
ab2d5247 1331 def _og_regexes(prop):
448ef1f3 1332 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
fbfde1c3
F
1333 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1334 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1335 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1336 return [
78fb87b2
JMF
1337 template % (property_re, content_re),
1338 template % (content_re, property_re),
ab2d5247 1339 ]
46720279 1340
864f24bd
S
1341 @staticmethod
1342 def _meta_regex(prop):
1343 return r'''(?isx)<meta
8b9848ac 1344 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1345 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1346
3c4e6d83 1347 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1348 prop = variadic(prop)
46720279 1349 if name is None:
b070564e
S
1350 name = 'OpenGraph %s' % prop[0]
1351 og_regexes = []
1352 for p in prop:
1353 og_regexes.extend(self._og_regexes(p))
1354 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1355 if escaped is None:
1356 return None
1357 return unescapeHTML(escaped)
46720279
JMF
1358
1359 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1360 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1361
1362 def _og_search_description(self, html, **kargs):
1363 return self._og_search_property('description', html, fatal=False, **kargs)
1364
04f3fd2c 1365 def _og_search_title(self, html, *, fatal=False, **kargs):
1366 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1367
8ffa13e0 1368 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1369 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1370 if secure:
1371 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1372 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1373
78338f71
JMF
1374 def _og_search_url(self, html, **kargs):
1375 return self._og_search_property('url', html, **kargs)
1376
04f3fd2c 1377 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1378 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1379
40c696e5 1380 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1381 name = variadic(name)
59040888 1382 if display_name is None:
88d9f6c0 1383 display_name = name[0]
59040888 1384 return self._html_search_regex(
88d9f6c0 1385 [self._meta_regex(n) for n in name],
711ede6e 1386 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1387
1388 def _dc_search_uploader(self, html):
1389 return self._html_search_meta('dc.creator', html, 'uploader')
1390
8f97a15d 1391 @staticmethod
1392 def _rta_search(html):
8dbe9899
PH
1393 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1394 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1395 r' content="RTA-5042-1996-1400-1577-RTA"',
1396 html):
1397 return 18
8f97a15d 1398
1399 # And then there are the jokers who advertise that they use RTA, but actually don't.
1400 AGE_LIMIT_MARKERS = [
1401 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1402 ]
1403 if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1404 return 18
8dbe9899
PH
1405 return 0
1406
59040888
PH
1407 def _media_rating_search(self, html):
1408 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1409 rating = self._html_search_meta('rating', html)
1410
1411 if not rating:
1412 return None
1413
1414 RATING_TABLE = {
1415 'safe for kids': 0,
1416 'general': 8,
1417 '14 years': 14,
1418 'mature': 17,
1419 'restricted': 19,
1420 }
d800609c 1421 return RATING_TABLE.get(rating.lower())
59040888 1422
69319969 1423 def _family_friendly_search(self, html):
6ca7732d 1424 # See http://schema.org/VideoObject
ac8491fc
S
1425 family_friendly = self._html_search_meta(
1426 'isFamilyFriendly', html, default=None)
69319969
NJ
1427
1428 if not family_friendly:
1429 return None
1430
1431 RATING_TABLE = {
1432 '1': 0,
1433 'true': 0,
1434 '0': 18,
1435 'false': 18,
1436 }
d800609c 1437 return RATING_TABLE.get(family_friendly.lower())
69319969 1438
0c708f11
JMF
1439 def _twitter_search_player(self, html):
1440 return self._html_search_meta('twitter:player', html,
9e1a5b84 1441 'twitter card player')
0c708f11 1442
0c36dc00 1443 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1444 """Yield all json ld objects in the html"""
1445 if default is not NO_DEFAULT:
1446 fatal = False
1447 for mobj in re.finditer(JSON_LD_RE, html):
1448 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1449 for json_ld in variadic(json_ld_item):
1450 if isinstance(json_ld, dict):
1451 yield json_ld
1452
1453 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1454 """Search for a video in any json ld in the html"""
1455 if default is not NO_DEFAULT:
1456 fatal = False
1457 info = self._json_ld(
1458 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1459 video_id, fatal=fatal, expected_type=expected_type)
1460 if info:
1461 return info
4433bb02
S
1462 if default is not NO_DEFAULT:
1463 return default
1464 elif fatal:
1465 raise RegexNotFoundError('Unable to extract JSON-LD')
1466 else:
6a39ee13 1467 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1468 return {}
4ca2a3cf 1469
95b31e26 1470 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
14f25df2 1471 if isinstance(json_ld, str):
4ca2a3cf
S
1472 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1473 if not json_ld:
1474 return {}
1475 info = {}
bae14048 1476
e7e4a6e0
S
1477 INTERACTION_TYPE_MAP = {
1478 'CommentAction': 'comment',
1479 'AgreeAction': 'like',
1480 'DisagreeAction': 'dislike',
1481 'LikeAction': 'like',
1482 'DislikeAction': 'dislike',
1483 'ListenAction': 'view',
1484 'WatchAction': 'view',
1485 'ViewAction': 'view',
1486 }
1487
f3c0c773 1488 def is_type(e, *expected_types):
1489 type = variadic(traverse_obj(e, '@type'))
1490 return any(x in type for x in expected_types)
1491
29f7c58a 1492 def extract_interaction_type(e):
1493 interaction_type = e.get('interactionType')
1494 if isinstance(interaction_type, dict):
1495 interaction_type = interaction_type.get('@type')
1496 return str_or_none(interaction_type)
1497
e7e4a6e0
S
1498 def extract_interaction_statistic(e):
1499 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1500 if isinstance(interaction_statistic, dict):
1501 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1502 if not isinstance(interaction_statistic, list):
1503 return
1504 for is_e in interaction_statistic:
f3c0c773 1505 if not is_type(is_e, 'InteractionCounter'):
e7e4a6e0 1506 continue
29f7c58a 1507 interaction_type = extract_interaction_type(is_e)
1508 if not interaction_type:
e7e4a6e0 1509 continue
ce5b9040
S
1510 # For interaction count some sites provide string instead of
1511 # an integer (as per spec) with non digit characters (e.g. ",")
1512 # so extracting count with more relaxed str_to_int
1513 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1514 if interaction_count is None:
1515 continue
1516 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1517 if not count_kind:
1518 continue
1519 count_key = '%s_count' % count_kind
1520 if info.get(count_key) is not None:
1521 continue
1522 info[count_key] = interaction_count
1523
f5225737 1524 def extract_chapter_information(e):
1525 chapters = [{
1526 'title': part.get('name'),
1527 'start_time': part.get('startOffset'),
1528 'end_time': part.get('endOffset'),
85553414 1529 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1530 for idx, (last_c, current_c, next_c) in enumerate(zip(
1531 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1532 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1533 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1534 if None in current_c.values():
1535 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1536 return
1537 if chapters:
1538 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1539 info['chapters'] = chapters
1540
bae14048 1541 def extract_video_object(e):
f7ad7160 1542 author = e.get('author')
bae14048 1543 info.update({
0c36dc00 1544 'url': url_or_none(e.get('contentUrl')),
0f60ba6e 1545 'ext': mimetype2ext(e.get('encodingFormat')),
bae14048
S
1546 'title': unescapeHTML(e.get('name')),
1547 'description': unescapeHTML(e.get('description')),
eb2333bc 1548 'thumbnails': [{'url': unescapeHTML(url)}
21633673 1549 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1550 if url_or_none(url)],
bae14048
S
1551 'duration': parse_duration(e.get('duration')),
1552 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1553 # author can be an instance of 'Organization' or 'Person' types.
1554 # both types can have 'name' property(inherited from 'Thing' type). [1]
1555 # however some websites are using 'Text' type instead.
1556 # 1. https://schema.org/VideoObject
14f25df2 1557 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
0f60ba6e 1558 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
56ba69e4 1559 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
bae14048
S
1560 'tbr': int_or_none(e.get('bitrate')),
1561 'width': int_or_none(e.get('width')),
1562 'height': int_or_none(e.get('height')),
33a81c2c 1563 'view_count': int_or_none(e.get('interactionCount')),
0f60ba6e 1564 'tags': try_call(lambda: e.get('keywords').split(',')),
bae14048 1565 })
0f60ba6e 1566 if is_type(e, 'AudioObject'):
1567 info.update({
1568 'vcodec': 'none',
1569 'abr': int_or_none(e.get('bitrate')),
1570 })
e7e4a6e0 1571 extract_interaction_statistic(e)
f5225737 1572 extract_chapter_information(e)
bae14048 1573
d5c32548 1574 def traverse_json_ld(json_ld, at_top_level=True):
1d55ebab
SS
1575 for e in variadic(json_ld):
1576 if not isinstance(e, dict):
1577 continue
d5c32548
ZM
1578 if at_top_level and '@context' not in e:
1579 continue
1580 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1d55ebab 1581 traverse_json_ld(e['@graph'], at_top_level=False)
c13a301a 1582 continue
f3c0c773 1583 if expected_type is not None and not is_type(e, expected_type):
4433bb02 1584 continue
8f122fa0 1585 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1586 if rating is not None:
1587 info['average_rating'] = rating
f3c0c773 1588 if is_type(e, 'TVEpisode', 'Episode'):
440863ad 1589 episode_name = unescapeHTML(e.get('name'))
46933a15 1590 info.update({
440863ad 1591 'episode': episode_name,
46933a15
S
1592 'episode_number': int_or_none(e.get('episodeNumber')),
1593 'description': unescapeHTML(e.get('description')),
1594 })
440863ad
S
1595 if not info.get('title') and episode_name:
1596 info['title'] = episode_name
46933a15 1597 part_of_season = e.get('partOfSeason')
f3c0c773 1598 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1599 info.update({
1600 'season': unescapeHTML(part_of_season.get('name')),
1601 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1602 })
d16b3c66 1603 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
f3c0c773 1604 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1605 info['series'] = unescapeHTML(part_of_series.get('name'))
f3c0c773 1606 elif is_type(e, 'Movie'):
391256dc
S
1607 info.update({
1608 'title': unescapeHTML(e.get('name')),
1609 'description': unescapeHTML(e.get('description')),
1610 'duration': parse_duration(e.get('duration')),
1611 'timestamp': unified_timestamp(e.get('dateCreated')),
1612 })
f3c0c773 1613 elif is_type(e, 'Article', 'NewsArticle'):
46933a15
S
1614 info.update({
1615 'timestamp': parse_iso8601(e.get('datePublished')),
1616 'title': unescapeHTML(e.get('headline')),
d5c32548 1617 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1618 })
f3c0c773 1619 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
2edb38e8 1620 extract_video_object(e['video'][0])
f3c0c773 1621 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
e50c3500 1622 extract_video_object(e['subjectOf'][0])
0f60ba6e 1623 elif is_type(e, 'VideoObject', 'AudioObject'):
bae14048 1624 extract_video_object(e)
4433bb02
S
1625 if expected_type is None:
1626 continue
1627 else:
1628 break
c69701c6 1629 video = e.get('video')
f3c0c773 1630 if is_type(video, 'VideoObject'):
c69701c6 1631 extract_video_object(video)
4433bb02
S
1632 if expected_type is None:
1633 continue
1634 else:
1635 break
d5c32548 1636
1d55ebab 1637 traverse_json_ld(json_ld)
90137ca4 1638 return filter_dict(info)
4ca2a3cf 1639
135dfa2c 1640 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1641 return self._parse_json(
1642 self._search_regex(
1643 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1644 webpage, 'next.js data', fatal=fatal, **kw),
1645 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1646
8072ef2b 1647 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1648 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
66f4c04e 1649 rectx = re.escape(context_name)
8072ef2b 1650 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
66f4c04e 1651 js, arg_keys, arg_vals = self._search_regex(
8072ef2b 1652 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
f7fc8d39 1653 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1654 default=NO_DEFAULT if fatal else (None, None, None))
1655 if js is None:
1656 return {}
66f4c04e
THD
1657
1658 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1659
1660 for key, val in args.items():
1661 if val in ('undefined', 'void 0'):
1662 args[key] = 'null'
1663
8072ef2b 1664 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1665 return traverse_obj(ret, traverse) or {}
66f4c04e 1666
27713812 1667 @staticmethod
f8da79f8 1668 def _hidden_inputs(html):
586f1cc5 1669 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1670 hidden_inputs = {}
c8498368
S
1671 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1672 attrs = extract_attributes(input)
1673 if not input:
201ea3ee 1674 continue
c8498368 1675 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1676 continue
c8498368
S
1677 name = attrs.get('name') or attrs.get('id')
1678 value = attrs.get('value')
1679 if name and value is not None:
1680 hidden_inputs[name] = value
201ea3ee 1681 return hidden_inputs
27713812 1682
cf61d96d
S
1683 def _form_hidden_inputs(self, form_id, html):
1684 form = self._search_regex(
73eb13df 1685 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1686 html, '%s form' % form_id, group='form')
1687 return self._hidden_inputs(form)
1688
eb8a4433 1689 class FormatSort:
b050d210 1690 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
eb8a4433 1691
8326b00a 1692 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
7e798d72 1693 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1694 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
198e3a04 1695 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
53ed7066 1696 'height', 'width', 'proto', 'vext', 'abr', 'aext',
f304da8a 1697 'fps', 'fs_approx', 'source', 'id')
eb8a4433 1698
1699 settings = {
1700 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1701 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1702 'acodec': {'type': 'ordered', 'regex': True,
a10aa588 1703 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
176f1866 1704 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1705 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
f137c99e 1706 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
f304da8a 1707 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
eb8a4433 1708 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1709 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1710 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1711 'aext': {'type': 'ordered', 'field': 'audio_ext',
1712 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
f2e9fa3e 1713 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
eb8a4433 1714 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f5510afe 1715 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
8326b00a 1716 'field': ('vcodec', 'acodec'),
1717 'function': lambda it: int(any(v != 'none' for v in it))},
f983b875 1718 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1719 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1720 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
10beccc9 1721 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1722 'quality': {'convert': 'float', 'default': -1},
eb8a4433 1723 'filesize': {'convert': 'bytes'},
f137c99e 1724 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1725 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1726 'height': {'convert': 'float_none'},
1727 'width': {'convert': 'float_none'},
1728 'fps': {'convert': 'float_none'},
b8ed0f15 1729 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
eb8a4433 1730 'tbr': {'convert': 'float_none'},
1731 'vbr': {'convert': 'float_none'},
1732 'abr': {'convert': 'float_none'},
1733 'asr': {'convert': 'float_none'},
10beccc9 1734 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
63be1aab 1735
eb8a4433 1736 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1737 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1738 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1739 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
f5510afe 1740 'res': {'type': 'multiple', 'field': ('height', 'width'),
dbf5416a 1741 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
63be1aab 1742
b8ed0f15 1743 # Actual field names
19188702 1744 'format_id': {'type': 'alias', 'field': 'id'},
1745 'preference': {'type': 'alias', 'field': 'ie_pref'},
1746 'language_preference': {'type': 'alias', 'field': 'lang'},
63be1aab 1747 'source_preference': {'type': 'alias', 'field': 'source'},
08d30158 1748 'protocol': {'type': 'alias', 'field': 'proto'},
63be1aab 1749 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
b8ed0f15 1750 'audio_channels': {'type': 'alias', 'field': 'channels'},
08d30158 1751
1752 # Deprecated
1753 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1754 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1755 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1756 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1757 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1758 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1759 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1760 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1761 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1762 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1763 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1764 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1765 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1766 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1767 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1768 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1769 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1770 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1771 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1772 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
63be1aab 1773 }
eb8a4433 1774
f304da8a 1775 def __init__(self, ie, field_preference):
1776 self._order = []
1777 self.ydl = ie._downloader
1778 self.evaluate_params(self.ydl.params, field_preference)
1779 if ie.get_param('verbose'):
1780 self.print_verbose_info(self.ydl.write_debug)
eb8a4433 1781
1782 def _get_field_setting(self, field, key):
1783 if field not in self.settings:
ee8dd27a 1784 if key in ('forced', 'priority'):
1785 return False
da4db748 1786 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1787 'deprecated and may be removed in a future version')
eb8a4433 1788 self.settings[field] = {}
1789 propObj = self.settings[field]
1790 if key not in propObj:
1791 type = propObj.get('type')
1792 if key == 'field':
1793 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1794 elif key == 'convert':
1795 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1796 else:
f5510afe 1797 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
eb8a4433 1798 propObj[key] = default
1799 return propObj[key]
1800
1801 def _resolve_field_value(self, field, value, convertNone=False):
1802 if value is None:
1803 if not convertNone:
1804 return None
4bcc7bd1 1805 else:
eb8a4433 1806 value = value.lower()
1807 conversion = self._get_field_setting(field, 'convert')
1808 if conversion == 'ignore':
1809 return None
1810 if conversion == 'string':
1811 return value
1812 elif conversion == 'float_none':
1813 return float_or_none(value)
1814 elif conversion == 'bytes':
1815 return FileDownloader.parse_bytes(value)
1816 elif conversion == 'order':
da9be05e 1817 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1818 use_regex = self._get_field_setting(field, 'regex')
1819 list_length = len(order_list)
1820 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1821 if use_regex and value is not None:
da9be05e 1822 for i, regex in enumerate(order_list):
eb8a4433 1823 if regex and re.match(regex, value):
1824 return list_length - i
1825 return list_length - empty_pos # not in list
1826 else: # not regex or value = None
1827 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1828 else:
1829 if value.isnumeric():
1830 return float(value)
4bcc7bd1 1831 else:
eb8a4433 1832 self.settings[field]['convert'] = 'string'
1833 return value
1834
1835 def evaluate_params(self, params, sort_extractor):
1836 self._use_free_order = params.get('prefer_free_formats', False)
1837 self._sort_user = params.get('format_sort', [])
1838 self._sort_extractor = sort_extractor
1839
1840 def add_item(field, reverse, closest, limit_text):
1841 field = field.lower()
1842 if field in self._order:
1843 return
1844 self._order.append(field)
1845 limit = self._resolve_field_value(field, limit_text)
1846 data = {
1847 'reverse': reverse,
1848 'closest': False if limit is None else closest,
1849 'limit_text': limit_text,
1850 'limit': limit}
1851 if field in self.settings:
1852 self.settings[field].update(data)
1853 else:
1854 self.settings[field] = data
1855
1856 sort_list = (
1857 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1858 + (tuple() if params.get('format_sort_force', False)
1859 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1860 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1861
1862 for item in sort_list:
1863 match = re.match(self.regex, item)
1864 if match is None:
1865 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1866 field = match.group('field')
1867 if field is None:
1868 continue
1869 if self._get_field_setting(field, 'type') == 'alias':
ee8dd27a 1870 alias, field = field, self._get_field_setting(field, 'field')
08d30158 1871 if self._get_field_setting(alias, 'deprecated'):
da4db748 1872 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
a057779d 1873 f'be removed in a future version. Please use {field} instead')
eb8a4433 1874 reverse = match.group('reverse') is not None
b050d210 1875 closest = match.group('separator') == '~'
eb8a4433 1876 limit_text = match.group('limit')
1877
1878 has_limit = limit_text is not None
1879 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1880 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1881
1882 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
b5ae35ee 1883 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
eb8a4433 1884 limit_count = len(limits)
1885 for (i, f) in enumerate(fields):
1886 add_item(f, reverse, closest,
1887 limits[i] if i < limit_count
1888 else limits[0] if has_limit and not has_multiple_limits
1889 else None)
1890
0760b0a7 1891 def print_verbose_info(self, write_debug):
b31fdeed 1892 if self._sort_user:
0760b0a7 1893 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
eb8a4433 1894 if self._sort_extractor:
0760b0a7 1895 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1896 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
eb8a4433 1897 '+' if self._get_field_setting(field, 'reverse') else '', field,
1898 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1899 self._get_field_setting(field, 'limit_text'),
1900 self._get_field_setting(field, 'limit'))
1901 if self._get_field_setting(field, 'limit_text') is not None else '')
1902 for field in self._order if self._get_field_setting(field, 'visible')]))
1903
1904 def _calculate_field_preference_from_value(self, format, field, type, value):
1905 reverse = self._get_field_setting(field, 'reverse')
1906 closest = self._get_field_setting(field, 'closest')
1907 limit = self._get_field_setting(field, 'limit')
1908
1909 if type == 'extractor':
1910 maximum = self._get_field_setting(field, 'max')
1911 if value is None or (maximum is not None and value >= maximum):
f983b875 1912 value = -1
eb8a4433 1913 elif type == 'boolean':
1914 in_list = self._get_field_setting(field, 'in_list')
1915 not_in_list = self._get_field_setting(field, 'not_in_list')
1916 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1917 elif type == 'ordered':
1918 value = self._resolve_field_value(field, value, True)
1919
1920 # try to convert to number
6a04a74e 1921 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
eb8a4433 1922 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1923 if is_num:
1924 value = val_num
1925
1926 return ((-10, 0) if value is None
1927 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1928 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1929 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1930 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1931 else (-1, value, 0))
1932
1933 def _calculate_field_preference(self, format, field):
1934 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1935 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1936 if type == 'multiple':
1937 type = 'field' # Only 'field' is allowed in multiple for now
1938 actual_fields = self._get_field_setting(field, 'field')
1939
f5510afe 1940 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
eb8a4433 1941 else:
1942 value = get_value(field)
1943 return self._calculate_field_preference_from_value(format, field, type, value)
1944
1945 def calculate_preference(self, format):
1946 # Determine missing protocol
1947 if not format.get('protocol'):
1948 format['protocol'] = determine_protocol(format)
1949
1950 # Determine missing ext
1951 if not format.get('ext') and 'url' in format:
1952 format['ext'] = determine_ext(format['url'])
1953 if format.get('vcodec') == 'none':
8326b00a 1954 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
eb8a4433 1955 format['video_ext'] = 'none'
1956 else:
1957 format['video_ext'] = format['ext']
1958 format['audio_ext'] = 'none'
1959 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1960 # format['preference'] = -1000
1961
1962 # Determine missing bitrates
1963 if format.get('tbr') is None:
1964 if format.get('vbr') is not None and format.get('abr') is not None:
1965 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1966 else:
b5ae35ee 1967 if format.get('vcodec') != 'none' and format.get('vbr') is None:
eb8a4433 1968 format['vbr'] = format.get('tbr') - format.get('abr', 0)
b5ae35ee 1969 if format.get('acodec') != 'none' and format.get('abr') is None:
eb8a4433 1970 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1971
1972 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1973
1974 def _sort_formats(self, formats, field_preference=[]):
1975 if not formats:
88acdbc2 1976 return
1d485a1a 1977 formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
59040888 1978
96a53167
S
1979 def _check_formats(self, formats, video_id):
1980 if formats:
1981 formats[:] = filter(
1982 lambda f: self._is_valid_url(
1983 f['url'], video_id,
1984 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1985 formats)
1986
f5bdb444
S
1987 @staticmethod
1988 def _remove_duplicate_formats(formats):
1989 format_urls = set()
1990 unique_formats = []
1991 for f in formats:
1992 if f['url'] not in format_urls:
1993 format_urls.add(f['url'])
1994 unique_formats.append(f)
1995 formats[:] = unique_formats
1996
45024183 1997 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1998 url = self._proto_relative_url(url, scheme='http:')
1999 # For now assume non HTTP(S) URLs always valid
2000 if not (url.startswith('http://') or url.startswith('https://')):
2001 return True
96a53167 2002 try:
45024183 2003 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 2004 return True
8bdd16b4 2005 except ExtractorError as e:
25e911a9 2006 self.to_screen(
8bdd16b4 2007 '%s: %s URL is invalid, skipping: %s'
2008 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 2009 return False
96a53167 2010
20991253 2011 def http_scheme(self):
1ede5b24 2012 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
2013 return (
2014 'http:'
a06916d9 2015 if self.get_param('prefer_insecure', False)
20991253
PH
2016 else 'https:')
2017
57c7411f 2018 def _proto_relative_url(self, url, scheme=None):
8f97a15d 2019 scheme = scheme or self.http_scheme()
2020 assert scheme.endswith(':')
2021 return sanitize_url(url, scheme=scheme[:-1])
57c7411f 2022
4094b6e3
PH
2023 def _sleep(self, timeout, video_id, msg_template=None):
2024 if msg_template is None:
f1a9d64e 2025 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
2026 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2027 self.to_screen(msg)
2028 time.sleep(timeout)
2029
f983b875 2030 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 2031 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 2032 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
a076c1f9 2033 res = self._download_xml_handle(
f036a632 2034 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
2035 'Unable to download f4m manifest',
2036 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 2037 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 2038 transform_source=transform_source,
7360c06f 2039 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 2040 if res is False:
8d29e47f 2041 return []
31bb8d3f 2042
a076c1f9
E
2043 manifest, urlh = res
2044 manifest_url = urlh.geturl()
2045
0fdbb332 2046 return self._parse_f4m_formats(
f983b875 2047 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 2048 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 2049
f983b875 2050 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 2051 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 2052 fatal=True, m3u8_id=None):
f9934b96 2053 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
2054 return []
2055
7a5c1cfe 2056 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 2057 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2058 if akamai_pv is not None and ';' in akamai_pv.text:
2059 playerVerificationChallenge = akamai_pv.text.split(';')[0]
2060 if playerVerificationChallenge.strip() != '':
2061 return []
2062
31bb8d3f 2063 formats = []
7a47d07c 2064 manifest_version = '1.0'
b2527359 2065 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 2066 if not media_nodes:
7a47d07c 2067 manifest_version = '2.0'
34e48bed 2068 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 2069 # Remove unsupported DRM protected media from final formats
067aa17e 2070 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
2071 media_nodes = remove_encrypted_media(media_nodes)
2072 if not media_nodes:
2073 return formats
48107c19
S
2074
2075 manifest_base_url = get_base_url(manifest)
0a5685b2 2076
a6571f10 2077 bootstrap_info = xpath_element(
0a5685b2
YCH
2078 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2079 'bootstrap info', default=None)
2080
edd6074c
RA
2081 vcodec = None
2082 mime_type = xpath_text(
2083 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2084 'base URL', default=None)
2085 if mime_type and mime_type.startswith('audio/'):
2086 vcodec = 'none'
2087
b2527359 2088 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
2089 tbr = int_or_none(media_el.attrib.get('bitrate'))
2090 width = int_or_none(media_el.attrib.get('width'))
2091 height = int_or_none(media_el.attrib.get('height'))
34921b43 2092 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
2093 # If <bootstrapInfo> is present, the specified f4m is a
2094 # stream-level manifest, and only set-level manifests may refer to
2095 # external resources. See section 11.4 and section 4 of F4M spec
2096 if bootstrap_info is None:
2097 media_url = None
2098 # @href is introduced in 2.0, see section 11.6 of F4M spec
2099 if manifest_version == '2.0':
2100 media_url = media_el.attrib.get('href')
2101 if media_url is None:
2102 media_url = media_el.attrib.get('url')
31c746e5
S
2103 if not media_url:
2104 continue
cc357c4d
S
2105 manifest_url = (
2106 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 2107 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
2108 # If media_url is itself a f4m manifest do the recursive extraction
2109 # since bitrates in parent manifest (this one) and media_url manifest
2110 # may differ leading to inability to resolve the format by requested
2111 # bitrate in f4m downloader
240b6045
YCH
2112 ext = determine_ext(manifest_url)
2113 if ext == 'f4m':
77b8b4e6 2114 f4m_formats = self._extract_f4m_formats(
f983b875 2115 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
2116 transform_source=transform_source, fatal=fatal)
2117 # Sometimes stream-level manifest contains single media entry that
2118 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2119 # At the same time parent's media entry in set-level manifest may
2120 # contain it. We will copy it from parent in such cases.
2121 if len(f4m_formats) == 1:
2122 f = f4m_formats[0]
2123 f.update({
2124 'tbr': f.get('tbr') or tbr,
2125 'width': f.get('width') or width,
2126 'height': f.get('height') or height,
2127 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 2128 'vcodec': vcodec,
77b8b4e6
S
2129 })
2130 formats.extend(f4m_formats)
70f0f5a8 2131 continue
240b6045
YCH
2132 elif ext == 'm3u8':
2133 formats.extend(self._extract_m3u8_formats(
2134 manifest_url, video_id, 'mp4', preference=preference,
f983b875 2135 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 2136 continue
31bb8d3f 2137 formats.append({
77b8b4e6 2138 'format_id': format_id,
31bb8d3f 2139 'url': manifest_url,
30d0b549 2140 'manifest_url': manifest_url,
a6571f10 2141 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 2142 'protocol': 'f4m',
b2527359 2143 'tbr': tbr,
77b8b4e6
S
2144 'width': width,
2145 'height': height,
edd6074c 2146 'vcodec': vcodec,
60ca389c 2147 'preference': preference,
f983b875 2148 'quality': quality,
31bb8d3f 2149 })
31bb8d3f
JMF
2150 return formats
2151
f983b875 2152 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 2153 return {
34921b43 2154 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
2155 'url': m3u8_url,
2156 'ext': ext,
2157 'protocol': 'm3u8',
37768f92 2158 'preference': preference - 100 if preference else -100,
f983b875 2159 'quality': quality,
704df56d
PH
2160 'resolution': 'multiple',
2161 'format_note': 'Quality selection URL',
16da9bbc
YCH
2162 }
2163
b5ae35ee 2164 def _report_ignoring_subs(self, name):
2165 self.report_warning(bug_reports_message(
2166 f'Ignoring subtitle tracks found in the {name} manifest; '
2167 'if any subtitle tracks are missing,'
2168 ), only_once=True)
2169
a0c3b2d5
F
2170 def _extract_m3u8_formats(self, *args, **kwargs):
2171 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2172 if subs:
b5ae35ee 2173 self._report_ignoring_subs('HLS')
a0c3b2d5
F
2174 return fmts
2175
2176 def _extract_m3u8_formats_and_subtitles(
177877c5 2177 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2178 preference=None, quality=None, m3u8_id=None, note=None,
2179 errnote=None, fatal=True, live=False, data=None, headers={},
2180 query={}):
2181
dbd82a1d 2182 res = self._download_webpage_handle(
81515ad9 2183 m3u8_url, video_id,
37a3bb66 2184 note='Downloading m3u8 information' if note is None else note,
2185 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 2186 fatal=fatal, data=data, headers=headers, query=query)
cb252080 2187
dbd82a1d 2188 if res is False:
a0c3b2d5 2189 return [], {}
cb252080 2190
dbd82a1d 2191 m3u8_doc, urlh = res
37113045 2192 m3u8_url = urlh.geturl()
9cdffeeb 2193
a0c3b2d5 2194 return self._parse_m3u8_formats_and_subtitles(
cb252080 2195 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 2196 preference=preference, quality=quality, m3u8_id=m3u8_id,
2197 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2198 headers=headers, query=query, video_id=video_id)
cb252080 2199
a0c3b2d5 2200 def _parse_m3u8_formats_and_subtitles(
42676437 2201 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2202 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2203 errnote=None, fatal=True, data=None, headers={}, query={},
2204 video_id=None):
60755938 2205 formats, subtitles = [], {}
a0c3b2d5 2206
6b993ca7 2207 has_drm = re.search('|'.join([
2208 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2209 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
2210 ]), m3u8_doc)
a0c3b2d5 2211
60755938 2212 def format_url(url):
14f25df2 2213 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
60755938 2214
2215 if self.get_param('hls_split_discontinuity', False):
2216 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2217 if not m3u8_doc:
2218 if not manifest_url:
2219 return []
2220 m3u8_doc = self._download_webpage(
2221 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2222 note=False, errnote='Failed to download m3u8 playlist information')
2223 if m3u8_doc is False:
2224 return []
2225 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 2226
60755938 2227 else:
2228 def _extract_m3u8_playlist_indices(*args, **kwargs):
2229 return [None]
310c2ed2 2230
cb252080
S
2231 # References:
2232 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2233 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2234 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2235
2236 # We should try extracting formats only from master playlists [1, 4.3.4],
2237 # i.e. playlists that describe available qualities. On the other hand
2238 # media playlists [1, 4.3.3] should be returned as is since they contain
2239 # just the media without qualities renditions.
9cdffeeb 2240 # Fortunately, master playlist can be easily distinguished from media
cb252080 2241 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2242 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2243 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2244 # media playlist and MUST NOT appear in master playlist thus we can
2245 # clearly detect media playlist with this criterion.
2246
9cdffeeb 2247 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2248 formats = [{
34921b43 2249 'format_id': join_nonempty(m3u8_id, idx),
60755938 2250 'format_index': idx,
42676437 2251 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2252 'ext': ext,
2253 'protocol': entry_protocol,
2254 'preference': preference,
2255 'quality': quality,
88acdbc2 2256 'has_drm': has_drm,
60755938 2257 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2258
a0c3b2d5 2259 return formats, subtitles
cb252080
S
2260
2261 groups = {}
2262 last_stream_inf = {}
2263
2264 def extract_media(x_media_line):
2265 media = parse_m3u8_attributes(x_media_line)
2266 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2267 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2268 if not (media_type and group_id and name):
2269 return
2270 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2271 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2272 if media_type == 'SUBTITLES':
3907333c 2273 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2274 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2275 # However, lack of URI has been spotted in the wild.
2276 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2277 if not media.get('URI'):
2278 return
a0c3b2d5
F
2279 url = format_url(media['URI'])
2280 sub_info = {
2281 'url': url,
2282 'ext': determine_ext(url),
2283 }
4a2f19ab
F
2284 if sub_info['ext'] == 'm3u8':
2285 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2286 # files may contain is WebVTT:
2287 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2288 sub_info['ext'] = 'vtt'
2289 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2290 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2291 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2292 if media_type not in ('VIDEO', 'AUDIO'):
2293 return
2294 media_url = media.get('URI')
2295 if media_url:
310c2ed2 2296 manifest_url = format_url(media_url)
60755938 2297 formats.extend({
34921b43 2298 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2299 'format_note': name,
2300 'format_index': idx,
2301 'url': manifest_url,
2302 'manifest_url': m3u8_url,
2303 'language': media.get('LANGUAGE'),
2304 'ext': ext,
2305 'protocol': entry_protocol,
2306 'preference': preference,
2307 'quality': quality,
2308 'vcodec': 'none' if media_type == 'AUDIO' else None,
2309 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2310
2311 def build_stream_name():
2312 # Despite specification does not mention NAME attribute for
3019cb0c
S
2313 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2314 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2315 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2316 stream_name = last_stream_inf.get('NAME')
2317 if stream_name:
2318 return stream_name
2319 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2320 # from corresponding rendition group
2321 stream_group_id = last_stream_inf.get('VIDEO')
2322 if not stream_group_id:
2323 return
2324 stream_group = groups.get(stream_group_id)
2325 if not stream_group:
2326 return stream_group_id
2327 rendition = stream_group[0]
2328 return rendition.get('NAME') or stream_group_id
2329
379306ef 2330 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2331 # chance to detect video only formats when EXT-X-STREAM-INF tags
2332 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2333 for line in m3u8_doc.splitlines():
2334 if line.startswith('#EXT-X-MEDIA:'):
2335 extract_media(line)
2336
704df56d
PH
2337 for line in m3u8_doc.splitlines():
2338 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2339 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2340 elif line.startswith('#') or not line.strip():
2341 continue
2342 else:
9c99bef7 2343 tbr = float_or_none(
3089bc74
S
2344 last_stream_inf.get('AVERAGE-BANDWIDTH')
2345 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2346 manifest_url = format_url(line.strip())
5ef62fc4 2347
60755938 2348 for idx in _extract_m3u8_playlist_indices(manifest_url):
2349 format_id = [m3u8_id, None, idx]
310c2ed2 2350 # Bandwidth of live streams may differ over time thus making
2351 # format_id unpredictable. So it's better to keep provided
2352 # format_id intact.
2353 if not live:
60755938 2354 stream_name = build_stream_name()
34921b43 2355 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2356 f = {
34921b43 2357 'format_id': join_nonempty(*format_id),
60755938 2358 'format_index': idx,
310c2ed2 2359 'url': manifest_url,
2360 'manifest_url': m3u8_url,
2361 'tbr': tbr,
2362 'ext': ext,
2363 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2364 'protocol': entry_protocol,
2365 'preference': preference,
2366 'quality': quality,
2367 }
2368 resolution = last_stream_inf.get('RESOLUTION')
2369 if resolution:
2370 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2371 if mobj:
2372 f['width'] = int(mobj.group('width'))
2373 f['height'] = int(mobj.group('height'))
2374 # Unified Streaming Platform
2375 mobj = re.search(
2376 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2377 if mobj:
2378 abr, vbr = mobj.groups()
2379 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2380 f.update({
2381 'vbr': vbr,
2382 'abr': abr,
2383 })
2384 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2385 f.update(codecs)
2386 audio_group_id = last_stream_inf.get('AUDIO')
2387 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2388 # references a rendition group MUST have a CODECS attribute.
62b58c09 2389 # However, this is not always respected. E.g. [2]
310c2ed2 2390 # contains EXT-X-STREAM-INF tag which references AUDIO
2391 # rendition group but does not have CODECS and despite
2392 # referencing an audio group it represents a complete
2393 # (with audio and video) format. So, for such cases we will
2394 # ignore references to rendition groups and treat them
2395 # as complete formats.
2396 if audio_group_id and codecs and f.get('vcodec') != 'none':
2397 audio_group = groups.get(audio_group_id)
2398 if audio_group and audio_group[0].get('URI'):
2399 # TODO: update acodec for audio only formats with
2400 # the same GROUP-ID
2401 f['acodec'] = 'none'
fc21af50 2402 if not f.get('ext'):
2403 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2404 formats.append(f)
2405
2406 # for DailyMotion
2407 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2408 if progressive_uri:
2409 http_f = f.copy()
2410 del http_f['manifest_url']
2411 http_f.update({
2412 'format_id': f['format_id'].replace('hls-', 'http-'),
2413 'protocol': 'http',
2414 'url': progressive_uri,
2415 })
2416 formats.append(http_f)
5ef62fc4 2417
cb252080 2418 last_stream_inf = {}
a0c3b2d5 2419 return formats, subtitles
704df56d 2420
3cf4b91d
C
2421 def _extract_m3u8_vod_duration(
2422 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2423
2424 m3u8_vod = self._download_webpage(
2425 m3u8_vod_url, video_id,
2426 note='Downloading m3u8 VOD manifest' if note is None else note,
2427 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2428 fatal=False, data=data, headers=headers, query=query)
2429
2430 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2431
2432 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2433 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2434 return None
2435
2436 return int(sum(
2437 float(line[len('#EXTINF:'):].split(',')[0])
2438 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2439
a107193e
S
2440 @staticmethod
2441 def _xpath_ns(path, namespace=None):
2442 if not namespace:
2443 return path
2444 out = []
2445 for c in path.split('/'):
2446 if not c or c == '.':
2447 out.append(c)
2448 else:
2449 out.append('{%s}%s' % (namespace, c))
2450 return '/'.join(out)
2451
da1c94ee 2452 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
a076c1f9
E
2453 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2454 if res is False:
995029a1 2455 assert not fatal
774a46c5 2456 return [], {}
e89a2aab 2457
a076c1f9
E
2458 smil, urlh = res
2459 smil_url = urlh.geturl()
2460
17712eeb 2461 namespace = self._parse_smil_namespace(smil)
a107193e 2462
da1c94ee 2463 fmts = self._parse_smil_formats(
a107193e 2464 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2465 subs = self._parse_smil_subtitles(
2466 smil, namespace=namespace)
2467
2468 return fmts, subs
2469
2470 def _extract_smil_formats(self, *args, **kwargs):
2471 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2472 if subs:
b5ae35ee 2473 self._report_ignoring_subs('SMIL')
da1c94ee 2474 return fmts
a107193e
S
2475
2476 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2477 res = self._download_smil(smil_url, video_id, fatal=fatal)
2478 if res is False:
a107193e 2479 return {}
a076c1f9
E
2480
2481 smil, urlh = res
2482 smil_url = urlh.geturl()
2483
a107193e
S
2484 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2485
09f572fb 2486 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2487 return self._download_xml_handle(
a107193e 2488 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2489 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2490
2491 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2492 namespace = self._parse_smil_namespace(smil)
a107193e
S
2493
2494 formats = self._parse_smil_formats(
2495 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2496 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2497
2498 video_id = os.path.splitext(url_basename(smil_url))[0]
2499 title = None
2500 description = None
647eab45 2501 upload_date = None
a107193e
S
2502 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2503 name = meta.attrib.get('name')
2504 content = meta.attrib.get('content')
2505 if not name or not content:
2506 continue
2507 if not title and name == 'title':
2508 title = content
2509 elif not description and name in ('description', 'abstract'):
2510 description = content
647eab45
S
2511 elif not upload_date and name == 'date':
2512 upload_date = unified_strdate(content)
a107193e 2513
1e5bcdec
S
2514 thumbnails = [{
2515 'id': image.get('type'),
2516 'url': image.get('src'),
2517 'width': int_or_none(image.get('width')),
2518 'height': int_or_none(image.get('height')),
2519 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2520
a107193e
S
2521 return {
2522 'id': video_id,
2523 'title': title or video_id,
2524 'description': description,
647eab45 2525 'upload_date': upload_date,
1e5bcdec 2526 'thumbnails': thumbnails,
a107193e
S
2527 'formats': formats,
2528 'subtitles': subtitles,
2529 }
2530
17712eeb
S
2531 def _parse_smil_namespace(self, smil):
2532 return self._search_regex(
2533 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2534
f877c6ae 2535 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2536 base = smil_url
2537 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2538 b = meta.get('base') or meta.get('httpBase')
2539 if b:
2540 base = b
2541 break
e89a2aab
S
2542
2543 formats = []
2544 rtmp_count = 0
a107193e 2545 http_count = 0
7f32e5dc 2546 m3u8_count = 0
9359f3d4 2547 imgs_count = 0
a107193e 2548
9359f3d4 2549 srcs = set()
ad96b4c8
YCH
2550 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2551 for medium in media:
2552 src = medium.get('src')
81e1c4e2 2553 if not src or src in srcs:
a107193e 2554 continue
9359f3d4 2555 srcs.add(src)
a107193e 2556
ad96b4c8
YCH
2557 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2558 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2559 width = int_or_none(medium.get('width'))
2560 height = int_or_none(medium.get('height'))
2561 proto = medium.get('proto')
2562 ext = medium.get('ext')
a107193e 2563 src_ext = determine_ext(src)
ad96b4c8 2564 streamer = medium.get('streamer') or base
a107193e
S
2565
2566 if proto == 'rtmp' or streamer.startswith('rtmp'):
2567 rtmp_count += 1
2568 formats.append({
2569 'url': streamer,
2570 'play_path': src,
2571 'ext': 'flv',
2572 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2573 'tbr': bitrate,
2574 'filesize': filesize,
2575 'width': width,
2576 'height': height,
2577 })
f877c6ae
YCH
2578 if transform_rtmp_url:
2579 streamer, src = transform_rtmp_url(streamer, src)
2580 formats[-1].update({
2581 'url': streamer,
2582 'play_path': src,
2583 })
a107193e
S
2584 continue
2585
14f25df2 2586 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
c349456e 2587 src_url = src_url.strip()
a107193e
S
2588
2589 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2590 m3u8_formats = self._extract_m3u8_formats(
2591 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2592 if len(m3u8_formats) == 1:
2593 m3u8_count += 1
2594 m3u8_formats[0].update({
2595 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2596 'tbr': bitrate,
2597 'width': width,
2598 'height': height,
2599 })
2600 formats.extend(m3u8_formats)
bd21ead2 2601 elif src_ext == 'f4m':
a107193e
S
2602 f4m_url = src_url
2603 if not f4m_params:
2604 f4m_params = {
2605 'hdcore': '3.2.0',
2606 'plugin': 'flowplayer-3.2.0.1',
2607 }
2608 f4m_url += '&' if '?' in f4m_url else '?'
14f25df2 2609 f4m_url += urllib.parse.urlencode(f4m_params)
7e5edcfd 2610 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2611 elif src_ext == 'mpd':
2612 formats.extend(self._extract_mpd_formats(
2613 src_url, video_id, mpd_id='dash', fatal=False))
2614 elif re.search(r'\.ism/[Mm]anifest', src_url):
2615 formats.extend(self._extract_ism_formats(
2616 src_url, video_id, ism_id='mss', fatal=False))
2617 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2618 http_count += 1
2619 formats.append({
2620 'url': src_url,
2621 'ext': ext or src_ext or 'flv',
2622 'format_id': 'http-%d' % (bitrate or http_count),
2623 'tbr': bitrate,
2624 'filesize': filesize,
2625 'width': width,
2626 'height': height,
2627 })
63757032 2628
9359f3d4
F
2629 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2630 src = medium.get('src')
2631 if not src or src in srcs:
2632 continue
2633 srcs.add(src)
2634
2635 imgs_count += 1
2636 formats.append({
2637 'format_id': 'imagestream-%d' % (imgs_count),
2638 'url': src,
2639 'ext': mimetype2ext(medium.get('type')),
2640 'acodec': 'none',
2641 'vcodec': 'none',
2642 'width': int_or_none(medium.get('width')),
2643 'height': int_or_none(medium.get('height')),
2644 'format_note': 'SMIL storyboards',
2645 })
2646
e89a2aab
S
2647 return formats
2648
ce00af87 2649 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2650 urls = []
a107193e
S
2651 subtitles = {}
2652 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2653 src = textstream.get('src')
d413095f 2654 if not src or src in urls:
a107193e 2655 continue
d413095f 2656 urls.append(src)
df634be2 2657 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2658 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2659 subtitles.setdefault(lang, []).append({
2660 'url': src,
2661 'ext': ext,
2662 })
2663 return subtitles
63757032 2664
47a5cb77 2665 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2666 res = self._download_xml_handle(
47a5cb77 2667 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2668 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2669 if res is False:
942acef5 2670 return []
a076c1f9
E
2671
2672 xspf, urlh = res
2673 xspf_url = urlh.geturl()
2674
47a5cb77
S
2675 return self._parse_xspf(
2676 xspf, playlist_id, xspf_url=xspf_url,
2677 xspf_base_url=base_url(xspf_url))
8d6765cf 2678
47a5cb77 2679 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2680 NS_MAP = {
2681 'xspf': 'http://xspf.org/ns/0/',
2682 's1': 'http://static.streamone.nl/player/ns/0',
2683 }
2684
2685 entries = []
47a5cb77 2686 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2687 title = xpath_text(
98044462 2688 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2689 description = xpath_text(
2690 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2691 thumbnail = xpath_text(
2692 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2693 duration = float_or_none(
2694 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2695
47a5cb77
S
2696 formats = []
2697 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2698 format_url = urljoin(xspf_base_url, location.text)
2699 if not format_url:
2700 continue
2701 formats.append({
2702 'url': format_url,
2703 'manifest_url': xspf_url,
2704 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2705 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2706 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2707 })
8d6765cf
S
2708 self._sort_formats(formats)
2709
2710 entries.append({
2711 'id': playlist_id,
2712 'title': title,
2713 'description': description,
2714 'thumbnail': thumbnail,
2715 'duration': duration,
2716 'formats': formats,
2717 })
2718 return entries
2719
171e59ed
F
2720 def _extract_mpd_formats(self, *args, **kwargs):
2721 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2722 if subs:
b5ae35ee 2723 self._report_ignoring_subs('DASH')
171e59ed
F
2724 return fmts
2725
2726 def _extract_mpd_formats_and_subtitles(
2727 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2728 fatal=True, data=None, headers={}, query={}):
47a5cb77 2729 res = self._download_xml_handle(
1bac3455 2730 mpd_url, video_id,
37a3bb66 2731 note='Downloading MPD manifest' if note is None else note,
2732 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2733 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2734 if res is False:
171e59ed 2735 return [], {}
47a5cb77 2736 mpd_doc, urlh = res
c25720ef 2737 if mpd_doc is None:
171e59ed 2738 return [], {}
779da8e3
E
2739
2740 # We could have been redirected to a new url when we retrieved our mpd file.
2741 mpd_url = urlh.geturl()
2742 mpd_base_url = base_url(mpd_url)
1bac3455 2743
171e59ed 2744 return self._parse_mpd_formats_and_subtitles(
545cc85d 2745 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2746
171e59ed
F
2747 def _parse_mpd_formats(self, *args, **kwargs):
2748 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2749 if subs:
b5ae35ee 2750 self._report_ignoring_subs('DASH')
171e59ed
F
2751 return fmts
2752
2753 def _parse_mpd_formats_and_subtitles(
2754 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2755 """
2756 Parse formats from MPD manifest.
2757 References:
2758 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2759 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2760 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2761 """
a06916d9 2762 if not self.get_param('dynamic_mpd', True):
78895bd3 2763 if mpd_doc.get('type') == 'dynamic':
171e59ed 2764 return [], {}
2d2fa82d 2765
91cb6b50 2766 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2767
2768 def _add_ns(path):
2769 return self._xpath_ns(path, namespace)
2770
675d0016 2771 def is_drm_protected(element):
2772 return element.find(_add_ns('ContentProtection')) is not None
2773
1bac3455 2774 def extract_multisegment_info(element, ms_parent_info):
2775 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2776
2777 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2778 # common attributes and elements. We will only extract relevant
2779 # for us.
2780 def extract_common(source):
2781 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2782 if segment_timeline is not None:
2783 s_e = segment_timeline.findall(_add_ns('S'))
2784 if s_e:
2785 ms_info['total_number'] = 0
2786 ms_info['s'] = []
2787 for s in s_e:
2788 r = int(s.get('r', 0))
2789 ms_info['total_number'] += 1 + r
2790 ms_info['s'].append({
2791 't': int(s.get('t', 0)),
2792 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2793 'd': int(s.attrib['d']),
2794 'r': r,
2795 })
2796 start_number = source.get('startNumber')
2797 if start_number:
2798 ms_info['start_number'] = int(start_number)
2799 timescale = source.get('timescale')
2800 if timescale:
2801 ms_info['timescale'] = int(timescale)
2802 segment_duration = source.get('duration')
2803 if segment_duration:
48504785 2804 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2805
2806 def extract_Initialization(source):
2807 initialization = source.find(_add_ns('Initialization'))
2808 if initialization is not None:
2809 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2810
f14be228 2811 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2812 if segment_list is not None:
b4c1d6e8
S
2813 extract_common(segment_list)
2814 extract_Initialization(segment_list)
f14be228 2815 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2816 if segment_urls_e:
2817 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2818 else:
f14be228 2819 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2820 if segment_template is not None:
b4c1d6e8 2821 extract_common(segment_template)
e228616c
S
2822 media = segment_template.get('media')
2823 if media:
2824 ms_info['media'] = media
1bac3455 2825 initialization = segment_template.get('initialization')
2826 if initialization:
e228616c 2827 ms_info['initialization'] = initialization
1bac3455 2828 else:
b4c1d6e8 2829 extract_Initialization(segment_template)
1bac3455 2830 return ms_info
b323e170 2831
1bac3455 2832 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2833 formats, subtitles = [], {}
234416e4 2834 stream_numbers = collections.defaultdict(int)
f14be228 2835 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2836 period_duration = parse_duration(period.get('duration')) or mpd_duration
2837 period_ms_info = extract_multisegment_info(period, {
2838 'start_number': 1,
2839 'timescale': 1,
2840 })
f14be228 2841 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2842 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2843 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2844 representation_attrib = adaptation_set.attrib.copy()
2845 representation_attrib.update(representation.attrib)
f0948348 2846 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2847 mime_type = representation_attrib['mimeType']
171e59ed
F
2848 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2849
21633673 2850 codec_str = representation_attrib.get('codecs', '')
2851 # Some kind of binary subtitle found in some youtube livestreams
2852 if mime_type == 'application/x-rawcc':
2853 codecs = {'scodec': codec_str}
2854 else:
2855 codecs = parse_codecs(codec_str)
be2fc5b2 2856 if content_type not in ('video', 'audio', 'text'):
2857 if mime_type == 'image/jpeg':
a8731fcc 2858 content_type = mime_type
21633673 2859 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2860 content_type = 'video'
21633673 2861 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2862 content_type = 'audio'
3fe75fdc 2863 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2864 content_type = 'text'
6993f78d 2865 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2866 content_type = 'text'
cdb19aa4 2867 else:
be2fc5b2 2868 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2869 continue
2870
2871 base_url = ''
2872 for element in (representation, adaptation_set, period, mpd_doc):
2873 base_url_e = element.find(_add_ns('BaseURL'))
47046464 2874 if try_call(lambda: base_url_e.text) is not None:
be2fc5b2 2875 base_url = base_url_e.text + base_url
2876 if re.match(r'^https?://', base_url):
2877 break
f9cc0161 2878 if mpd_base_url and base_url.startswith('/'):
14f25df2 2879 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
f9cc0161
D
2880 elif mpd_base_url and not re.match(r'^https?://', base_url):
2881 if not mpd_base_url.endswith('/'):
be2fc5b2 2882 mpd_base_url += '/'
2883 base_url = mpd_base_url + base_url
2884 representation_id = representation_attrib.get('id')
2885 lang = representation_attrib.get('lang')
2886 url_el = representation.find(_add_ns('BaseURL'))
2887 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2888 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2889 if representation_id is not None:
2890 format_id = representation_id
2891 else:
2892 format_id = content_type
2893 if mpd_id:
2894 format_id = mpd_id + '-' + format_id
2895 if content_type in ('video', 'audio'):
2896 f = {
2897 'format_id': format_id,
2898 'manifest_url': mpd_url,
2899 'ext': mimetype2ext(mime_type),
2900 'width': int_or_none(representation_attrib.get('width')),
2901 'height': int_or_none(representation_attrib.get('height')),
2902 'tbr': float_or_none(bandwidth, 1000),
2903 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2904 'fps': int_or_none(representation_attrib.get('frameRate')),
2905 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2906 'format_note': 'DASH %s' % content_type,
2907 'filesize': filesize,
2908 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2909 **codecs
be2fc5b2 2910 }
be2fc5b2 2911 elif content_type == 'text':
2912 f = {
2913 'ext': mimetype2ext(mime_type),
2914 'manifest_url': mpd_url,
2915 'filesize': filesize,
2916 }
2917 elif content_type == 'image/jpeg':
2918 # See test case in VikiIE
2919 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2920 f = {
2921 'format_id': format_id,
2922 'ext': 'mhtml',
2923 'manifest_url': mpd_url,
2924 'format_note': 'DASH storyboards (jpeg)',
2925 'acodec': 'none',
2926 'vcodec': 'none',
2927 }
88acdbc2 2928 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2929 f['has_drm'] = True
be2fc5b2 2930 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2931
2932 def prepare_template(template_name, identifiers):
2933 tmpl = representation_ms_info[template_name]
0cb0fdbb 2934 if representation_id is not None:
2935 tmpl = tmpl.replace('$RepresentationID$', representation_id)
be2fc5b2 2936 # First of, % characters outside $...$ templates
2937 # must be escaped by doubling for proper processing
2938 # by % operator string formatting used further (see
2939 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2940 t = ''
2941 in_template = False
2942 for c in tmpl:
2943 t += c
2944 if c == '$':
2945 in_template = not in_template
2946 elif c == '%' and not in_template:
eca1f0d1 2947 t += c
be2fc5b2 2948 # Next, $...$ templates are translated to their
2949 # %(...) counterparts to be used with % operator
be2fc5b2 2950 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2951 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2952 t.replace('$$', '$')
2953 return t
2954
2955 # @initialization is a regular template like @media one
2956 # so it should be handled just the same way (see
2957 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2958 if 'initialization' in representation_ms_info:
2959 initialization_template = prepare_template(
2960 'initialization',
2961 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2962 # $Time$ shall not be included for @initialization thus
2963 # only $Bandwidth$ remains
2964 ('Bandwidth', ))
2965 representation_ms_info['initialization_url'] = initialization_template % {
2966 'Bandwidth': bandwidth,
2967 }
2968
2969 def location_key(location):
2970 return 'url' if re.match(r'^https?://', location) else 'path'
2971
2972 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2973
2974 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2975 media_location_key = location_key(media_template)
2976
2977 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2978 # can't be used at the same time
2979 if '%(Number' in media_template and 's' not in representation_ms_info:
2980 segment_duration = None
2981 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2982 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2983 representation_ms_info['total_number'] = int(math.ceil(
2984 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2985 representation_ms_info['fragments'] = [{
2986 media_location_key: media_template % {
2987 'Number': segment_number,
2988 'Bandwidth': bandwidth,
2989 },
2990 'duration': segment_duration,
2991 } for segment_number in range(
2992 representation_ms_info['start_number'],
2993 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2994 else:
2995 # $Number*$ or $Time$ in media template with S list available
2996 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2997 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2998 representation_ms_info['fragments'] = []
2999 segment_time = 0
3000 segment_d = None
3001 segment_number = representation_ms_info['start_number']
3002
3003 def add_segment_url():
3004 segment_url = media_template % {
3005 'Time': segment_time,
3006 'Bandwidth': bandwidth,
3007 'Number': segment_number,
3008 }
3009 representation_ms_info['fragments'].append({
3010 media_location_key: segment_url,
3011 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
3012 })
3013
3014 for num, s in enumerate(representation_ms_info['s']):
3015 segment_time = s.get('t') or segment_time
3016 segment_d = s['d']
3017 add_segment_url()
3018 segment_number += 1
3019 for r in range(s.get('r', 0)):
3020 segment_time += segment_d
f0948348 3021 add_segment_url()
b4c1d6e8 3022 segment_number += 1
be2fc5b2 3023 segment_time += segment_d
3024 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
62b58c09
L
3025 # No media template,
3026 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
be2fc5b2 3027 # or any YouTube dashsegments video
3028 fragments = []
3029 segment_index = 0
3030 timescale = representation_ms_info['timescale']
3031 for s in representation_ms_info['s']:
3032 duration = float_or_none(s['d'], timescale)
3033 for r in range(s.get('r', 0) + 1):
3034 segment_uri = representation_ms_info['segment_urls'][segment_index]
3035 fragments.append({
3036 location_key(segment_uri): segment_uri,
3037 'duration': duration,
3038 })
3039 segment_index += 1
3040 representation_ms_info['fragments'] = fragments
3041 elif 'segment_urls' in representation_ms_info:
3042 # Segment URLs with no SegmentTimeline
62b58c09 3043 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
be2fc5b2 3044 # https://github.com/ytdl-org/youtube-dl/pull/14844
3045 fragments = []
3046 segment_duration = float_or_none(
3047 representation_ms_info['segment_duration'],
3048 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3049 for segment_url in representation_ms_info['segment_urls']:
3050 fragment = {
3051 location_key(segment_url): segment_url,
3052 }
3053 if segment_duration:
3054 fragment['duration'] = segment_duration
3055 fragments.append(fragment)
3056 representation_ms_info['fragments'] = fragments
3057 # If there is a fragments key available then we correctly recognized fragmented media.
3058 # Otherwise we will assume unfragmented media with direct access. Technically, such
3059 # assumption is not necessarily correct since we may simply have no support for
3060 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3061 if 'fragments' in representation_ms_info:
3062 f.update({
3063 # NB: mpd_url may be empty when MPD manifest is parsed from a string
3064 'url': mpd_url or base_url,
3065 'fragment_base_url': base_url,
3066 'fragments': [],
3067 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3068 })
3069 if 'initialization_url' in representation_ms_info:
3070 initialization_url = representation_ms_info['initialization_url']
3071 if not f.get('url'):
3072 f['url'] = initialization_url
3073 f['fragments'].append({location_key(initialization_url): initialization_url})
3074 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 3075 if not period_duration:
3076 period_duration = try_get(
3077 representation_ms_info,
3078 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 3079 else:
be2fc5b2 3080 # Assuming direct URL to unfragmented media.
3081 f['url'] = base_url
234416e4 3082 if content_type in ('video', 'audio', 'image/jpeg'):
3083 f['manifest_stream_number'] = stream_numbers[f['url']]
3084 stream_numbers[f['url']] += 1
be2fc5b2 3085 formats.append(f)
3086 elif content_type == 'text':
3087 subtitles.setdefault(lang or 'und', []).append(f)
3088
171e59ed 3089 return formats, subtitles
17b598d3 3090
fd76a142
F
3091 def _extract_ism_formats(self, *args, **kwargs):
3092 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3093 if subs:
b5ae35ee 3094 self._report_ignoring_subs('ISM')
fd76a142
F
3095 return fmts
3096
3097 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 3098 res = self._download_xml_handle(
b2758123 3099 ism_url, video_id,
37a3bb66 3100 note='Downloading ISM manifest' if note is None else note,
3101 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 3102 fatal=fatal, data=data, headers=headers, query=query)
b2758123 3103 if res is False:
fd76a142 3104 return [], {}
47a5cb77 3105 ism_doc, urlh = res
13b08034 3106 if ism_doc is None:
fd76a142 3107 return [], {}
b2758123 3108
fd76a142 3109 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 3110
fd76a142 3111 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
3112 """
3113 Parse formats from ISM manifest.
3114 References:
3115 1. [MS-SSTR]: Smooth Streaming Protocol,
3116 https://msdn.microsoft.com/en-us/library/ff469518.aspx
3117 """
06869367 3118 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 3119 return [], {}
b2758123 3120
b2758123
RA
3121 duration = int(ism_doc.attrib['Duration'])
3122 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3123
3124 formats = []
fd76a142 3125 subtitles = {}
b2758123
RA
3126 for stream in ism_doc.findall('StreamIndex'):
3127 stream_type = stream.get('Type')
fd76a142 3128 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
3129 continue
3130 url_pattern = stream.attrib['Url']
3131 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3132 stream_name = stream.get('Name')
fd76a142 3133 stream_language = stream.get('Language', 'und')
b2758123 3134 for track in stream.findall('QualityLevel'):
81b6102d 3135 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3136 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
b2758123 3137 # TODO: add support for WVC1 and WMAP
81b6102d 3138 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
b2758123
RA
3139 self.report_warning('%s is not a supported codec' % fourcc)
3140 continue
3141 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
3142 # [1] does not mention Width and Height attributes. However,
3143 # they're often present while MaxWidth and MaxHeight are
3144 # missing, so should be used as fallbacks
3145 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3146 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
3147 sampling_rate = int_or_none(track.get('SamplingRate'))
3148
3149 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
14f25df2 3150 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
b2758123
RA
3151
3152 fragments = []
3153 fragment_ctx = {
3154 'time': 0,
3155 }
3156 stream_fragments = stream.findall('c')
3157 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3158 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3159 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3160 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3161 if not fragment_ctx['duration']:
3162 try:
3163 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3164 except IndexError:
3165 next_fragment_time = duration
1616f9b4 3166 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
3167 for _ in range(fragment_repeat):
3168 fragments.append({
14f25df2 3169 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
3170 'duration': fragment_ctx['duration'] / stream_timescale,
3171 })
3172 fragment_ctx['time'] += fragment_ctx['duration']
3173
fd76a142
F
3174 if stream_type == 'text':
3175 subtitles.setdefault(stream_language, []).append({
3176 'ext': 'ismt',
3177 'protocol': 'ism',
3178 'url': ism_url,
3179 'manifest_url': ism_url,
3180 'fragments': fragments,
3181 '_download_params': {
3182 'stream_type': stream_type,
3183 'duration': duration,
3184 'timescale': stream_timescale,
3185 'fourcc': fourcc,
3186 'language': stream_language,
3187 'codec_private_data': track.get('CodecPrivateData'),
3188 }
3189 })
3190 elif stream_type in ('video', 'audio'):
3191 formats.append({
34921b43 3192 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
3193 'url': ism_url,
3194 'manifest_url': ism_url,
3195 'ext': 'ismv' if stream_type == 'video' else 'isma',
3196 'width': width,
3197 'height': height,
3198 'tbr': tbr,
3199 'asr': sampling_rate,
3200 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3201 'acodec': 'none' if stream_type == 'video' else fourcc,
3202 'protocol': 'ism',
3203 'fragments': fragments,
88acdbc2 3204 'has_drm': ism_doc.find('Protection') is not None,
fd76a142
F
3205 '_download_params': {
3206 'stream_type': stream_type,
3207 'duration': duration,
3208 'timescale': stream_timescale,
3209 'width': width or 0,
3210 'height': height or 0,
3211 'fourcc': fourcc,
3212 'language': stream_language,
3213 'codec_private_data': track.get('CodecPrivateData'),
3214 'sampling_rate': sampling_rate,
3215 'channels': int_or_none(track.get('Channels', 2)),
3216 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3217 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3218 },
3219 })
3220 return formats, subtitles
b2758123 3221
079a7cfc 3222 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
3223 def absolute_url(item_url):
3224 return urljoin(base_url, item_url)
59bbe491 3225
3226 def parse_content_type(content_type):
3227 if not content_type:
3228 return {}
3229 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3230 if ctr:
3231 mimetype, codecs = ctr.groups()
3232 f = parse_codecs(codecs)
3233 f['ext'] = mimetype2ext(mimetype)
3234 return f
3235 return {}
3236
222a2308
L
3237 def _media_formats(src, cur_media_type, type_info=None):
3238 type_info = type_info or {}
520251c0 3239 full_url = absolute_url(src)
82889d4a 3240 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3241 if ext == 'm3u8':
520251c0
YCH
3242 is_plain_url = False
3243 formats = self._extract_m3u8_formats(
ad120ae1 3244 full_url, video_id, ext='mp4',
eeb0a956 3245 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3246 preference=preference, quality=quality, fatal=False)
87a449c1
S
3247 elif ext == 'mpd':
3248 is_plain_url = False
3249 formats = self._extract_mpd_formats(
b359e977 3250 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3251 else:
3252 is_plain_url = True
3253 formats = [{
3254 'url': full_url,
3255 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3256 'ext': ext,
520251c0
YCH
3257 }]
3258 return is_plain_url, formats
3259
59bbe491 3260 entries = []
4328ddf8 3261 # amp-video and amp-audio are very similar to their HTML5 counterparts
962ffcf8 3262 # so we will include them right here (see
4328ddf8 3263 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3264 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3265 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3266 media_tags = [(media_tag, media_tag_name, media_type, '')
3267 for media_tag, media_tag_name, media_type
3268 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3269 media_tags.extend(re.findall(
3270 # We only allow video|audio followed by a whitespace or '>'.
3271 # Allowing more characters may end up in significant slow down (see
62b58c09
L
3272 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3273 # e.g. http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3274 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3275 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3276 media_info = {
3277 'formats': [],
3278 'subtitles': {},
3279 }
3280 media_attributes = extract_attributes(media_tag)
bfbecd11 3281 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3282 if src:
222a2308
L
3283 f = parse_content_type(media_attributes.get('type'))
3284 _, formats = _media_formats(src, media_type, f)
520251c0 3285 media_info['formats'].extend(formats)
6780154e 3286 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3287 if media_content:
3288 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3289 s_attr = extract_attributes(source_tag)
3290 # data-video-src and data-src are non standard but seen
3291 # several times in the wild
bfbecd11 3292 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3293 if not src:
3294 continue
d493f15c 3295 f = parse_content_type(s_attr.get('type'))
868f79db 3296 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3297 if is_plain_url:
d493f15c
S
3298 # width, height, res, label and title attributes are
3299 # all not standard but seen several times in the wild
3300 labels = [
3301 s_attr.get(lbl)
3302 for lbl in ('label', 'title')
3303 if str_or_none(s_attr.get(lbl))
3304 ]
3305 width = int_or_none(s_attr.get('width'))
3089bc74
S
3306 height = (int_or_none(s_attr.get('height'))
3307 or int_or_none(s_attr.get('res')))
d493f15c
S
3308 if not width or not height:
3309 for lbl in labels:
3310 resolution = parse_resolution(lbl)
3311 if not resolution:
3312 continue
3313 width = width or resolution.get('width')
3314 height = height or resolution.get('height')
3315 for lbl in labels:
3316 tbr = parse_bitrate(lbl)
3317 if tbr:
3318 break
3319 else:
3320 tbr = None
1ed45499 3321 f.update({
d493f15c
S
3322 'width': width,
3323 'height': height,
3324 'tbr': tbr,
3325 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3326 })
520251c0
YCH
3327 f.update(formats[0])
3328 media_info['formats'].append(f)
3329 else:
3330 media_info['formats'].extend(formats)
59bbe491 3331 for track_tag in re.findall(r'<track[^>]+>', media_content):
3332 track_attributes = extract_attributes(track_tag)
3333 kind = track_attributes.get('kind')
5968d7d2 3334 if not kind or kind in ('subtitles', 'captions'):
f856816b 3335 src = strip_or_none(track_attributes.get('src'))
59bbe491 3336 if not src:
3337 continue
3338 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3339 media_info['subtitles'].setdefault(lang, []).append({
3340 'url': absolute_url(src),
3341 })
5e8e2fa5
S
3342 for f in media_info['formats']:
3343 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3344 if media_info['formats'] or media_info['subtitles']:
59bbe491 3345 entries.append(media_info)
3346 return entries
3347
f6a1d69a
F
3348 def _extract_akamai_formats(self, *args, **kwargs):
3349 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3350 if subs:
b5ae35ee 3351 self._report_ignoring_subs('akamai')
f6a1d69a
F
3352 return fmts
3353
3354 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3355 signed = 'hdnea=' in manifest_url
3356 if not signed:
3357 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3358 manifest_url = re.sub(
3359 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3360 '', manifest_url).strip('?')
3361
c7c43a93 3362 formats = []
f6a1d69a 3363 subtitles = {}
70c5802b 3364
e71a4509 3365 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3366 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3367 hds_host = hosts.get('hds')
3368 if hds_host:
3369 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3370 if 'hdcore=' not in f4m_url:
3371 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3372 f4m_formats = self._extract_f4m_formats(
3373 f4m_url, video_id, f4m_id='hds', fatal=False)
3374 for entry in f4m_formats:
3375 entry.update({'extra_param_to_segment_url': hdcore_sign})
3376 formats.extend(f4m_formats)
70c5802b 3377
c4251b9a
RA
3378 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3379 hls_host = hosts.get('hls')
3380 if hls_host:
3381 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3382 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3383 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3384 m3u8_id='hls', fatal=False)
3385 formats.extend(m3u8_formats)
f6a1d69a 3386 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3387
3388 http_host = hosts.get('http')
29f7c58a 3389 if http_host and m3u8_formats and not signed:
3390 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3391 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3392 qualities_length = len(qualities)
29f7c58a 3393 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3394 i = 0
29f7c58a 3395 for f in m3u8_formats:
3396 if f['vcodec'] != 'none':
70c5802b 3397 for protocol in ('http', 'https'):
3398 http_f = f.copy()
3399 del http_f['manifest_url']
3400 http_url = re.sub(
86e5f3ed 3401 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3402 http_f.update({
3403 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3404 'url': http_url,
3405 'protocol': protocol,
3406 })
29f7c58a 3407 formats.append(http_f)
70c5802b 3408 i += 1
70c5802b 3409
f6a1d69a 3410 return formats, subtitles
c7c43a93 3411
6ad02195 3412 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
14f25df2 3413 query = urllib.parse.urlparse(url).query
6ad02195 3414 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3415 mobj = re.search(
3416 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3417 url_base = mobj.group('url')
3418 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3419 formats = []
044eeb14
S
3420
3421 def manifest_url(manifest):
86e5f3ed 3422 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3423 if query:
3424 m_url += '?%s' % query
3425 return m_url
3426
6ad02195
RA
3427 if 'm3u8' not in skip_protocols:
3428 formats.extend(self._extract_m3u8_formats(
044eeb14 3429 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3430 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3431 if 'f4m' not in skip_protocols:
3432 formats.extend(self._extract_f4m_formats(
044eeb14 3433 manifest_url('manifest.f4m'),
6ad02195 3434 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3435 if 'dash' not in skip_protocols:
3436 formats.extend(self._extract_mpd_formats(
044eeb14 3437 manifest_url('manifest.mpd'),
0384932e 3438 video_id, mpd_id='dash', fatal=False))
6ad02195 3439 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3440 if 'smil' not in skip_protocols:
3441 rtmp_formats = self._extract_smil_formats(
044eeb14 3442 manifest_url('jwplayer.smil'),
6ad02195
RA
3443 video_id, fatal=False)
3444 for rtmp_format in rtmp_formats:
3445 rtsp_format = rtmp_format.copy()
3446 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3447 del rtsp_format['play_path']
3448 del rtsp_format['ext']
3449 rtsp_format.update({
3450 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3451 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3452 'protocol': 'rtsp',
3453 })
3454 formats.extend([rtmp_format, rtsp_format])
3455 else:
3456 for protocol in ('rtmp', 'rtsp'):
3457 if protocol not in skip_protocols:
3458 formats.append({
86e5f3ed 3459 'url': f'{protocol}:{url_base}',
6ad02195
RA
3460 'format_id': protocol,
3461 'protocol': protocol,
3462 })
3463 return formats
3464
c73e330e 3465 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3466 mobj = re.search(
ac9c69ac 3467 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3468 webpage)
3469 if mobj:
c73e330e
RU
3470 try:
3471 jwplayer_data = self._parse_json(mobj.group('options'),
3472 video_id=video_id,
3473 transform_source=transform_source)
3474 except ExtractorError:
3475 pass
3476 else:
3477 if isinstance(jwplayer_data, dict):
3478 return jwplayer_data
a4a554a7
YCH
3479
3480 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3481 jwplayer_data = self._find_jwplayer_data(
3482 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3483 return self._parse_jwplayer_data(
3484 jwplayer_data, video_id, *args, **kwargs)
3485
3486 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3487 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3488 # JWPlayer backward compatibility: flattened playlists
3489 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3490 if 'playlist' not in jwplayer_data:
3491 jwplayer_data = {'playlist': [jwplayer_data]}
3492
3493 entries = []
3494
3495 # JWPlayer backward compatibility: single playlist item
3496 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3497 if not isinstance(jwplayer_data['playlist'], list):
3498 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3499
3500 for video_data in jwplayer_data['playlist']:
3501 # JWPlayer backward compatibility: flattened sources
3502 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3503 if 'sources' not in video_data:
3504 video_data['sources'] = [video_data]
3505
3506 this_video_id = video_id or video_data['mediaid']
3507
1a2192cb
S
3508 formats = self._parse_jwplayer_formats(
3509 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3510 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3511
3512 subtitles = {}
3513 tracks = video_data.get('tracks')
3514 if tracks and isinstance(tracks, list):
3515 for track in tracks:
96a2daa1
S
3516 if not isinstance(track, dict):
3517 continue
f4b74272 3518 track_kind = track.get('kind')
14f25df2 3519 if not track_kind or not isinstance(track_kind, str):
f4b74272
S
3520 continue
3521 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3522 continue
3523 track_url = urljoin(base_url, track.get('file'))
3524 if not track_url:
3525 continue
3526 subtitles.setdefault(track.get('label') or 'en', []).append({
3527 'url': self._proto_relative_url(track_url)
3528 })
3529
50d808f5 3530 entry = {
a4a554a7 3531 'id': this_video_id,
50d808f5 3532 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3533 'description': clean_html(video_data.get('description')),
6945b9e7 3534 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3535 'timestamp': int_or_none(video_data.get('pubdate')),
3536 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3537 'subtitles': subtitles,
50d808f5
RA
3538 }
3539 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3540 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3541 entry.update({
3542 '_type': 'url_transparent',
3543 'url': formats[0]['url'],
3544 })
3545 else:
3546 self._sort_formats(formats)
3547 entry['formats'] = formats
3548 entries.append(entry)
a4a554a7
YCH
3549 if len(entries) == 1:
3550 return entries[0]
3551 else:
3552 return self.playlist_result(entries)
3553
ed0cf9b3
S
3554 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3555 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3556 urls = []
ed0cf9b3 3557 formats = []
1a2192cb 3558 for source in jwplayer_sources_data:
0a268c6e
S
3559 if not isinstance(source, dict):
3560 continue
6945b9e7
RA
3561 source_url = urljoin(
3562 base_url, self._proto_relative_url(source.get('file')))
3563 if not source_url or source_url in urls:
bf1b87cd
RA
3564 continue
3565 urls.append(source_url)
ed0cf9b3
S
3566 source_type = source.get('type') or ''
3567 ext = mimetype2ext(source_type) or determine_ext(source_url)
3568 if source_type == 'hls' or ext == 'm3u8':
3569 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3570 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3571 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3572 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3573 formats.extend(self._extract_mpd_formats(
3574 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3575 elif ext == 'smil':
3576 formats.extend(self._extract_smil_formats(
3577 source_url, video_id, fatal=False))
ed0cf9b3 3578 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3579 elif source_type.startswith('audio') or ext in (
3580 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3581 formats.append({
3582 'url': source_url,
3583 'vcodec': 'none',
3584 'ext': ext,
3585 })
3586 else:
3587 height = int_or_none(source.get('height'))
3588 if height is None:
3589 # Often no height is provided but there is a label in
0236cd0d 3590 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3591 height = int_or_none(self._search_regex(
14f25df2 3592 r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
ed0cf9b3
S
3593 'height', default=None))
3594 a_format = {
3595 'url': source_url,
3596 'width': int_or_none(source.get('width')),
3597 'height': height,
d3a3d7f0 3598 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3599 'filesize': int_or_none(source.get('filesize')),
ed0cf9b3
S
3600 'ext': ext,
3601 }
3602 if source_url.startswith('rtmp'):
3603 a_format['ext'] = 'flv'
ed0cf9b3
S
3604 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3605 # of jwplayer.flash.swf
3606 rtmp_url_parts = re.split(
3607 r'((?:mp4|mp3|flv):)', source_url, 1)
3608 if len(rtmp_url_parts) == 3:
3609 rtmp_url, prefix, play_path = rtmp_url_parts
3610 a_format.update({
3611 'url': rtmp_url,
3612 'play_path': prefix + play_path,
3613 })
3614 if rtmp_params:
3615 a_format.update(rtmp_params)
3616 formats.append(a_format)
3617 return formats
3618
f4b1c7ad 3619 def _live_title(self, name):
39ca3b5c 3620 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3621 return name
f4b1c7ad 3622
b14f3a4c
PH
3623 def _int(self, v, name, fatal=False, **kwargs):
3624 res = int_or_none(v, **kwargs)
b14f3a4c 3625 if res is None:
86e5f3ed 3626 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3627 if fatal:
3628 raise ExtractorError(msg)
3629 else:
6a39ee13 3630 self.report_warning(msg)
b14f3a4c
PH
3631 return res
3632
3633 def _float(self, v, name, fatal=False, **kwargs):
3634 res = float_or_none(v, **kwargs)
3635 if res is None:
86e5f3ed 3636 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3637 if fatal:
3638 raise ExtractorError(msg)
3639 else:
6a39ee13 3640 self.report_warning(msg)
b14f3a4c
PH
3641 return res
3642
40e41780
TF
3643 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3644 path='/', secure=False, discard=False, rest={}, **kwargs):
ac668111 3645 cookie = http.cookiejar.Cookie(
4ed2d7b7 3646 0, name, value, port, port is not None, domain, True,
40e41780
TF
3647 domain.startswith('.'), path, True, secure, expire_time,
3648 discard, None, None, rest)
9809740b 3649 self.cookiejar.set_cookie(cookie)
42939b61 3650
799207e8 3651 def _get_cookies(self, url):
ac668111 3652 """ Return a http.cookies.SimpleCookie with the cookies for the url """
8817a80d 3653 return LenientSimpleCookie(self._downloader._calc_cookies(url))
799207e8 3654
e3c1266f 3655 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3656 """
3657 Apply first Set-Cookie header instead of the last. Experimental.
3658
3659 Some sites (e.g. [1-3]) may serve two cookies under the same name
3660 in Set-Cookie header and expect the first (old) one to be set rather
3661 than second (new). However, as of RFC6265 the newer one cookie
3662 should be set into cookie store what actually happens.
3663 We will workaround this issue by resetting the cookie to
3664 the first one manually.
3665 1. https://new.vk.com/
3666 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3667 3. https://learning.oreilly.com/
3668 """
e3c1266f
S
3669 for header, cookies in url_handle.headers.items():
3670 if header.lower() != 'set-cookie':
3671 continue
cfb0511d 3672 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3673 cookie_value = re.search(
3674 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3675 if cookie_value:
3676 value, domain = cookie_value.groups()
3677 self._set_cookie(domain, cookie, value)
3678 break
3679
82d02080 3680 @classmethod
3681 def get_testcases(cls, include_onlymatching=False):
6368e2e6 3682 # Do not look in super classes
3683 t = vars(cls).get('_TEST')
05900629 3684 if t:
82d02080 3685 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3686 tests = [t]
3687 else:
6368e2e6 3688 tests = vars(cls).get('_TESTS', [])
05900629
PH
3689 for t in tests:
3690 if not include_onlymatching and t.get('only_matching', False):
3691 continue
82d02080 3692 t['name'] = cls.ie_key()
05900629
PH
3693 yield t
3694
f2e8dbcc 3695 @classmethod
3696 def get_webpage_testcases(cls):
6368e2e6 3697 tests = vars(cls).get('_WEBPAGE_TESTS', [])
f2e8dbcc 3698 for t in tests:
3699 t['name'] = cls.ie_key()
3700 return tests
3701
6368e2e6 3702 @classproperty(cache=True)
24146491 3703 def age_limit(cls):
3704 """Get age limit from the testcases"""
3705 return max(traverse_obj(
f2e8dbcc 3706 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
24146491 3707 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3708
171a31db 3709 @classproperty(cache=True)
3710 def _RETURN_TYPE(cls):
3711 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3712 tests = tuple(cls.get_testcases(include_onlymatching=False))
3713 if not tests:
3714 return None
3715 elif not any(k.startswith('playlist') for test in tests for k in test):
3716 return 'video'
3717 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3718 return 'playlist'
3719 return 'any'
3720
3721 @classmethod
3722 def is_single_video(cls, url):
3723 """Returns whether the URL is of a single video, None if unknown"""
3724 assert cls.suitable(url), 'The URL must be suitable for the extractor'
3725 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3726
82d02080 3727 @classmethod
3728 def is_suitable(cls, age_limit):
24146491 3729 """Test whether the extractor is generally suitable for the given age limit"""
3730 return not age_restricted(cls.age_limit, age_limit)
05900629 3731
82d02080 3732 @classmethod
3733 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3734 """Description of the extractor"""
3735 desc = ''
82d02080 3736 if cls._NETRC_MACHINE:
8dcce6a8 3737 if markdown:
82d02080 3738 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
8dcce6a8 3739 else:
82d02080 3740 desc += f' [{cls._NETRC_MACHINE}]'
3741 if cls.IE_DESC is False:
8dcce6a8 3742 desc += ' [HIDDEN]'
82d02080 3743 elif cls.IE_DESC:
3744 desc += f' {cls.IE_DESC}'
3745 if cls.SEARCH_KEY:
3746 desc += f'; "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3747 if search_examples:
3748 _COUNTS = ('', '5', '10', 'all')
62b58c09 3749 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
82d02080 3750 if not cls.working():
8dcce6a8 3751 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3752
46d09f87 3753 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3754 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
8dcce6a8 3755 return f'{name}:{desc}' if desc else name
3756
a504ced0 3757 def extract_subtitles(self, *args, **kwargs):
a06916d9 3758 if (self.get_param('writesubtitles', False)
3759 or self.get_param('listsubtitles')):
9868ea49
JMF
3760 return self._get_subtitles(*args, **kwargs)
3761 return {}
a504ced0
JMF
3762
3763 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3764 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3765
0cf643b2
M
3766 class CommentsDisabled(Exception):
3767 """Raise in _get_comments if comments are disabled for the video"""
3768
a2160aa4 3769 def extract_comments(self, *args, **kwargs):
3770 if not self.get_param('getcomments'):
3771 return None
3772 generator = self._get_comments(*args, **kwargs)
3773
3774 def extractor():
3775 comments = []
d2b2fca5 3776 interrupted = True
a2160aa4 3777 try:
3778 while True:
3779 comments.append(next(generator))
a2160aa4 3780 except StopIteration:
3781 interrupted = False
d2b2fca5 3782 except KeyboardInterrupt:
3783 self.to_screen('Interrupted by user')
0cf643b2
M
3784 except self.CommentsDisabled:
3785 return {'comments': None, 'comment_count': None}
d2b2fca5 3786 except Exception as e:
3787 if self.get_param('ignoreerrors') is not True:
3788 raise
3789 self._downloader.report_error(e)
a2160aa4 3790 comment_count = len(comments)
3791 self.to_screen(f'Extracted {comment_count} comments')
3792 return {
3793 'comments': comments,
3794 'comment_count': None if interrupted else comment_count
3795 }
3796 return extractor
3797
3798 def _get_comments(self, *args, **kwargs):
3799 raise NotImplementedError('This method must be implemented by subclasses')
3800
912e0b7e
YCH
3801 @staticmethod
3802 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3803 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3804 will be dropped. """
86e5f3ed 3805 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3806 ret = list(subtitle_list1)
a44ca5a4 3807 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3808 return ret
3809
3810 @classmethod
46890374 3811 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3812 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3813 if target is None:
3814 target = {}
3815 for d in dicts:
3816 for lang, subs in d.items():
3817 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3818 return target
912e0b7e 3819
360e1ca5 3820 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3821 if (self.get_param('writeautomaticsub', False)
3822 or self.get_param('listsubtitles')):
9868ea49
JMF
3823 return self._get_automatic_captions(*args, **kwargs)
3824 return {}
360e1ca5
JMF
3825
3826 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3827 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3828
2762dbb1 3829 @functools.cached_property
24146491 3830 def _cookies_passed(self):
3831 """Whether cookies have been passed to YoutubeDL"""
3832 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3833
d77ab8e2 3834 def mark_watched(self, *args, **kwargs):
1813a6cc 3835 if not self.get_param('mark_watched', False):
3836 return
24146491 3837 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3838 self._mark_watched(*args, **kwargs)
3839
3840 def _mark_watched(self, *args, **kwargs):
3841 raise NotImplementedError('This method must be implemented by subclasses')
3842
38cce791
YCH
3843 def geo_verification_headers(self):
3844 headers = {}
a06916d9 3845 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3846 if geo_verification_proxy:
3847 headers['Ytdl-request-proxy'] = geo_verification_proxy
3848 return headers
3849
8f97a15d 3850 @staticmethod
3851 def _generic_id(url):
14f25df2 3852 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
98763ee3 3853
62b8dac4 3854 def _generic_title(self, url='', webpage='', *, default=None):
3855 return (self._og_search_title(webpage, default=None)
3856 or self._html_extract_title(webpage, default=None)
3857 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3858 or default)
98763ee3 3859
c224251a 3860 @staticmethod
b0089e89 3861 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3862 all_known = all(map(
3863 lambda x: x is not None,
3864 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3865 return (
3866 'private' if is_private
3867 else 'premium_only' if needs_premium
3868 else 'subscriber_only' if needs_subscription
3869 else 'needs_auth' if needs_auth
3870 else 'unlisted' if is_unlisted
3871 else 'public' if all_known
3872 else None)
3873
d43de682 3874 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3875 '''
3876 @returns A list of values for the extractor argument given by "key"
3877 or "default" if no such key is present
3878 @param default The default value to return when the key is not present (default: [])
3879 @param casesense When false, the values are converted to lower case
3880 '''
5225df50 3881 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3882 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
4bb6b02f 3883 if val is None:
3884 return [] if default is NO_DEFAULT else default
3885 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3886
f40ee5e9 3887 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3888 if not playlist_id or not video_id:
3889 return not video_id
3890
3891 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3892 if no_playlist is not None:
3893 return not no_playlist
3894
3895 video_id = '' if video_id is True else f' {video_id}'
3896 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3897 if self.get_param('noplaylist'):
3898 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3899 return False
3900 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3901 return True
3902
be5c1ae8 3903 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
8ca48a1a 3904 RetryManager.report_retry(
3905 err, _count or int(fatal), _retries,
3906 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3907 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
be5c1ae8 3908
3909 def RetryManager(self, **kwargs):
3910 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3911
ade1fa70 3912 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3913 display_id = traverse_obj(info_dict, 'display_id', 'id')
3914 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3915 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3916 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3917
8f97a15d 3918 @classmethod
3919 def extract_from_webpage(cls, ydl, url, webpage):
3920 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3921 else ydl.get_info_extractor(cls.ie_key()))
f2e8dbcc 3922 for info in ie._extract_from_webpage(url, webpage) or []:
3923 # url = None since we do not want to set (webpage/original)_url
3924 ydl.add_default_extra_info(info, ie, None)
3925 yield info
8f97a15d 3926
3927 @classmethod
3928 def _extract_from_webpage(cls, url, webpage):
3929 for embed_url in orderedSet(
3930 cls._extract_embed_urls(url, webpage) or [], lazy=True):
d2c8aadf 3931 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
8f97a15d 3932
3933 @classmethod
3934 def _extract_embed_urls(cls, url, webpage):
3935 """@returns all the embed urls on the webpage"""
3936 if '_EMBED_URL_RE' not in cls.__dict__:
3937 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3938 for idx, regex in enumerate(cls._EMBED_REGEX):
3939 assert regex.count('(?P<url>') == 1, \
3940 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3941 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3942
3943 for regex in cls._EMBED_URL_RE:
3944 for mobj in regex.finditer(webpage):
3945 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3946 if cls._VALID_URL is False or cls.suitable(embed_url):
3947 yield embed_url
3948
3949 class StopExtraction(Exception):
3950 pass
3951
bfd973ec 3952 @classmethod
3953 def _extract_url(cls, webpage): # TODO: Remove
3954 """Only for compatibility with some older extractors"""
3955 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3956
2314b4d8 3957 @classmethod
3958 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3959 if plugin_name:
3960 mro = inspect.getmro(cls)
3961 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3962 cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
3963 while getattr(super_class, '__wrapped__', None):
3964 super_class = super_class.__wrapped__
3965 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3966
3967 return super().__init_subclass__(**kwargs)
3968
8dbe9899 3969
d6983cb4
PH
3970class SearchInfoExtractor(InfoExtractor):
3971 """
3972 Base class for paged search queries extractors.
10952eb2 3973 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3974 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3975 """
3976
96565c7e 3977 _MAX_RESULTS = float('inf')
171a31db 3978 _RETURN_TYPE = 'playlist'
96565c7e 3979
8f97a15d 3980 @classproperty
3981 def _VALID_URL(cls):
d6983cb4
PH
3982 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3983
d6983cb4 3984 def _real_extract(self, query):
2c4aaadd 3985 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3986 if prefix == '':
3987 return self._get_n_results(query, 1)
3988 elif prefix == 'all':
3989 return self._get_n_results(query, self._MAX_RESULTS)
3990 else:
3991 n = int(prefix)
3992 if n <= 0:
86e5f3ed 3993 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3994 elif n > self._MAX_RESULTS:
6a39ee13 3995 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3996 n = self._MAX_RESULTS
3997 return self._get_n_results(query, n)
3998
3999 def _get_n_results(self, query, n):
cc16383f 4000 """Get a specified number of results for a query.
4001 Either this function or _search_results must be overridden by subclasses """
4002 return self.playlist_result(
4003 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
4004 query, query)
4005
4006 def _search_results(self, query):
4007 """Returns an iterator of search results"""
611c1dd9 4008 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 4009
82d02080 4010 @classproperty
4011 def SEARCH_KEY(cls):
4012 return cls._SEARCH_KEY
fe7866d0 4013
4014
4015class UnsupportedURLIE(InfoExtractor):
4016 _VALID_URL = '.*'
4017 _ENABLED = False
4018 IE_DESC = False
4019
4020 def _real_extract(self, url):
4021 raise UnsupportedError(url)