]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
[extractor] Add dev option `--load-pages`
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
3ec05685 3import hashlib
cc16383f 4import itertools
3d3538e4 5import json
f8271158 6import math
4094b6e3 7import netrc
d6983cb4 8import os
773f291d 9import random
d6983cb4 10import sys
4094b6e3 11import time
f8271158 12import xml.etree.ElementTree
d6983cb4 13
c487cf00 14from ..compat import functools, re # isort: split
8c25f81b 15from ..compat import (
6c22cee6 16 compat_cookiejar_Cookie,
f7ad7160 17 compat_cookies_SimpleCookie,
e9c0cdd3 18 compat_etree_fromstring,
0001fcb5 19 compat_expanduser,
e64b7569 20 compat_getpass,
d6983cb4 21 compat_http_client,
e9c0cdd3
YCH
22 compat_os_name,
23 compat_str,
d6983cb4 24 compat_urllib_error,
98763ee3 25 compat_urllib_parse_unquote,
15707c7e 26 compat_urllib_parse_urlencode,
41d06b04 27 compat_urllib_request,
f0b5d6af 28 compat_urlparse,
8c25f81b 29)
eb8a4433 30from ..downloader import FileDownloader
f8271158 31from ..downloader.f4m import get_base_url, remove_encrypted_media
8c25f81b 32from ..utils import (
f8271158 33 JSON_LD_RE,
34 NO_DEFAULT,
35 ExtractorError,
36 GeoRestrictedError,
37 GeoUtils,
38 RegexNotFoundError,
39 UnsupportedError,
05900629 40 age_restricted,
02dc0a36 41 base_url,
08f2a92c 42 bug_reports_message,
82d02080 43 classproperty,
d6983cb4 44 clean_html,
70f0f5a8 45 determine_ext,
46b18f23 46 determine_protocol,
d493f15c 47 dict_get,
42676437 48 encode_data_uri,
9b9c5355 49 error_to_compat_str,
46b18f23 50 extract_attributes,
90137ca4 51 filter_dict,
97f4aecf 52 fix_xml_ampersands,
b14f3a4c 53 float_or_none,
b868936c 54 format_field,
31bb8d3f 55 int_or_none,
34921b43 56 join_nonempty,
a4a554a7 57 js_to_json,
46b18f23 58 mimetype2ext,
3158150c 59 network_exceptions,
46b18f23 60 orderedSet,
d493f15c 61 parse_bitrate,
46b18f23
JH
62 parse_codecs,
63 parse_duration,
4ca2a3cf 64 parse_iso8601,
46b18f23 65 parse_m3u8_attributes,
d493f15c 66 parse_resolution,
46b18f23 67 sanitize_filename,
b868936c 68 sanitized_Request,
d493f15c 69 str_or_none,
ce5b9040 70 str_to_int,
f856816b 71 strip_or_none,
5d3a0e79 72 traverse_obj,
ffa89477 73 try_get,
f38de77f 74 unescapeHTML,
647eab45 75 unified_strdate,
6b3a3098 76 unified_timestamp,
46b18f23 77 update_Request,
a107193e 78 url_basename,
bebef109 79 url_or_none,
b868936c 80 urljoin,
6606817a 81 variadic,
a6571f10 82 xpath_element,
8d6765cf
S
83 xpath_text,
84 xpath_with_ns,
d6983cb4 85)
c342041f 86
d6983cb4 87
86e5f3ed 88class InfoExtractor:
d6983cb4
PH
89 """Information Extractor class.
90
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
5d380852 95 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
98
cf0649f8 99 The type field determines the type of the result.
fed5d032
PH
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
102
103 For a video, the dictionaries must include the following fields:
d6983cb4
PH
104
105 id: Video identifier.
d4736fdb 106 title: Video title, unescaped. Set to an empty string if video has
107 no title as opposed to "None" which signifies that the
108 extractor failed to obtain a title
d67b0b15 109
f49d89ee 110 Additionally, it must contain either a formats entry or a url one:
d67b0b15 111
f49d89ee
PH
112 formats: A list of dictionaries for each format available, ordered
113 from worst to best quality.
114
115 Potential fields:
c790e93a
S
116 * url The mandatory URL representing the media:
117 for plain file media - HTTP URL of this file,
118 for RTMP - RTMP URL,
119 for HLS - URL of the M3U8 media playlist,
120 for HDS - URL of the F4M manifest,
79d2077e
S
121 for DASH
122 - HTTP URL to plain file media (in case of
123 unfragmented media)
124 - URL of the MPD manifest or base URL
125 representing the media if MPD manifest
8ed7a233 126 is parsed from a string (in case of
79d2077e 127 fragmented media)
c790e93a 128 for MSS - URL of the ISM manifest.
86f4d14f
S
129 * manifest_url
130 The URL of the manifest file in case of
c790e93a
S
131 fragmented media:
132 for HLS - URL of the M3U8 master playlist,
133 for HDS - URL of the F4M manifest,
134 for DASH - URL of the MPD manifest,
135 for MSS - URL of the ISM manifest.
a44ca5a4 136 * manifest_stream_number (For internal use only)
137 The index of the stream in the manifest file
10952eb2 138 * ext Will be calculated from URL if missing
d67b0b15
PH
139 * format A human-readable description of the format
140 ("mp4 container with h264/opus").
141 Calculated from the format_id, width, height.
142 and format_note fields if missing.
143 * format_id A short description of the format
5d4f3985
PH
144 ("mp4_h264_opus" or "19").
145 Technically optional, but strongly recommended.
d67b0b15
PH
146 * format_note Additional info about the format
147 ("3D" or "DASH video")
148 * width Width of the video, if known
149 * height Height of the video, if known
f49d89ee 150 * resolution Textual description of width and height
176f1866 151 * dynamic_range The dynamic range of the video. One of:
152 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 153 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
154 * abr Average audio bitrate in KBit/s
155 * acodec Name of the audio codec in use
dd27fd17 156 * asr Audio sampling rate in Hertz
d67b0b15 157 * vbr Average video bitrate in KBit/s
fbb21cf5 158 * fps Frame rate
d67b0b15 159 * vcodec Name of the video codec in use
1394ce65 160 * container Name of the container format
d67b0b15 161 * filesize The number of bytes, if known in advance
9732d77e 162 * filesize_approx An estimate for the number of bytes
d67b0b15 163 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 164 * protocol The protocol that will be used for the actual
adbc4ec4
THD
165 download, lower-case. One of "http", "https" or
166 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
167 * fragment_base_url
168 Base URL for fragments. Each fragment's path
169 value (if present) will be relative to
170 this URL.
171 * fragments A list of fragments of a fragmented media.
172 Each fragment entry must contain either an url
173 or a path. If an url is present it should be
174 considered by a client. Otherwise both path and
175 fragment_base_url must be present. Here is
176 the list of all potential fields:
177 * "url" - fragment's URL
178 * "path" - fragment's path relative to
179 fragment_base_url
a0d5077c
S
180 * "duration" (optional, int or float)
181 * "filesize" (optional, int)
adbc4ec4
THD
182 * is_from_start Is a live format that can be downloaded
183 from the start. Boolean
f49d89ee 184 * preference Order number of this format. If this field is
08d13955 185 present and not None, the formats get sorted
38d63d84 186 by this field, regardless of all other values.
f49d89ee
PH
187 -1 for default (order by other properties),
188 -2 or smaller for less than default.
e65566a9
PH
189 < -1000 to hide the format (if there is
190 another one which is strictly better)
32f90364
PH
191 * language Language code, e.g. "de" or "en-US".
192 * language_preference Is this in the language mentioned in
193 the URL?
aff2f4f4
PH
194 10 if it's what the URL is about,
195 -1 for default (don't know),
196 -10 otherwise, other values reserved for now.
5d73273f
PH
197 * quality Order number of the video quality of this
198 format, irrespective of the file format.
199 -1 for default (order by other properties),
200 -2 or smaller for less than default.
c64ed2a3
PH
201 * source_preference Order number for this video source
202 (quality takes higher priority)
203 -1 for default (order by other properties),
204 -2 or smaller for less than default.
d769be6c
PH
205 * http_headers A dictionary of additional HTTP headers
206 to add to the request.
6271f1ca 207 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
208 video's pixels are not square.
209 width : height ratio as float.
210 * no_resume The server does not support resuming the
211 (HTTP or RTMP) download. Boolean.
88acdbc2 212 * has_drm The format has DRM and cannot be downloaded. Boolean
0a5a191a 213 * downloader_options A dictionary of downloader options
214 (For internal use only)
215 * http_chunk_size Chunk size for HTTP downloads
216 * ffmpeg_args Extra arguments for ffmpeg downloader
3b1fe47d 217 RTMP formats can also have the additional fields: page_url,
218 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
219 rtmp_protocol, rtmp_real_time
3dee7826 220
c0ba0f48 221 url: Final video URL.
d6983cb4 222 ext: Video filename extension.
d67b0b15
PH
223 format: The video format, defaults to ext (used for --get-format)
224 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 225
d6983cb4
PH
226 The following fields are optional:
227
08d30158 228 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 229 alt_title: A secondary title of the video.
0afef30b
PH
230 display_id An alternative identifier for the video, not necessarily
231 unique, but available before title. Typically, id is
232 something like "4234987", title "Dancing naked mole rats",
233 and display_id "dancing-naked-mole-rats"
d5519808 234 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 235 * "id" (optional, string) - Thumbnail format ID
d5519808 236 * "url"
cfb56d1a 237 * "preference" (optional, int) - quality of the image
d5519808
PH
238 * "width" (optional, int)
239 * "height" (optional, int)
5e1c39ac 240 * "resolution" (optional, string "{width}x{height}",
d5519808 241 deprecated)
2de624fd 242 * "filesize" (optional, int)
297e9952 243 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 244 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 245 description: Full video description.
d6983cb4 246 uploader: Full name of the video uploader.
2bc0c46f 247 license: License name the video is licensed under.
8a92e51c 248 creator: The creator of the video.
10db0d2f 249 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 250 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 251 If not explicitly set, calculated from timestamp
252 release_timestamp: UNIX timestamp of the moment the video was released.
253 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 254 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 255 If not explicitly set, calculated from release_timestamp
256 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 257 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 258 If not explicitly set, calculated from modified_timestamp
d6983cb4 259 uploader_id: Nickname or id of the video uploader.
7bcd2830 260 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 261 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 262 Note that channel fields may or may not repeat uploader
6f1f59f3
S
263 fields. This depends on a particular extractor.
264 channel_id: Id of the channel.
265 channel_url: Full URL to a channel webpage.
6c73052c 266 channel_follower_count: Number of followers of the channel.
da9ec3b9 267 location: Physical location where the video was filmed.
a504ced0 268 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
269 {tag: subformats}. "tag" is usually a language code, and
270 "subformats" is a list sorted from lower to higher
271 preference, each element is a dictionary with the "ext"
272 entry and one of:
a504ced0 273 * "data": The subtitles file contents
10952eb2 274 * "url": A URL pointing to the subtitles file
2412044c 275 It can optionally also have:
276 * "name": Name or description of the subtitles
08d30158 277 * "http_headers": A dictionary of additional HTTP headers
297e9952 278 to add to the request.
4bba3716 279 "ext" will be calculated from URL if missing
e167860c 280 automatic_captions: Like 'subtitles'; contains automatically generated
281 captions instead of normal subtitles
62d231c0 282 duration: Length of the video in seconds, as an integer or float.
f3d29461 283 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
284 like_count: Number of positive ratings of the video
285 dislike_count: Number of negative ratings of the video
02835c6b 286 repost_count: Number of reposts of the video
2d30521a 287 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 288 comment_count: Number of comments on the video
dd622d7c
PH
289 comments: A list of comments, each with one or more of the following
290 properties (all but one of text or html optional):
291 * "author" - human-readable name of the comment author
292 * "author_id" - user ID of the comment author
a1c5d2ca 293 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
294 * "id" - Comment ID
295 * "html" - Comment as HTML
296 * "text" - Plain text of the comment
297 * "timestamp" - UNIX timestamp of comment
298 * "parent" - ID of the comment this one is replying to.
299 Set to "root" to indicate that this is a
300 comment to the original video.
a1c5d2ca
M
301 * "like_count" - Number of positive ratings of the comment
302 * "dislike_count" - Number of negative ratings of the comment
303 * "is_favorited" - Whether the comment is marked as
304 favorite by the video uploader
305 * "author_is_uploader" - Whether the comment is made by
306 the video uploader
8dbe9899 307 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 308 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
309 should allow to get the same result again. (It will be set
310 by YoutubeDL if it's missing)
ad3bc6ac
PH
311 categories: A list of categories that the video falls in, for example
312 ["Sports", "Berlin"]
864f24bd 313 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 314 cast: A list of the video cast
7267bd53
PH
315 is_live: True, False, or None (=unknown). Whether this video is a
316 live stream that goes on instead of a fixed-length video.
f76ede8e 317 was_live: True, False, or None (=unknown). Whether this video was
318 originally a live stream.
3dbb2a9d 319 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
ae30b840 320 If absent, automatically set from is_live, was_live
7c80519c 321 start_time: Time in seconds where the reproduction should start, as
10952eb2 322 specified in the URL.
297a564b 323 end_time: Time in seconds where the reproduction should end, as
10952eb2 324 specified in the URL.
55949fed 325 chapters: A list of dictionaries, with the following entries:
326 * "start_time" - The start time of the chapter in seconds
327 * "end_time" - The end time of the chapter in seconds
328 * "title" (optional, string)
6cfda058 329 playable_in_embed: Whether this video is allowed to play in embedded
330 players on other sites. Can be True (=always allowed),
331 False (=never allowed), None (=unknown), or a string
c224251a
M
332 specifying the criteria for embedability (Eg: 'whitelist')
333 availability: Under what condition the video is available. One of
334 'private', 'premium_only', 'subscriber_only', 'needs_auth',
335 'unlisted' or 'public'. Use 'InfoExtractor._availability'
336 to set it
277d6ff5 337 __post_extractor: A function to be called just before the metadata is
338 written to either disk, logger or console. The function
339 must return a dict which will be added to the info_dict.
340 This is usefull for additional information that is
341 time-consuming to extract. Note that the fields thus
342 extracted will not be available to output template and
343 match_filter. So, only "comments" and "comment_count" are
344 currently allowed to be extracted via this method.
d6983cb4 345
7109903e
S
346 The following fields should only be used when the video belongs to some logical
347 chapter or section:
348
349 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
350 chapter_number: Number of the chapter the video belongs to, as an integer.
351 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
352
353 The following fields should only be used when the video is an episode of some
8d76bdf1 354 series, programme or podcast:
7109903e
S
355
356 series: Title of the series or programme the video episode belongs to.
9ac24e23 357 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 358 season: Title of the season the video episode belongs to.
27bfd4e5
S
359 season_number: Number of the season the video episode belongs to, as an integer.
360 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
361 episode: Title of the video episode. Unlike mandatory video title field,
362 this field should denote the exact title of the video episode
363 without any kind of decoration.
27bfd4e5
S
364 episode_number: Number of the video episode within a season, as an integer.
365 episode_id: Id of the video episode, as a unicode string.
7109903e 366
7a93ab5f
S
367 The following fields should only be used when the media is a track or a part of
368 a music album:
369
370 track: Title of the track.
371 track_number: Number of the track within an album or a disc, as an integer.
372 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
373 as a unicode string.
374 artist: Artist(s) of the track.
375 genre: Genre(s) of the track.
376 album: Title of the album the track belongs to.
377 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
378 album_artist: List of all artists appeared on the album (e.g.
379 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
380 and compilations).
381 disc_number: Number of the disc or other physical medium the track belongs to,
382 as an integer.
383 release_year: Year (YYYY) when the album was released.
8bcd4048 384 composer: Composer of the piece
7a93ab5f 385
deefc05b 386 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 387
d838b1bd
PH
388 Unless mentioned otherwise, None is equivalent to absence of information.
389
fed5d032
PH
390
391 _type "playlist" indicates multiple videos.
b82f815f
PH
392 There must be a key "entries", which is a list, an iterable, or a PagedList
393 object, each element of which is a valid dictionary by this specification.
fed5d032 394
b60419c5 395 Additionally, playlists can have "id", "title", and any other relevent
396 attributes with the same semantics as videos (see above).
fed5d032 397
f0d785d3 398 It can also have the following optional fields:
399
400 playlist_count: The total number of videos in a playlist. If not given,
401 YoutubeDL tries to calculate it from "entries"
402
fed5d032
PH
403
404 _type "multi_video" indicates that there are multiple videos that
405 form a single show, for examples multiple acts of an opera or TV episode.
406 It must have an entries key like a playlist and contain all the keys
407 required for a video at the same time.
408
409
410 _type "url" indicates that the video must be extracted from another
411 location, possibly by a different extractor. Its only required key is:
412 "url" - the next URL to extract.
f58766ce
PH
413 The key "ie_key" can be set to the class name (minus the trailing "IE",
414 e.g. "Youtube") if the extractor class is known in advance.
415 Additionally, the dictionary may have any properties of the resolved entity
416 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
417 known ahead of time.
418
419
420 _type "url_transparent" entities have the same specification as "url", but
421 indicate that the given additional information is more precise than the one
422 associated with the resolved URL.
423 This is useful when a site employs a video service that hosts the video and
424 its technical metadata, but that video service does not embed a useful
425 title, description etc.
426
427
08d30158 428 Subclasses of this should define a _VALID_URL regexp and, re-define the
429 _real_extract() and (optionally) _real_initialize() methods.
d6983cb4
PH
430 Probably, they should also be added to the list of extractors.
431
e6f21b3d 432 Subclasses may also override suitable() if necessary, but ensure the function
433 signature is preserved and that this function imports everything it needs
52efa4b3 434 (except other extractors), so that lazy_extractors works correctly.
435
436 To support username + password (or netrc) login, the extractor must define a
437 _NETRC_MACHINE and re-define _perform_login(username, password) and
438 (optionally) _initialize_pre_login() methods. The _perform_login method will
439 be called between _initialize_pre_login and _real_initialize if credentials
440 are passed by the user. In cases where it is necessary to have the login
441 process as part of the extraction rather than initialization, _perform_login
442 can be left undefined.
e6f21b3d 443
4248dad9 444 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
445 geo restriction bypass mechanisms for a particular extractor.
446 Though it won't disable explicit geo restriction bypass based on
504f20dd 447 country code provided with geo_bypass_country.
4248dad9
S
448
449 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
450 countries for this extractor. One of these countries will be used by
451 geo restriction bypass mechanism right away in order to bypass
504f20dd 452 geo restriction, of course, if the mechanism is not disabled.
773f291d 453
5f95927a
S
454 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
455 IP blocks in CIDR notation for this extractor. One of these IP blocks
456 will be used by geo restriction bypass mechanism similarly
504f20dd 457 to _GEO_COUNTRIES.
3ccdde8c 458
e6f21b3d 459 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
460 in order to warn the users and skip the tests.
461 """
462
463 _ready = False
464 _downloader = None
773f291d 465 _x_forwarded_for_ip = None
4248dad9
S
466 _GEO_BYPASS = True
467 _GEO_COUNTRIES = None
5f95927a 468 _GEO_IP_BLOCKS = None
d6983cb4 469 _WORKING = True
52efa4b3 470 _NETRC_MACHINE = None
231025c4 471 IE_DESC = None
8dcce6a8 472 SEARCH_KEY = None
d6983cb4 473
8dcce6a8 474 def _login_hint(self, method=NO_DEFAULT, netrc=None):
475 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
476 return {
477 None: '',
478 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
479 'password': f'Use {password_hint}',
480 'cookies': (
481 'Use --cookies-from-browser or --cookies for the authentication. '
482 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
483 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 484
d6983cb4 485 def __init__(self, downloader=None):
49a57e70 486 """Constructor. Receives an optional downloader (a YoutubeDL instance).
487 If a downloader is not passed during initialization,
488 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 489 self._ready = False
773f291d 490 self._x_forwarded_for_ip = None
28f436ba 491 self._printed_messages = set()
d6983cb4
PH
492 self.set_downloader(downloader)
493
494 @classmethod
5ad28e7f 495 def _match_valid_url(cls, url):
79cb2577
PH
496 # This does not use has/getattr intentionally - we want to know whether
497 # we have cached the regexp for *this* class, whereas getattr would also
498 # match the superclass
499 if '_VALID_URL_RE' not in cls.__dict__:
2c4aaadd 500 if '_VALID_URL' not in cls.__dict__:
501 cls._VALID_URL = cls._make_valid_url()
79cb2577 502 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 503 return cls._VALID_URL_RE.match(url)
504
505 @classmethod
506 def suitable(cls, url):
507 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 508 # This function must import everything it needs (except other extractors),
509 # so that lazy_extractors works correctly
5ad28e7f 510 return cls._match_valid_url(url) is not None
d6983cb4 511
ed9266db
PH
512 @classmethod
513 def _match_id(cls, url):
5ad28e7f 514 return cls._match_valid_url(url).group('id')
ed9266db 515
1151c407 516 @classmethod
517 def get_temp_id(cls, url):
518 try:
519 return cls._match_id(url)
520 except (IndexError, AttributeError):
521 return None
522
d6983cb4
PH
523 @classmethod
524 def working(cls):
525 """Getter method for _WORKING."""
526 return cls._WORKING
527
52efa4b3 528 @classmethod
529 def supports_login(cls):
530 return bool(cls._NETRC_MACHINE)
531
d6983cb4
PH
532 def initialize(self):
533 """Initializes an instance (authentication, etc)."""
28f436ba 534 self._printed_messages = set()
5f95927a
S
535 self._initialize_geo_bypass({
536 'countries': self._GEO_COUNTRIES,
537 'ip_blocks': self._GEO_IP_BLOCKS,
538 })
4248dad9 539 if not self._ready:
52efa4b3 540 self._initialize_pre_login()
541 if self.supports_login():
542 username, password = self._get_login_info()
543 if username:
544 self._perform_login(username, password)
545 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 546 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
547 self._real_initialize()
548 self._ready = True
549
5f95927a 550 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
551 """
552 Initialize geo restriction bypass mechanism.
553
554 This method is used to initialize geo bypass mechanism based on faking
555 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 556 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
557 IP will be passed as X-Forwarded-For HTTP header in all subsequent
558 HTTP requests.
e39b5d4a
S
559
560 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
561 during the instance initialization with _GEO_COUNTRIES and
562 _GEO_IP_BLOCKS.
e39b5d4a 563
5f95927a 564 You may also manually call it from extractor's code if geo bypass
e39b5d4a 565 information is not available beforehand (e.g. obtained during
5f95927a
S
566 extraction) or due to some other reason. In this case you should pass
567 this information in geo bypass context passed as first argument. It may
568 contain following fields:
569
570 countries: List of geo unrestricted countries (similar
571 to _GEO_COUNTRIES)
572 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
573 (similar to _GEO_IP_BLOCKS)
574
e39b5d4a 575 """
773f291d 576 if not self._x_forwarded_for_ip:
5f95927a
S
577
578 # Geo bypass mechanism is explicitly disabled by user
a06916d9 579 if not self.get_param('geo_bypass', True):
5f95927a
S
580 return
581
582 if not geo_bypass_context:
583 geo_bypass_context = {}
584
585 # Backward compatibility: previously _initialize_geo_bypass
586 # expected a list of countries, some 3rd party code may still use
587 # it this way
588 if isinstance(geo_bypass_context, (list, tuple)):
589 geo_bypass_context = {
590 'countries': geo_bypass_context,
591 }
592
593 # The whole point of geo bypass mechanism is to fake IP
594 # as X-Forwarded-For HTTP header based on some IP block or
595 # country code.
596
597 # Path 1: bypassing based on IP block in CIDR notation
598
599 # Explicit IP block specified by user, use it right away
600 # regardless of whether extractor is geo bypassable or not
a06916d9 601 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
602
603 # Otherwise use random IP block from geo bypass context but only
604 # if extractor is known as geo bypassable
605 if not ip_block:
606 ip_blocks = geo_bypass_context.get('ip_blocks')
607 if self._GEO_BYPASS and ip_blocks:
608 ip_block = random.choice(ip_blocks)
609
610 if ip_block:
611 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 612 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
613 return
614
615 # Path 2: bypassing based on country code
616
617 # Explicit country code specified by user, use it right away
618 # regardless of whether extractor is geo bypassable or not
a06916d9 619 country = self.get_param('geo_bypass_country', None)
5f95927a
S
620
621 # Otherwise use random country code from geo bypass context but
622 # only if extractor is known as geo bypassable
623 if not country:
624 countries = geo_bypass_context.get('countries')
625 if self._GEO_BYPASS and countries:
626 country = random.choice(countries)
627
628 if country:
629 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 630 self._downloader.write_debug(
86e5f3ed 631 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
632
633 def extract(self, url):
634 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 635 try:
773f291d
S
636 for _ in range(2):
637 try:
638 self.initialize()
a06916d9 639 self.write_debug('Extracting URL: %s' % url)
0016b84e 640 ie_result = self._real_extract(url)
07cce701 641 if ie_result is None:
642 return None
0016b84e
S
643 if self._x_forwarded_for_ip:
644 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
53ed7066 645 subtitles = ie_result.get('subtitles')
646 if (subtitles and 'live_chat' in subtitles
a06916d9 647 and 'no-live-chat' in self.get_param('compat_opts', [])):
53ed7066 648 del subtitles['live_chat']
0016b84e 649 return ie_result
773f291d 650 except GeoRestrictedError as e:
4248dad9
S
651 if self.__maybe_fake_ip_and_retry(e.countries):
652 continue
773f291d 653 raise
0db3bae8 654 except UnsupportedError:
655 raise
1151c407 656 except ExtractorError as e:
0db3bae8 657 kwargs = {
658 'video_id': e.video_id or self.get_temp_id(url),
659 'ie': self.IE_NAME,
b69fd25c 660 'tb': e.traceback or sys.exc_info()[2],
0db3bae8 661 'expected': e.expected,
662 'cause': e.cause
663 }
664 if hasattr(e, 'countries'):
665 kwargs['countries'] = e.countries
7265a219 666 raise type(e)(e.orig_msg, **kwargs)
3a5bcd03 667 except compat_http_client.IncompleteRead as e:
1151c407 668 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 669 except (KeyError, StopIteration) as e:
1151c407 670 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 671
4248dad9 672 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 673 if (not self.get_param('geo_bypass_country', None)
3089bc74 674 and self._GEO_BYPASS
a06916d9 675 and self.get_param('geo_bypass', True)
3089bc74
S
676 and not self._x_forwarded_for_ip
677 and countries):
eea0716c
S
678 country_code = random.choice(countries)
679 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
680 if self._x_forwarded_for_ip:
681 self.report_warning(
eea0716c
S
682 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
683 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
684 return True
685 return False
686
d6983cb4 687 def set_downloader(self, downloader):
08d30158 688 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
689 self._downloader = downloader
690
52efa4b3 691 def _initialize_pre_login(self):
692 """ Intialization before login. Redefine in subclasses."""
693 pass
694
695 def _perform_login(self, username, password):
696 """ Login with username and password. Redefine in subclasses."""
697 pass
698
d6983cb4
PH
699 def _real_initialize(self):
700 """Real initialization process. Redefine in subclasses."""
701 pass
702
703 def _real_extract(self, url):
704 """Real extraction process. Redefine in subclasses."""
08d30158 705 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 706
56c73665
JMF
707 @classmethod
708 def ie_key(cls):
709 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 710 return cls.__name__[:-2]
56c73665 711
82d02080 712 @classproperty
713 def IE_NAME(cls):
714 return cls.__name__[:-2]
d6983cb4 715
d391b7e2
S
716 @staticmethod
717 def __can_accept_status_code(err, expected_status):
718 assert isinstance(err, compat_urllib_error.HTTPError)
719 if expected_status is None:
720 return False
d391b7e2
S
721 elif callable(expected_status):
722 return expected_status(err.code) is True
723 else:
6606817a 724 return err.code in variadic(expected_status)
d391b7e2 725
f95b9dee 726 def _create_request(self, url_or_request, data=None, headers={}, query={}):
727 if not isinstance(url_or_request, compat_urllib_request.Request):
728 url_or_request = sanitized_Request(url_or_request)
729 return update_Request(url_or_request, data=data, headers=headers, query=query)
730
d391b7e2
S
731 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
732 """
733 Return the response handle.
734
735 See _download_webpage docstring for arguments specification.
736 """
1cf376f5 737 if not self._downloader._first_webpage_request:
49a57e70 738 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 739 if sleep_interval > 0:
5ef7d9bd 740 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 741 time.sleep(sleep_interval)
742 else:
743 self._downloader._first_webpage_request = False
744
d6983cb4
PH
745 if note is None:
746 self.report_download_webpage(video_id)
747 elif note is not False:
7cc3570e 748 if video_id is None:
86e5f3ed 749 self.to_screen(str(note))
7cc3570e 750 else:
86e5f3ed 751 self.to_screen(f'{video_id}: {note}')
2132edaa
S
752
753 # Some sites check X-Forwarded-For HTTP header in order to figure out
754 # the origin of the client behind proxy. This allows bypassing geo
755 # restriction by faking this header's value to IP that belongs to some
756 # geo unrestricted country. We will do so once we encounter any
757 # geo restriction error.
758 if self._x_forwarded_for_ip:
759 if 'X-Forwarded-For' not in headers:
760 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
761
d6983cb4 762 try:
f95b9dee 763 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 764 except network_exceptions as err:
d391b7e2
S
765 if isinstance(err, compat_urllib_error.HTTPError):
766 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
767 # Retain reference to error to prevent file object from
768 # being closed before it can be read. Works around the
769 # effects of <https://bugs.python.org/issue15002>
770 # introduced in Python 3.4.1.
771 err.fp._error = err
d391b7e2
S
772 return err.fp
773
aa94a6d3
PH
774 if errnote is False:
775 return False
d6983cb4 776 if errnote is None:
f1a9d64e 777 errnote = 'Unable to download webpage'
7f8b2714 778
86e5f3ed 779 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 780 if fatal:
497d2fab 781 raise ExtractorError(errmsg, cause=err)
7cc3570e 782 else:
6a39ee13 783 self.report_warning(errmsg)
7cc3570e 784 return False
d6983cb4 785
d391b7e2
S
786 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
787 """
788 Return a tuple (page content as string, URL handle).
789
617f658b 790 Arguments:
791 url_or_request -- plain text URL as a string or
792 a compat_urllib_request.Requestobject
793 video_id -- Video/playlist/item identifier (string)
794
795 Keyword arguments:
796 note -- note printed before downloading (string)
797 errnote -- note printed in case of an error (string)
798 fatal -- flag denoting whether error should be considered fatal,
799 i.e. whether it should cause ExtractionError to be raised,
800 otherwise a warning will be reported and extraction continued
801 encoding -- encoding for a page content decoding, guessed automatically
802 when not explicitly specified
803 data -- POST data (bytes)
804 headers -- HTTP headers (dict)
805 query -- URL query (dict)
806 expected_status -- allows to accept failed HTTP requests (non 2xx
807 status code) by explicitly specifying a set of accepted status
808 codes. Can be any of the following entities:
809 - an integer type specifying an exact failed status code to
810 accept
811 - a list or a tuple of integer types specifying a list of
812 failed status codes to accept
813 - a callable accepting an actual failed status code and
814 returning True if it should be accepted
815 Note that this argument does not affect success status codes (2xx)
816 which are always accepted.
d391b7e2 817 """
617f658b 818
b9d3e163
PH
819 # Strip hashes from the URL (#1038)
820 if isinstance(url_or_request, (compat_str, str)):
821 url_or_request = url_or_request.partition('#')[0]
822
d391b7e2 823 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
824 if urlh is False:
825 assert not fatal
826 return False
c9a77969 827 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
828 return (content, urlh)
829
c9a77969
YCH
830 @staticmethod
831 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
832 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
833 if m:
834 encoding = m.group(1)
835 else:
0d75ae2c 836 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
837 webpage_bytes[:1024])
838 if m:
839 encoding = m.group(1).decode('ascii')
b60016e8
PH
840 elif webpage_bytes.startswith(b'\xff\xfe'):
841 encoding = 'utf-16'
f143d86a
PH
842 else:
843 encoding = 'utf-8'
c9a77969
YCH
844
845 return encoding
846
4457823d
S
847 def __check_blocked(self, content):
848 first_block = content[:512]
3089bc74
S
849 if ('<title>Access to this site is blocked</title>' in content
850 and 'Websense' in first_block):
4457823d
S
851 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
852 blocked_iframe = self._html_search_regex(
853 r'<iframe src="([^"]+)"', content,
854 'Websense information URL', default=None)
855 if blocked_iframe:
856 msg += ' Visit %s for more details' % blocked_iframe
857 raise ExtractorError(msg, expected=True)
858 if '<title>The URL you requested has been blocked</title>' in first_block:
859 msg = (
860 'Access to this webpage has been blocked by Indian censorship. '
861 'Use a VPN or proxy server (with --proxy) to route around it.')
862 block_msg = self._html_search_regex(
863 r'</h1><p>(.*?)</p>',
864 content, 'block message', default=None)
865 if block_msg:
866 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
867 raise ExtractorError(msg, expected=True)
3089bc74
S
868 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
869 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
870 raise ExtractorError(
871 'Access to this webpage has been blocked by decision of the Russian government. '
872 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
873 expected=True)
874
f95b9dee 875 def _request_dump_filename(self, url, video_id):
876 basen = f'{video_id}_{url}'
877 trim_length = self.get_param('trim_file_name') or 240
878 if len(basen) > trim_length:
879 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
880 basen = basen[:trim_length - len(h)] + h
881 filename = sanitize_filename(f'{basen}.dump', restricted=True)
882 # Working around MAX_PATH limitation on Windows (see
883 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
884 if compat_os_name == 'nt':
885 absfilepath = os.path.abspath(filename)
886 if len(absfilepath) > 259:
887 filename = fR'\\?\{absfilepath}'
888 return filename
889
890 def __decode_webpage(self, webpage_bytes, encoding, headers):
891 if not encoding:
892 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
893 try:
894 return webpage_bytes.decode(encoding, 'replace')
895 except LookupError:
896 return webpage_bytes.decode('utf-8', 'replace')
897
c9a77969 898 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
899 webpage_bytes = urlh.read()
900 if prefix is not None:
901 webpage_bytes = prefix + webpage_bytes
a06916d9 902 if self.get_param('dump_intermediate_pages', False):
f610dbb0 903 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
904 dump = base64.b64encode(webpage_bytes).decode('ascii')
905 self._downloader.to_screen(dump)
f95b9dee 906 if self.get_param('write_pages'):
907 filename = self._request_dump_filename(video_id, urlh.geturl())
908 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
909 with open(filename, 'wb') as outf:
910 outf.write(webpage_bytes)
911
f95b9dee 912 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 913 self.__check_blocked(content)
2410c43d 914
23be51d8 915 return content
d6983cb4 916
e01c3d2e 917 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
918 if transform_source:
919 xml_string = transform_source(xml_string)
e01c3d2e
S
920 try:
921 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 922 except xml.etree.ElementTree.ParseError as ve:
e01c3d2e
S
923 errmsg = '%s: Failed to parse XML ' % video_id
924 if fatal:
925 raise ExtractorError(errmsg, cause=ve)
926 else:
927 self.report_warning(errmsg + str(ve))
267ed0c5 928
ee27297f 929 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
81c2f20b
PH
930 if transform_source:
931 json_string = transform_source(json_string)
3d3538e4 932 try:
ee27297f 933 try:
934 return json.loads(json_string, strict=False)
935 except json.JSONDecodeError as e:
936 if not lenient:
937 raise
938 try:
939 return json.loads(json_string[:e.pos], strict=False)
940 except ValueError:
941 raise e
3d3538e4 942 except ValueError as ve:
e7b6d122
PH
943 errmsg = '%s: Failed to parse JSON ' % video_id
944 if fatal:
945 raise ExtractorError(errmsg, cause=ve)
946 else:
947 self.report_warning(errmsg + str(ve))
3d3538e4 948
adddc50c 949 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
950 return self._parse_json(
951 data[data.find('{'):data.rfind('}') + 1],
952 video_id, transform_source, fatal)
953
617f658b 954 def __create_download_methods(name, parser, note, errnote, return_value):
955
956 def parse(ie, content, *args, **kwargs):
957 if parser is None:
958 return content
959 # parser is fetched by name so subclasses can override it
960 return getattr(ie, parser)(content, *args, **kwargs)
961
962 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote,
963 transform_source=None, fatal=True, *args, **kwargs):
964 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, *args, **kwargs)
965 if res is False:
966 return res
967 content, urlh = res
968 return parse(self, content, video_id, transform_source, fatal), urlh
969
f95b9dee 970 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
971 fatal=True, encoding=None, data=None, headers={}, query={}, *args, **kwargs):
972 if self.get_param('load_pages'):
973 url_or_request = self._create_request(url_or_request, data, headers, query)
974 filename = self._request_dump_filename(url_or_request.full_url, video_id)
975 self.to_screen(f'Loading request from {filename}')
976 try:
977 with open(filename, 'rb') as dumpf:
978 webpage_bytes = dumpf.read()
979 except OSError as e:
980 self.report_warning(f'Unable to load request from disk: {e}')
981 else:
982 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
983 return parse(self, content, video_id, transform_source, fatal)
984 args = [url_or_request, video_id, note, errnote, transform_source, fatal, encoding, data, headers, query, *args]
617f658b 985 if parser is None:
986 args.pop(4) # transform_source
987 # The method is fetched by name so subclasses can override _download_..._handle
988 res = getattr(self, download_handle.__name__)(*args, **kwargs)
989 return res if res is False else res[0]
990
991 def impersonate(func, name, return_value):
992 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
993 func.__doc__ = f'''
994 @param transform_source Apply this transformation before parsing
995 @returns {return_value}
996
997 See _download_webpage_handle docstring for other arguments specification
998 '''
999
1000 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1001 impersonate(download_content, f'_download_{name}', f'{return_value}')
1002 return download_handle, download_content
1003
1004 _download_xml_handle, _download_xml = __create_download_methods(
1005 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1006 _download_json_handle, _download_json = __create_download_methods(
1007 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1008 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1009 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1010 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1011
617f658b 1012 def _download_webpage(
1013 self, url_or_request, video_id, note=None, errnote=None,
1014 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1015 """
617f658b 1016 Return the data of the page as a string.
adddc50c 1017
617f658b 1018 Keyword arguments:
1019 tries -- number of tries
1020 timeout -- sleep interval between tries
1021
1022 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1023 """
617f658b 1024
1025 R''' # NB: These are unused; should they be deprecated?
1026 if tries != 1:
1027 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1028 if timeout is NO_DEFAULT:
1029 timeout = 5
1030 else:
1031 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1032 '''
1033
1034 try_count = 0
1035 while True:
1036 try:
1037 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1038 except compat_http_client.IncompleteRead as e:
1039 try_count += 1
1040 if try_count >= tries:
1041 raise e
1042 self._sleep(timeout, video_id)
adddc50c 1043
28f436ba 1044 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
b868936c 1045 idstr = format_field(video_id, template='%s: ')
28f436ba 1046 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1047 if only_once:
1048 if f'WARNING: {msg}' in self._printed_messages:
1049 return
1050 self._printed_messages.add(f'WARNING: {msg}')
1051 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1052
a06916d9 1053 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1054 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1055 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1056
1057 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1058 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1059
1060 def get_param(self, name, default=None, *args, **kwargs):
1061 if self._downloader:
1062 return self._downloader.params.get(name, default, *args, **kwargs)
1063 return default
d6983cb4 1064
88acdbc2 1065 def report_drm(self, video_id, partial=False):
1066 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1067
d6983cb4
PH
1068 def report_extraction(self, id_or_name):
1069 """Report information extraction."""
f1a9d64e 1070 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1071
1072 def report_download_webpage(self, video_id):
1073 """Report webpage download."""
f1a9d64e 1074 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1075
1076 def report_age_confirmation(self):
1077 """Report attempt to confirm age."""
f1a9d64e 1078 self.to_screen('Confirming age')
d6983cb4 1079
fc79158d
JMF
1080 def report_login(self):
1081 """Report attempt to log in."""
f1a9d64e 1082 self.to_screen('Logging in')
fc79158d 1083
b7da73eb 1084 def raise_login_required(
9d5d4d64 1085 self, msg='This video is only available for registered users',
52efa4b3 1086 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1087 if metadata_available and (
1088 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1089 self.report_warning(msg)
7265a219 1090 return
8dcce6a8 1091 msg += format_field(self._login_hint(method), template='. %s')
46890374 1092 raise ExtractorError(msg, expected=True)
43e7d3c9 1093
b7da73eb 1094 def raise_geo_restricted(
1095 self, msg='This video is not available from your location due to geo restriction',
1096 countries=None, metadata_available=False):
f2ebc5c7 1097 if metadata_available and (
1098 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1099 self.report_warning(msg)
1100 else:
1101 raise GeoRestrictedError(msg, countries=countries)
1102
1103 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1104 if expected and (
1105 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1106 self.report_warning(msg, video_id)
68f5867c
L
1107 elif isinstance(msg, ExtractorError):
1108 raise msg
b7da73eb 1109 else:
1110 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1111
5f6a1245 1112 # Methods for following #608
c0d0b01f 1113 @staticmethod
311b6615 1114 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1115 """Returns a URL that points to a page that should be processed"""
311b6615 1116 if ie is not None:
1117 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1118 if video_id is not None:
311b6615 1119 kwargs['id'] = video_id
830d53bf 1120 if video_title is not None:
311b6615 1121 kwargs['title'] = video_title
1122 return {
1123 **kwargs,
1124 '_type': 'url_transparent' if url_transparent else 'url',
1125 'url': url,
1126 }
1127
27231526
ZM
1128 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1129 urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
311b6615 1130 for m in orderedSet(map(getter, matches) if getter else matches))
1131 return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
46b18f23 1132
c0d0b01f 1133 @staticmethod
311b6615 1134 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1135 """Returns a playlist"""
d6983cb4 1136 if playlist_id:
311b6615 1137 kwargs['id'] = playlist_id
d6983cb4 1138 if playlist_title:
311b6615 1139 kwargs['title'] = playlist_title
ecc97af3 1140 if playlist_description is not None:
311b6615 1141 kwargs['description'] = playlist_description
1142 return {
1143 **kwargs,
1144 '_type': 'multi_video' if multi_video else 'playlist',
1145 'entries': entries,
1146 }
d6983cb4 1147
c342041f 1148 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1149 """
1150 Perform a regex search on the given string, using a single or a list of
1151 patterns returning the first matching group.
1152 In case of failure return a default value or raise a WARNING or a
55b3e45b 1153 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1154 """
61d3665d 1155 if string is None:
1156 mobj = None
77f90330 1157 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1158 mobj = re.search(pattern, string, flags)
1159 else:
1160 for p in pattern:
1161 mobj = re.search(p, string, flags)
c3415d1b
PH
1162 if mobj:
1163 break
d6983cb4 1164
ec11a9f4 1165 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1166
1167 if mobj:
711ede6e
PH
1168 if group is None:
1169 # return the first matching group
1170 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1171 elif isinstance(group, (list, tuple)):
1172 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1173 else:
1174 return mobj.group(group)
c342041f 1175 elif default is not NO_DEFAULT:
d6983cb4
PH
1176 return default
1177 elif fatal:
f1a9d64e 1178 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1179 else:
6a39ee13 1180 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1181 return None
1182
c342041f 1183 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1184 """
1185 Like _search_regex, but strips HTML tags and unescapes entities.
1186 """
711ede6e 1187 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1188 if res:
1189 return clean_html(res).strip()
1190 else:
1191 return res
1192
2118fdd1
RA
1193 def _get_netrc_login_info(self, netrc_machine=None):
1194 username = None
1195 password = None
1196 netrc_machine = netrc_machine or self._NETRC_MACHINE
1197
a06916d9 1198 if self.get_param('usenetrc', False):
2118fdd1 1199 try:
0001fcb5 1200 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1201 if os.path.isdir(netrc_file):
1202 netrc_file = os.path.join(netrc_file, '.netrc')
1203 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1204 if info is not None:
1205 username = info[0]
1206 password = info[2]
1207 else:
dcce092e
S
1208 raise netrc.NetrcParseError(
1209 'No authenticators for %s' % netrc_machine)
86e5f3ed 1210 except (OSError, netrc.NetrcParseError) as err:
6a39ee13 1211 self.report_warning(
dcce092e 1212 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1213
dcce092e 1214 return username, password
2118fdd1 1215
1b6712ab 1216 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1217 """
cf0649f8 1218 Get the login info as (username, password)
32443dd3
S
1219 First look for the manually specified credentials using username_option
1220 and password_option as keys in params dictionary. If no such credentials
1221 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1222 value.
fc79158d
JMF
1223 If there's no info available, return (None, None)
1224 """
fc79158d
JMF
1225
1226 # Attempt to use provided username and password or .netrc data
a06916d9 1227 username = self.get_param(username_option)
1228 if username is not None:
1229 password = self.get_param(password_option)
2118fdd1 1230 else:
1b6712ab 1231 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1232
2133565c 1233 return username, password
fc79158d 1234
e64b7569 1235 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1236 """
1237 Get the two-factor authentication info
1238 TODO - asking the user will be required for sms/phone verify
1239 currently just uses the command line option
1240 If there's no info available, return None
1241 """
83317f69 1242
a06916d9 1243 tfa = self.get_param('twofactor')
1244 if tfa is not None:
1245 return tfa
83317f69 1246
e64b7569 1247 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1248
46720279
JMF
1249 # Helper functions for extracting OpenGraph info
1250 @staticmethod
ab2d5247 1251 def _og_regexes(prop):
448ef1f3 1252 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
fbfde1c3
F
1253 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1254 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1255 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1256 return [
78fb87b2
JMF
1257 template % (property_re, content_re),
1258 template % (content_re, property_re),
ab2d5247 1259 ]
46720279 1260
864f24bd
S
1261 @staticmethod
1262 def _meta_regex(prop):
1263 return r'''(?isx)<meta
8b9848ac 1264 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1265 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1266
3c4e6d83 1267 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1268 prop = variadic(prop)
46720279 1269 if name is None:
b070564e
S
1270 name = 'OpenGraph %s' % prop[0]
1271 og_regexes = []
1272 for p in prop:
1273 og_regexes.extend(self._og_regexes(p))
1274 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1275 if escaped is None:
1276 return None
1277 return unescapeHTML(escaped)
46720279
JMF
1278
1279 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1280 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1281
1282 def _og_search_description(self, html, **kargs):
1283 return self._og_search_property('description', html, fatal=False, **kargs)
1284
04f3fd2c 1285 def _og_search_title(self, html, *, fatal=False, **kargs):
1286 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1287
8ffa13e0 1288 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1289 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1290 if secure:
1291 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1292 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1293
78338f71
JMF
1294 def _og_search_url(self, html, **kargs):
1295 return self._og_search_property('url', html, **kargs)
1296
04f3fd2c 1297 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1298 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1299
40c696e5 1300 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1301 name = variadic(name)
59040888 1302 if display_name is None:
88d9f6c0 1303 display_name = name[0]
59040888 1304 return self._html_search_regex(
88d9f6c0 1305 [self._meta_regex(n) for n in name],
711ede6e 1306 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1307
1308 def _dc_search_uploader(self, html):
1309 return self._html_search_meta('dc.creator', html, 'uploader')
1310
8dbe9899
PH
1311 def _rta_search(self, html):
1312 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1313 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1314 r' content="RTA-5042-1996-1400-1577-RTA"',
1315 html):
1316 return 18
1317 return 0
1318
59040888
PH
1319 def _media_rating_search(self, html):
1320 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1321 rating = self._html_search_meta('rating', html)
1322
1323 if not rating:
1324 return None
1325
1326 RATING_TABLE = {
1327 'safe for kids': 0,
1328 'general': 8,
1329 '14 years': 14,
1330 'mature': 17,
1331 'restricted': 19,
1332 }
d800609c 1333 return RATING_TABLE.get(rating.lower())
59040888 1334
69319969 1335 def _family_friendly_search(self, html):
6ca7732d 1336 # See http://schema.org/VideoObject
ac8491fc
S
1337 family_friendly = self._html_search_meta(
1338 'isFamilyFriendly', html, default=None)
69319969
NJ
1339
1340 if not family_friendly:
1341 return None
1342
1343 RATING_TABLE = {
1344 '1': 0,
1345 'true': 0,
1346 '0': 18,
1347 'false': 18,
1348 }
d800609c 1349 return RATING_TABLE.get(family_friendly.lower())
69319969 1350
0c708f11
JMF
1351 def _twitter_search_player(self, html):
1352 return self._html_search_meta('twitter:player', html,
9e1a5b84 1353 'twitter card player')
0c708f11 1354
95b31e26 1355 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4433bb02 1356 json_ld_list = list(re.finditer(JSON_LD_RE, html))
321b5e08 1357 default = kwargs.get('default', NO_DEFAULT)
321b5e08
S
1358 # JSON-LD may be malformed and thus `fatal` should be respected.
1359 # At the same time `default` may be passed that assumes `fatal=False`
1360 # for _search_regex. Let's simulate the same behavior here as well.
dbf5416a 1361 fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
4433bb02
S
1362 json_ld = []
1363 for mobj in json_ld_list:
1364 json_ld_item = self._parse_json(
1365 mobj.group('json_ld'), video_id, fatal=fatal)
1366 if not json_ld_item:
1367 continue
1368 if isinstance(json_ld_item, dict):
1369 json_ld.append(json_ld_item)
1370 elif isinstance(json_ld_item, (list, tuple)):
1371 json_ld.extend(json_ld_item)
1372 if json_ld:
1373 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1374 if json_ld:
1375 return json_ld
1376 if default is not NO_DEFAULT:
1377 return default
1378 elif fatal:
1379 raise RegexNotFoundError('Unable to extract JSON-LD')
1380 else:
6a39ee13 1381 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1382 return {}
4ca2a3cf 1383
95b31e26 1384 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1385 if isinstance(json_ld, compat_str):
1386 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1387 if not json_ld:
1388 return {}
1389 info = {}
46933a15
S
1390 if not isinstance(json_ld, (list, tuple, dict)):
1391 return info
1392 if isinstance(json_ld, dict):
1393 json_ld = [json_ld]
bae14048 1394
e7e4a6e0
S
1395 INTERACTION_TYPE_MAP = {
1396 'CommentAction': 'comment',
1397 'AgreeAction': 'like',
1398 'DisagreeAction': 'dislike',
1399 'LikeAction': 'like',
1400 'DislikeAction': 'dislike',
1401 'ListenAction': 'view',
1402 'WatchAction': 'view',
1403 'ViewAction': 'view',
1404 }
1405
29f7c58a 1406 def extract_interaction_type(e):
1407 interaction_type = e.get('interactionType')
1408 if isinstance(interaction_type, dict):
1409 interaction_type = interaction_type.get('@type')
1410 return str_or_none(interaction_type)
1411
e7e4a6e0
S
1412 def extract_interaction_statistic(e):
1413 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1414 if isinstance(interaction_statistic, dict):
1415 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1416 if not isinstance(interaction_statistic, list):
1417 return
1418 for is_e in interaction_statistic:
1419 if not isinstance(is_e, dict):
1420 continue
1421 if is_e.get('@type') != 'InteractionCounter':
1422 continue
29f7c58a 1423 interaction_type = extract_interaction_type(is_e)
1424 if not interaction_type:
e7e4a6e0 1425 continue
ce5b9040
S
1426 # For interaction count some sites provide string instead of
1427 # an integer (as per spec) with non digit characters (e.g. ",")
1428 # so extracting count with more relaxed str_to_int
1429 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1430 if interaction_count is None:
1431 continue
1432 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1433 if not count_kind:
1434 continue
1435 count_key = '%s_count' % count_kind
1436 if info.get(count_key) is not None:
1437 continue
1438 info[count_key] = interaction_count
1439
f5225737 1440 def extract_chapter_information(e):
1441 chapters = [{
1442 'title': part.get('name'),
1443 'start_time': part.get('startOffset'),
1444 'end_time': part.get('endOffset'),
85553414 1445 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1446 for idx, (last_c, current_c, next_c) in enumerate(zip(
1447 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1448 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1449 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1450 if None in current_c.values():
1451 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1452 return
1453 if chapters:
1454 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1455 info['chapters'] = chapters
1456
bae14048
S
1457 def extract_video_object(e):
1458 assert e['@type'] == 'VideoObject'
f7ad7160 1459 author = e.get('author')
bae14048 1460 info.update({
bebef109 1461 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1462 'title': unescapeHTML(e.get('name')),
1463 'description': unescapeHTML(e.get('description')),
21633673 1464 'thumbnails': [{'url': url}
1465 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1466 if url_or_none(url)],
bae14048
S
1467 'duration': parse_duration(e.get('duration')),
1468 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1469 # author can be an instance of 'Organization' or 'Person' types.
1470 # both types can have 'name' property(inherited from 'Thing' type). [1]
1471 # however some websites are using 'Text' type instead.
1472 # 1. https://schema.org/VideoObject
1473 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
bae14048
S
1474 'filesize': float_or_none(e.get('contentSize')),
1475 'tbr': int_or_none(e.get('bitrate')),
1476 'width': int_or_none(e.get('width')),
1477 'height': int_or_none(e.get('height')),
33a81c2c 1478 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1479 })
e7e4a6e0 1480 extract_interaction_statistic(e)
f5225737 1481 extract_chapter_information(e)
bae14048 1482
d5c32548
ZM
1483 def traverse_json_ld(json_ld, at_top_level=True):
1484 for e in json_ld:
1485 if at_top_level and '@context' not in e:
1486 continue
1487 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1488 traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1489 break
46933a15
S
1490 item_type = e.get('@type')
1491 if expected_type is not None and expected_type != item_type:
4433bb02 1492 continue
8f122fa0 1493 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1494 if rating is not None:
1495 info['average_rating'] = rating
c69701c6 1496 if item_type in ('TVEpisode', 'Episode'):
440863ad 1497 episode_name = unescapeHTML(e.get('name'))
46933a15 1498 info.update({
440863ad 1499 'episode': episode_name,
46933a15
S
1500 'episode_number': int_or_none(e.get('episodeNumber')),
1501 'description': unescapeHTML(e.get('description')),
1502 })
440863ad
S
1503 if not info.get('title') and episode_name:
1504 info['title'] = episode_name
46933a15 1505 part_of_season = e.get('partOfSeason')
c69701c6 1506 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1507 info.update({
1508 'season': unescapeHTML(part_of_season.get('name')),
1509 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1510 })
d16b3c66 1511 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1512 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1513 info['series'] = unescapeHTML(part_of_series.get('name'))
391256dc
S
1514 elif item_type == 'Movie':
1515 info.update({
1516 'title': unescapeHTML(e.get('name')),
1517 'description': unescapeHTML(e.get('description')),
1518 'duration': parse_duration(e.get('duration')),
1519 'timestamp': unified_timestamp(e.get('dateCreated')),
1520 })
3931b845 1521 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1522 info.update({
1523 'timestamp': parse_iso8601(e.get('datePublished')),
1524 'title': unescapeHTML(e.get('headline')),
d5c32548 1525 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1526 })
2edb38e8 1527 if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject':
1528 extract_video_object(e['video'][0])
46933a15 1529 elif item_type == 'VideoObject':
bae14048 1530 extract_video_object(e)
4433bb02
S
1531 if expected_type is None:
1532 continue
1533 else:
1534 break
c69701c6
S
1535 video = e.get('video')
1536 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1537 extract_video_object(video)
4433bb02
S
1538 if expected_type is None:
1539 continue
1540 else:
1541 break
d5c32548
ZM
1542 traverse_json_ld(json_ld)
1543
90137ca4 1544 return filter_dict(info)
4ca2a3cf 1545
135dfa2c 1546 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1547 return self._parse_json(
1548 self._search_regex(
1549 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1550 webpage, 'next.js data', fatal=fatal, **kw),
1551 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1552
66f4c04e
THD
1553 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1554 ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1555 # not all website do this, but it can be changed
1556 # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1557 rectx = re.escape(context_name)
1558 js, arg_keys, arg_vals = self._search_regex(
1559 (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1560 r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1561 webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1562
1563 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1564
1565 for key, val in args.items():
1566 if val in ('undefined', 'void 0'):
1567 args[key] = 'null'
1568
1569 return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1570
27713812 1571 @staticmethod
f8da79f8 1572 def _hidden_inputs(html):
586f1cc5 1573 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1574 hidden_inputs = {}
c8498368
S
1575 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1576 attrs = extract_attributes(input)
1577 if not input:
201ea3ee 1578 continue
c8498368 1579 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1580 continue
c8498368
S
1581 name = attrs.get('name') or attrs.get('id')
1582 value = attrs.get('value')
1583 if name and value is not None:
1584 hidden_inputs[name] = value
201ea3ee 1585 return hidden_inputs
27713812 1586
cf61d96d
S
1587 def _form_hidden_inputs(self, form_id, html):
1588 form = self._search_regex(
73eb13df 1589 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1590 html, '%s form' % form_id, group='form')
1591 return self._hidden_inputs(form)
1592
eb8a4433 1593 class FormatSort:
b050d210 1594 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
eb8a4433 1595
8326b00a 1596 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
176f1866 1597 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
f304da8a 1598 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
198e3a04 1599 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
53ed7066 1600 'height', 'width', 'proto', 'vext', 'abr', 'aext',
f304da8a 1601 'fps', 'fs_approx', 'source', 'id')
eb8a4433 1602
1603 settings = {
1604 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1605 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1606 'acodec': {'type': 'ordered', 'regex': True,
a10aa588 1607 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
176f1866 1608 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1609 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
f137c99e 1610 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
f304da8a 1611 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
eb8a4433 1612 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1613 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1614 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1615 'aext': {'type': 'ordered', 'field': 'audio_ext',
1616 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1617 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1618 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f5510afe 1619 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
8326b00a 1620 'field': ('vcodec', 'acodec'),
1621 'function': lambda it: int(any(v != 'none' for v in it))},
f983b875 1622 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1623 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1624 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
10beccc9 1625 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1626 'quality': {'convert': 'float', 'default': -1},
eb8a4433 1627 'filesize': {'convert': 'bytes'},
f137c99e 1628 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1629 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1630 'height': {'convert': 'float_none'},
1631 'width': {'convert': 'float_none'},
1632 'fps': {'convert': 'float_none'},
1633 'tbr': {'convert': 'float_none'},
1634 'vbr': {'convert': 'float_none'},
1635 'abr': {'convert': 'float_none'},
1636 'asr': {'convert': 'float_none'},
10beccc9 1637 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
63be1aab 1638
eb8a4433 1639 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1640 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1641 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1642 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
f5510afe 1643 'res': {'type': 'multiple', 'field': ('height', 'width'),
dbf5416a 1644 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
63be1aab 1645
19188702 1646 # For compatibility with youtube-dl
1647 'format_id': {'type': 'alias', 'field': 'id'},
1648 'preference': {'type': 'alias', 'field': 'ie_pref'},
1649 'language_preference': {'type': 'alias', 'field': 'lang'},
63be1aab 1650 'source_preference': {'type': 'alias', 'field': 'source'},
08d30158 1651 'protocol': {'type': 'alias', 'field': 'proto'},
63be1aab 1652 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
08d30158 1653
1654 # Deprecated
1655 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1656 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1657 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1658 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1659 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1660 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1661 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1662 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1663 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1664 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1665 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1666 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1667 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1668 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1669 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1670 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1671 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1672 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1673 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1674 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
63be1aab 1675 }
eb8a4433 1676
f304da8a 1677 def __init__(self, ie, field_preference):
1678 self._order = []
1679 self.ydl = ie._downloader
1680 self.evaluate_params(self.ydl.params, field_preference)
1681 if ie.get_param('verbose'):
1682 self.print_verbose_info(self.ydl.write_debug)
eb8a4433 1683
1684 def _get_field_setting(self, field, key):
1685 if field not in self.settings:
ee8dd27a 1686 if key in ('forced', 'priority'):
1687 return False
1688 self.ydl.deprecation_warning(
1689 f'Using arbitrary fields ({field}) for format sorting is deprecated '
1690 'and may be removed in a future version')
eb8a4433 1691 self.settings[field] = {}
1692 propObj = self.settings[field]
1693 if key not in propObj:
1694 type = propObj.get('type')
1695 if key == 'field':
1696 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1697 elif key == 'convert':
1698 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1699 else:
f5510afe 1700 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
eb8a4433 1701 propObj[key] = default
1702 return propObj[key]
1703
1704 def _resolve_field_value(self, field, value, convertNone=False):
1705 if value is None:
1706 if not convertNone:
1707 return None
4bcc7bd1 1708 else:
eb8a4433 1709 value = value.lower()
1710 conversion = self._get_field_setting(field, 'convert')
1711 if conversion == 'ignore':
1712 return None
1713 if conversion == 'string':
1714 return value
1715 elif conversion == 'float_none':
1716 return float_or_none(value)
1717 elif conversion == 'bytes':
1718 return FileDownloader.parse_bytes(value)
1719 elif conversion == 'order':
da9be05e 1720 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1721 use_regex = self._get_field_setting(field, 'regex')
1722 list_length = len(order_list)
1723 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1724 if use_regex and value is not None:
da9be05e 1725 for i, regex in enumerate(order_list):
eb8a4433 1726 if regex and re.match(regex, value):
1727 return list_length - i
1728 return list_length - empty_pos # not in list
1729 else: # not regex or value = None
1730 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1731 else:
1732 if value.isnumeric():
1733 return float(value)
4bcc7bd1 1734 else:
eb8a4433 1735 self.settings[field]['convert'] = 'string'
1736 return value
1737
1738 def evaluate_params(self, params, sort_extractor):
1739 self._use_free_order = params.get('prefer_free_formats', False)
1740 self._sort_user = params.get('format_sort', [])
1741 self._sort_extractor = sort_extractor
1742
1743 def add_item(field, reverse, closest, limit_text):
1744 field = field.lower()
1745 if field in self._order:
1746 return
1747 self._order.append(field)
1748 limit = self._resolve_field_value(field, limit_text)
1749 data = {
1750 'reverse': reverse,
1751 'closest': False if limit is None else closest,
1752 'limit_text': limit_text,
1753 'limit': limit}
1754 if field in self.settings:
1755 self.settings[field].update(data)
1756 else:
1757 self.settings[field] = data
1758
1759 sort_list = (
1760 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1761 + (tuple() if params.get('format_sort_force', False)
1762 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1763 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1764
1765 for item in sort_list:
1766 match = re.match(self.regex, item)
1767 if match is None:
1768 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1769 field = match.group('field')
1770 if field is None:
1771 continue
1772 if self._get_field_setting(field, 'type') == 'alias':
ee8dd27a 1773 alias, field = field, self._get_field_setting(field, 'field')
08d30158 1774 if self._get_field_setting(alias, 'deprecated'):
19188702 1775 self.ydl.deprecation_warning(
1776 f'Format sorting alias {alias} is deprecated '
1777 f'and may be removed in a future version. Please use {field} instead')
eb8a4433 1778 reverse = match.group('reverse') is not None
b050d210 1779 closest = match.group('separator') == '~'
eb8a4433 1780 limit_text = match.group('limit')
1781
1782 has_limit = limit_text is not None
1783 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1784 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1785
1786 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
b5ae35ee 1787 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
eb8a4433 1788 limit_count = len(limits)
1789 for (i, f) in enumerate(fields):
1790 add_item(f, reverse, closest,
1791 limits[i] if i < limit_count
1792 else limits[0] if has_limit and not has_multiple_limits
1793 else None)
1794
0760b0a7 1795 def print_verbose_info(self, write_debug):
b31fdeed 1796 if self._sort_user:
0760b0a7 1797 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
eb8a4433 1798 if self._sort_extractor:
0760b0a7 1799 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1800 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
eb8a4433 1801 '+' if self._get_field_setting(field, 'reverse') else '', field,
1802 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1803 self._get_field_setting(field, 'limit_text'),
1804 self._get_field_setting(field, 'limit'))
1805 if self._get_field_setting(field, 'limit_text') is not None else '')
1806 for field in self._order if self._get_field_setting(field, 'visible')]))
1807
1808 def _calculate_field_preference_from_value(self, format, field, type, value):
1809 reverse = self._get_field_setting(field, 'reverse')
1810 closest = self._get_field_setting(field, 'closest')
1811 limit = self._get_field_setting(field, 'limit')
1812
1813 if type == 'extractor':
1814 maximum = self._get_field_setting(field, 'max')
1815 if value is None or (maximum is not None and value >= maximum):
f983b875 1816 value = -1
eb8a4433 1817 elif type == 'boolean':
1818 in_list = self._get_field_setting(field, 'in_list')
1819 not_in_list = self._get_field_setting(field, 'not_in_list')
1820 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1821 elif type == 'ordered':
1822 value = self._resolve_field_value(field, value, True)
1823
1824 # try to convert to number
6a04a74e 1825 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
eb8a4433 1826 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1827 if is_num:
1828 value = val_num
1829
1830 return ((-10, 0) if value is None
1831 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1832 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1833 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1834 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1835 else (-1, value, 0))
1836
1837 def _calculate_field_preference(self, format, field):
1838 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1839 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1840 if type == 'multiple':
1841 type = 'field' # Only 'field' is allowed in multiple for now
1842 actual_fields = self._get_field_setting(field, 'field')
1843
f5510afe 1844 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
eb8a4433 1845 else:
1846 value = get_value(field)
1847 return self._calculate_field_preference_from_value(format, field, type, value)
1848
1849 def calculate_preference(self, format):
1850 # Determine missing protocol
1851 if not format.get('protocol'):
1852 format['protocol'] = determine_protocol(format)
1853
1854 # Determine missing ext
1855 if not format.get('ext') and 'url' in format:
1856 format['ext'] = determine_ext(format['url'])
1857 if format.get('vcodec') == 'none':
8326b00a 1858 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
eb8a4433 1859 format['video_ext'] = 'none'
1860 else:
1861 format['video_ext'] = format['ext']
1862 format['audio_ext'] = 'none'
1863 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1864 # format['preference'] = -1000
1865
1866 # Determine missing bitrates
1867 if format.get('tbr') is None:
1868 if format.get('vbr') is not None and format.get('abr') is not None:
1869 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1870 else:
b5ae35ee 1871 if format.get('vcodec') != 'none' and format.get('vbr') is None:
eb8a4433 1872 format['vbr'] = format.get('tbr') - format.get('abr', 0)
b5ae35ee 1873 if format.get('acodec') != 'none' and format.get('abr') is None:
eb8a4433 1874 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1875
1876 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1877
1878 def _sort_formats(self, formats, field_preference=[]):
1879 if not formats:
88acdbc2 1880 return
1d485a1a 1881 formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
59040888 1882
96a53167
S
1883 def _check_formats(self, formats, video_id):
1884 if formats:
1885 formats[:] = filter(
1886 lambda f: self._is_valid_url(
1887 f['url'], video_id,
1888 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1889 formats)
1890
f5bdb444
S
1891 @staticmethod
1892 def _remove_duplicate_formats(formats):
1893 format_urls = set()
1894 unique_formats = []
1895 for f in formats:
1896 if f['url'] not in format_urls:
1897 format_urls.add(f['url'])
1898 unique_formats.append(f)
1899 formats[:] = unique_formats
1900
45024183 1901 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1902 url = self._proto_relative_url(url, scheme='http:')
1903 # For now assume non HTTP(S) URLs always valid
1904 if not (url.startswith('http://') or url.startswith('https://')):
1905 return True
96a53167 1906 try:
45024183 1907 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1908 return True
8bdd16b4 1909 except ExtractorError as e:
25e911a9 1910 self.to_screen(
8bdd16b4 1911 '%s: %s URL is invalid, skipping: %s'
1912 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1913 return False
96a53167 1914
20991253 1915 def http_scheme(self):
1ede5b24 1916 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1917 return (
1918 'http:'
a06916d9 1919 if self.get_param('prefer_insecure', False)
20991253
PH
1920 else 'https:')
1921
57c7411f
PH
1922 def _proto_relative_url(self, url, scheme=None):
1923 if url is None:
1924 return url
1925 if url.startswith('//'):
1926 if scheme is None:
1927 scheme = self.http_scheme()
1928 return scheme + url
1929 else:
1930 return url
1931
4094b6e3
PH
1932 def _sleep(self, timeout, video_id, msg_template=None):
1933 if msg_template is None:
f1a9d64e 1934 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1935 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1936 self.to_screen(msg)
1937 time.sleep(timeout)
1938
f983b875 1939 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1940 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1941 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
a076c1f9 1942 res = self._download_xml_handle(
f036a632 1943 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1944 'Unable to download f4m manifest',
1945 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1946 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1947 transform_source=transform_source,
7360c06f 1948 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 1949 if res is False:
8d29e47f 1950 return []
31bb8d3f 1951
a076c1f9
E
1952 manifest, urlh = res
1953 manifest_url = urlh.geturl()
1954
0fdbb332 1955 return self._parse_f4m_formats(
f983b875 1956 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1957 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1958
f983b875 1959 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1960 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1961 fatal=True, m3u8_id=None):
f9934b96 1962 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
1963 return []
1964
7a5c1cfe 1965 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1966 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1967 if akamai_pv is not None and ';' in akamai_pv.text:
1968 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1969 if playerVerificationChallenge.strip() != '':
1970 return []
1971
31bb8d3f 1972 formats = []
7a47d07c 1973 manifest_version = '1.0'
b2527359 1974 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1975 if not media_nodes:
7a47d07c 1976 manifest_version = '2.0'
34e48bed 1977 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1978 # Remove unsupported DRM protected media from final formats
067aa17e 1979 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1980 media_nodes = remove_encrypted_media(media_nodes)
1981 if not media_nodes:
1982 return formats
48107c19
S
1983
1984 manifest_base_url = get_base_url(manifest)
0a5685b2 1985
a6571f10 1986 bootstrap_info = xpath_element(
0a5685b2
YCH
1987 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1988 'bootstrap info', default=None)
1989
edd6074c
RA
1990 vcodec = None
1991 mime_type = xpath_text(
1992 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1993 'base URL', default=None)
1994 if mime_type and mime_type.startswith('audio/'):
1995 vcodec = 'none'
1996
b2527359 1997 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1998 tbr = int_or_none(media_el.attrib.get('bitrate'))
1999 width = int_or_none(media_el.attrib.get('width'))
2000 height = int_or_none(media_el.attrib.get('height'))
34921b43 2001 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
2002 # If <bootstrapInfo> is present, the specified f4m is a
2003 # stream-level manifest, and only set-level manifests may refer to
2004 # external resources. See section 11.4 and section 4 of F4M spec
2005 if bootstrap_info is None:
2006 media_url = None
2007 # @href is introduced in 2.0, see section 11.6 of F4M spec
2008 if manifest_version == '2.0':
2009 media_url = media_el.attrib.get('href')
2010 if media_url is None:
2011 media_url = media_el.attrib.get('url')
31c746e5
S
2012 if not media_url:
2013 continue
cc357c4d
S
2014 manifest_url = (
2015 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 2016 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
2017 # If media_url is itself a f4m manifest do the recursive extraction
2018 # since bitrates in parent manifest (this one) and media_url manifest
2019 # may differ leading to inability to resolve the format by requested
2020 # bitrate in f4m downloader
240b6045
YCH
2021 ext = determine_ext(manifest_url)
2022 if ext == 'f4m':
77b8b4e6 2023 f4m_formats = self._extract_f4m_formats(
f983b875 2024 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
2025 transform_source=transform_source, fatal=fatal)
2026 # Sometimes stream-level manifest contains single media entry that
2027 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2028 # At the same time parent's media entry in set-level manifest may
2029 # contain it. We will copy it from parent in such cases.
2030 if len(f4m_formats) == 1:
2031 f = f4m_formats[0]
2032 f.update({
2033 'tbr': f.get('tbr') or tbr,
2034 'width': f.get('width') or width,
2035 'height': f.get('height') or height,
2036 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 2037 'vcodec': vcodec,
77b8b4e6
S
2038 })
2039 formats.extend(f4m_formats)
70f0f5a8 2040 continue
240b6045
YCH
2041 elif ext == 'm3u8':
2042 formats.extend(self._extract_m3u8_formats(
2043 manifest_url, video_id, 'mp4', preference=preference,
f983b875 2044 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 2045 continue
31bb8d3f 2046 formats.append({
77b8b4e6 2047 'format_id': format_id,
31bb8d3f 2048 'url': manifest_url,
30d0b549 2049 'manifest_url': manifest_url,
a6571f10 2050 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 2051 'protocol': 'f4m',
b2527359 2052 'tbr': tbr,
77b8b4e6
S
2053 'width': width,
2054 'height': height,
edd6074c 2055 'vcodec': vcodec,
60ca389c 2056 'preference': preference,
f983b875 2057 'quality': quality,
31bb8d3f 2058 })
31bb8d3f
JMF
2059 return formats
2060
f983b875 2061 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 2062 return {
34921b43 2063 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
2064 'url': m3u8_url,
2065 'ext': ext,
2066 'protocol': 'm3u8',
37768f92 2067 'preference': preference - 100 if preference else -100,
f983b875 2068 'quality': quality,
704df56d
PH
2069 'resolution': 'multiple',
2070 'format_note': 'Quality selection URL',
16da9bbc
YCH
2071 }
2072
b5ae35ee 2073 def _report_ignoring_subs(self, name):
2074 self.report_warning(bug_reports_message(
2075 f'Ignoring subtitle tracks found in the {name} manifest; '
2076 'if any subtitle tracks are missing,'
2077 ), only_once=True)
2078
a0c3b2d5
F
2079 def _extract_m3u8_formats(self, *args, **kwargs):
2080 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2081 if subs:
b5ae35ee 2082 self._report_ignoring_subs('HLS')
a0c3b2d5
F
2083 return fmts
2084
2085 def _extract_m3u8_formats_and_subtitles(
177877c5 2086 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2087 preference=None, quality=None, m3u8_id=None, note=None,
2088 errnote=None, fatal=True, live=False, data=None, headers={},
2089 query={}):
2090
dbd82a1d 2091 res = self._download_webpage_handle(
81515ad9 2092 m3u8_url, video_id,
37a3bb66 2093 note='Downloading m3u8 information' if note is None else note,
2094 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 2095 fatal=fatal, data=data, headers=headers, query=query)
cb252080 2096
dbd82a1d 2097 if res is False:
a0c3b2d5 2098 return [], {}
cb252080 2099
dbd82a1d 2100 m3u8_doc, urlh = res
37113045 2101 m3u8_url = urlh.geturl()
9cdffeeb 2102
a0c3b2d5 2103 return self._parse_m3u8_formats_and_subtitles(
cb252080 2104 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 2105 preference=preference, quality=quality, m3u8_id=m3u8_id,
2106 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2107 headers=headers, query=query, video_id=video_id)
cb252080 2108
a0c3b2d5 2109 def _parse_m3u8_formats_and_subtitles(
42676437 2110 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2111 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2112 errnote=None, fatal=True, data=None, headers={}, query={},
2113 video_id=None):
60755938 2114 formats, subtitles = [], {}
a0c3b2d5 2115
6b993ca7 2116 has_drm = re.search('|'.join([
2117 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2118 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
2119 ]), m3u8_doc)
a0c3b2d5 2120
60755938 2121 def format_url(url):
2122 return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2123
2124 if self.get_param('hls_split_discontinuity', False):
2125 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2126 if not m3u8_doc:
2127 if not manifest_url:
2128 return []
2129 m3u8_doc = self._download_webpage(
2130 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2131 note=False, errnote='Failed to download m3u8 playlist information')
2132 if m3u8_doc is False:
2133 return []
2134 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 2135
60755938 2136 else:
2137 def _extract_m3u8_playlist_indices(*args, **kwargs):
2138 return [None]
310c2ed2 2139
cb252080
S
2140 # References:
2141 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2142 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2143 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2144
2145 # We should try extracting formats only from master playlists [1, 4.3.4],
2146 # i.e. playlists that describe available qualities. On the other hand
2147 # media playlists [1, 4.3.3] should be returned as is since they contain
2148 # just the media without qualities renditions.
9cdffeeb 2149 # Fortunately, master playlist can be easily distinguished from media
cb252080 2150 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2151 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2152 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2153 # media playlist and MUST NOT appear in master playlist thus we can
2154 # clearly detect media playlist with this criterion.
2155
9cdffeeb 2156 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2157 formats = [{
34921b43 2158 'format_id': join_nonempty(m3u8_id, idx),
60755938 2159 'format_index': idx,
42676437 2160 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2161 'ext': ext,
2162 'protocol': entry_protocol,
2163 'preference': preference,
2164 'quality': quality,
88acdbc2 2165 'has_drm': has_drm,
60755938 2166 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2167
a0c3b2d5 2168 return formats, subtitles
cb252080
S
2169
2170 groups = {}
2171 last_stream_inf = {}
2172
2173 def extract_media(x_media_line):
2174 media = parse_m3u8_attributes(x_media_line)
2175 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2176 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2177 if not (media_type and group_id and name):
2178 return
2179 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2180 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2181 if media_type == 'SUBTITLES':
3907333c 2182 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2183 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2184 # However, lack of URI has been spotted in the wild.
2185 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2186 if not media.get('URI'):
2187 return
a0c3b2d5
F
2188 url = format_url(media['URI'])
2189 sub_info = {
2190 'url': url,
2191 'ext': determine_ext(url),
2192 }
4a2f19ab
F
2193 if sub_info['ext'] == 'm3u8':
2194 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2195 # files may contain is WebVTT:
2196 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2197 sub_info['ext'] = 'vtt'
2198 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2199 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2200 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2201 if media_type not in ('VIDEO', 'AUDIO'):
2202 return
2203 media_url = media.get('URI')
2204 if media_url:
310c2ed2 2205 manifest_url = format_url(media_url)
60755938 2206 formats.extend({
34921b43 2207 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2208 'format_note': name,
2209 'format_index': idx,
2210 'url': manifest_url,
2211 'manifest_url': m3u8_url,
2212 'language': media.get('LANGUAGE'),
2213 'ext': ext,
2214 'protocol': entry_protocol,
2215 'preference': preference,
2216 'quality': quality,
2217 'vcodec': 'none' if media_type == 'AUDIO' else None,
2218 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2219
2220 def build_stream_name():
2221 # Despite specification does not mention NAME attribute for
3019cb0c
S
2222 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2223 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2224 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2225 stream_name = last_stream_inf.get('NAME')
2226 if stream_name:
2227 return stream_name
2228 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2229 # from corresponding rendition group
2230 stream_group_id = last_stream_inf.get('VIDEO')
2231 if not stream_group_id:
2232 return
2233 stream_group = groups.get(stream_group_id)
2234 if not stream_group:
2235 return stream_group_id
2236 rendition = stream_group[0]
2237 return rendition.get('NAME') or stream_group_id
2238
379306ef 2239 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2240 # chance to detect video only formats when EXT-X-STREAM-INF tags
2241 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2242 for line in m3u8_doc.splitlines():
2243 if line.startswith('#EXT-X-MEDIA:'):
2244 extract_media(line)
2245
704df56d
PH
2246 for line in m3u8_doc.splitlines():
2247 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2248 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2249 elif line.startswith('#') or not line.strip():
2250 continue
2251 else:
9c99bef7 2252 tbr = float_or_none(
3089bc74
S
2253 last_stream_inf.get('AVERAGE-BANDWIDTH')
2254 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2255 manifest_url = format_url(line.strip())
5ef62fc4 2256
60755938 2257 for idx in _extract_m3u8_playlist_indices(manifest_url):
2258 format_id = [m3u8_id, None, idx]
310c2ed2 2259 # Bandwidth of live streams may differ over time thus making
2260 # format_id unpredictable. So it's better to keep provided
2261 # format_id intact.
2262 if not live:
60755938 2263 stream_name = build_stream_name()
34921b43 2264 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2265 f = {
34921b43 2266 'format_id': join_nonempty(*format_id),
60755938 2267 'format_index': idx,
310c2ed2 2268 'url': manifest_url,
2269 'manifest_url': m3u8_url,
2270 'tbr': tbr,
2271 'ext': ext,
2272 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2273 'protocol': entry_protocol,
2274 'preference': preference,
2275 'quality': quality,
2276 }
2277 resolution = last_stream_inf.get('RESOLUTION')
2278 if resolution:
2279 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2280 if mobj:
2281 f['width'] = int(mobj.group('width'))
2282 f['height'] = int(mobj.group('height'))
2283 # Unified Streaming Platform
2284 mobj = re.search(
2285 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2286 if mobj:
2287 abr, vbr = mobj.groups()
2288 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2289 f.update({
2290 'vbr': vbr,
2291 'abr': abr,
2292 })
2293 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2294 f.update(codecs)
2295 audio_group_id = last_stream_inf.get('AUDIO')
2296 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2297 # references a rendition group MUST have a CODECS attribute.
2298 # However, this is not always respected, for example, [2]
2299 # contains EXT-X-STREAM-INF tag which references AUDIO
2300 # rendition group but does not have CODECS and despite
2301 # referencing an audio group it represents a complete
2302 # (with audio and video) format. So, for such cases we will
2303 # ignore references to rendition groups and treat them
2304 # as complete formats.
2305 if audio_group_id and codecs and f.get('vcodec') != 'none':
2306 audio_group = groups.get(audio_group_id)
2307 if audio_group and audio_group[0].get('URI'):
2308 # TODO: update acodec for audio only formats with
2309 # the same GROUP-ID
2310 f['acodec'] = 'none'
fc21af50 2311 if not f.get('ext'):
2312 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2313 formats.append(f)
2314
2315 # for DailyMotion
2316 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2317 if progressive_uri:
2318 http_f = f.copy()
2319 del http_f['manifest_url']
2320 http_f.update({
2321 'format_id': f['format_id'].replace('hls-', 'http-'),
2322 'protocol': 'http',
2323 'url': progressive_uri,
2324 })
2325 formats.append(http_f)
5ef62fc4 2326
cb252080 2327 last_stream_inf = {}
a0c3b2d5 2328 return formats, subtitles
704df56d 2329
3cf4b91d
C
2330 def _extract_m3u8_vod_duration(
2331 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2332
2333 m3u8_vod = self._download_webpage(
2334 m3u8_vod_url, video_id,
2335 note='Downloading m3u8 VOD manifest' if note is None else note,
2336 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2337 fatal=False, data=data, headers=headers, query=query)
2338
2339 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2340
2341 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2342 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2343 return None
2344
2345 return int(sum(
2346 float(line[len('#EXTINF:'):].split(',')[0])
2347 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2348
a107193e
S
2349 @staticmethod
2350 def _xpath_ns(path, namespace=None):
2351 if not namespace:
2352 return path
2353 out = []
2354 for c in path.split('/'):
2355 if not c or c == '.':
2356 out.append(c)
2357 else:
2358 out.append('{%s}%s' % (namespace, c))
2359 return '/'.join(out)
2360
da1c94ee 2361 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
a076c1f9
E
2362 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2363 if res is False:
995029a1 2364 assert not fatal
774a46c5 2365 return [], {}
e89a2aab 2366
a076c1f9
E
2367 smil, urlh = res
2368 smil_url = urlh.geturl()
2369
17712eeb 2370 namespace = self._parse_smil_namespace(smil)
a107193e 2371
da1c94ee 2372 fmts = self._parse_smil_formats(
a107193e 2373 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2374 subs = self._parse_smil_subtitles(
2375 smil, namespace=namespace)
2376
2377 return fmts, subs
2378
2379 def _extract_smil_formats(self, *args, **kwargs):
2380 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2381 if subs:
b5ae35ee 2382 self._report_ignoring_subs('SMIL')
da1c94ee 2383 return fmts
a107193e
S
2384
2385 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2386 res = self._download_smil(smil_url, video_id, fatal=fatal)
2387 if res is False:
a107193e 2388 return {}
a076c1f9
E
2389
2390 smil, urlh = res
2391 smil_url = urlh.geturl()
2392
a107193e
S
2393 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2394
09f572fb 2395 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2396 return self._download_xml_handle(
a107193e 2397 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2398 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2399
2400 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2401 namespace = self._parse_smil_namespace(smil)
a107193e
S
2402
2403 formats = self._parse_smil_formats(
2404 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2405 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2406
2407 video_id = os.path.splitext(url_basename(smil_url))[0]
2408 title = None
2409 description = None
647eab45 2410 upload_date = None
a107193e
S
2411 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2412 name = meta.attrib.get('name')
2413 content = meta.attrib.get('content')
2414 if not name or not content:
2415 continue
2416 if not title and name == 'title':
2417 title = content
2418 elif not description and name in ('description', 'abstract'):
2419 description = content
647eab45
S
2420 elif not upload_date and name == 'date':
2421 upload_date = unified_strdate(content)
a107193e 2422
1e5bcdec
S
2423 thumbnails = [{
2424 'id': image.get('type'),
2425 'url': image.get('src'),
2426 'width': int_or_none(image.get('width')),
2427 'height': int_or_none(image.get('height')),
2428 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2429
a107193e
S
2430 return {
2431 'id': video_id,
2432 'title': title or video_id,
2433 'description': description,
647eab45 2434 'upload_date': upload_date,
1e5bcdec 2435 'thumbnails': thumbnails,
a107193e
S
2436 'formats': formats,
2437 'subtitles': subtitles,
2438 }
2439
17712eeb
S
2440 def _parse_smil_namespace(self, smil):
2441 return self._search_regex(
2442 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2443
f877c6ae 2444 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2445 base = smil_url
2446 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2447 b = meta.get('base') or meta.get('httpBase')
2448 if b:
2449 base = b
2450 break
e89a2aab
S
2451
2452 formats = []
2453 rtmp_count = 0
a107193e 2454 http_count = 0
7f32e5dc 2455 m3u8_count = 0
9359f3d4 2456 imgs_count = 0
a107193e 2457
9359f3d4 2458 srcs = set()
ad96b4c8
YCH
2459 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2460 for medium in media:
2461 src = medium.get('src')
81e1c4e2 2462 if not src or src in srcs:
a107193e 2463 continue
9359f3d4 2464 srcs.add(src)
a107193e 2465
ad96b4c8
YCH
2466 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2467 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2468 width = int_or_none(medium.get('width'))
2469 height = int_or_none(medium.get('height'))
2470 proto = medium.get('proto')
2471 ext = medium.get('ext')
a107193e 2472 src_ext = determine_ext(src)
ad96b4c8 2473 streamer = medium.get('streamer') or base
a107193e
S
2474
2475 if proto == 'rtmp' or streamer.startswith('rtmp'):
2476 rtmp_count += 1
2477 formats.append({
2478 'url': streamer,
2479 'play_path': src,
2480 'ext': 'flv',
2481 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2482 'tbr': bitrate,
2483 'filesize': filesize,
2484 'width': width,
2485 'height': height,
2486 })
f877c6ae
YCH
2487 if transform_rtmp_url:
2488 streamer, src = transform_rtmp_url(streamer, src)
2489 formats[-1].update({
2490 'url': streamer,
2491 'play_path': src,
2492 })
a107193e
S
2493 continue
2494
2495 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 2496 src_url = src_url.strip()
a107193e
S
2497
2498 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2499 m3u8_formats = self._extract_m3u8_formats(
2500 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2501 if len(m3u8_formats) == 1:
2502 m3u8_count += 1
2503 m3u8_formats[0].update({
2504 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2505 'tbr': bitrate,
2506 'width': width,
2507 'height': height,
2508 })
2509 formats.extend(m3u8_formats)
bd21ead2 2510 elif src_ext == 'f4m':
a107193e
S
2511 f4m_url = src_url
2512 if not f4m_params:
2513 f4m_params = {
2514 'hdcore': '3.2.0',
2515 'plugin': 'flowplayer-3.2.0.1',
2516 }
2517 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 2518 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 2519 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2520 elif src_ext == 'mpd':
2521 formats.extend(self._extract_mpd_formats(
2522 src_url, video_id, mpd_id='dash', fatal=False))
2523 elif re.search(r'\.ism/[Mm]anifest', src_url):
2524 formats.extend(self._extract_ism_formats(
2525 src_url, video_id, ism_id='mss', fatal=False))
2526 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2527 http_count += 1
2528 formats.append({
2529 'url': src_url,
2530 'ext': ext or src_ext or 'flv',
2531 'format_id': 'http-%d' % (bitrate or http_count),
2532 'tbr': bitrate,
2533 'filesize': filesize,
2534 'width': width,
2535 'height': height,
2536 })
63757032 2537
9359f3d4
F
2538 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2539 src = medium.get('src')
2540 if not src or src in srcs:
2541 continue
2542 srcs.add(src)
2543
2544 imgs_count += 1
2545 formats.append({
2546 'format_id': 'imagestream-%d' % (imgs_count),
2547 'url': src,
2548 'ext': mimetype2ext(medium.get('type')),
2549 'acodec': 'none',
2550 'vcodec': 'none',
2551 'width': int_or_none(medium.get('width')),
2552 'height': int_or_none(medium.get('height')),
2553 'format_note': 'SMIL storyboards',
2554 })
2555
e89a2aab
S
2556 return formats
2557
ce00af87 2558 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2559 urls = []
a107193e
S
2560 subtitles = {}
2561 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2562 src = textstream.get('src')
d413095f 2563 if not src or src in urls:
a107193e 2564 continue
d413095f 2565 urls.append(src)
df634be2 2566 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2567 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2568 subtitles.setdefault(lang, []).append({
2569 'url': src,
2570 'ext': ext,
2571 })
2572 return subtitles
63757032 2573
47a5cb77 2574 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2575 res = self._download_xml_handle(
47a5cb77 2576 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2577 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2578 if res is False:
942acef5 2579 return []
a076c1f9
E
2580
2581 xspf, urlh = res
2582 xspf_url = urlh.geturl()
2583
47a5cb77
S
2584 return self._parse_xspf(
2585 xspf, playlist_id, xspf_url=xspf_url,
2586 xspf_base_url=base_url(xspf_url))
8d6765cf 2587
47a5cb77 2588 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2589 NS_MAP = {
2590 'xspf': 'http://xspf.org/ns/0/',
2591 's1': 'http://static.streamone.nl/player/ns/0',
2592 }
2593
2594 entries = []
47a5cb77 2595 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2596 title = xpath_text(
98044462 2597 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2598 description = xpath_text(
2599 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2600 thumbnail = xpath_text(
2601 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2602 duration = float_or_none(
2603 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2604
47a5cb77
S
2605 formats = []
2606 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2607 format_url = urljoin(xspf_base_url, location.text)
2608 if not format_url:
2609 continue
2610 formats.append({
2611 'url': format_url,
2612 'manifest_url': xspf_url,
2613 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2614 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2615 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2616 })
8d6765cf
S
2617 self._sort_formats(formats)
2618
2619 entries.append({
2620 'id': playlist_id,
2621 'title': title,
2622 'description': description,
2623 'thumbnail': thumbnail,
2624 'duration': duration,
2625 'formats': formats,
2626 })
2627 return entries
2628
171e59ed
F
2629 def _extract_mpd_formats(self, *args, **kwargs):
2630 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2631 if subs:
b5ae35ee 2632 self._report_ignoring_subs('DASH')
171e59ed
F
2633 return fmts
2634
2635 def _extract_mpd_formats_and_subtitles(
2636 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2637 fatal=True, data=None, headers={}, query={}):
47a5cb77 2638 res = self._download_xml_handle(
1bac3455 2639 mpd_url, video_id,
37a3bb66 2640 note='Downloading MPD manifest' if note is None else note,
2641 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2642 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2643 if res is False:
171e59ed 2644 return [], {}
47a5cb77 2645 mpd_doc, urlh = res
c25720ef 2646 if mpd_doc is None:
171e59ed 2647 return [], {}
779da8e3
E
2648
2649 # We could have been redirected to a new url when we retrieved our mpd file.
2650 mpd_url = urlh.geturl()
2651 mpd_base_url = base_url(mpd_url)
1bac3455 2652
171e59ed 2653 return self._parse_mpd_formats_and_subtitles(
545cc85d 2654 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2655
171e59ed
F
2656 def _parse_mpd_formats(self, *args, **kwargs):
2657 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2658 if subs:
b5ae35ee 2659 self._report_ignoring_subs('DASH')
171e59ed
F
2660 return fmts
2661
2662 def _parse_mpd_formats_and_subtitles(
2663 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2664 """
2665 Parse formats from MPD manifest.
2666 References:
2667 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2668 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2669 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2670 """
a06916d9 2671 if not self.get_param('dynamic_mpd', True):
78895bd3 2672 if mpd_doc.get('type') == 'dynamic':
171e59ed 2673 return [], {}
2d2fa82d 2674
91cb6b50 2675 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2676
2677 def _add_ns(path):
2678 return self._xpath_ns(path, namespace)
2679
675d0016 2680 def is_drm_protected(element):
2681 return element.find(_add_ns('ContentProtection')) is not None
2682
1bac3455 2683 def extract_multisegment_info(element, ms_parent_info):
2684 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2685
2686 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2687 # common attributes and elements. We will only extract relevant
2688 # for us.
2689 def extract_common(source):
2690 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2691 if segment_timeline is not None:
2692 s_e = segment_timeline.findall(_add_ns('S'))
2693 if s_e:
2694 ms_info['total_number'] = 0
2695 ms_info['s'] = []
2696 for s in s_e:
2697 r = int(s.get('r', 0))
2698 ms_info['total_number'] += 1 + r
2699 ms_info['s'].append({
2700 't': int(s.get('t', 0)),
2701 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2702 'd': int(s.attrib['d']),
2703 'r': r,
2704 })
2705 start_number = source.get('startNumber')
2706 if start_number:
2707 ms_info['start_number'] = int(start_number)
2708 timescale = source.get('timescale')
2709 if timescale:
2710 ms_info['timescale'] = int(timescale)
2711 segment_duration = source.get('duration')
2712 if segment_duration:
48504785 2713 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2714
2715 def extract_Initialization(source):
2716 initialization = source.find(_add_ns('Initialization'))
2717 if initialization is not None:
2718 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2719
f14be228 2720 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2721 if segment_list is not None:
b4c1d6e8
S
2722 extract_common(segment_list)
2723 extract_Initialization(segment_list)
f14be228 2724 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2725 if segment_urls_e:
2726 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2727 else:
f14be228 2728 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2729 if segment_template is not None:
b4c1d6e8 2730 extract_common(segment_template)
e228616c
S
2731 media = segment_template.get('media')
2732 if media:
2733 ms_info['media'] = media
1bac3455 2734 initialization = segment_template.get('initialization')
2735 if initialization:
e228616c 2736 ms_info['initialization'] = initialization
1bac3455 2737 else:
b4c1d6e8 2738 extract_Initialization(segment_template)
1bac3455 2739 return ms_info
b323e170 2740
1bac3455 2741 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2742 formats, subtitles = [], {}
234416e4 2743 stream_numbers = collections.defaultdict(int)
f14be228 2744 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2745 period_duration = parse_duration(period.get('duration')) or mpd_duration
2746 period_ms_info = extract_multisegment_info(period, {
2747 'start_number': 1,
2748 'timescale': 1,
2749 })
f14be228 2750 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2751 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2752 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2753 representation_attrib = adaptation_set.attrib.copy()
2754 representation_attrib.update(representation.attrib)
f0948348 2755 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2756 mime_type = representation_attrib['mimeType']
171e59ed
F
2757 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2758
21633673 2759 codec_str = representation_attrib.get('codecs', '')
2760 # Some kind of binary subtitle found in some youtube livestreams
2761 if mime_type == 'application/x-rawcc':
2762 codecs = {'scodec': codec_str}
2763 else:
2764 codecs = parse_codecs(codec_str)
be2fc5b2 2765 if content_type not in ('video', 'audio', 'text'):
2766 if mime_type == 'image/jpeg':
a8731fcc 2767 content_type = mime_type
21633673 2768 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2769 content_type = 'video'
21633673 2770 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2771 content_type = 'audio'
3fe75fdc 2772 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2773 content_type = 'text'
6993f78d 2774 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2775 content_type = 'text'
cdb19aa4 2776 else:
be2fc5b2 2777 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2778 continue
2779
2780 base_url = ''
2781 for element in (representation, adaptation_set, period, mpd_doc):
2782 base_url_e = element.find(_add_ns('BaseURL'))
2783 if base_url_e is not None:
2784 base_url = base_url_e.text + base_url
2785 if re.match(r'^https?://', base_url):
2786 break
f9cc0161
D
2787 if mpd_base_url and base_url.startswith('/'):
2788 base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2789 elif mpd_base_url and not re.match(r'^https?://', base_url):
2790 if not mpd_base_url.endswith('/'):
be2fc5b2 2791 mpd_base_url += '/'
2792 base_url = mpd_base_url + base_url
2793 representation_id = representation_attrib.get('id')
2794 lang = representation_attrib.get('lang')
2795 url_el = representation.find(_add_ns('BaseURL'))
2796 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2797 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2798 if representation_id is not None:
2799 format_id = representation_id
2800 else:
2801 format_id = content_type
2802 if mpd_id:
2803 format_id = mpd_id + '-' + format_id
2804 if content_type in ('video', 'audio'):
2805 f = {
2806 'format_id': format_id,
2807 'manifest_url': mpd_url,
2808 'ext': mimetype2ext(mime_type),
2809 'width': int_or_none(representation_attrib.get('width')),
2810 'height': int_or_none(representation_attrib.get('height')),
2811 'tbr': float_or_none(bandwidth, 1000),
2812 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2813 'fps': int_or_none(representation_attrib.get('frameRate')),
2814 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2815 'format_note': 'DASH %s' % content_type,
2816 'filesize': filesize,
2817 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2818 **codecs
be2fc5b2 2819 }
be2fc5b2 2820 elif content_type == 'text':
2821 f = {
2822 'ext': mimetype2ext(mime_type),
2823 'manifest_url': mpd_url,
2824 'filesize': filesize,
2825 }
2826 elif content_type == 'image/jpeg':
2827 # See test case in VikiIE
2828 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2829 f = {
2830 'format_id': format_id,
2831 'ext': 'mhtml',
2832 'manifest_url': mpd_url,
2833 'format_note': 'DASH storyboards (jpeg)',
2834 'acodec': 'none',
2835 'vcodec': 'none',
2836 }
88acdbc2 2837 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2838 f['has_drm'] = True
be2fc5b2 2839 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2840
2841 def prepare_template(template_name, identifiers):
2842 tmpl = representation_ms_info[template_name]
2843 # First of, % characters outside $...$ templates
2844 # must be escaped by doubling for proper processing
2845 # by % operator string formatting used further (see
2846 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2847 t = ''
2848 in_template = False
2849 for c in tmpl:
2850 t += c
2851 if c == '$':
2852 in_template = not in_template
2853 elif c == '%' and not in_template:
eca1f0d1 2854 t += c
be2fc5b2 2855 # Next, $...$ templates are translated to their
2856 # %(...) counterparts to be used with % operator
2857 if representation_id is not None:
2858 t = t.replace('$RepresentationID$', representation_id)
2859 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2860 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2861 t.replace('$$', '$')
2862 return t
2863
2864 # @initialization is a regular template like @media one
2865 # so it should be handled just the same way (see
2866 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2867 if 'initialization' in representation_ms_info:
2868 initialization_template = prepare_template(
2869 'initialization',
2870 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2871 # $Time$ shall not be included for @initialization thus
2872 # only $Bandwidth$ remains
2873 ('Bandwidth', ))
2874 representation_ms_info['initialization_url'] = initialization_template % {
2875 'Bandwidth': bandwidth,
2876 }
2877
2878 def location_key(location):
2879 return 'url' if re.match(r'^https?://', location) else 'path'
2880
2881 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2882
2883 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2884 media_location_key = location_key(media_template)
2885
2886 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2887 # can't be used at the same time
2888 if '%(Number' in media_template and 's' not in representation_ms_info:
2889 segment_duration = None
2890 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2891 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2892 representation_ms_info['total_number'] = int(math.ceil(
2893 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2894 representation_ms_info['fragments'] = [{
2895 media_location_key: media_template % {
2896 'Number': segment_number,
2897 'Bandwidth': bandwidth,
2898 },
2899 'duration': segment_duration,
2900 } for segment_number in range(
2901 representation_ms_info['start_number'],
2902 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2903 else:
2904 # $Number*$ or $Time$ in media template with S list available
2905 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2906 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2907 representation_ms_info['fragments'] = []
2908 segment_time = 0
2909 segment_d = None
2910 segment_number = representation_ms_info['start_number']
2911
2912 def add_segment_url():
2913 segment_url = media_template % {
2914 'Time': segment_time,
2915 'Bandwidth': bandwidth,
2916 'Number': segment_number,
2917 }
2918 representation_ms_info['fragments'].append({
2919 media_location_key: segment_url,
2920 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2921 })
2922
2923 for num, s in enumerate(representation_ms_info['s']):
2924 segment_time = s.get('t') or segment_time
2925 segment_d = s['d']
2926 add_segment_url()
2927 segment_number += 1
2928 for r in range(s.get('r', 0)):
2929 segment_time += segment_d
f0948348 2930 add_segment_url()
b4c1d6e8 2931 segment_number += 1
be2fc5b2 2932 segment_time += segment_d
2933 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2934 # No media template
2935 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2936 # or any YouTube dashsegments video
2937 fragments = []
2938 segment_index = 0
2939 timescale = representation_ms_info['timescale']
2940 for s in representation_ms_info['s']:
2941 duration = float_or_none(s['d'], timescale)
2942 for r in range(s.get('r', 0) + 1):
2943 segment_uri = representation_ms_info['segment_urls'][segment_index]
2944 fragments.append({
2945 location_key(segment_uri): segment_uri,
2946 'duration': duration,
2947 })
2948 segment_index += 1
2949 representation_ms_info['fragments'] = fragments
2950 elif 'segment_urls' in representation_ms_info:
2951 # Segment URLs with no SegmentTimeline
2952 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2953 # https://github.com/ytdl-org/youtube-dl/pull/14844
2954 fragments = []
2955 segment_duration = float_or_none(
2956 representation_ms_info['segment_duration'],
2957 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2958 for segment_url in representation_ms_info['segment_urls']:
2959 fragment = {
2960 location_key(segment_url): segment_url,
2961 }
2962 if segment_duration:
2963 fragment['duration'] = segment_duration
2964 fragments.append(fragment)
2965 representation_ms_info['fragments'] = fragments
2966 # If there is a fragments key available then we correctly recognized fragmented media.
2967 # Otherwise we will assume unfragmented media with direct access. Technically, such
2968 # assumption is not necessarily correct since we may simply have no support for
2969 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2970 if 'fragments' in representation_ms_info:
2971 f.update({
2972 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2973 'url': mpd_url or base_url,
2974 'fragment_base_url': base_url,
2975 'fragments': [],
2976 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2977 })
2978 if 'initialization_url' in representation_ms_info:
2979 initialization_url = representation_ms_info['initialization_url']
2980 if not f.get('url'):
2981 f['url'] = initialization_url
2982 f['fragments'].append({location_key(initialization_url): initialization_url})
2983 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 2984 if not period_duration:
2985 period_duration = try_get(
2986 representation_ms_info,
2987 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 2988 else:
be2fc5b2 2989 # Assuming direct URL to unfragmented media.
2990 f['url'] = base_url
234416e4 2991 if content_type in ('video', 'audio', 'image/jpeg'):
2992 f['manifest_stream_number'] = stream_numbers[f['url']]
2993 stream_numbers[f['url']] += 1
be2fc5b2 2994 formats.append(f)
2995 elif content_type == 'text':
2996 subtitles.setdefault(lang or 'und', []).append(f)
2997
171e59ed 2998 return formats, subtitles
17b598d3 2999
fd76a142
F
3000 def _extract_ism_formats(self, *args, **kwargs):
3001 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3002 if subs:
b5ae35ee 3003 self._report_ignoring_subs('ISM')
fd76a142
F
3004 return fmts
3005
3006 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 3007 res = self._download_xml_handle(
b2758123 3008 ism_url, video_id,
37a3bb66 3009 note='Downloading ISM manifest' if note is None else note,
3010 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 3011 fatal=fatal, data=data, headers=headers, query=query)
b2758123 3012 if res is False:
fd76a142 3013 return [], {}
47a5cb77 3014 ism_doc, urlh = res
13b08034 3015 if ism_doc is None:
fd76a142 3016 return [], {}
b2758123 3017
fd76a142 3018 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 3019
fd76a142 3020 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
3021 """
3022 Parse formats from ISM manifest.
3023 References:
3024 1. [MS-SSTR]: Smooth Streaming Protocol,
3025 https://msdn.microsoft.com/en-us/library/ff469518.aspx
3026 """
06869367 3027 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 3028 return [], {}
b2758123 3029
b2758123
RA
3030 duration = int(ism_doc.attrib['Duration'])
3031 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3032
3033 formats = []
fd76a142 3034 subtitles = {}
b2758123
RA
3035 for stream in ism_doc.findall('StreamIndex'):
3036 stream_type = stream.get('Type')
fd76a142 3037 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
3038 continue
3039 url_pattern = stream.attrib['Url']
3040 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3041 stream_name = stream.get('Name')
fd76a142 3042 stream_language = stream.get('Language', 'und')
b2758123 3043 for track in stream.findall('QualityLevel'):
e2efe599 3044 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
b2758123 3045 # TODO: add support for WVC1 and WMAP
66a1b864 3046 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
b2758123
RA
3047 self.report_warning('%s is not a supported codec' % fourcc)
3048 continue
3049 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
3050 # [1] does not mention Width and Height attributes. However,
3051 # they're often present while MaxWidth and MaxHeight are
3052 # missing, so should be used as fallbacks
3053 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3054 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
3055 sampling_rate = int_or_none(track.get('SamplingRate'))
3056
3057 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3058 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
3059
3060 fragments = []
3061 fragment_ctx = {
3062 'time': 0,
3063 }
3064 stream_fragments = stream.findall('c')
3065 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3066 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3067 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3068 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3069 if not fragment_ctx['duration']:
3070 try:
3071 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3072 except IndexError:
3073 next_fragment_time = duration
1616f9b4 3074 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
3075 for _ in range(fragment_repeat):
3076 fragments.append({
1616f9b4 3077 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
3078 'duration': fragment_ctx['duration'] / stream_timescale,
3079 })
3080 fragment_ctx['time'] += fragment_ctx['duration']
3081
fd76a142
F
3082 if stream_type == 'text':
3083 subtitles.setdefault(stream_language, []).append({
3084 'ext': 'ismt',
3085 'protocol': 'ism',
3086 'url': ism_url,
3087 'manifest_url': ism_url,
3088 'fragments': fragments,
3089 '_download_params': {
3090 'stream_type': stream_type,
3091 'duration': duration,
3092 'timescale': stream_timescale,
3093 'fourcc': fourcc,
3094 'language': stream_language,
3095 'codec_private_data': track.get('CodecPrivateData'),
3096 }
3097 })
3098 elif stream_type in ('video', 'audio'):
3099 formats.append({
34921b43 3100 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
3101 'url': ism_url,
3102 'manifest_url': ism_url,
3103 'ext': 'ismv' if stream_type == 'video' else 'isma',
3104 'width': width,
3105 'height': height,
3106 'tbr': tbr,
3107 'asr': sampling_rate,
3108 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3109 'acodec': 'none' if stream_type == 'video' else fourcc,
3110 'protocol': 'ism',
3111 'fragments': fragments,
88acdbc2 3112 'has_drm': ism_doc.find('Protection') is not None,
fd76a142
F
3113 '_download_params': {
3114 'stream_type': stream_type,
3115 'duration': duration,
3116 'timescale': stream_timescale,
3117 'width': width or 0,
3118 'height': height or 0,
3119 'fourcc': fourcc,
3120 'language': stream_language,
3121 'codec_private_data': track.get('CodecPrivateData'),
3122 'sampling_rate': sampling_rate,
3123 'channels': int_or_none(track.get('Channels', 2)),
3124 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3125 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3126 },
3127 })
3128 return formats, subtitles
b2758123 3129
079a7cfc 3130 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
3131 def absolute_url(item_url):
3132 return urljoin(base_url, item_url)
59bbe491 3133
3134 def parse_content_type(content_type):
3135 if not content_type:
3136 return {}
3137 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3138 if ctr:
3139 mimetype, codecs = ctr.groups()
3140 f = parse_codecs(codecs)
3141 f['ext'] = mimetype2ext(mimetype)
3142 return f
3143 return {}
3144
222a2308
L
3145 def _media_formats(src, cur_media_type, type_info=None):
3146 type_info = type_info or {}
520251c0 3147 full_url = absolute_url(src)
82889d4a 3148 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3149 if ext == 'm3u8':
520251c0
YCH
3150 is_plain_url = False
3151 formats = self._extract_m3u8_formats(
ad120ae1 3152 full_url, video_id, ext='mp4',
eeb0a956 3153 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3154 preference=preference, quality=quality, fatal=False)
87a449c1
S
3155 elif ext == 'mpd':
3156 is_plain_url = False
3157 formats = self._extract_mpd_formats(
b359e977 3158 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3159 else:
3160 is_plain_url = True
3161 formats = [{
3162 'url': full_url,
3163 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3164 'ext': ext,
520251c0
YCH
3165 }]
3166 return is_plain_url, formats
3167
59bbe491 3168 entries = []
4328ddf8
S
3169 # amp-video and amp-audio are very similar to their HTML5 counterparts
3170 # so we wll include them right here (see
3171 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3172 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3173 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3174 media_tags = [(media_tag, media_tag_name, media_type, '')
3175 for media_tag, media_tag_name, media_type
3176 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3177 media_tags.extend(re.findall(
3178 # We only allow video|audio followed by a whitespace or '>'.
3179 # Allowing more characters may end up in significant slow down (see
067aa17e 3180 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 3181 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3182 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3183 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3184 media_info = {
3185 'formats': [],
3186 'subtitles': {},
3187 }
3188 media_attributes = extract_attributes(media_tag)
f856816b 3189 src = strip_or_none(media_attributes.get('src'))
59bbe491 3190 if src:
222a2308
L
3191 f = parse_content_type(media_attributes.get('type'))
3192 _, formats = _media_formats(src, media_type, f)
520251c0 3193 media_info['formats'].extend(formats)
6780154e 3194 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3195 if media_content:
3196 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3197 s_attr = extract_attributes(source_tag)
3198 # data-video-src and data-src are non standard but seen
3199 # several times in the wild
f856816b 3200 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 3201 if not src:
3202 continue
d493f15c 3203 f = parse_content_type(s_attr.get('type'))
868f79db 3204 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3205 if is_plain_url:
d493f15c
S
3206 # width, height, res, label and title attributes are
3207 # all not standard but seen several times in the wild
3208 labels = [
3209 s_attr.get(lbl)
3210 for lbl in ('label', 'title')
3211 if str_or_none(s_attr.get(lbl))
3212 ]
3213 width = int_or_none(s_attr.get('width'))
3089bc74
S
3214 height = (int_or_none(s_attr.get('height'))
3215 or int_or_none(s_attr.get('res')))
d493f15c
S
3216 if not width or not height:
3217 for lbl in labels:
3218 resolution = parse_resolution(lbl)
3219 if not resolution:
3220 continue
3221 width = width or resolution.get('width')
3222 height = height or resolution.get('height')
3223 for lbl in labels:
3224 tbr = parse_bitrate(lbl)
3225 if tbr:
3226 break
3227 else:
3228 tbr = None
1ed45499 3229 f.update({
d493f15c
S
3230 'width': width,
3231 'height': height,
3232 'tbr': tbr,
3233 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3234 })
520251c0
YCH
3235 f.update(formats[0])
3236 media_info['formats'].append(f)
3237 else:
3238 media_info['formats'].extend(formats)
59bbe491 3239 for track_tag in re.findall(r'<track[^>]+>', media_content):
3240 track_attributes = extract_attributes(track_tag)
3241 kind = track_attributes.get('kind')
5968d7d2 3242 if not kind or kind in ('subtitles', 'captions'):
f856816b 3243 src = strip_or_none(track_attributes.get('src'))
59bbe491 3244 if not src:
3245 continue
3246 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3247 media_info['subtitles'].setdefault(lang, []).append({
3248 'url': absolute_url(src),
3249 })
5e8e2fa5
S
3250 for f in media_info['formats']:
3251 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3252 if media_info['formats'] or media_info['subtitles']:
59bbe491 3253 entries.append(media_info)
3254 return entries
3255
f6a1d69a
F
3256 def _extract_akamai_formats(self, *args, **kwargs):
3257 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3258 if subs:
b5ae35ee 3259 self._report_ignoring_subs('akamai')
f6a1d69a
F
3260 return fmts
3261
3262 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3263 signed = 'hdnea=' in manifest_url
3264 if not signed:
3265 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3266 manifest_url = re.sub(
3267 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3268 '', manifest_url).strip('?')
3269
c7c43a93 3270 formats = []
f6a1d69a 3271 subtitles = {}
70c5802b 3272
e71a4509 3273 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3274 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3275 hds_host = hosts.get('hds')
3276 if hds_host:
3277 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3278 if 'hdcore=' not in f4m_url:
3279 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3280 f4m_formats = self._extract_f4m_formats(
3281 f4m_url, video_id, f4m_id='hds', fatal=False)
3282 for entry in f4m_formats:
3283 entry.update({'extra_param_to_segment_url': hdcore_sign})
3284 formats.extend(f4m_formats)
70c5802b 3285
c4251b9a
RA
3286 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3287 hls_host = hosts.get('hls')
3288 if hls_host:
3289 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3290 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3291 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3292 m3u8_id='hls', fatal=False)
3293 formats.extend(m3u8_formats)
f6a1d69a 3294 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3295
3296 http_host = hosts.get('http')
29f7c58a 3297 if http_host and m3u8_formats and not signed:
3298 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3299 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3300 qualities_length = len(qualities)
29f7c58a 3301 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3302 i = 0
29f7c58a 3303 for f in m3u8_formats:
3304 if f['vcodec'] != 'none':
70c5802b 3305 for protocol in ('http', 'https'):
3306 http_f = f.copy()
3307 del http_f['manifest_url']
3308 http_url = re.sub(
86e5f3ed 3309 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3310 http_f.update({
3311 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3312 'url': http_url,
3313 'protocol': protocol,
3314 })
29f7c58a 3315 formats.append(http_f)
70c5802b 3316 i += 1
70c5802b 3317
f6a1d69a 3318 return formats, subtitles
c7c43a93 3319
6ad02195 3320 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 3321 query = compat_urlparse.urlparse(url).query
6ad02195 3322 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3323 mobj = re.search(
3324 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3325 url_base = mobj.group('url')
3326 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3327 formats = []
044eeb14
S
3328
3329 def manifest_url(manifest):
86e5f3ed 3330 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3331 if query:
3332 m_url += '?%s' % query
3333 return m_url
3334
6ad02195
RA
3335 if 'm3u8' not in skip_protocols:
3336 formats.extend(self._extract_m3u8_formats(
044eeb14 3337 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3338 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3339 if 'f4m' not in skip_protocols:
3340 formats.extend(self._extract_f4m_formats(
044eeb14 3341 manifest_url('manifest.f4m'),
6ad02195 3342 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3343 if 'dash' not in skip_protocols:
3344 formats.extend(self._extract_mpd_formats(
044eeb14 3345 manifest_url('manifest.mpd'),
0384932e 3346 video_id, mpd_id='dash', fatal=False))
6ad02195 3347 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3348 if 'smil' not in skip_protocols:
3349 rtmp_formats = self._extract_smil_formats(
044eeb14 3350 manifest_url('jwplayer.smil'),
6ad02195
RA
3351 video_id, fatal=False)
3352 for rtmp_format in rtmp_formats:
3353 rtsp_format = rtmp_format.copy()
3354 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3355 del rtsp_format['play_path']
3356 del rtsp_format['ext']
3357 rtsp_format.update({
3358 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3359 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3360 'protocol': 'rtsp',
3361 })
3362 formats.extend([rtmp_format, rtsp_format])
3363 else:
3364 for protocol in ('rtmp', 'rtsp'):
3365 if protocol not in skip_protocols:
3366 formats.append({
86e5f3ed 3367 'url': f'{protocol}:{url_base}',
6ad02195
RA
3368 'format_id': protocol,
3369 'protocol': protocol,
3370 })
3371 return formats
3372
c73e330e 3373 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3374 mobj = re.search(
ac9c69ac 3375 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3376 webpage)
3377 if mobj:
c73e330e
RU
3378 try:
3379 jwplayer_data = self._parse_json(mobj.group('options'),
3380 video_id=video_id,
3381 transform_source=transform_source)
3382 except ExtractorError:
3383 pass
3384 else:
3385 if isinstance(jwplayer_data, dict):
3386 return jwplayer_data
a4a554a7
YCH
3387
3388 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3389 jwplayer_data = self._find_jwplayer_data(
3390 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3391 return self._parse_jwplayer_data(
3392 jwplayer_data, video_id, *args, **kwargs)
3393
3394 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3395 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3396 # JWPlayer backward compatibility: flattened playlists
3397 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3398 if 'playlist' not in jwplayer_data:
3399 jwplayer_data = {'playlist': [jwplayer_data]}
3400
3401 entries = []
3402
3403 # JWPlayer backward compatibility: single playlist item
3404 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3405 if not isinstance(jwplayer_data['playlist'], list):
3406 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3407
3408 for video_data in jwplayer_data['playlist']:
3409 # JWPlayer backward compatibility: flattened sources
3410 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3411 if 'sources' not in video_data:
3412 video_data['sources'] = [video_data]
3413
3414 this_video_id = video_id or video_data['mediaid']
3415
1a2192cb
S
3416 formats = self._parse_jwplayer_formats(
3417 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3418 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3419
3420 subtitles = {}
3421 tracks = video_data.get('tracks')
3422 if tracks and isinstance(tracks, list):
3423 for track in tracks:
96a2daa1
S
3424 if not isinstance(track, dict):
3425 continue
f4b74272
S
3426 track_kind = track.get('kind')
3427 if not track_kind or not isinstance(track_kind, compat_str):
3428 continue
3429 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3430 continue
3431 track_url = urljoin(base_url, track.get('file'))
3432 if not track_url:
3433 continue
3434 subtitles.setdefault(track.get('label') or 'en', []).append({
3435 'url': self._proto_relative_url(track_url)
3436 })
3437
50d808f5 3438 entry = {
a4a554a7 3439 'id': this_video_id,
50d808f5 3440 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3441 'description': clean_html(video_data.get('description')),
6945b9e7 3442 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3443 'timestamp': int_or_none(video_data.get('pubdate')),
3444 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3445 'subtitles': subtitles,
50d808f5
RA
3446 }
3447 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3448 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3449 entry.update({
3450 '_type': 'url_transparent',
3451 'url': formats[0]['url'],
3452 })
3453 else:
3454 self._sort_formats(formats)
3455 entry['formats'] = formats
3456 entries.append(entry)
a4a554a7
YCH
3457 if len(entries) == 1:
3458 return entries[0]
3459 else:
3460 return self.playlist_result(entries)
3461
ed0cf9b3
S
3462 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3463 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3464 urls = []
ed0cf9b3 3465 formats = []
1a2192cb 3466 for source in jwplayer_sources_data:
0a268c6e
S
3467 if not isinstance(source, dict):
3468 continue
6945b9e7
RA
3469 source_url = urljoin(
3470 base_url, self._proto_relative_url(source.get('file')))
3471 if not source_url or source_url in urls:
bf1b87cd
RA
3472 continue
3473 urls.append(source_url)
ed0cf9b3
S
3474 source_type = source.get('type') or ''
3475 ext = mimetype2ext(source_type) or determine_ext(source_url)
3476 if source_type == 'hls' or ext == 'm3u8':
3477 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3478 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3479 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3480 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3481 formats.extend(self._extract_mpd_formats(
3482 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3483 elif ext == 'smil':
3484 formats.extend(self._extract_smil_formats(
3485 source_url, video_id, fatal=False))
ed0cf9b3 3486 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3487 elif source_type.startswith('audio') or ext in (
3488 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3489 formats.append({
3490 'url': source_url,
3491 'vcodec': 'none',
3492 'ext': ext,
3493 })
3494 else:
3495 height = int_or_none(source.get('height'))
3496 if height is None:
3497 # Often no height is provided but there is a label in
0236cd0d 3498 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3499 height = int_or_none(self._search_regex(
0236cd0d 3500 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
3501 'height', default=None))
3502 a_format = {
3503 'url': source_url,
3504 'width': int_or_none(source.get('width')),
3505 'height': height,
0236cd0d 3506 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3507 'ext': ext,
3508 }
3509 if source_url.startswith('rtmp'):
3510 a_format['ext'] = 'flv'
ed0cf9b3
S
3511 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3512 # of jwplayer.flash.swf
3513 rtmp_url_parts = re.split(
3514 r'((?:mp4|mp3|flv):)', source_url, 1)
3515 if len(rtmp_url_parts) == 3:
3516 rtmp_url, prefix, play_path = rtmp_url_parts
3517 a_format.update({
3518 'url': rtmp_url,
3519 'play_path': prefix + play_path,
3520 })
3521 if rtmp_params:
3522 a_format.update(rtmp_params)
3523 formats.append(a_format)
3524 return formats
3525
f4b1c7ad 3526 def _live_title(self, name):
39ca3b5c 3527 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3528 return name
f4b1c7ad 3529
b14f3a4c
PH
3530 def _int(self, v, name, fatal=False, **kwargs):
3531 res = int_or_none(v, **kwargs)
b14f3a4c 3532 if res is None:
86e5f3ed 3533 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3534 if fatal:
3535 raise ExtractorError(msg)
3536 else:
6a39ee13 3537 self.report_warning(msg)
b14f3a4c
PH
3538 return res
3539
3540 def _float(self, v, name, fatal=False, **kwargs):
3541 res = float_or_none(v, **kwargs)
3542 if res is None:
86e5f3ed 3543 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3544 if fatal:
3545 raise ExtractorError(msg)
3546 else:
6a39ee13 3547 self.report_warning(msg)
b14f3a4c
PH
3548 return res
3549
40e41780
TF
3550 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3551 path='/', secure=False, discard=False, rest={}, **kwargs):
6c22cee6 3552 cookie = compat_cookiejar_Cookie(
4ed2d7b7 3553 0, name, value, port, port is not None, domain, True,
40e41780
TF
3554 domain.startswith('.'), path, True, secure, expire_time,
3555 discard, None, None, rest)
42939b61
JMF
3556 self._downloader.cookiejar.set_cookie(cookie)
3557
799207e8 3558 def _get_cookies(self, url):
f7ad7160 3559 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
c487cf00 3560 return compat_cookies_SimpleCookie(self._downloader._calc_cookies(url))
799207e8 3561
e3c1266f 3562 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3563 """
3564 Apply first Set-Cookie header instead of the last. Experimental.
3565
3566 Some sites (e.g. [1-3]) may serve two cookies under the same name
3567 in Set-Cookie header and expect the first (old) one to be set rather
3568 than second (new). However, as of RFC6265 the newer one cookie
3569 should be set into cookie store what actually happens.
3570 We will workaround this issue by resetting the cookie to
3571 the first one manually.
3572 1. https://new.vk.com/
3573 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3574 3. https://learning.oreilly.com/
3575 """
e3c1266f
S
3576 for header, cookies in url_handle.headers.items():
3577 if header.lower() != 'set-cookie':
3578 continue
cfb0511d 3579 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3580 cookie_value = re.search(
3581 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3582 if cookie_value:
3583 value, domain = cookie_value.groups()
3584 self._set_cookie(domain, cookie, value)
3585 break
3586
82d02080 3587 @classmethod
3588 def get_testcases(cls, include_onlymatching=False):
3589 t = getattr(cls, '_TEST', None)
05900629 3590 if t:
82d02080 3591 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3592 tests = [t]
3593 else:
82d02080 3594 tests = getattr(cls, '_TESTS', [])
05900629
PH
3595 for t in tests:
3596 if not include_onlymatching and t.get('only_matching', False):
3597 continue
82d02080 3598 t['name'] = cls.ie_key()
05900629
PH
3599 yield t
3600
24146491 3601 @classproperty
3602 def age_limit(cls):
3603 """Get age limit from the testcases"""
3604 return max(traverse_obj(
3605 tuple(cls.get_testcases(include_onlymatching=False)),
3606 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3607
82d02080 3608 @classmethod
3609 def is_suitable(cls, age_limit):
24146491 3610 """Test whether the extractor is generally suitable for the given age limit"""
3611 return not age_restricted(cls.age_limit, age_limit)
05900629 3612
82d02080 3613 @classmethod
3614 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3615 """Description of the extractor"""
3616 desc = ''
82d02080 3617 if cls._NETRC_MACHINE:
8dcce6a8 3618 if markdown:
82d02080 3619 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
8dcce6a8 3620 else:
82d02080 3621 desc += f' [{cls._NETRC_MACHINE}]'
3622 if cls.IE_DESC is False:
8dcce6a8 3623 desc += ' [HIDDEN]'
82d02080 3624 elif cls.IE_DESC:
3625 desc += f' {cls.IE_DESC}'
3626 if cls.SEARCH_KEY:
3627 desc += f'; "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3628 if search_examples:
3629 _COUNTS = ('', '5', '10', 'all')
82d02080 3630 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3631 if not cls.working():
8dcce6a8 3632 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3633
82d02080 3634 name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
8dcce6a8 3635 return f'{name}:{desc}' if desc else name
3636
a504ced0 3637 def extract_subtitles(self, *args, **kwargs):
a06916d9 3638 if (self.get_param('writesubtitles', False)
3639 or self.get_param('listsubtitles')):
9868ea49
JMF
3640 return self._get_subtitles(*args, **kwargs)
3641 return {}
a504ced0
JMF
3642
3643 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3644 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3645
a2160aa4 3646 def extract_comments(self, *args, **kwargs):
3647 if not self.get_param('getcomments'):
3648 return None
3649 generator = self._get_comments(*args, **kwargs)
3650
3651 def extractor():
3652 comments = []
d2b2fca5 3653 interrupted = True
a2160aa4 3654 try:
3655 while True:
3656 comments.append(next(generator))
a2160aa4 3657 except StopIteration:
3658 interrupted = False
d2b2fca5 3659 except KeyboardInterrupt:
3660 self.to_screen('Interrupted by user')
3661 except Exception as e:
3662 if self.get_param('ignoreerrors') is not True:
3663 raise
3664 self._downloader.report_error(e)
a2160aa4 3665 comment_count = len(comments)
3666 self.to_screen(f'Extracted {comment_count} comments')
3667 return {
3668 'comments': comments,
3669 'comment_count': None if interrupted else comment_count
3670 }
3671 return extractor
3672
3673 def _get_comments(self, *args, **kwargs):
3674 raise NotImplementedError('This method must be implemented by subclasses')
3675
912e0b7e
YCH
3676 @staticmethod
3677 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3678 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3679 will be dropped. """
86e5f3ed 3680 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3681 ret = list(subtitle_list1)
a44ca5a4 3682 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3683 return ret
3684
3685 @classmethod
46890374 3686 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3687 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3688 if target is None:
3689 target = {}
3690 for d in dicts:
3691 for lang, subs in d.items():
3692 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3693 return target
912e0b7e 3694
360e1ca5 3695 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3696 if (self.get_param('writeautomaticsub', False)
3697 or self.get_param('listsubtitles')):
9868ea49
JMF
3698 return self._get_automatic_captions(*args, **kwargs)
3699 return {}
360e1ca5
JMF
3700
3701 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3702 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3703
2762dbb1 3704 @functools.cached_property
24146491 3705 def _cookies_passed(self):
3706 """Whether cookies have been passed to YoutubeDL"""
3707 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3708
d77ab8e2 3709 def mark_watched(self, *args, **kwargs):
1813a6cc 3710 if not self.get_param('mark_watched', False):
3711 return
24146491 3712 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3713 self._mark_watched(*args, **kwargs)
3714
3715 def _mark_watched(self, *args, **kwargs):
3716 raise NotImplementedError('This method must be implemented by subclasses')
3717
38cce791
YCH
3718 def geo_verification_headers(self):
3719 headers = {}
a06916d9 3720 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3721 if geo_verification_proxy:
3722 headers['Ytdl-request-proxy'] = geo_verification_proxy
3723 return headers
3724
98763ee3
YCH
3725 def _generic_id(self, url):
3726 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3727
3728 def _generic_title(self, url):
3729 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3730
c224251a 3731 @staticmethod
b0089e89 3732 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3733 all_known = all(map(
3734 lambda x: x is not None,
3735 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3736 return (
3737 'private' if is_private
3738 else 'premium_only' if needs_premium
3739 else 'subscriber_only' if needs_subscription
3740 else 'needs_auth' if needs_auth
3741 else 'unlisted' if is_unlisted
3742 else 'public' if all_known
3743 else None)
3744
d43de682 3745 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3746 '''
3747 @returns A list of values for the extractor argument given by "key"
3748 or "default" if no such key is present
3749 @param default The default value to return when the key is not present (default: [])
3750 @param casesense When false, the values are converted to lower case
3751 '''
3752 val = traverse_obj(
d43de682 3753 self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
4bb6b02f 3754 if val is None:
3755 return [] if default is NO_DEFAULT else default
3756 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3757
f40ee5e9 3758 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3759 if not playlist_id or not video_id:
3760 return not video_id
3761
3762 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3763 if no_playlist is not None:
3764 return not no_playlist
3765
3766 video_id = '' if video_id is True else f' {video_id}'
3767 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3768 if self.get_param('noplaylist'):
3769 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3770 return False
3771 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3772 return True
3773
8dbe9899 3774
d6983cb4
PH
3775class SearchInfoExtractor(InfoExtractor):
3776 """
3777 Base class for paged search queries extractors.
10952eb2 3778 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3779 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3780 """
3781
96565c7e 3782 _MAX_RESULTS = float('inf')
3783
d6983cb4
PH
3784 @classmethod
3785 def _make_valid_url(cls):
3786 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3787
d6983cb4 3788 def _real_extract(self, query):
2c4aaadd 3789 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3790 if prefix == '':
3791 return self._get_n_results(query, 1)
3792 elif prefix == 'all':
3793 return self._get_n_results(query, self._MAX_RESULTS)
3794 else:
3795 n = int(prefix)
3796 if n <= 0:
86e5f3ed 3797 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3798 elif n > self._MAX_RESULTS:
6a39ee13 3799 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3800 n = self._MAX_RESULTS
3801 return self._get_n_results(query, n)
3802
3803 def _get_n_results(self, query, n):
cc16383f 3804 """Get a specified number of results for a query.
3805 Either this function or _search_results must be overridden by subclasses """
3806 return self.playlist_result(
3807 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3808 query, query)
3809
3810 def _search_results(self, query):
3811 """Returns an iterator of search results"""
611c1dd9 3812 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 3813
82d02080 3814 @classproperty
3815 def SEARCH_KEY(cls):
3816 return cls._SEARCH_KEY