]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
Update to ytdl-commit-195f22f6
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
ac668111 3import getpass
3ec05685 4import hashlib
54007a45 5import http.client
6import http.cookiejar
7import http.cookies
2314b4d8 8import inspect
cc16383f 9import itertools
3d3538e4 10import json
f8271158 11import math
4094b6e3 12import netrc
d6983cb4 13import os
773f291d 14import random
6929b41a 15import re
d6983cb4 16import sys
4094b6e3 17import time
8f97a15d 18import types
14f25df2 19import urllib.parse
ac668111 20import urllib.request
f8271158 21import xml.etree.ElementTree
d6983cb4 22
6929b41a 23from ..compat import functools # isort: split
14f25df2 24from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
8817a80d 25from ..cookies import LenientSimpleCookie
f8271158 26from ..downloader.f4m import get_base_url, remove_encrypted_media
8c25f81b 27from ..utils import (
8f97a15d 28 IDENTITY,
f8271158 29 JSON_LD_RE,
30 NO_DEFAULT,
31 ExtractorError,
d0d74b71 32 FormatSorter,
f8271158 33 GeoRestrictedError,
34 GeoUtils,
b7c47b74 35 LenientJSONDecoder,
f8271158 36 RegexNotFoundError,
be5c1ae8 37 RetryManager,
f8271158 38 UnsupportedError,
05900629 39 age_restricted,
02dc0a36 40 base_url,
08f2a92c 41 bug_reports_message,
82d02080 42 classproperty,
d6983cb4 43 clean_html,
d0d74b71 44 deprecation_warning,
70f0f5a8 45 determine_ext,
d493f15c 46 dict_get,
42676437 47 encode_data_uri,
9b9c5355 48 error_to_compat_str,
46b18f23 49 extract_attributes,
90137ca4 50 filter_dict,
97f4aecf 51 fix_xml_ampersands,
b14f3a4c 52 float_or_none,
b868936c 53 format_field,
31bb8d3f 54 int_or_none,
34921b43 55 join_nonempty,
a4a554a7 56 js_to_json,
46b18f23 57 mimetype2ext,
3158150c 58 network_exceptions,
46b18f23 59 orderedSet,
d493f15c 60 parse_bitrate,
46b18f23
JH
61 parse_codecs,
62 parse_duration,
4ca2a3cf 63 parse_iso8601,
46b18f23 64 parse_m3u8_attributes,
d493f15c 65 parse_resolution,
46b18f23 66 sanitize_filename,
8f97a15d 67 sanitize_url,
b868936c 68 sanitized_Request,
ade1fa70 69 smuggle_url,
d493f15c 70 str_or_none,
ce5b9040 71 str_to_int,
f856816b 72 strip_or_none,
5d3a0e79 73 traverse_obj,
71df9b7f 74 truncate_string,
47046464 75 try_call,
ffa89477 76 try_get,
f38de77f 77 unescapeHTML,
647eab45 78 unified_strdate,
6b3a3098 79 unified_timestamp,
46b18f23 80 update_Request,
09d02ea4 81 update_url_query,
a107193e 82 url_basename,
bebef109 83 url_or_none,
b868936c 84 urljoin,
6606817a 85 variadic,
a6571f10 86 xpath_element,
8d6765cf
S
87 xpath_text,
88 xpath_with_ns,
d6983cb4 89)
c342041f 90
d6983cb4 91
86e5f3ed 92class InfoExtractor:
d6983cb4
PH
93 """Information Extractor class.
94
95 Information extractors are the classes that, given a URL, extract
96 information about the video (or videos) the URL refers to. This
97 information includes the real video URL, the video title, author and
98 others. The information is stored in a dictionary which is then
5d380852 99 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
100 information possibly downloading the video to the file system, among
101 other possible outcomes.
102
cf0649f8 103 The type field determines the type of the result.
fed5d032
PH
104 By far the most common value (and the default if _type is missing) is
105 "video", which indicates a single video.
106
107 For a video, the dictionaries must include the following fields:
d6983cb4
PH
108
109 id: Video identifier.
d4736fdb 110 title: Video title, unescaped. Set to an empty string if video has
111 no title as opposed to "None" which signifies that the
112 extractor failed to obtain a title
d67b0b15 113
f49d89ee 114 Additionally, it must contain either a formats entry or a url one:
d67b0b15 115
f49d89ee
PH
116 formats: A list of dictionaries for each format available, ordered
117 from worst to best quality.
118
119 Potential fields:
c790e93a
S
120 * url The mandatory URL representing the media:
121 for plain file media - HTTP URL of this file,
122 for RTMP - RTMP URL,
123 for HLS - URL of the M3U8 media playlist,
124 for HDS - URL of the F4M manifest,
79d2077e
S
125 for DASH
126 - HTTP URL to plain file media (in case of
127 unfragmented media)
128 - URL of the MPD manifest or base URL
129 representing the media if MPD manifest
8ed7a233 130 is parsed from a string (in case of
79d2077e 131 fragmented media)
c790e93a 132 for MSS - URL of the ISM manifest.
86f4d14f
S
133 * manifest_url
134 The URL of the manifest file in case of
c790e93a
S
135 fragmented media:
136 for HLS - URL of the M3U8 master playlist,
137 for HDS - URL of the F4M manifest,
138 for DASH - URL of the MPD manifest,
139 for MSS - URL of the ISM manifest.
a44ca5a4 140 * manifest_stream_number (For internal use only)
141 The index of the stream in the manifest file
10952eb2 142 * ext Will be calculated from URL if missing
d67b0b15
PH
143 * format A human-readable description of the format
144 ("mp4 container with h264/opus").
145 Calculated from the format_id, width, height.
146 and format_note fields if missing.
147 * format_id A short description of the format
5d4f3985
PH
148 ("mp4_h264_opus" or "19").
149 Technically optional, but strongly recommended.
d67b0b15
PH
150 * format_note Additional info about the format
151 ("3D" or "DASH video")
152 * width Width of the video, if known
153 * height Height of the video, if known
105bfd90 154 * aspect_ratio Aspect ratio of the video, if known
155 Automatically calculated from width and height
f49d89ee 156 * resolution Textual description of width and height
105bfd90 157 Automatically calculated from width and height
176f1866 158 * dynamic_range The dynamic range of the video. One of:
159 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 160 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
161 * abr Average audio bitrate in KBit/s
162 * acodec Name of the audio codec in use
dd27fd17 163 * asr Audio sampling rate in Hertz
b8ed0f15 164 * audio_channels Number of audio channels
d67b0b15 165 * vbr Average video bitrate in KBit/s
fbb21cf5 166 * fps Frame rate
d67b0b15 167 * vcodec Name of the video codec in use
1394ce65 168 * container Name of the container format
d67b0b15 169 * filesize The number of bytes, if known in advance
9732d77e 170 * filesize_approx An estimate for the number of bytes
d67b0b15 171 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 172 * protocol The protocol that will be used for the actual
adbc4ec4
THD
173 download, lower-case. One of "http", "https" or
174 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
175 * fragment_base_url
176 Base URL for fragments. Each fragment's path
177 value (if present) will be relative to
178 this URL.
179 * fragments A list of fragments of a fragmented media.
180 Each fragment entry must contain either an url
181 or a path. If an url is present it should be
182 considered by a client. Otherwise both path and
183 fragment_base_url must be present. Here is
184 the list of all potential fields:
185 * "url" - fragment's URL
186 * "path" - fragment's path relative to
187 fragment_base_url
a0d5077c
S
188 * "duration" (optional, int or float)
189 * "filesize" (optional, int)
adbc4ec4
THD
190 * is_from_start Is a live format that can be downloaded
191 from the start. Boolean
f49d89ee 192 * preference Order number of this format. If this field is
08d13955 193 present and not None, the formats get sorted
38d63d84 194 by this field, regardless of all other values.
f49d89ee
PH
195 -1 for default (order by other properties),
196 -2 or smaller for less than default.
e65566a9
PH
197 < -1000 to hide the format (if there is
198 another one which is strictly better)
32f90364
PH
199 * language Language code, e.g. "de" or "en-US".
200 * language_preference Is this in the language mentioned in
201 the URL?
aff2f4f4
PH
202 10 if it's what the URL is about,
203 -1 for default (don't know),
204 -10 otherwise, other values reserved for now.
5d73273f
PH
205 * quality Order number of the video quality of this
206 format, irrespective of the file format.
207 -1 for default (order by other properties),
208 -2 or smaller for less than default.
c64ed2a3
PH
209 * source_preference Order number for this video source
210 (quality takes higher priority)
211 -1 for default (order by other properties),
212 -2 or smaller for less than default.
d769be6c
PH
213 * http_headers A dictionary of additional HTTP headers
214 to add to the request.
6271f1ca 215 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
216 video's pixels are not square.
217 width : height ratio as float.
218 * no_resume The server does not support resuming the
219 (HTTP or RTMP) download. Boolean.
88acdbc2 220 * has_drm The format has DRM and cannot be downloaded. Boolean
0a5a191a 221 * downloader_options A dictionary of downloader options
222 (For internal use only)
223 * http_chunk_size Chunk size for HTTP downloads
224 * ffmpeg_args Extra arguments for ffmpeg downloader
3b1fe47d 225 RTMP formats can also have the additional fields: page_url,
226 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
227 rtmp_protocol, rtmp_real_time
3dee7826 228
c0ba0f48 229 url: Final video URL.
d6983cb4 230 ext: Video filename extension.
d67b0b15
PH
231 format: The video format, defaults to ext (used for --get-format)
232 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 233
d6983cb4
PH
234 The following fields are optional:
235
08d30158 236 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 237 alt_title: A secondary title of the video.
0afef30b
PH
238 display_id An alternative identifier for the video, not necessarily
239 unique, but available before title. Typically, id is
240 something like "4234987", title "Dancing naked mole rats",
241 and display_id "dancing-naked-mole-rats"
d5519808 242 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 243 * "id" (optional, string) - Thumbnail format ID
d5519808 244 * "url"
cfb56d1a 245 * "preference" (optional, int) - quality of the image
d5519808
PH
246 * "width" (optional, int)
247 * "height" (optional, int)
5e1c39ac 248 * "resolution" (optional, string "{width}x{height}",
d5519808 249 deprecated)
2de624fd 250 * "filesize" (optional, int)
297e9952 251 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 252 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 253 description: Full video description.
d6983cb4 254 uploader: Full name of the video uploader.
2bc0c46f 255 license: License name the video is licensed under.
8a92e51c 256 creator: The creator of the video.
10db0d2f 257 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 258 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 259 If not explicitly set, calculated from timestamp
260 release_timestamp: UNIX timestamp of the moment the video was released.
261 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 262 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 263 If not explicitly set, calculated from release_timestamp
264 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 265 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 266 If not explicitly set, calculated from modified_timestamp
d6983cb4 267 uploader_id: Nickname or id of the video uploader.
7bcd2830 268 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 269 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 270 Note that channel fields may or may not repeat uploader
6f1f59f3
S
271 fields. This depends on a particular extractor.
272 channel_id: Id of the channel.
273 channel_url: Full URL to a channel webpage.
6c73052c 274 channel_follower_count: Number of followers of the channel.
da9ec3b9 275 location: Physical location where the video was filmed.
a504ced0 276 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
277 {tag: subformats}. "tag" is usually a language code, and
278 "subformats" is a list sorted from lower to higher
279 preference, each element is a dictionary with the "ext"
280 entry and one of:
a504ced0 281 * "data": The subtitles file contents
10952eb2 282 * "url": A URL pointing to the subtitles file
2412044c 283 It can optionally also have:
284 * "name": Name or description of the subtitles
08d30158 285 * "http_headers": A dictionary of additional HTTP headers
297e9952 286 to add to the request.
4bba3716 287 "ext" will be calculated from URL if missing
e167860c 288 automatic_captions: Like 'subtitles'; contains automatically generated
289 captions instead of normal subtitles
62d231c0 290 duration: Length of the video in seconds, as an integer or float.
f3d29461 291 view_count: How many users have watched the video on the platform.
867c66ff 292 concurrent_view_count: How many users are currently watching the video on the platform.
19e3dfc9
PH
293 like_count: Number of positive ratings of the video
294 dislike_count: Number of negative ratings of the video
02835c6b 295 repost_count: Number of reposts of the video
2d30521a 296 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 297 comment_count: Number of comments on the video
dd622d7c
PH
298 comments: A list of comments, each with one or more of the following
299 properties (all but one of text or html optional):
300 * "author" - human-readable name of the comment author
301 * "author_id" - user ID of the comment author
a1c5d2ca 302 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
303 * "id" - Comment ID
304 * "html" - Comment as HTML
305 * "text" - Plain text of the comment
306 * "timestamp" - UNIX timestamp of comment
307 * "parent" - ID of the comment this one is replying to.
308 Set to "root" to indicate that this is a
309 comment to the original video.
a1c5d2ca
M
310 * "like_count" - Number of positive ratings of the comment
311 * "dislike_count" - Number of negative ratings of the comment
312 * "is_favorited" - Whether the comment is marked as
313 favorite by the video uploader
314 * "author_is_uploader" - Whether the comment is made by
315 the video uploader
8dbe9899 316 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 317 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
318 should allow to get the same result again. (It will be set
319 by YoutubeDL if it's missing)
ad3bc6ac
PH
320 categories: A list of categories that the video falls in, for example
321 ["Sports", "Berlin"]
864f24bd 322 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 323 cast: A list of the video cast
7267bd53
PH
324 is_live: True, False, or None (=unknown). Whether this video is a
325 live stream that goes on instead of a fixed-length video.
f76ede8e 326 was_live: True, False, or None (=unknown). Whether this video was
327 originally a live stream.
0647d925 328 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
e325a21a 329 or 'post_live' (was live, but VOD is not yet processed)
ae30b840 330 If absent, automatically set from is_live, was_live
7c80519c 331 start_time: Time in seconds where the reproduction should start, as
10952eb2 332 specified in the URL.
297a564b 333 end_time: Time in seconds where the reproduction should end, as
10952eb2 334 specified in the URL.
55949fed 335 chapters: A list of dictionaries, with the following entries:
336 * "start_time" - The start time of the chapter in seconds
337 * "end_time" - The end time of the chapter in seconds
338 * "title" (optional, string)
6cfda058 339 playable_in_embed: Whether this video is allowed to play in embedded
340 players on other sites. Can be True (=always allowed),
341 False (=never allowed), None (=unknown), or a string
62b58c09 342 specifying the criteria for embedability; e.g. 'whitelist'
c224251a
M
343 availability: Under what condition the video is available. One of
344 'private', 'premium_only', 'subscriber_only', 'needs_auth',
345 'unlisted' or 'public'. Use 'InfoExtractor._availability'
346 to set it
1e8fe57e 347 _old_archive_ids: A list of old archive ids needed for backward compatibility
784320c9 348 _format_sort_fields: A list of fields to use for sorting formats
277d6ff5 349 __post_extractor: A function to be called just before the metadata is
350 written to either disk, logger or console. The function
351 must return a dict which will be added to the info_dict.
352 This is usefull for additional information that is
353 time-consuming to extract. Note that the fields thus
354 extracted will not be available to output template and
355 match_filter. So, only "comments" and "comment_count" are
356 currently allowed to be extracted via this method.
d6983cb4 357
7109903e
S
358 The following fields should only be used when the video belongs to some logical
359 chapter or section:
360
361 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
362 chapter_number: Number of the chapter the video belongs to, as an integer.
363 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
364
365 The following fields should only be used when the video is an episode of some
8d76bdf1 366 series, programme or podcast:
7109903e
S
367
368 series: Title of the series or programme the video episode belongs to.
9ac24e23 369 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 370 season: Title of the season the video episode belongs to.
27bfd4e5
S
371 season_number: Number of the season the video episode belongs to, as an integer.
372 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
373 episode: Title of the video episode. Unlike mandatory video title field,
374 this field should denote the exact title of the video episode
375 without any kind of decoration.
27bfd4e5
S
376 episode_number: Number of the video episode within a season, as an integer.
377 episode_id: Id of the video episode, as a unicode string.
7109903e 378
7a93ab5f
S
379 The following fields should only be used when the media is a track or a part of
380 a music album:
381
382 track: Title of the track.
383 track_number: Number of the track within an album or a disc, as an integer.
384 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
385 as a unicode string.
386 artist: Artist(s) of the track.
387 genre: Genre(s) of the track.
388 album: Title of the album the track belongs to.
389 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
390 album_artist: List of all artists appeared on the album (e.g.
391 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
392 and compilations).
393 disc_number: Number of the disc or other physical medium the track belongs to,
394 as an integer.
395 release_year: Year (YYYY) when the album was released.
8bcd4048 396 composer: Composer of the piece
7a93ab5f 397
3975b4d2 398 The following fields should only be set for clips that should be cut from the original video:
399
400 section_start: Start time of the section in seconds
401 section_end: End time of the section in seconds
402
45e8a04e 403 The following fields should only be set for storyboards:
404 rows: Number of rows in each storyboard fragment, as an integer
405 columns: Number of columns in each storyboard fragment, as an integer
406
deefc05b 407 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 408
d838b1bd
PH
409 Unless mentioned otherwise, None is equivalent to absence of information.
410
fed5d032
PH
411
412 _type "playlist" indicates multiple videos.
b82f815f
PH
413 There must be a key "entries", which is a list, an iterable, or a PagedList
414 object, each element of which is a valid dictionary by this specification.
fed5d032 415
962ffcf8 416 Additionally, playlists can have "id", "title", and any other relevant
b60419c5 417 attributes with the same semantics as videos (see above).
fed5d032 418
f0d785d3 419 It can also have the following optional fields:
420
421 playlist_count: The total number of videos in a playlist. If not given,
422 YoutubeDL tries to calculate it from "entries"
423
fed5d032
PH
424
425 _type "multi_video" indicates that there are multiple videos that
426 form a single show, for examples multiple acts of an opera or TV episode.
427 It must have an entries key like a playlist and contain all the keys
428 required for a video at the same time.
429
430
431 _type "url" indicates that the video must be extracted from another
432 location, possibly by a different extractor. Its only required key is:
433 "url" - the next URL to extract.
f58766ce
PH
434 The key "ie_key" can be set to the class name (minus the trailing "IE",
435 e.g. "Youtube") if the extractor class is known in advance.
436 Additionally, the dictionary may have any properties of the resolved entity
437 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
438 known ahead of time.
439
440
441 _type "url_transparent" entities have the same specification as "url", but
442 indicate that the given additional information is more precise than the one
443 associated with the resolved URL.
444 This is useful when a site employs a video service that hosts the video and
445 its technical metadata, but that video service does not embed a useful
446 title, description etc.
447
448
8f97a15d 449 Subclasses of this should also be added to the list of extractors and
450 should define a _VALID_URL regexp and, re-define the _real_extract() and
451 (optionally) _real_initialize() methods.
d6983cb4 452
e6f21b3d 453 Subclasses may also override suitable() if necessary, but ensure the function
454 signature is preserved and that this function imports everything it needs
52efa4b3 455 (except other extractors), so that lazy_extractors works correctly.
456
8f97a15d 457 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
458 the HTML of Generic webpages. It may also override _extract_embed_urls
459 or _extract_from_webpage as necessary. While these are normally classmethods,
460 _extract_from_webpage is allowed to be an instance method.
461
462 _extract_from_webpage may raise self.StopExtraction() to stop further
463 processing of the webpage and obtain exclusive rights to it. This is useful
62b58c09
L
464 when the extractor cannot reliably be matched using just the URL,
465 e.g. invidious/peertube instances
8f97a15d 466
467 Embed-only extractors can be defined by setting _VALID_URL = False.
468
52efa4b3 469 To support username + password (or netrc) login, the extractor must define a
470 _NETRC_MACHINE and re-define _perform_login(username, password) and
471 (optionally) _initialize_pre_login() methods. The _perform_login method will
472 be called between _initialize_pre_login and _real_initialize if credentials
473 are passed by the user. In cases where it is necessary to have the login
474 process as part of the extraction rather than initialization, _perform_login
475 can be left undefined.
e6f21b3d 476
4248dad9 477 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
478 geo restriction bypass mechanisms for a particular extractor.
479 Though it won't disable explicit geo restriction bypass based on
504f20dd 480 country code provided with geo_bypass_country.
4248dad9
S
481
482 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
483 countries for this extractor. One of these countries will be used by
484 geo restriction bypass mechanism right away in order to bypass
504f20dd 485 geo restriction, of course, if the mechanism is not disabled.
773f291d 486
5f95927a
S
487 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
488 IP blocks in CIDR notation for this extractor. One of these IP blocks
489 will be used by geo restriction bypass mechanism similarly
504f20dd 490 to _GEO_COUNTRIES.
3ccdde8c 491
fe7866d0 492 The _ENABLED attribute should be set to False for IEs that
493 are disabled by default and must be explicitly enabled.
494
e6f21b3d 495 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
496 in order to warn the users and skip the tests.
497 """
498
499 _ready = False
500 _downloader = None
773f291d 501 _x_forwarded_for_ip = None
4248dad9
S
502 _GEO_BYPASS = True
503 _GEO_COUNTRIES = None
5f95927a 504 _GEO_IP_BLOCKS = None
d6983cb4 505 _WORKING = True
fe7866d0 506 _ENABLED = True
52efa4b3 507 _NETRC_MACHINE = None
231025c4 508 IE_DESC = None
8dcce6a8 509 SEARCH_KEY = None
8f97a15d 510 _VALID_URL = None
511 _EMBED_REGEX = []
d6983cb4 512
8dcce6a8 513 def _login_hint(self, method=NO_DEFAULT, netrc=None):
514 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
515 return {
516 None: '',
517 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
518 'password': f'Use {password_hint}',
519 'cookies': (
520 'Use --cookies-from-browser or --cookies for the authentication. '
17ffed18 521 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
8dcce6a8 522 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 523
d6983cb4 524 def __init__(self, downloader=None):
49a57e70 525 """Constructor. Receives an optional downloader (a YoutubeDL instance).
526 If a downloader is not passed during initialization,
527 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 528 self._ready = False
773f291d 529 self._x_forwarded_for_ip = None
28f436ba 530 self._printed_messages = set()
d6983cb4
PH
531 self.set_downloader(downloader)
532
533 @classmethod
5ad28e7f 534 def _match_valid_url(cls, url):
8f97a15d 535 if cls._VALID_URL is False:
536 return None
79cb2577
PH
537 # This does not use has/getattr intentionally - we want to know whether
538 # we have cached the regexp for *this* class, whereas getattr would also
539 # match the superclass
540 if '_VALID_URL_RE' not in cls.__dict__:
541 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 542 return cls._VALID_URL_RE.match(url)
543
544 @classmethod
545 def suitable(cls, url):
546 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 547 # This function must import everything it needs (except other extractors),
548 # so that lazy_extractors works correctly
5ad28e7f 549 return cls._match_valid_url(url) is not None
d6983cb4 550
ed9266db
PH
551 @classmethod
552 def _match_id(cls, url):
5ad28e7f 553 return cls._match_valid_url(url).group('id')
ed9266db 554
1151c407 555 @classmethod
556 def get_temp_id(cls, url):
557 try:
558 return cls._match_id(url)
559 except (IndexError, AttributeError):
560 return None
561
d6983cb4
PH
562 @classmethod
563 def working(cls):
564 """Getter method for _WORKING."""
565 return cls._WORKING
566
52efa4b3 567 @classmethod
568 def supports_login(cls):
569 return bool(cls._NETRC_MACHINE)
570
d6983cb4
PH
571 def initialize(self):
572 """Initializes an instance (authentication, etc)."""
28f436ba 573 self._printed_messages = set()
5f95927a
S
574 self._initialize_geo_bypass({
575 'countries': self._GEO_COUNTRIES,
576 'ip_blocks': self._GEO_IP_BLOCKS,
577 })
4248dad9 578 if not self._ready:
52efa4b3 579 self._initialize_pre_login()
580 if self.supports_login():
581 username, password = self._get_login_info()
582 if username:
583 self._perform_login(username, password)
584 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 585 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
586 self._real_initialize()
587 self._ready = True
588
5f95927a 589 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
590 """
591 Initialize geo restriction bypass mechanism.
592
593 This method is used to initialize geo bypass mechanism based on faking
594 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 595 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
596 IP will be passed as X-Forwarded-For HTTP header in all subsequent
597 HTTP requests.
e39b5d4a
S
598
599 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
600 during the instance initialization with _GEO_COUNTRIES and
601 _GEO_IP_BLOCKS.
e39b5d4a 602
5f95927a 603 You may also manually call it from extractor's code if geo bypass
e39b5d4a 604 information is not available beforehand (e.g. obtained during
5f95927a
S
605 extraction) or due to some other reason. In this case you should pass
606 this information in geo bypass context passed as first argument. It may
607 contain following fields:
608
609 countries: List of geo unrestricted countries (similar
610 to _GEO_COUNTRIES)
611 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
612 (similar to _GEO_IP_BLOCKS)
613
e39b5d4a 614 """
773f291d 615 if not self._x_forwarded_for_ip:
5f95927a
S
616
617 # Geo bypass mechanism is explicitly disabled by user
a06916d9 618 if not self.get_param('geo_bypass', True):
5f95927a
S
619 return
620
621 if not geo_bypass_context:
622 geo_bypass_context = {}
623
624 # Backward compatibility: previously _initialize_geo_bypass
625 # expected a list of countries, some 3rd party code may still use
626 # it this way
627 if isinstance(geo_bypass_context, (list, tuple)):
628 geo_bypass_context = {
629 'countries': geo_bypass_context,
630 }
631
632 # The whole point of geo bypass mechanism is to fake IP
633 # as X-Forwarded-For HTTP header based on some IP block or
634 # country code.
635
636 # Path 1: bypassing based on IP block in CIDR notation
637
638 # Explicit IP block specified by user, use it right away
639 # regardless of whether extractor is geo bypassable or not
a06916d9 640 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
641
642 # Otherwise use random IP block from geo bypass context but only
643 # if extractor is known as geo bypassable
644 if not ip_block:
645 ip_blocks = geo_bypass_context.get('ip_blocks')
646 if self._GEO_BYPASS and ip_blocks:
647 ip_block = random.choice(ip_blocks)
648
649 if ip_block:
650 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 651 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
652 return
653
654 # Path 2: bypassing based on country code
655
656 # Explicit country code specified by user, use it right away
657 # regardless of whether extractor is geo bypassable or not
a06916d9 658 country = self.get_param('geo_bypass_country', None)
5f95927a
S
659
660 # Otherwise use random country code from geo bypass context but
661 # only if extractor is known as geo bypassable
662 if not country:
663 countries = geo_bypass_context.get('countries')
664 if self._GEO_BYPASS and countries:
665 country = random.choice(countries)
666
667 if country:
668 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 669 self._downloader.write_debug(
86e5f3ed 670 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
671
672 def extract(self, url):
673 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 674 try:
773f291d
S
675 for _ in range(2):
676 try:
677 self.initialize()
71df9b7f 678 self.to_screen('Extracting URL: %s' % (
679 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
0016b84e 680 ie_result = self._real_extract(url)
07cce701 681 if ie_result is None:
682 return None
0016b84e
S
683 if self._x_forwarded_for_ip:
684 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
b79f9e30 685 subtitles = ie_result.get('subtitles') or {}
686 if 'no-live-chat' in self.get_param('compat_opts'):
687 for lang in ('live_chat', 'comments', 'danmaku'):
688 subtitles.pop(lang, None)
0016b84e 689 return ie_result
773f291d 690 except GeoRestrictedError as e:
4248dad9
S
691 if self.__maybe_fake_ip_and_retry(e.countries):
692 continue
773f291d 693 raise
0db3bae8 694 except UnsupportedError:
695 raise
1151c407 696 except ExtractorError as e:
9bcfe33b 697 e.video_id = e.video_id or self.get_temp_id(url),
698 e.ie = e.ie or self.IE_NAME,
699 e.traceback = e.traceback or sys.exc_info()[2]
700 raise
ac668111 701 except http.client.IncompleteRead as e:
1151c407 702 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 703 except (KeyError, StopIteration) as e:
1151c407 704 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 705
4248dad9 706 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 707 if (not self.get_param('geo_bypass_country', None)
3089bc74 708 and self._GEO_BYPASS
a06916d9 709 and self.get_param('geo_bypass', True)
3089bc74
S
710 and not self._x_forwarded_for_ip
711 and countries):
eea0716c
S
712 country_code = random.choice(countries)
713 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
714 if self._x_forwarded_for_ip:
715 self.report_warning(
eea0716c
S
716 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
717 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
718 return True
719 return False
720
d6983cb4 721 def set_downloader(self, downloader):
08d30158 722 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
723 self._downloader = downloader
724
9809740b 725 @property
726 def cache(self):
727 return self._downloader.cache
728
729 @property
730 def cookiejar(self):
731 return self._downloader.cookiejar
732
52efa4b3 733 def _initialize_pre_login(self):
962ffcf8 734 """ Initialization before login. Redefine in subclasses."""
52efa4b3 735 pass
736
737 def _perform_login(self, username, password):
738 """ Login with username and password. Redefine in subclasses."""
739 pass
740
d6983cb4
PH
741 def _real_initialize(self):
742 """Real initialization process. Redefine in subclasses."""
743 pass
744
745 def _real_extract(self, url):
746 """Real extraction process. Redefine in subclasses."""
08d30158 747 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 748
56c73665
JMF
749 @classmethod
750 def ie_key(cls):
751 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 752 return cls.__name__[:-2]
56c73665 753
82d02080 754 @classproperty
755 def IE_NAME(cls):
756 return cls.__name__[:-2]
d6983cb4 757
d391b7e2
S
758 @staticmethod
759 def __can_accept_status_code(err, expected_status):
ac668111 760 assert isinstance(err, urllib.error.HTTPError)
d391b7e2
S
761 if expected_status is None:
762 return False
d391b7e2
S
763 elif callable(expected_status):
764 return expected_status(err.code) is True
765 else:
6606817a 766 return err.code in variadic(expected_status)
d391b7e2 767
c043c246 768 def _create_request(self, url_or_request, data=None, headers=None, query=None):
ac668111 769 if isinstance(url_or_request, urllib.request.Request):
09d02ea4 770 return update_Request(url_or_request, data=data, headers=headers, query=query)
771 if query:
772 url_or_request = update_url_query(url_or_request, query)
c043c246 773 return sanitized_Request(url_or_request, data, headers or {})
f95b9dee 774
c043c246 775 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
d391b7e2
S
776 """
777 Return the response handle.
778
779 See _download_webpage docstring for arguments specification.
780 """
1cf376f5 781 if not self._downloader._first_webpage_request:
49a57e70 782 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 783 if sleep_interval > 0:
5ef7d9bd 784 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 785 time.sleep(sleep_interval)
786 else:
787 self._downloader._first_webpage_request = False
788
d6983cb4
PH
789 if note is None:
790 self.report_download_webpage(video_id)
791 elif note is not False:
7cc3570e 792 if video_id is None:
86e5f3ed 793 self.to_screen(str(note))
7cc3570e 794 else:
86e5f3ed 795 self.to_screen(f'{video_id}: {note}')
2132edaa
S
796
797 # Some sites check X-Forwarded-For HTTP header in order to figure out
798 # the origin of the client behind proxy. This allows bypassing geo
799 # restriction by faking this header's value to IP that belongs to some
800 # geo unrestricted country. We will do so once we encounter any
801 # geo restriction error.
802 if self._x_forwarded_for_ip:
c043c246 803 headers = (headers or {}).copy()
804 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
2132edaa 805
d6983cb4 806 try:
f95b9dee 807 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 808 except network_exceptions as err:
ac668111 809 if isinstance(err, urllib.error.HTTPError):
d391b7e2 810 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
811 # Retain reference to error to prevent file object from
812 # being closed before it can be read. Works around the
813 # effects of <https://bugs.python.org/issue15002>
814 # introduced in Python 3.4.1.
815 err.fp._error = err
d391b7e2
S
816 return err.fp
817
aa94a6d3
PH
818 if errnote is False:
819 return False
d6983cb4 820 if errnote is None:
f1a9d64e 821 errnote = 'Unable to download webpage'
7f8b2714 822
86e5f3ed 823 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 824 if fatal:
497d2fab 825 raise ExtractorError(errmsg, cause=err)
7cc3570e 826 else:
6a39ee13 827 self.report_warning(errmsg)
7cc3570e 828 return False
d6983cb4 829
1890fc63 830 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
831 encoding=None, data=None, headers={}, query={}, expected_status=None):
d391b7e2
S
832 """
833 Return a tuple (page content as string, URL handle).
834
617f658b 835 Arguments:
836 url_or_request -- plain text URL as a string or
ac668111 837 a urllib.request.Request object
617f658b 838 video_id -- Video/playlist/item identifier (string)
839
840 Keyword arguments:
841 note -- note printed before downloading (string)
842 errnote -- note printed in case of an error (string)
843 fatal -- flag denoting whether error should be considered fatal,
844 i.e. whether it should cause ExtractionError to be raised,
845 otherwise a warning will be reported and extraction continued
846 encoding -- encoding for a page content decoding, guessed automatically
847 when not explicitly specified
848 data -- POST data (bytes)
849 headers -- HTTP headers (dict)
850 query -- URL query (dict)
851 expected_status -- allows to accept failed HTTP requests (non 2xx
852 status code) by explicitly specifying a set of accepted status
853 codes. Can be any of the following entities:
854 - an integer type specifying an exact failed status code to
855 accept
856 - a list or a tuple of integer types specifying a list of
857 failed status codes to accept
858 - a callable accepting an actual failed status code and
859 returning True if it should be accepted
860 Note that this argument does not affect success status codes (2xx)
861 which are always accepted.
d391b7e2 862 """
617f658b 863
b9d3e163 864 # Strip hashes from the URL (#1038)
14f25df2 865 if isinstance(url_or_request, str):
b9d3e163
PH
866 url_or_request = url_or_request.partition('#')[0]
867
d391b7e2 868 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
869 if urlh is False:
870 assert not fatal
871 return False
c9a77969 872 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
873 return (content, urlh)
874
c9a77969
YCH
875 @staticmethod
876 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
877 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
878 if m:
879 encoding = m.group(1)
880 else:
0d75ae2c 881 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
882 webpage_bytes[:1024])
883 if m:
884 encoding = m.group(1).decode('ascii')
b60016e8
PH
885 elif webpage_bytes.startswith(b'\xff\xfe'):
886 encoding = 'utf-16'
f143d86a
PH
887 else:
888 encoding = 'utf-8'
c9a77969
YCH
889
890 return encoding
891
4457823d
S
892 def __check_blocked(self, content):
893 first_block = content[:512]
3089bc74
S
894 if ('<title>Access to this site is blocked</title>' in content
895 and 'Websense' in first_block):
4457823d
S
896 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
897 blocked_iframe = self._html_search_regex(
898 r'<iframe src="([^"]+)"', content,
899 'Websense information URL', default=None)
900 if blocked_iframe:
901 msg += ' Visit %s for more details' % blocked_iframe
902 raise ExtractorError(msg, expected=True)
903 if '<title>The URL you requested has been blocked</title>' in first_block:
904 msg = (
905 'Access to this webpage has been blocked by Indian censorship. '
906 'Use a VPN or proxy server (with --proxy) to route around it.')
907 block_msg = self._html_search_regex(
908 r'</h1><p>(.*?)</p>',
909 content, 'block message', default=None)
910 if block_msg:
911 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
912 raise ExtractorError(msg, expected=True)
3089bc74
S
913 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
914 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
915 raise ExtractorError(
916 'Access to this webpage has been blocked by decision of the Russian government. '
917 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
918 expected=True)
919
f95b9dee 920 def _request_dump_filename(self, url, video_id):
921 basen = f'{video_id}_{url}'
922 trim_length = self.get_param('trim_file_name') or 240
923 if len(basen) > trim_length:
924 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
925 basen = basen[:trim_length - len(h)] + h
926 filename = sanitize_filename(f'{basen}.dump', restricted=True)
927 # Working around MAX_PATH limitation on Windows (see
928 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
929 if compat_os_name == 'nt':
930 absfilepath = os.path.abspath(filename)
931 if len(absfilepath) > 259:
932 filename = fR'\\?\{absfilepath}'
933 return filename
934
935 def __decode_webpage(self, webpage_bytes, encoding, headers):
936 if not encoding:
937 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
938 try:
939 return webpage_bytes.decode(encoding, 'replace')
940 except LookupError:
941 return webpage_bytes.decode('utf-8', 'replace')
942
c9a77969 943 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
944 webpage_bytes = urlh.read()
945 if prefix is not None:
946 webpage_bytes = prefix + webpage_bytes
a06916d9 947 if self.get_param('dump_intermediate_pages', False):
f610dbb0 948 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
949 dump = base64.b64encode(webpage_bytes).decode('ascii')
950 self._downloader.to_screen(dump)
f95b9dee 951 if self.get_param('write_pages'):
e121e3ce 952 filename = self._request_dump_filename(urlh.geturl(), video_id)
f95b9dee 953 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
954 with open(filename, 'wb') as outf:
955 outf.write(webpage_bytes)
956
f95b9dee 957 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 958 self.__check_blocked(content)
2410c43d 959
23be51d8 960 return content
d6983cb4 961
6edf2808 962 def __print_error(self, errnote, fatal, video_id, err):
963 if fatal:
c6e07cf1 964 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
6edf2808 965 elif errnote:
c6e07cf1 966 self.report_warning(f'{video_id}: {errnote}: {err}')
6edf2808 967
968 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
e2b38da9
PH
969 if transform_source:
970 xml_string = transform_source(xml_string)
e01c3d2e
S
971 try:
972 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 973 except xml.etree.ElementTree.ParseError as ve:
6edf2808 974 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
267ed0c5 975
6edf2808 976 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
3d3538e4 977 try:
b7c47b74 978 return json.loads(
979 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
3d3538e4 980 except ValueError as ve:
6edf2808 981 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
3d3538e4 982
6edf2808 983 def _parse_socket_response_as_json(self, data, *args, **kwargs):
984 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
adddc50c 985
617f658b 986 def __create_download_methods(name, parser, note, errnote, return_value):
987
6edf2808 988 def parse(ie, content, *args, errnote=errnote, **kwargs):
617f658b 989 if parser is None:
990 return content
6edf2808 991 if errnote is False:
992 kwargs['errnote'] = errnote
617f658b 993 # parser is fetched by name so subclasses can override it
994 return getattr(ie, parser)(content, *args, **kwargs)
995
c4910024 996 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
997 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
998 res = self._download_webpage_handle(
999 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1000 data=data, headers=headers, query=query, expected_status=expected_status)
617f658b 1001 if res is False:
1002 return res
1003 content, urlh = res
6edf2808 1004 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
617f658b 1005
f95b9dee 1006 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
c4910024 1007 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
f95b9dee 1008 if self.get_param('load_pages'):
1009 url_or_request = self._create_request(url_or_request, data, headers, query)
1010 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1011 self.to_screen(f'Loading request from {filename}')
1012 try:
1013 with open(filename, 'rb') as dumpf:
1014 webpage_bytes = dumpf.read()
1015 except OSError as e:
1016 self.report_warning(f'Unable to load request from disk: {e}')
1017 else:
1018 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
6edf2808 1019 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
c4910024 1020 kwargs = {
1021 'note': note,
1022 'errnote': errnote,
1023 'transform_source': transform_source,
1024 'fatal': fatal,
1025 'encoding': encoding,
1026 'data': data,
1027 'headers': headers,
1028 'query': query,
1029 'expected_status': expected_status,
1030 }
617f658b 1031 if parser is None:
c4910024 1032 kwargs.pop('transform_source')
617f658b 1033 # The method is fetched by name so subclasses can override _download_..._handle
c4910024 1034 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
617f658b 1035 return res if res is False else res[0]
1036
1037 def impersonate(func, name, return_value):
1038 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1039 func.__doc__ = f'''
1040 @param transform_source Apply this transformation before parsing
1041 @returns {return_value}
1042
1043 See _download_webpage_handle docstring for other arguments specification
1044 '''
1045
1046 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1047 impersonate(download_content, f'_download_{name}', f'{return_value}')
1048 return download_handle, download_content
1049
1050 _download_xml_handle, _download_xml = __create_download_methods(
1051 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1052 _download_json_handle, _download_json = __create_download_methods(
1053 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1054 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1055 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1056 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1057
617f658b 1058 def _download_webpage(
1059 self, url_or_request, video_id, note=None, errnote=None,
1060 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1061 """
617f658b 1062 Return the data of the page as a string.
adddc50c 1063
617f658b 1064 Keyword arguments:
1065 tries -- number of tries
1066 timeout -- sleep interval between tries
1067
1068 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1069 """
617f658b 1070
1071 R''' # NB: These are unused; should they be deprecated?
1072 if tries != 1:
1073 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1074 if timeout is NO_DEFAULT:
1075 timeout = 5
1076 else:
1077 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1078 '''
1079
1080 try_count = 0
1081 while True:
1082 try:
1083 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
ac668111 1084 except http.client.IncompleteRead as e:
617f658b 1085 try_count += 1
1086 if try_count >= tries:
1087 raise e
1088 self._sleep(timeout, video_id)
adddc50c 1089
28f436ba 1090 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
a70635b8 1091 idstr = format_field(video_id, None, '%s: ')
28f436ba 1092 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1093 if only_once:
1094 if f'WARNING: {msg}' in self._printed_messages:
1095 return
1096 self._printed_messages.add(f'WARNING: {msg}')
1097 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1098
a06916d9 1099 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1100 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1101 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1102
1103 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1104 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1105
1106 def get_param(self, name, default=None, *args, **kwargs):
1107 if self._downloader:
1108 return self._downloader.params.get(name, default, *args, **kwargs)
1109 return default
d6983cb4 1110
d5d1df8a 1111 def report_drm(self, video_id, partial=NO_DEFAULT):
1112 if partial is not NO_DEFAULT:
1113 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
88acdbc2 1114 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1115
d6983cb4
PH
1116 def report_extraction(self, id_or_name):
1117 """Report information extraction."""
f1a9d64e 1118 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1119
1120 def report_download_webpage(self, video_id):
1121 """Report webpage download."""
f1a9d64e 1122 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1123
1124 def report_age_confirmation(self):
1125 """Report attempt to confirm age."""
f1a9d64e 1126 self.to_screen('Confirming age')
d6983cb4 1127
fc79158d
JMF
1128 def report_login(self):
1129 """Report attempt to log in."""
f1a9d64e 1130 self.to_screen('Logging in')
fc79158d 1131
b7da73eb 1132 def raise_login_required(
9d5d4d64 1133 self, msg='This video is only available for registered users',
52efa4b3 1134 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1135 if metadata_available and (
1136 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1137 self.report_warning(msg)
7265a219 1138 return
a70635b8 1139 msg += format_field(self._login_hint(method), None, '. %s')
46890374 1140 raise ExtractorError(msg, expected=True)
43e7d3c9 1141
b7da73eb 1142 def raise_geo_restricted(
1143 self, msg='This video is not available from your location due to geo restriction',
1144 countries=None, metadata_available=False):
f2ebc5c7 1145 if metadata_available and (
1146 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1147 self.report_warning(msg)
1148 else:
1149 raise GeoRestrictedError(msg, countries=countries)
1150
1151 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1152 if expected and (
1153 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1154 self.report_warning(msg, video_id)
68f5867c
L
1155 elif isinstance(msg, ExtractorError):
1156 raise msg
b7da73eb 1157 else:
1158 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1159
5f6a1245 1160 # Methods for following #608
c0d0b01f 1161 @staticmethod
311b6615 1162 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1163 """Returns a URL that points to a page that should be processed"""
311b6615 1164 if ie is not None:
1165 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1166 if video_id is not None:
311b6615 1167 kwargs['id'] = video_id
830d53bf 1168 if video_title is not None:
311b6615 1169 kwargs['title'] = video_title
1170 return {
1171 **kwargs,
1172 '_type': 'url_transparent' if url_transparent else 'url',
1173 'url': url,
1174 }
1175
8f97a15d 1176 @classmethod
1177 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1178 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1179 return cls.playlist_result(
1180 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1181 playlist_id, playlist_title, **kwargs)
46b18f23 1182
c0d0b01f 1183 @staticmethod
311b6615 1184 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1185 """Returns a playlist"""
d6983cb4 1186 if playlist_id:
311b6615 1187 kwargs['id'] = playlist_id
d6983cb4 1188 if playlist_title:
311b6615 1189 kwargs['title'] = playlist_title
ecc97af3 1190 if playlist_description is not None:
311b6615 1191 kwargs['description'] = playlist_description
1192 return {
1193 **kwargs,
1194 '_type': 'multi_video' if multi_video else 'playlist',
1195 'entries': entries,
1196 }
d6983cb4 1197
c342041f 1198 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1199 """
1200 Perform a regex search on the given string, using a single or a list of
1201 patterns returning the first matching group.
1202 In case of failure return a default value or raise a WARNING or a
55b3e45b 1203 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1204 """
61d3665d 1205 if string is None:
1206 mobj = None
77f90330 1207 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1208 mobj = re.search(pattern, string, flags)
1209 else:
1210 for p in pattern:
1211 mobj = re.search(p, string, flags)
c3415d1b
PH
1212 if mobj:
1213 break
d6983cb4 1214
ec11a9f4 1215 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1216
1217 if mobj:
711ede6e
PH
1218 if group is None:
1219 # return the first matching group
1220 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1221 elif isinstance(group, (list, tuple)):
1222 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1223 else:
1224 return mobj.group(group)
c342041f 1225 elif default is not NO_DEFAULT:
d6983cb4
PH
1226 return default
1227 elif fatal:
f1a9d64e 1228 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1229 else:
6a39ee13 1230 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1231 return None
1232
f0bc6e20 1233 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
8b7fb8b6 1234 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
b7c47b74 1235 """Searches string for the JSON object specified by start_pattern"""
1236 # NB: end_pattern is only used to reduce the size of the initial match
f0bc6e20 1237 if default is NO_DEFAULT:
1238 default, has_default = {}, False
1239 else:
1240 fatal, has_default = False, True
1241
1242 json_string = self._search_regex(
8b7fb8b6 1243 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
f0bc6e20 1244 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1245 if not json_string:
1246 return default
1247
1248 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1249 try:
1250 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1251 except ExtractorError as e:
1252 if fatal:
1253 raise ExtractorError(
1254 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1255 elif not has_default:
1256 self.report_warning(
1257 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1258 return default
b7c47b74 1259
c342041f 1260 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1261 """
1262 Like _search_regex, but strips HTML tags and unescapes entities.
1263 """
711ede6e 1264 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1265 if res:
1266 return clean_html(res).strip()
1267 else:
1268 return res
1269
2118fdd1
RA
1270 def _get_netrc_login_info(self, netrc_machine=None):
1271 username = None
1272 password = None
1273 netrc_machine = netrc_machine or self._NETRC_MACHINE
1274
a06916d9 1275 if self.get_param('usenetrc', False):
2118fdd1 1276 try:
0001fcb5 1277 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1278 if os.path.isdir(netrc_file):
1279 netrc_file = os.path.join(netrc_file, '.netrc')
1280 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1281 if info is not None:
1282 username = info[0]
1283 password = info[2]
1284 else:
dcce092e
S
1285 raise netrc.NetrcParseError(
1286 'No authenticators for %s' % netrc_machine)
86e5f3ed 1287 except (OSError, netrc.NetrcParseError) as err:
6a39ee13 1288 self.report_warning(
dcce092e 1289 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1290
dcce092e 1291 return username, password
2118fdd1 1292
1b6712ab 1293 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1294 """
cf0649f8 1295 Get the login info as (username, password)
32443dd3
S
1296 First look for the manually specified credentials using username_option
1297 and password_option as keys in params dictionary. If no such credentials
1298 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1299 value.
fc79158d
JMF
1300 If there's no info available, return (None, None)
1301 """
fc79158d
JMF
1302
1303 # Attempt to use provided username and password or .netrc data
a06916d9 1304 username = self.get_param(username_option)
1305 if username is not None:
1306 password = self.get_param(password_option)
2118fdd1 1307 else:
1b6712ab 1308 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1309
2133565c 1310 return username, password
fc79158d 1311
e64b7569 1312 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1313 """
1314 Get the two-factor authentication info
1315 TODO - asking the user will be required for sms/phone verify
1316 currently just uses the command line option
1317 If there's no info available, return None
1318 """
83317f69 1319
a06916d9 1320 tfa = self.get_param('twofactor')
1321 if tfa is not None:
1322 return tfa
83317f69 1323
ac668111 1324 return getpass.getpass('Type %s and press [Return]: ' % note)
83317f69 1325
46720279
JMF
1326 # Helper functions for extracting OpenGraph info
1327 @staticmethod
ab2d5247 1328 def _og_regexes(prop):
448ef1f3 1329 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
fbfde1c3
F
1330 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1331 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1332 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1333 return [
78fb87b2
JMF
1334 template % (property_re, content_re),
1335 template % (content_re, property_re),
ab2d5247 1336 ]
46720279 1337
864f24bd
S
1338 @staticmethod
1339 def _meta_regex(prop):
1340 return r'''(?isx)<meta
8b9848ac 1341 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1342 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1343
3c4e6d83 1344 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1345 prop = variadic(prop)
46720279 1346 if name is None:
b070564e
S
1347 name = 'OpenGraph %s' % prop[0]
1348 og_regexes = []
1349 for p in prop:
1350 og_regexes.extend(self._og_regexes(p))
1351 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1352 if escaped is None:
1353 return None
1354 return unescapeHTML(escaped)
46720279
JMF
1355
1356 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1357 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1358
1359 def _og_search_description(self, html, **kargs):
1360 return self._og_search_property('description', html, fatal=False, **kargs)
1361
04f3fd2c 1362 def _og_search_title(self, html, *, fatal=False, **kargs):
1363 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1364
8ffa13e0 1365 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1366 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1367 if secure:
1368 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1369 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1370
78338f71
JMF
1371 def _og_search_url(self, html, **kargs):
1372 return self._og_search_property('url', html, **kargs)
1373
04f3fd2c 1374 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1375 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1376
40c696e5 1377 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1378 name = variadic(name)
59040888 1379 if display_name is None:
88d9f6c0 1380 display_name = name[0]
59040888 1381 return self._html_search_regex(
88d9f6c0 1382 [self._meta_regex(n) for n in name],
711ede6e 1383 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1384
1385 def _dc_search_uploader(self, html):
1386 return self._html_search_meta('dc.creator', html, 'uploader')
1387
8f97a15d 1388 @staticmethod
1389 def _rta_search(html):
8dbe9899
PH
1390 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1391 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1392 r' content="RTA-5042-1996-1400-1577-RTA"',
1393 html):
1394 return 18
8f97a15d 1395
1396 # And then there are the jokers who advertise that they use RTA, but actually don't.
1397 AGE_LIMIT_MARKERS = [
1398 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
32a84bcf
SS
1399 r'>[^<]*you acknowledge you are at least (\d+) years old',
1400 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
8f97a15d 1401 ]
32a84bcf
SS
1402
1403 age_limit = 0
1404 for marker in AGE_LIMIT_MARKERS:
1405 mobj = re.search(marker, html)
1406 if mobj:
1407 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1408 return age_limit
8dbe9899 1409
59040888
PH
1410 def _media_rating_search(self, html):
1411 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1412 rating = self._html_search_meta('rating', html)
1413
1414 if not rating:
1415 return None
1416
1417 RATING_TABLE = {
1418 'safe for kids': 0,
1419 'general': 8,
1420 '14 years': 14,
1421 'mature': 17,
1422 'restricted': 19,
1423 }
d800609c 1424 return RATING_TABLE.get(rating.lower())
59040888 1425
69319969 1426 def _family_friendly_search(self, html):
6ca7732d 1427 # See http://schema.org/VideoObject
ac8491fc
S
1428 family_friendly = self._html_search_meta(
1429 'isFamilyFriendly', html, default=None)
69319969
NJ
1430
1431 if not family_friendly:
1432 return None
1433
1434 RATING_TABLE = {
1435 '1': 0,
1436 'true': 0,
1437 '0': 18,
1438 'false': 18,
1439 }
d800609c 1440 return RATING_TABLE.get(family_friendly.lower())
69319969 1441
0c708f11
JMF
1442 def _twitter_search_player(self, html):
1443 return self._html_search_meta('twitter:player', html,
9e1a5b84 1444 'twitter card player')
0c708f11 1445
0c36dc00 1446 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1447 """Yield all json ld objects in the html"""
1448 if default is not NO_DEFAULT:
1449 fatal = False
1450 for mobj in re.finditer(JSON_LD_RE, html):
1451 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1452 for json_ld in variadic(json_ld_item):
1453 if isinstance(json_ld, dict):
1454 yield json_ld
1455
1456 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1457 """Search for a video in any json ld in the html"""
1458 if default is not NO_DEFAULT:
1459 fatal = False
1460 info = self._json_ld(
1461 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1462 video_id, fatal=fatal, expected_type=expected_type)
1463 if info:
1464 return info
4433bb02
S
1465 if default is not NO_DEFAULT:
1466 return default
1467 elif fatal:
1468 raise RegexNotFoundError('Unable to extract JSON-LD')
1469 else:
6a39ee13 1470 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1471 return {}
4ca2a3cf 1472
95b31e26 1473 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
14f25df2 1474 if isinstance(json_ld, str):
4ca2a3cf
S
1475 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1476 if not json_ld:
1477 return {}
1478 info = {}
bae14048 1479
e7e4a6e0
S
1480 INTERACTION_TYPE_MAP = {
1481 'CommentAction': 'comment',
1482 'AgreeAction': 'like',
1483 'DisagreeAction': 'dislike',
1484 'LikeAction': 'like',
1485 'DislikeAction': 'dislike',
1486 'ListenAction': 'view',
1487 'WatchAction': 'view',
1488 'ViewAction': 'view',
1489 }
1490
f3c0c773 1491 def is_type(e, *expected_types):
1492 type = variadic(traverse_obj(e, '@type'))
1493 return any(x in type for x in expected_types)
1494
29f7c58a 1495 def extract_interaction_type(e):
1496 interaction_type = e.get('interactionType')
1497 if isinstance(interaction_type, dict):
1498 interaction_type = interaction_type.get('@type')
1499 return str_or_none(interaction_type)
1500
e7e4a6e0
S
1501 def extract_interaction_statistic(e):
1502 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1503 if isinstance(interaction_statistic, dict):
1504 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1505 if not isinstance(interaction_statistic, list):
1506 return
1507 for is_e in interaction_statistic:
f3c0c773 1508 if not is_type(is_e, 'InteractionCounter'):
e7e4a6e0 1509 continue
29f7c58a 1510 interaction_type = extract_interaction_type(is_e)
1511 if not interaction_type:
e7e4a6e0 1512 continue
ce5b9040
S
1513 # For interaction count some sites provide string instead of
1514 # an integer (as per spec) with non digit characters (e.g. ",")
1515 # so extracting count with more relaxed str_to_int
1516 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1517 if interaction_count is None:
1518 continue
1519 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1520 if not count_kind:
1521 continue
1522 count_key = '%s_count' % count_kind
1523 if info.get(count_key) is not None:
1524 continue
1525 info[count_key] = interaction_count
1526
f5225737 1527 def extract_chapter_information(e):
1528 chapters = [{
1529 'title': part.get('name'),
1530 'start_time': part.get('startOffset'),
1531 'end_time': part.get('endOffset'),
85553414 1532 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1533 for idx, (last_c, current_c, next_c) in enumerate(zip(
1534 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1535 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1536 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1537 if None in current_c.values():
1538 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1539 return
1540 if chapters:
1541 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1542 info['chapters'] = chapters
1543
bae14048 1544 def extract_video_object(e):
f7ad7160 1545 author = e.get('author')
bae14048 1546 info.update({
0c36dc00 1547 'url': url_or_none(e.get('contentUrl')),
0f60ba6e 1548 'ext': mimetype2ext(e.get('encodingFormat')),
bae14048
S
1549 'title': unescapeHTML(e.get('name')),
1550 'description': unescapeHTML(e.get('description')),
eb2333bc 1551 'thumbnails': [{'url': unescapeHTML(url)}
21633673 1552 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1553 if url_or_none(url)],
bae14048
S
1554 'duration': parse_duration(e.get('duration')),
1555 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1556 # author can be an instance of 'Organization' or 'Person' types.
1557 # both types can have 'name' property(inherited from 'Thing' type). [1]
1558 # however some websites are using 'Text' type instead.
1559 # 1. https://schema.org/VideoObject
14f25df2 1560 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
0f60ba6e 1561 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
56ba69e4 1562 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
bae14048
S
1563 'tbr': int_or_none(e.get('bitrate')),
1564 'width': int_or_none(e.get('width')),
1565 'height': int_or_none(e.get('height')),
33a81c2c 1566 'view_count': int_or_none(e.get('interactionCount')),
0f60ba6e 1567 'tags': try_call(lambda: e.get('keywords').split(',')),
bae14048 1568 })
0f60ba6e 1569 if is_type(e, 'AudioObject'):
1570 info.update({
1571 'vcodec': 'none',
1572 'abr': int_or_none(e.get('bitrate')),
1573 })
e7e4a6e0 1574 extract_interaction_statistic(e)
f5225737 1575 extract_chapter_information(e)
bae14048 1576
d5c32548 1577 def traverse_json_ld(json_ld, at_top_level=True):
1d55ebab
SS
1578 for e in variadic(json_ld):
1579 if not isinstance(e, dict):
1580 continue
d5c32548
ZM
1581 if at_top_level and '@context' not in e:
1582 continue
1583 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1d55ebab 1584 traverse_json_ld(e['@graph'], at_top_level=False)
c13a301a 1585 continue
f3c0c773 1586 if expected_type is not None and not is_type(e, expected_type):
4433bb02 1587 continue
8f122fa0 1588 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1589 if rating is not None:
1590 info['average_rating'] = rating
f3c0c773 1591 if is_type(e, 'TVEpisode', 'Episode'):
440863ad 1592 episode_name = unescapeHTML(e.get('name'))
46933a15 1593 info.update({
440863ad 1594 'episode': episode_name,
46933a15
S
1595 'episode_number': int_or_none(e.get('episodeNumber')),
1596 'description': unescapeHTML(e.get('description')),
1597 })
440863ad
S
1598 if not info.get('title') and episode_name:
1599 info['title'] = episode_name
46933a15 1600 part_of_season = e.get('partOfSeason')
f3c0c773 1601 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1602 info.update({
1603 'season': unescapeHTML(part_of_season.get('name')),
1604 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1605 })
d16b3c66 1606 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
f3c0c773 1607 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1608 info['series'] = unescapeHTML(part_of_series.get('name'))
f3c0c773 1609 elif is_type(e, 'Movie'):
391256dc
S
1610 info.update({
1611 'title': unescapeHTML(e.get('name')),
1612 'description': unescapeHTML(e.get('description')),
1613 'duration': parse_duration(e.get('duration')),
1614 'timestamp': unified_timestamp(e.get('dateCreated')),
1615 })
f3c0c773 1616 elif is_type(e, 'Article', 'NewsArticle'):
46933a15
S
1617 info.update({
1618 'timestamp': parse_iso8601(e.get('datePublished')),
1619 'title': unescapeHTML(e.get('headline')),
d5c32548 1620 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1621 })
f3c0c773 1622 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
2edb38e8 1623 extract_video_object(e['video'][0])
f3c0c773 1624 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
e50c3500 1625 extract_video_object(e['subjectOf'][0])
0f60ba6e 1626 elif is_type(e, 'VideoObject', 'AudioObject'):
bae14048 1627 extract_video_object(e)
4433bb02
S
1628 if expected_type is None:
1629 continue
1630 else:
1631 break
c69701c6 1632 video = e.get('video')
f3c0c773 1633 if is_type(video, 'VideoObject'):
c69701c6 1634 extract_video_object(video)
4433bb02
S
1635 if expected_type is None:
1636 continue
1637 else:
1638 break
d5c32548 1639
1d55ebab 1640 traverse_json_ld(json_ld)
90137ca4 1641 return filter_dict(info)
4ca2a3cf 1642
135dfa2c 1643 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1644 return self._parse_json(
1645 self._search_regex(
1646 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1647 webpage, 'next.js data', fatal=fatal, **kw),
1648 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1649
8072ef2b 1650 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1651 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
66f4c04e 1652 rectx = re.escape(context_name)
8072ef2b 1653 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
66f4c04e 1654 js, arg_keys, arg_vals = self._search_regex(
8072ef2b 1655 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
f7fc8d39 1656 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1657 default=NO_DEFAULT if fatal else (None, None, None))
1658 if js is None:
1659 return {}
66f4c04e
THD
1660
1661 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1662
1663 for key, val in args.items():
1664 if val in ('undefined', 'void 0'):
1665 args[key] = 'null'
1666
8072ef2b 1667 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1668 return traverse_obj(ret, traverse) or {}
66f4c04e 1669
27713812 1670 @staticmethod
f8da79f8 1671 def _hidden_inputs(html):
586f1cc5 1672 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1673 hidden_inputs = {}
c8498368
S
1674 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1675 attrs = extract_attributes(input)
1676 if not input:
201ea3ee 1677 continue
c8498368 1678 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1679 continue
c8498368
S
1680 name = attrs.get('name') or attrs.get('id')
1681 value = attrs.get('value')
1682 if name and value is not None:
1683 hidden_inputs[name] = value
201ea3ee 1684 return hidden_inputs
27713812 1685
cf61d96d
S
1686 def _form_hidden_inputs(self, form_id, html):
1687 form = self._search_regex(
73eb13df 1688 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1689 html, '%s form' % form_id, group='form')
1690 return self._hidden_inputs(form)
1691
d0d74b71 1692 @classproperty(cache=True)
1693 def FormatSort(cls):
1694 class FormatSort(FormatSorter):
1695 def __init__(ie, *args, **kwargs):
1696 super().__init__(ie._downloader, *args, **kwargs)
eb8a4433 1697
d0d74b71 1698 deprecation_warning(
1699 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1700 'Use yt_dlp.utils.FormatSorter instead')
1701 return FormatSort
eb8a4433 1702
1703 def _sort_formats(self, formats, field_preference=[]):
9f14daf2 1704 if not field_preference:
1705 self._downloader.deprecation_warning(
1706 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1707 return
1708 self._downloader.deprecation_warning(
1709 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1710 'Return _format_sort_fields in the info_dict instead')
1711 if formats:
784320c9 1712 formats[0]['__sort_fields'] = field_preference
59040888 1713
96a53167
S
1714 def _check_formats(self, formats, video_id):
1715 if formats:
1716 formats[:] = filter(
1717 lambda f: self._is_valid_url(
1718 f['url'], video_id,
1719 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1720 formats)
1721
f5bdb444
S
1722 @staticmethod
1723 def _remove_duplicate_formats(formats):
1724 format_urls = set()
1725 unique_formats = []
1726 for f in formats:
1727 if f['url'] not in format_urls:
1728 format_urls.add(f['url'])
1729 unique_formats.append(f)
1730 formats[:] = unique_formats
1731
45024183 1732 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1733 url = self._proto_relative_url(url, scheme='http:')
1734 # For now assume non HTTP(S) URLs always valid
1735 if not (url.startswith('http://') or url.startswith('https://')):
1736 return True
96a53167 1737 try:
45024183 1738 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1739 return True
8bdd16b4 1740 except ExtractorError as e:
25e911a9 1741 self.to_screen(
8bdd16b4 1742 '%s: %s URL is invalid, skipping: %s'
1743 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1744 return False
96a53167 1745
20991253 1746 def http_scheme(self):
1ede5b24 1747 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1748 return (
1749 'http:'
a06916d9 1750 if self.get_param('prefer_insecure', False)
20991253
PH
1751 else 'https:')
1752
57c7411f 1753 def _proto_relative_url(self, url, scheme=None):
8f97a15d 1754 scheme = scheme or self.http_scheme()
1755 assert scheme.endswith(':')
1756 return sanitize_url(url, scheme=scheme[:-1])
57c7411f 1757
4094b6e3
PH
1758 def _sleep(self, timeout, video_id, msg_template=None):
1759 if msg_template is None:
f1a9d64e 1760 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1761 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1762 self.to_screen(msg)
1763 time.sleep(timeout)
1764
f983b875 1765 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1766 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1767 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
0b5546c7 1768 if self.get_param('ignore_no_formats_error'):
1769 fatal = False
1770
a076c1f9 1771 res = self._download_xml_handle(
f036a632 1772 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1773 'Unable to download f4m manifest',
1774 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1775 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1776 transform_source=transform_source,
7360c06f 1777 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 1778 if res is False:
8d29e47f 1779 return []
31bb8d3f 1780
a076c1f9
E
1781 manifest, urlh = res
1782 manifest_url = urlh.geturl()
1783
0fdbb332 1784 return self._parse_f4m_formats(
f983b875 1785 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1786 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1787
f983b875 1788 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1789 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1790 fatal=True, m3u8_id=None):
f9934b96 1791 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
1792 return []
1793
7a5c1cfe 1794 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1795 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1796 if akamai_pv is not None and ';' in akamai_pv.text:
1797 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1798 if playerVerificationChallenge.strip() != '':
1799 return []
1800
31bb8d3f 1801 formats = []
7a47d07c 1802 manifest_version = '1.0'
b2527359 1803 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1804 if not media_nodes:
7a47d07c 1805 manifest_version = '2.0'
34e48bed 1806 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1807 # Remove unsupported DRM protected media from final formats
067aa17e 1808 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1809 media_nodes = remove_encrypted_media(media_nodes)
1810 if not media_nodes:
1811 return formats
48107c19
S
1812
1813 manifest_base_url = get_base_url(manifest)
0a5685b2 1814
a6571f10 1815 bootstrap_info = xpath_element(
0a5685b2
YCH
1816 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1817 'bootstrap info', default=None)
1818
edd6074c
RA
1819 vcodec = None
1820 mime_type = xpath_text(
1821 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1822 'base URL', default=None)
1823 if mime_type and mime_type.startswith('audio/'):
1824 vcodec = 'none'
1825
b2527359 1826 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1827 tbr = int_or_none(media_el.attrib.get('bitrate'))
1828 width = int_or_none(media_el.attrib.get('width'))
1829 height = int_or_none(media_el.attrib.get('height'))
34921b43 1830 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
1831 # If <bootstrapInfo> is present, the specified f4m is a
1832 # stream-level manifest, and only set-level manifests may refer to
1833 # external resources. See section 11.4 and section 4 of F4M spec
1834 if bootstrap_info is None:
1835 media_url = None
1836 # @href is introduced in 2.0, see section 11.6 of F4M spec
1837 if manifest_version == '2.0':
1838 media_url = media_el.attrib.get('href')
1839 if media_url is None:
1840 media_url = media_el.attrib.get('url')
31c746e5
S
1841 if not media_url:
1842 continue
cc357c4d
S
1843 manifest_url = (
1844 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1845 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1846 # If media_url is itself a f4m manifest do the recursive extraction
1847 # since bitrates in parent manifest (this one) and media_url manifest
1848 # may differ leading to inability to resolve the format by requested
1849 # bitrate in f4m downloader
240b6045
YCH
1850 ext = determine_ext(manifest_url)
1851 if ext == 'f4m':
77b8b4e6 1852 f4m_formats = self._extract_f4m_formats(
f983b875 1853 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1854 transform_source=transform_source, fatal=fatal)
1855 # Sometimes stream-level manifest contains single media entry that
1856 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1857 # At the same time parent's media entry in set-level manifest may
1858 # contain it. We will copy it from parent in such cases.
1859 if len(f4m_formats) == 1:
1860 f = f4m_formats[0]
1861 f.update({
1862 'tbr': f.get('tbr') or tbr,
1863 'width': f.get('width') or width,
1864 'height': f.get('height') or height,
1865 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1866 'vcodec': vcodec,
77b8b4e6
S
1867 })
1868 formats.extend(f4m_formats)
70f0f5a8 1869 continue
240b6045
YCH
1870 elif ext == 'm3u8':
1871 formats.extend(self._extract_m3u8_formats(
1872 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1873 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1874 continue
31bb8d3f 1875 formats.append({
77b8b4e6 1876 'format_id': format_id,
31bb8d3f 1877 'url': manifest_url,
30d0b549 1878 'manifest_url': manifest_url,
a6571f10 1879 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1880 'protocol': 'f4m',
b2527359 1881 'tbr': tbr,
77b8b4e6
S
1882 'width': width,
1883 'height': height,
edd6074c 1884 'vcodec': vcodec,
60ca389c 1885 'preference': preference,
f983b875 1886 'quality': quality,
31bb8d3f 1887 })
31bb8d3f
JMF
1888 return formats
1889
f983b875 1890 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1891 return {
34921b43 1892 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
1893 'url': m3u8_url,
1894 'ext': ext,
1895 'protocol': 'm3u8',
37768f92 1896 'preference': preference - 100 if preference else -100,
f983b875 1897 'quality': quality,
704df56d
PH
1898 'resolution': 'multiple',
1899 'format_note': 'Quality selection URL',
16da9bbc
YCH
1900 }
1901
b5ae35ee 1902 def _report_ignoring_subs(self, name):
1903 self.report_warning(bug_reports_message(
1904 f'Ignoring subtitle tracks found in the {name} manifest; '
1905 'if any subtitle tracks are missing,'
1906 ), only_once=True)
1907
a0c3b2d5
F
1908 def _extract_m3u8_formats(self, *args, **kwargs):
1909 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1910 if subs:
b5ae35ee 1911 self._report_ignoring_subs('HLS')
a0c3b2d5
F
1912 return fmts
1913
1914 def _extract_m3u8_formats_and_subtitles(
177877c5 1915 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1916 preference=None, quality=None, m3u8_id=None, note=None,
1917 errnote=None, fatal=True, live=False, data=None, headers={},
1918 query={}):
1919
0b5546c7 1920 if self.get_param('ignore_no_formats_error'):
1921 fatal = False
1922
71df9b7f 1923 if not m3u8_url:
1924 if errnote is not False:
1925 errnote = errnote or 'Failed to obtain m3u8 URL'
1926 if fatal:
1927 raise ExtractorError(errnote, video_id=video_id)
1928 self.report_warning(f'{errnote}{bug_reports_message()}')
1929 return [], {}
1930
dbd82a1d 1931 res = self._download_webpage_handle(
81515ad9 1932 m3u8_url, video_id,
37a3bb66 1933 note='Downloading m3u8 information' if note is None else note,
1934 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1935 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1936
dbd82a1d 1937 if res is False:
a0c3b2d5 1938 return [], {}
cb252080 1939
dbd82a1d 1940 m3u8_doc, urlh = res
37113045 1941 m3u8_url = urlh.geturl()
9cdffeeb 1942
a0c3b2d5 1943 return self._parse_m3u8_formats_and_subtitles(
cb252080 1944 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1945 preference=preference, quality=quality, m3u8_id=m3u8_id,
1946 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1947 headers=headers, query=query, video_id=video_id)
cb252080 1948
a0c3b2d5 1949 def _parse_m3u8_formats_and_subtitles(
42676437 1950 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1951 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1952 errnote=None, fatal=True, data=None, headers={}, query={},
1953 video_id=None):
60755938 1954 formats, subtitles = [], {}
a0c3b2d5 1955
6b993ca7 1956 has_drm = re.search('|'.join([
1957 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
1958 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
1959 ]), m3u8_doc)
a0c3b2d5 1960
60755938 1961 def format_url(url):
14f25df2 1962 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
60755938 1963
1964 if self.get_param('hls_split_discontinuity', False):
1965 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1966 if not m3u8_doc:
1967 if not manifest_url:
1968 return []
1969 m3u8_doc = self._download_webpage(
1970 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1971 note=False, errnote='Failed to download m3u8 playlist information')
1972 if m3u8_doc is False:
1973 return []
1974 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 1975
60755938 1976 else:
1977 def _extract_m3u8_playlist_indices(*args, **kwargs):
1978 return [None]
310c2ed2 1979
cb252080
S
1980 # References:
1981 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
1982 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1983 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
1984
1985 # We should try extracting formats only from master playlists [1, 4.3.4],
1986 # i.e. playlists that describe available qualities. On the other hand
1987 # media playlists [1, 4.3.3] should be returned as is since they contain
1988 # just the media without qualities renditions.
9cdffeeb 1989 # Fortunately, master playlist can be easily distinguished from media
cb252080 1990 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 1991 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
1992 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1993 # media playlist and MUST NOT appear in master playlist thus we can
1994 # clearly detect media playlist with this criterion.
1995
9cdffeeb 1996 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 1997 formats = [{
34921b43 1998 'format_id': join_nonempty(m3u8_id, idx),
60755938 1999 'format_index': idx,
42676437 2000 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2001 'ext': ext,
2002 'protocol': entry_protocol,
2003 'preference': preference,
2004 'quality': quality,
88acdbc2 2005 'has_drm': has_drm,
60755938 2006 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2007
a0c3b2d5 2008 return formats, subtitles
cb252080
S
2009
2010 groups = {}
2011 last_stream_inf = {}
2012
2013 def extract_media(x_media_line):
2014 media = parse_m3u8_attributes(x_media_line)
2015 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2016 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2017 if not (media_type and group_id and name):
2018 return
2019 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2020 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2021 if media_type == 'SUBTITLES':
3907333c 2022 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2023 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2024 # However, lack of URI has been spotted in the wild.
2025 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2026 if not media.get('URI'):
2027 return
a0c3b2d5
F
2028 url = format_url(media['URI'])
2029 sub_info = {
2030 'url': url,
2031 'ext': determine_ext(url),
2032 }
4a2f19ab
F
2033 if sub_info['ext'] == 'm3u8':
2034 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2035 # files may contain is WebVTT:
2036 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2037 sub_info['ext'] = 'vtt'
2038 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2039 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2040 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2041 if media_type not in ('VIDEO', 'AUDIO'):
2042 return
2043 media_url = media.get('URI')
2044 if media_url:
310c2ed2 2045 manifest_url = format_url(media_url)
60755938 2046 formats.extend({
34921b43 2047 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2048 'format_note': name,
2049 'format_index': idx,
2050 'url': manifest_url,
2051 'manifest_url': m3u8_url,
2052 'language': media.get('LANGUAGE'),
2053 'ext': ext,
2054 'protocol': entry_protocol,
2055 'preference': preference,
2056 'quality': quality,
2057 'vcodec': 'none' if media_type == 'AUDIO' else None,
2058 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2059
2060 def build_stream_name():
2061 # Despite specification does not mention NAME attribute for
3019cb0c
S
2062 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2063 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2064 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2065 stream_name = last_stream_inf.get('NAME')
2066 if stream_name:
2067 return stream_name
2068 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2069 # from corresponding rendition group
2070 stream_group_id = last_stream_inf.get('VIDEO')
2071 if not stream_group_id:
2072 return
2073 stream_group = groups.get(stream_group_id)
2074 if not stream_group:
2075 return stream_group_id
2076 rendition = stream_group[0]
2077 return rendition.get('NAME') or stream_group_id
2078
379306ef 2079 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2080 # chance to detect video only formats when EXT-X-STREAM-INF tags
2081 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2082 for line in m3u8_doc.splitlines():
2083 if line.startswith('#EXT-X-MEDIA:'):
2084 extract_media(line)
2085
704df56d
PH
2086 for line in m3u8_doc.splitlines():
2087 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2088 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2089 elif line.startswith('#') or not line.strip():
2090 continue
2091 else:
9c99bef7 2092 tbr = float_or_none(
3089bc74
S
2093 last_stream_inf.get('AVERAGE-BANDWIDTH')
2094 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2095 manifest_url = format_url(line.strip())
5ef62fc4 2096
60755938 2097 for idx in _extract_m3u8_playlist_indices(manifest_url):
2098 format_id = [m3u8_id, None, idx]
310c2ed2 2099 # Bandwidth of live streams may differ over time thus making
2100 # format_id unpredictable. So it's better to keep provided
2101 # format_id intact.
2102 if not live:
60755938 2103 stream_name = build_stream_name()
34921b43 2104 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2105 f = {
34921b43 2106 'format_id': join_nonempty(*format_id),
60755938 2107 'format_index': idx,
310c2ed2 2108 'url': manifest_url,
2109 'manifest_url': m3u8_url,
2110 'tbr': tbr,
2111 'ext': ext,
2112 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2113 'protocol': entry_protocol,
2114 'preference': preference,
2115 'quality': quality,
2116 }
2117 resolution = last_stream_inf.get('RESOLUTION')
2118 if resolution:
2119 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2120 if mobj:
2121 f['width'] = int(mobj.group('width'))
2122 f['height'] = int(mobj.group('height'))
2123 # Unified Streaming Platform
2124 mobj = re.search(
2125 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2126 if mobj:
2127 abr, vbr = mobj.groups()
2128 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2129 f.update({
2130 'vbr': vbr,
2131 'abr': abr,
2132 })
2133 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2134 f.update(codecs)
2135 audio_group_id = last_stream_inf.get('AUDIO')
2136 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2137 # references a rendition group MUST have a CODECS attribute.
62b58c09 2138 # However, this is not always respected. E.g. [2]
310c2ed2 2139 # contains EXT-X-STREAM-INF tag which references AUDIO
2140 # rendition group but does not have CODECS and despite
2141 # referencing an audio group it represents a complete
2142 # (with audio and video) format. So, for such cases we will
2143 # ignore references to rendition groups and treat them
2144 # as complete formats.
2145 if audio_group_id and codecs and f.get('vcodec') != 'none':
2146 audio_group = groups.get(audio_group_id)
2147 if audio_group and audio_group[0].get('URI'):
2148 # TODO: update acodec for audio only formats with
2149 # the same GROUP-ID
2150 f['acodec'] = 'none'
fc21af50 2151 if not f.get('ext'):
2152 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2153 formats.append(f)
2154
2155 # for DailyMotion
2156 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2157 if progressive_uri:
2158 http_f = f.copy()
2159 del http_f['manifest_url']
2160 http_f.update({
2161 'format_id': f['format_id'].replace('hls-', 'http-'),
2162 'protocol': 'http',
2163 'url': progressive_uri,
2164 })
2165 formats.append(http_f)
5ef62fc4 2166
cb252080 2167 last_stream_inf = {}
a0c3b2d5 2168 return formats, subtitles
704df56d 2169
3cf4b91d
C
2170 def _extract_m3u8_vod_duration(
2171 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2172
2173 m3u8_vod = self._download_webpage(
2174 m3u8_vod_url, video_id,
2175 note='Downloading m3u8 VOD manifest' if note is None else note,
2176 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2177 fatal=False, data=data, headers=headers, query=query)
2178
2179 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2180
2181 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2182 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2183 return None
2184
2185 return int(sum(
2186 float(line[len('#EXTINF:'):].split(',')[0])
2187 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2188
a107193e
S
2189 @staticmethod
2190 def _xpath_ns(path, namespace=None):
2191 if not namespace:
2192 return path
2193 out = []
2194 for c in path.split('/'):
2195 if not c or c == '.':
2196 out.append(c)
2197 else:
2198 out.append('{%s}%s' % (namespace, c))
2199 return '/'.join(out)
2200
da1c94ee 2201 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
0b5546c7 2202 if self.get_param('ignore_no_formats_error'):
2203 fatal = False
2204
a076c1f9
E
2205 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2206 if res is False:
995029a1 2207 assert not fatal
774a46c5 2208 return [], {}
e89a2aab 2209
a076c1f9
E
2210 smil, urlh = res
2211 smil_url = urlh.geturl()
2212
17712eeb 2213 namespace = self._parse_smil_namespace(smil)
a107193e 2214
da1c94ee 2215 fmts = self._parse_smil_formats(
a107193e 2216 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2217 subs = self._parse_smil_subtitles(
2218 smil, namespace=namespace)
2219
2220 return fmts, subs
2221
2222 def _extract_smil_formats(self, *args, **kwargs):
2223 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2224 if subs:
b5ae35ee 2225 self._report_ignoring_subs('SMIL')
da1c94ee 2226 return fmts
a107193e
S
2227
2228 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2229 res = self._download_smil(smil_url, video_id, fatal=fatal)
2230 if res is False:
a107193e 2231 return {}
a076c1f9
E
2232
2233 smil, urlh = res
2234 smil_url = urlh.geturl()
2235
a107193e
S
2236 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2237
09f572fb 2238 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2239 return self._download_xml_handle(
a107193e 2240 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2241 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2242
2243 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2244 namespace = self._parse_smil_namespace(smil)
a107193e
S
2245
2246 formats = self._parse_smil_formats(
2247 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2248 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2249
2250 video_id = os.path.splitext(url_basename(smil_url))[0]
2251 title = None
2252 description = None
647eab45 2253 upload_date = None
a107193e
S
2254 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2255 name = meta.attrib.get('name')
2256 content = meta.attrib.get('content')
2257 if not name or not content:
2258 continue
2259 if not title and name == 'title':
2260 title = content
2261 elif not description and name in ('description', 'abstract'):
2262 description = content
647eab45
S
2263 elif not upload_date and name == 'date':
2264 upload_date = unified_strdate(content)
a107193e 2265
1e5bcdec
S
2266 thumbnails = [{
2267 'id': image.get('type'),
2268 'url': image.get('src'),
2269 'width': int_or_none(image.get('width')),
2270 'height': int_or_none(image.get('height')),
2271 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2272
a107193e
S
2273 return {
2274 'id': video_id,
2275 'title': title or video_id,
2276 'description': description,
647eab45 2277 'upload_date': upload_date,
1e5bcdec 2278 'thumbnails': thumbnails,
a107193e
S
2279 'formats': formats,
2280 'subtitles': subtitles,
2281 }
2282
17712eeb
S
2283 def _parse_smil_namespace(self, smil):
2284 return self._search_regex(
2285 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2286
f877c6ae 2287 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2288 base = smil_url
2289 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2290 b = meta.get('base') or meta.get('httpBase')
2291 if b:
2292 base = b
2293 break
e89a2aab
S
2294
2295 formats = []
2296 rtmp_count = 0
a107193e 2297 http_count = 0
7f32e5dc 2298 m3u8_count = 0
9359f3d4 2299 imgs_count = 0
a107193e 2300
9359f3d4 2301 srcs = set()
ad96b4c8
YCH
2302 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2303 for medium in media:
2304 src = medium.get('src')
81e1c4e2 2305 if not src or src in srcs:
a107193e 2306 continue
9359f3d4 2307 srcs.add(src)
a107193e 2308
ad96b4c8
YCH
2309 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2310 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2311 width = int_or_none(medium.get('width'))
2312 height = int_or_none(medium.get('height'))
2313 proto = medium.get('proto')
2314 ext = medium.get('ext')
a107193e 2315 src_ext = determine_ext(src)
ad96b4c8 2316 streamer = medium.get('streamer') or base
a107193e
S
2317
2318 if proto == 'rtmp' or streamer.startswith('rtmp'):
2319 rtmp_count += 1
2320 formats.append({
2321 'url': streamer,
2322 'play_path': src,
2323 'ext': 'flv',
2324 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2325 'tbr': bitrate,
2326 'filesize': filesize,
2327 'width': width,
2328 'height': height,
2329 })
f877c6ae
YCH
2330 if transform_rtmp_url:
2331 streamer, src = transform_rtmp_url(streamer, src)
2332 formats[-1].update({
2333 'url': streamer,
2334 'play_path': src,
2335 })
a107193e
S
2336 continue
2337
14f25df2 2338 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
c349456e 2339 src_url = src_url.strip()
a107193e
S
2340
2341 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2342 m3u8_formats = self._extract_m3u8_formats(
2343 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2344 if len(m3u8_formats) == 1:
2345 m3u8_count += 1
2346 m3u8_formats[0].update({
2347 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2348 'tbr': bitrate,
2349 'width': width,
2350 'height': height,
2351 })
2352 formats.extend(m3u8_formats)
bd21ead2 2353 elif src_ext == 'f4m':
a107193e
S
2354 f4m_url = src_url
2355 if not f4m_params:
2356 f4m_params = {
2357 'hdcore': '3.2.0',
2358 'plugin': 'flowplayer-3.2.0.1',
2359 }
2360 f4m_url += '&' if '?' in f4m_url else '?'
14f25df2 2361 f4m_url += urllib.parse.urlencode(f4m_params)
7e5edcfd 2362 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2363 elif src_ext == 'mpd':
2364 formats.extend(self._extract_mpd_formats(
2365 src_url, video_id, mpd_id='dash', fatal=False))
2366 elif re.search(r'\.ism/[Mm]anifest', src_url):
2367 formats.extend(self._extract_ism_formats(
2368 src_url, video_id, ism_id='mss', fatal=False))
2369 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2370 http_count += 1
2371 formats.append({
2372 'url': src_url,
2373 'ext': ext or src_ext or 'flv',
2374 'format_id': 'http-%d' % (bitrate or http_count),
2375 'tbr': bitrate,
2376 'filesize': filesize,
2377 'width': width,
2378 'height': height,
2379 })
63757032 2380
9359f3d4
F
2381 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2382 src = medium.get('src')
2383 if not src or src in srcs:
2384 continue
2385 srcs.add(src)
2386
2387 imgs_count += 1
2388 formats.append({
2389 'format_id': 'imagestream-%d' % (imgs_count),
2390 'url': src,
2391 'ext': mimetype2ext(medium.get('type')),
2392 'acodec': 'none',
2393 'vcodec': 'none',
2394 'width': int_or_none(medium.get('width')),
2395 'height': int_or_none(medium.get('height')),
2396 'format_note': 'SMIL storyboards',
2397 })
2398
e89a2aab
S
2399 return formats
2400
ce00af87 2401 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2402 urls = []
a107193e
S
2403 subtitles = {}
2404 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2405 src = textstream.get('src')
d413095f 2406 if not src or src in urls:
a107193e 2407 continue
d413095f 2408 urls.append(src)
df634be2 2409 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2410 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2411 subtitles.setdefault(lang, []).append({
2412 'url': src,
2413 'ext': ext,
2414 })
2415 return subtitles
63757032 2416
47a5cb77 2417 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2418 res = self._download_xml_handle(
47a5cb77 2419 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2420 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2421 if res is False:
942acef5 2422 return []
a076c1f9
E
2423
2424 xspf, urlh = res
2425 xspf_url = urlh.geturl()
2426
47a5cb77
S
2427 return self._parse_xspf(
2428 xspf, playlist_id, xspf_url=xspf_url,
2429 xspf_base_url=base_url(xspf_url))
8d6765cf 2430
47a5cb77 2431 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2432 NS_MAP = {
2433 'xspf': 'http://xspf.org/ns/0/',
2434 's1': 'http://static.streamone.nl/player/ns/0',
2435 }
2436
2437 entries = []
47a5cb77 2438 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2439 title = xpath_text(
98044462 2440 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2441 description = xpath_text(
2442 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2443 thumbnail = xpath_text(
2444 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2445 duration = float_or_none(
2446 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2447
47a5cb77
S
2448 formats = []
2449 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2450 format_url = urljoin(xspf_base_url, location.text)
2451 if not format_url:
2452 continue
2453 formats.append({
2454 'url': format_url,
2455 'manifest_url': xspf_url,
2456 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2457 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2458 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2459 })
8d6765cf
S
2460
2461 entries.append({
2462 'id': playlist_id,
2463 'title': title,
2464 'description': description,
2465 'thumbnail': thumbnail,
2466 'duration': duration,
2467 'formats': formats,
2468 })
2469 return entries
2470
171e59ed
F
2471 def _extract_mpd_formats(self, *args, **kwargs):
2472 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2473 if subs:
b5ae35ee 2474 self._report_ignoring_subs('DASH')
171e59ed
F
2475 return fmts
2476
2477 def _extract_mpd_formats_and_subtitles(
2478 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2479 fatal=True, data=None, headers={}, query={}):
0b5546c7 2480
2481 if self.get_param('ignore_no_formats_error'):
2482 fatal = False
2483
47a5cb77 2484 res = self._download_xml_handle(
1bac3455 2485 mpd_url, video_id,
37a3bb66 2486 note='Downloading MPD manifest' if note is None else note,
2487 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2488 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2489 if res is False:
171e59ed 2490 return [], {}
47a5cb77 2491 mpd_doc, urlh = res
c25720ef 2492 if mpd_doc is None:
171e59ed 2493 return [], {}
779da8e3
E
2494
2495 # We could have been redirected to a new url when we retrieved our mpd file.
2496 mpd_url = urlh.geturl()
2497 mpd_base_url = base_url(mpd_url)
1bac3455 2498
171e59ed 2499 return self._parse_mpd_formats_and_subtitles(
545cc85d 2500 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2501
171e59ed
F
2502 def _parse_mpd_formats(self, *args, **kwargs):
2503 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2504 if subs:
b5ae35ee 2505 self._report_ignoring_subs('DASH')
171e59ed
F
2506 return fmts
2507
2508 def _parse_mpd_formats_and_subtitles(
2509 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2510 """
2511 Parse formats from MPD manifest.
2512 References:
2513 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2514 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2515 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2516 """
a06916d9 2517 if not self.get_param('dynamic_mpd', True):
78895bd3 2518 if mpd_doc.get('type') == 'dynamic':
171e59ed 2519 return [], {}
2d2fa82d 2520
91cb6b50 2521 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2522
2523 def _add_ns(path):
2524 return self._xpath_ns(path, namespace)
2525
675d0016 2526 def is_drm_protected(element):
2527 return element.find(_add_ns('ContentProtection')) is not None
2528
1bac3455 2529 def extract_multisegment_info(element, ms_parent_info):
2530 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2531
2532 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2533 # common attributes and elements. We will only extract relevant
2534 # for us.
2535 def extract_common(source):
2536 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2537 if segment_timeline is not None:
2538 s_e = segment_timeline.findall(_add_ns('S'))
2539 if s_e:
2540 ms_info['total_number'] = 0
2541 ms_info['s'] = []
2542 for s in s_e:
2543 r = int(s.get('r', 0))
2544 ms_info['total_number'] += 1 + r
2545 ms_info['s'].append({
2546 't': int(s.get('t', 0)),
2547 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2548 'd': int(s.attrib['d']),
2549 'r': r,
2550 })
2551 start_number = source.get('startNumber')
2552 if start_number:
2553 ms_info['start_number'] = int(start_number)
2554 timescale = source.get('timescale')
2555 if timescale:
2556 ms_info['timescale'] = int(timescale)
2557 segment_duration = source.get('duration')
2558 if segment_duration:
48504785 2559 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2560
2561 def extract_Initialization(source):
2562 initialization = source.find(_add_ns('Initialization'))
2563 if initialization is not None:
2564 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2565
f14be228 2566 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2567 if segment_list is not None:
b4c1d6e8
S
2568 extract_common(segment_list)
2569 extract_Initialization(segment_list)
f14be228 2570 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2571 if segment_urls_e:
2572 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2573 else:
f14be228 2574 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2575 if segment_template is not None:
b4c1d6e8 2576 extract_common(segment_template)
e228616c
S
2577 media = segment_template.get('media')
2578 if media:
2579 ms_info['media'] = media
1bac3455 2580 initialization = segment_template.get('initialization')
2581 if initialization:
e228616c 2582 ms_info['initialization'] = initialization
1bac3455 2583 else:
b4c1d6e8 2584 extract_Initialization(segment_template)
1bac3455 2585 return ms_info
b323e170 2586
1bac3455 2587 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2588 formats, subtitles = [], {}
234416e4 2589 stream_numbers = collections.defaultdict(int)
f14be228 2590 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2591 period_duration = parse_duration(period.get('duration')) or mpd_duration
2592 period_ms_info = extract_multisegment_info(period, {
2593 'start_number': 1,
2594 'timescale': 1,
2595 })
f14be228 2596 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2597 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2598 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2599 representation_attrib = adaptation_set.attrib.copy()
2600 representation_attrib.update(representation.attrib)
f0948348 2601 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2602 mime_type = representation_attrib['mimeType']
171e59ed
F
2603 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2604
21633673 2605 codec_str = representation_attrib.get('codecs', '')
2606 # Some kind of binary subtitle found in some youtube livestreams
2607 if mime_type == 'application/x-rawcc':
2608 codecs = {'scodec': codec_str}
2609 else:
2610 codecs = parse_codecs(codec_str)
be2fc5b2 2611 if content_type not in ('video', 'audio', 'text'):
2612 if mime_type == 'image/jpeg':
a8731fcc 2613 content_type = mime_type
21633673 2614 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2615 content_type = 'video'
21633673 2616 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2617 content_type = 'audio'
3fe75fdc 2618 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2619 content_type = 'text'
6993f78d 2620 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2621 content_type = 'text'
cdb19aa4 2622 else:
be2fc5b2 2623 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2624 continue
2625
2626 base_url = ''
2627 for element in (representation, adaptation_set, period, mpd_doc):
2628 base_url_e = element.find(_add_ns('BaseURL'))
47046464 2629 if try_call(lambda: base_url_e.text) is not None:
be2fc5b2 2630 base_url = base_url_e.text + base_url
2631 if re.match(r'^https?://', base_url):
2632 break
f9cc0161 2633 if mpd_base_url and base_url.startswith('/'):
14f25df2 2634 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
f9cc0161
D
2635 elif mpd_base_url and not re.match(r'^https?://', base_url):
2636 if not mpd_base_url.endswith('/'):
be2fc5b2 2637 mpd_base_url += '/'
2638 base_url = mpd_base_url + base_url
2639 representation_id = representation_attrib.get('id')
2640 lang = representation_attrib.get('lang')
2641 url_el = representation.find(_add_ns('BaseURL'))
2642 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2643 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2644 if representation_id is not None:
2645 format_id = representation_id
2646 else:
2647 format_id = content_type
2648 if mpd_id:
2649 format_id = mpd_id + '-' + format_id
2650 if content_type in ('video', 'audio'):
2651 f = {
2652 'format_id': format_id,
2653 'manifest_url': mpd_url,
2654 'ext': mimetype2ext(mime_type),
2655 'width': int_or_none(representation_attrib.get('width')),
2656 'height': int_or_none(representation_attrib.get('height')),
2657 'tbr': float_or_none(bandwidth, 1000),
2658 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2659 'fps': int_or_none(representation_attrib.get('frameRate')),
2660 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2661 'format_note': 'DASH %s' % content_type,
2662 'filesize': filesize,
2663 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2664 **codecs
be2fc5b2 2665 }
be2fc5b2 2666 elif content_type == 'text':
2667 f = {
2668 'ext': mimetype2ext(mime_type),
2669 'manifest_url': mpd_url,
2670 'filesize': filesize,
2671 }
2672 elif content_type == 'image/jpeg':
2673 # See test case in VikiIE
2674 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2675 f = {
2676 'format_id': format_id,
2677 'ext': 'mhtml',
2678 'manifest_url': mpd_url,
2679 'format_note': 'DASH storyboards (jpeg)',
2680 'acodec': 'none',
2681 'vcodec': 'none',
2682 }
88acdbc2 2683 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2684 f['has_drm'] = True
be2fc5b2 2685 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2686
2687 def prepare_template(template_name, identifiers):
2688 tmpl = representation_ms_info[template_name]
0cb0fdbb 2689 if representation_id is not None:
2690 tmpl = tmpl.replace('$RepresentationID$', representation_id)
be2fc5b2 2691 # First of, % characters outside $...$ templates
2692 # must be escaped by doubling for proper processing
2693 # by % operator string formatting used further (see
2694 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2695 t = ''
2696 in_template = False
2697 for c in tmpl:
2698 t += c
2699 if c == '$':
2700 in_template = not in_template
2701 elif c == '%' and not in_template:
eca1f0d1 2702 t += c
be2fc5b2 2703 # Next, $...$ templates are translated to their
2704 # %(...) counterparts to be used with % operator
be2fc5b2 2705 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2706 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2707 t.replace('$$', '$')
2708 return t
2709
2710 # @initialization is a regular template like @media one
2711 # so it should be handled just the same way (see
2712 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2713 if 'initialization' in representation_ms_info:
2714 initialization_template = prepare_template(
2715 'initialization',
2716 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2717 # $Time$ shall not be included for @initialization thus
2718 # only $Bandwidth$ remains
2719 ('Bandwidth', ))
2720 representation_ms_info['initialization_url'] = initialization_template % {
2721 'Bandwidth': bandwidth,
2722 }
2723
2724 def location_key(location):
2725 return 'url' if re.match(r'^https?://', location) else 'path'
2726
2727 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2728
2729 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2730 media_location_key = location_key(media_template)
2731
2732 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2733 # can't be used at the same time
2734 if '%(Number' in media_template and 's' not in representation_ms_info:
2735 segment_duration = None
2736 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2737 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2738 representation_ms_info['total_number'] = int(math.ceil(
2739 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2740 representation_ms_info['fragments'] = [{
2741 media_location_key: media_template % {
2742 'Number': segment_number,
2743 'Bandwidth': bandwidth,
2744 },
2745 'duration': segment_duration,
2746 } for segment_number in range(
2747 representation_ms_info['start_number'],
2748 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2749 else:
2750 # $Number*$ or $Time$ in media template with S list available
2751 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2752 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2753 representation_ms_info['fragments'] = []
2754 segment_time = 0
2755 segment_d = None
2756 segment_number = representation_ms_info['start_number']
2757
2758 def add_segment_url():
2759 segment_url = media_template % {
2760 'Time': segment_time,
2761 'Bandwidth': bandwidth,
2762 'Number': segment_number,
2763 }
2764 representation_ms_info['fragments'].append({
2765 media_location_key: segment_url,
2766 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2767 })
2768
2769 for num, s in enumerate(representation_ms_info['s']):
2770 segment_time = s.get('t') or segment_time
2771 segment_d = s['d']
2772 add_segment_url()
2773 segment_number += 1
2774 for r in range(s.get('r', 0)):
2775 segment_time += segment_d
f0948348 2776 add_segment_url()
b4c1d6e8 2777 segment_number += 1
be2fc5b2 2778 segment_time += segment_d
2779 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
62b58c09
L
2780 # No media template,
2781 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
be2fc5b2 2782 # or any YouTube dashsegments video
2783 fragments = []
2784 segment_index = 0
2785 timescale = representation_ms_info['timescale']
2786 for s in representation_ms_info['s']:
2787 duration = float_or_none(s['d'], timescale)
2788 for r in range(s.get('r', 0) + 1):
2789 segment_uri = representation_ms_info['segment_urls'][segment_index]
2790 fragments.append({
2791 location_key(segment_uri): segment_uri,
2792 'duration': duration,
2793 })
2794 segment_index += 1
2795 representation_ms_info['fragments'] = fragments
2796 elif 'segment_urls' in representation_ms_info:
2797 # Segment URLs with no SegmentTimeline
62b58c09 2798 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
be2fc5b2 2799 # https://github.com/ytdl-org/youtube-dl/pull/14844
2800 fragments = []
2801 segment_duration = float_or_none(
2802 representation_ms_info['segment_duration'],
2803 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2804 for segment_url in representation_ms_info['segment_urls']:
2805 fragment = {
2806 location_key(segment_url): segment_url,
2807 }
2808 if segment_duration:
2809 fragment['duration'] = segment_duration
2810 fragments.append(fragment)
2811 representation_ms_info['fragments'] = fragments
2812 # If there is a fragments key available then we correctly recognized fragmented media.
2813 # Otherwise we will assume unfragmented media with direct access. Technically, such
2814 # assumption is not necessarily correct since we may simply have no support for
2815 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2816 if 'fragments' in representation_ms_info:
2817 f.update({
2818 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2819 'url': mpd_url or base_url,
2820 'fragment_base_url': base_url,
2821 'fragments': [],
2822 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2823 })
2824 if 'initialization_url' in representation_ms_info:
2825 initialization_url = representation_ms_info['initialization_url']
2826 if not f.get('url'):
2827 f['url'] = initialization_url
2828 f['fragments'].append({location_key(initialization_url): initialization_url})
2829 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 2830 if not period_duration:
2831 period_duration = try_get(
2832 representation_ms_info,
2833 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 2834 else:
be2fc5b2 2835 # Assuming direct URL to unfragmented media.
2836 f['url'] = base_url
234416e4 2837 if content_type in ('video', 'audio', 'image/jpeg'):
2838 f['manifest_stream_number'] = stream_numbers[f['url']]
2839 stream_numbers[f['url']] += 1
be2fc5b2 2840 formats.append(f)
2841 elif content_type == 'text':
2842 subtitles.setdefault(lang or 'und', []).append(f)
2843
171e59ed 2844 return formats, subtitles
17b598d3 2845
fd76a142
F
2846 def _extract_ism_formats(self, *args, **kwargs):
2847 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2848 if subs:
b5ae35ee 2849 self._report_ignoring_subs('ISM')
fd76a142
F
2850 return fmts
2851
2852 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
0b5546c7 2853 if self.get_param('ignore_no_formats_error'):
2854 fatal = False
2855
47a5cb77 2856 res = self._download_xml_handle(
b2758123 2857 ism_url, video_id,
37a3bb66 2858 note='Downloading ISM manifest' if note is None else note,
2859 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2860 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2861 if res is False:
fd76a142 2862 return [], {}
47a5cb77 2863 ism_doc, urlh = res
13b08034 2864 if ism_doc is None:
fd76a142 2865 return [], {}
b2758123 2866
fd76a142 2867 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 2868
fd76a142 2869 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2870 """
2871 Parse formats from ISM manifest.
2872 References:
2873 1. [MS-SSTR]: Smooth Streaming Protocol,
2874 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2875 """
06869367 2876 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2877 return [], {}
b2758123 2878
b2758123
RA
2879 duration = int(ism_doc.attrib['Duration'])
2880 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2881
2882 formats = []
fd76a142 2883 subtitles = {}
b2758123
RA
2884 for stream in ism_doc.findall('StreamIndex'):
2885 stream_type = stream.get('Type')
fd76a142 2886 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2887 continue
2888 url_pattern = stream.attrib['Url']
2889 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2890 stream_name = stream.get('Name')
fd76a142 2891 stream_language = stream.get('Language', 'und')
b2758123 2892 for track in stream.findall('QualityLevel'):
81b6102d 2893 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2894 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
b2758123 2895 # TODO: add support for WVC1 and WMAP
81b6102d 2896 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
b2758123
RA
2897 self.report_warning('%s is not a supported codec' % fourcc)
2898 continue
2899 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2900 # [1] does not mention Width and Height attributes. However,
2901 # they're often present while MaxWidth and MaxHeight are
2902 # missing, so should be used as fallbacks
2903 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2904 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2905 sampling_rate = int_or_none(track.get('SamplingRate'))
2906
2907 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
14f25df2 2908 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
b2758123
RA
2909
2910 fragments = []
2911 fragment_ctx = {
2912 'time': 0,
2913 }
2914 stream_fragments = stream.findall('c')
2915 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2916 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2917 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2918 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2919 if not fragment_ctx['duration']:
2920 try:
2921 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2922 except IndexError:
2923 next_fragment_time = duration
1616f9b4 2924 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2925 for _ in range(fragment_repeat):
2926 fragments.append({
14f25df2 2927 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2928 'duration': fragment_ctx['duration'] / stream_timescale,
2929 })
2930 fragment_ctx['time'] += fragment_ctx['duration']
2931
fd76a142
F
2932 if stream_type == 'text':
2933 subtitles.setdefault(stream_language, []).append({
2934 'ext': 'ismt',
2935 'protocol': 'ism',
2936 'url': ism_url,
2937 'manifest_url': ism_url,
2938 'fragments': fragments,
2939 '_download_params': {
2940 'stream_type': stream_type,
2941 'duration': duration,
2942 'timescale': stream_timescale,
2943 'fourcc': fourcc,
2944 'language': stream_language,
2945 'codec_private_data': track.get('CodecPrivateData'),
2946 }
2947 })
2948 elif stream_type in ('video', 'audio'):
2949 formats.append({
34921b43 2950 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
2951 'url': ism_url,
2952 'manifest_url': ism_url,
2953 'ext': 'ismv' if stream_type == 'video' else 'isma',
2954 'width': width,
2955 'height': height,
2956 'tbr': tbr,
2957 'asr': sampling_rate,
2958 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2959 'acodec': 'none' if stream_type == 'video' else fourcc,
2960 'protocol': 'ism',
2961 'fragments': fragments,
88acdbc2 2962 'has_drm': ism_doc.find('Protection') is not None,
fd76a142
F
2963 '_download_params': {
2964 'stream_type': stream_type,
2965 'duration': duration,
2966 'timescale': stream_timescale,
2967 'width': width or 0,
2968 'height': height or 0,
2969 'fourcc': fourcc,
2970 'language': stream_language,
2971 'codec_private_data': track.get('CodecPrivateData'),
2972 'sampling_rate': sampling_rate,
2973 'channels': int_or_none(track.get('Channels', 2)),
2974 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2975 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2976 },
2977 })
2978 return formats, subtitles
b2758123 2979
079a7cfc 2980 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
2981 def absolute_url(item_url):
2982 return urljoin(base_url, item_url)
59bbe491 2983
2984 def parse_content_type(content_type):
2985 if not content_type:
2986 return {}
2987 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2988 if ctr:
2989 mimetype, codecs = ctr.groups()
2990 f = parse_codecs(codecs)
2991 f['ext'] = mimetype2ext(mimetype)
2992 return f
2993 return {}
2994
222a2308
L
2995 def _media_formats(src, cur_media_type, type_info=None):
2996 type_info = type_info or {}
520251c0 2997 full_url = absolute_url(src)
82889d4a 2998 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 2999 if ext == 'm3u8':
520251c0
YCH
3000 is_plain_url = False
3001 formats = self._extract_m3u8_formats(
ad120ae1 3002 full_url, video_id, ext='mp4',
eeb0a956 3003 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3004 preference=preference, quality=quality, fatal=False)
87a449c1
S
3005 elif ext == 'mpd':
3006 is_plain_url = False
3007 formats = self._extract_mpd_formats(
b359e977 3008 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3009 else:
3010 is_plain_url = True
3011 formats = [{
3012 'url': full_url,
3013 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3014 'ext': ext,
520251c0
YCH
3015 }]
3016 return is_plain_url, formats
3017
59bbe491 3018 entries = []
4328ddf8 3019 # amp-video and amp-audio are very similar to their HTML5 counterparts
962ffcf8 3020 # so we will include them right here (see
4328ddf8 3021 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3022 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3023 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3024 media_tags = [(media_tag, media_tag_name, media_type, '')
3025 for media_tag, media_tag_name, media_type
3026 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3027 media_tags.extend(re.findall(
3028 # We only allow video|audio followed by a whitespace or '>'.
3029 # Allowing more characters may end up in significant slow down (see
62b58c09
L
3030 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3031 # e.g. http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3032 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3033 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3034 media_info = {
3035 'formats': [],
3036 'subtitles': {},
3037 }
3038 media_attributes = extract_attributes(media_tag)
bfbecd11 3039 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3040 if src:
222a2308
L
3041 f = parse_content_type(media_attributes.get('type'))
3042 _, formats = _media_formats(src, media_type, f)
520251c0 3043 media_info['formats'].extend(formats)
6780154e 3044 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3045 if media_content:
3046 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3047 s_attr = extract_attributes(source_tag)
3048 # data-video-src and data-src are non standard but seen
3049 # several times in the wild
bfbecd11 3050 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3051 if not src:
3052 continue
d493f15c 3053 f = parse_content_type(s_attr.get('type'))
868f79db 3054 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3055 if is_plain_url:
d493f15c
S
3056 # width, height, res, label and title attributes are
3057 # all not standard but seen several times in the wild
3058 labels = [
3059 s_attr.get(lbl)
3060 for lbl in ('label', 'title')
3061 if str_or_none(s_attr.get(lbl))
3062 ]
3063 width = int_or_none(s_attr.get('width'))
3089bc74
S
3064 height = (int_or_none(s_attr.get('height'))
3065 or int_or_none(s_attr.get('res')))
d493f15c
S
3066 if not width or not height:
3067 for lbl in labels:
3068 resolution = parse_resolution(lbl)
3069 if not resolution:
3070 continue
3071 width = width or resolution.get('width')
3072 height = height or resolution.get('height')
3073 for lbl in labels:
3074 tbr = parse_bitrate(lbl)
3075 if tbr:
3076 break
3077 else:
3078 tbr = None
1ed45499 3079 f.update({
d493f15c
S
3080 'width': width,
3081 'height': height,
3082 'tbr': tbr,
3083 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3084 })
520251c0
YCH
3085 f.update(formats[0])
3086 media_info['formats'].append(f)
3087 else:
3088 media_info['formats'].extend(formats)
59bbe491 3089 for track_tag in re.findall(r'<track[^>]+>', media_content):
3090 track_attributes = extract_attributes(track_tag)
3091 kind = track_attributes.get('kind')
5968d7d2 3092 if not kind or kind in ('subtitles', 'captions'):
f856816b 3093 src = strip_or_none(track_attributes.get('src'))
59bbe491 3094 if not src:
3095 continue
3096 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3097 media_info['subtitles'].setdefault(lang, []).append({
3098 'url': absolute_url(src),
3099 })
5e8e2fa5
S
3100 for f in media_info['formats']:
3101 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3102 if media_info['formats'] or media_info['subtitles']:
59bbe491 3103 entries.append(media_info)
3104 return entries
3105
f6a1d69a
F
3106 def _extract_akamai_formats(self, *args, **kwargs):
3107 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3108 if subs:
b5ae35ee 3109 self._report_ignoring_subs('akamai')
f6a1d69a
F
3110 return fmts
3111
3112 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3113 signed = 'hdnea=' in manifest_url
3114 if not signed:
3115 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3116 manifest_url = re.sub(
3117 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3118 '', manifest_url).strip('?')
3119
c7c43a93 3120 formats = []
f6a1d69a 3121 subtitles = {}
70c5802b 3122
e71a4509 3123 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3124 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3125 hds_host = hosts.get('hds')
3126 if hds_host:
3127 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3128 if 'hdcore=' not in f4m_url:
3129 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3130 f4m_formats = self._extract_f4m_formats(
3131 f4m_url, video_id, f4m_id='hds', fatal=False)
3132 for entry in f4m_formats:
3133 entry.update({'extra_param_to_segment_url': hdcore_sign})
3134 formats.extend(f4m_formats)
70c5802b 3135
c4251b9a
RA
3136 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3137 hls_host = hosts.get('hls')
3138 if hls_host:
3139 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3140 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3141 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3142 m3u8_id='hls', fatal=False)
3143 formats.extend(m3u8_formats)
f6a1d69a 3144 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3145
3146 http_host = hosts.get('http')
29f7c58a 3147 if http_host and m3u8_formats and not signed:
3148 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3149 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3150 qualities_length = len(qualities)
29f7c58a 3151 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3152 i = 0
29f7c58a 3153 for f in m3u8_formats:
3154 if f['vcodec'] != 'none':
70c5802b 3155 for protocol in ('http', 'https'):
3156 http_f = f.copy()
3157 del http_f['manifest_url']
3158 http_url = re.sub(
86e5f3ed 3159 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3160 http_f.update({
3161 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3162 'url': http_url,
3163 'protocol': protocol,
3164 })
29f7c58a 3165 formats.append(http_f)
70c5802b 3166 i += 1
70c5802b 3167
f6a1d69a 3168 return formats, subtitles
c7c43a93 3169
6ad02195 3170 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
14f25df2 3171 query = urllib.parse.urlparse(url).query
6ad02195 3172 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3173 mobj = re.search(
3174 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3175 url_base = mobj.group('url')
3176 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3177 formats = []
044eeb14
S
3178
3179 def manifest_url(manifest):
86e5f3ed 3180 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3181 if query:
3182 m_url += '?%s' % query
3183 return m_url
3184
6ad02195
RA
3185 if 'm3u8' not in skip_protocols:
3186 formats.extend(self._extract_m3u8_formats(
044eeb14 3187 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3188 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3189 if 'f4m' not in skip_protocols:
3190 formats.extend(self._extract_f4m_formats(
044eeb14 3191 manifest_url('manifest.f4m'),
6ad02195 3192 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3193 if 'dash' not in skip_protocols:
3194 formats.extend(self._extract_mpd_formats(
044eeb14 3195 manifest_url('manifest.mpd'),
0384932e 3196 video_id, mpd_id='dash', fatal=False))
6ad02195 3197 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3198 if 'smil' not in skip_protocols:
3199 rtmp_formats = self._extract_smil_formats(
044eeb14 3200 manifest_url('jwplayer.smil'),
6ad02195
RA
3201 video_id, fatal=False)
3202 for rtmp_format in rtmp_formats:
3203 rtsp_format = rtmp_format.copy()
3204 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3205 del rtsp_format['play_path']
3206 del rtsp_format['ext']
3207 rtsp_format.update({
3208 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3209 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3210 'protocol': 'rtsp',
3211 })
3212 formats.extend([rtmp_format, rtsp_format])
3213 else:
3214 for protocol in ('rtmp', 'rtsp'):
3215 if protocol not in skip_protocols:
3216 formats.append({
86e5f3ed 3217 'url': f'{protocol}:{url_base}',
6ad02195
RA
3218 'format_id': protocol,
3219 'protocol': protocol,
3220 })
3221 return formats
3222
c73e330e 3223 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3224 mobj = re.search(
32a84bcf 3225 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
a4a554a7
YCH
3226 webpage)
3227 if mobj:
c73e330e
RU
3228 try:
3229 jwplayer_data = self._parse_json(mobj.group('options'),
3230 video_id=video_id,
3231 transform_source=transform_source)
3232 except ExtractorError:
3233 pass
3234 else:
3235 if isinstance(jwplayer_data, dict):
3236 return jwplayer_data
a4a554a7
YCH
3237
3238 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3239 jwplayer_data = self._find_jwplayer_data(
3240 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3241 return self._parse_jwplayer_data(
3242 jwplayer_data, video_id, *args, **kwargs)
3243
3244 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3245 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
a4a554a7 3246 entries = []
32a84bcf
SS
3247 if not isinstance(jwplayer_data, dict):
3248 return entries
a4a554a7 3249
32a84bcf
SS
3250 playlist_items = jwplayer_data.get('playlist')
3251 # JWPlayer backward compatibility: single playlist item/flattened playlists
a4a554a7 3252 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
32a84bcf
SS
3253 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3254 if not isinstance(playlist_items, list):
3255 playlist_items = (playlist_items or jwplayer_data, )
a4a554a7 3256
32a84bcf
SS
3257 for video_data in playlist_items:
3258 if not isinstance(video_data, dict):
3259 continue
a4a554a7
YCH
3260 # JWPlayer backward compatibility: flattened sources
3261 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3262 if 'sources' not in video_data:
3263 video_data['sources'] = [video_data]
3264
3265 this_video_id = video_id or video_data['mediaid']
3266
1a2192cb
S
3267 formats = self._parse_jwplayer_formats(
3268 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3269 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3270
3271 subtitles = {}
3272 tracks = video_data.get('tracks')
3273 if tracks and isinstance(tracks, list):
3274 for track in tracks:
96a2daa1
S
3275 if not isinstance(track, dict):
3276 continue
f4b74272 3277 track_kind = track.get('kind')
14f25df2 3278 if not track_kind or not isinstance(track_kind, str):
f4b74272
S
3279 continue
3280 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3281 continue
3282 track_url = urljoin(base_url, track.get('file'))
3283 if not track_url:
3284 continue
3285 subtitles.setdefault(track.get('label') or 'en', []).append({
3286 'url': self._proto_relative_url(track_url)
3287 })
3288
50d808f5 3289 entry = {
a4a554a7 3290 'id': this_video_id,
50d808f5 3291 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3292 'description': clean_html(video_data.get('description')),
6945b9e7 3293 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3294 'timestamp': int_or_none(video_data.get('pubdate')),
3295 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3296 'subtitles': subtitles,
32a84bcf
SS
3297 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3298 'genre': clean_html(video_data.get('genre')),
3299 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3300 'season_number': int_or_none(video_data.get('season')),
3301 'episode_number': int_or_none(video_data.get('episode')),
3302 'release_year': int_or_none(video_data.get('releasedate')),
3303 'age_limit': int_or_none(video_data.get('age_restriction')),
50d808f5
RA
3304 }
3305 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3306 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3307 entry.update({
3308 '_type': 'url_transparent',
3309 'url': formats[0]['url'],
3310 })
3311 else:
50d808f5
RA
3312 entry['formats'] = formats
3313 entries.append(entry)
a4a554a7
YCH
3314 if len(entries) == 1:
3315 return entries[0]
3316 else:
3317 return self.playlist_result(entries)
3318
ed0cf9b3
S
3319 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3320 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
32a84bcf 3321 urls = set()
ed0cf9b3 3322 formats = []
1a2192cb 3323 for source in jwplayer_sources_data:
0a268c6e
S
3324 if not isinstance(source, dict):
3325 continue
6945b9e7
RA
3326 source_url = urljoin(
3327 base_url, self._proto_relative_url(source.get('file')))
3328 if not source_url or source_url in urls:
bf1b87cd 3329 continue
32a84bcf 3330 urls.add(source_url)
ed0cf9b3
S
3331 source_type = source.get('type') or ''
3332 ext = mimetype2ext(source_type) or determine_ext(source_url)
32a84bcf 3333 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
ed0cf9b3 3334 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3335 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3336 m3u8_id=m3u8_id, fatal=False))
32a84bcf 3337 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
ed0cf9b3
S
3338 formats.extend(self._extract_mpd_formats(
3339 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3340 elif ext == 'smil':
3341 formats.extend(self._extract_smil_formats(
3342 source_url, video_id, fatal=False))
ed0cf9b3 3343 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3344 elif source_type.startswith('audio') or ext in (
3345 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3346 formats.append({
3347 'url': source_url,
3348 'vcodec': 'none',
3349 'ext': ext,
3350 })
3351 else:
32a84bcf 3352 format_id = str_or_none(source.get('label'))
ed0cf9b3 3353 height = int_or_none(source.get('height'))
32a84bcf 3354 if height is None and format_id:
ed0cf9b3 3355 # Often no height is provided but there is a label in
0236cd0d 3356 # format like "1080p", "720p SD", or 1080.
32a84bcf 3357 height = parse_resolution(format_id).get('height')
ed0cf9b3
S
3358 a_format = {
3359 'url': source_url,
3360 'width': int_or_none(source.get('width')),
3361 'height': height,
d3a3d7f0 3362 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3363 'filesize': int_or_none(source.get('filesize')),
ed0cf9b3 3364 'ext': ext,
32a84bcf 3365 'format_id': format_id
ed0cf9b3
S
3366 }
3367 if source_url.startswith('rtmp'):
3368 a_format['ext'] = 'flv'
ed0cf9b3
S
3369 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3370 # of jwplayer.flash.swf
3371 rtmp_url_parts = re.split(
3372 r'((?:mp4|mp3|flv):)', source_url, 1)
3373 if len(rtmp_url_parts) == 3:
3374 rtmp_url, prefix, play_path = rtmp_url_parts
3375 a_format.update({
3376 'url': rtmp_url,
3377 'play_path': prefix + play_path,
3378 })
3379 if rtmp_params:
3380 a_format.update(rtmp_params)
3381 formats.append(a_format)
3382 return formats
3383
f4b1c7ad 3384 def _live_title(self, name):
39ca3b5c 3385 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3386 return name
f4b1c7ad 3387
b14f3a4c
PH
3388 def _int(self, v, name, fatal=False, **kwargs):
3389 res = int_or_none(v, **kwargs)
b14f3a4c 3390 if res is None:
86e5f3ed 3391 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3392 if fatal:
3393 raise ExtractorError(msg)
3394 else:
6a39ee13 3395 self.report_warning(msg)
b14f3a4c
PH
3396 return res
3397
3398 def _float(self, v, name, fatal=False, **kwargs):
3399 res = float_or_none(v, **kwargs)
3400 if res is None:
86e5f3ed 3401 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3402 if fatal:
3403 raise ExtractorError(msg)
3404 else:
6a39ee13 3405 self.report_warning(msg)
b14f3a4c
PH
3406 return res
3407
40e41780
TF
3408 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3409 path='/', secure=False, discard=False, rest={}, **kwargs):
ac668111 3410 cookie = http.cookiejar.Cookie(
4ed2d7b7 3411 0, name, value, port, port is not None, domain, True,
40e41780
TF
3412 domain.startswith('.'), path, True, secure, expire_time,
3413 discard, None, None, rest)
9809740b 3414 self.cookiejar.set_cookie(cookie)
42939b61 3415
799207e8 3416 def _get_cookies(self, url):
ac668111 3417 """ Return a http.cookies.SimpleCookie with the cookies for the url """
8817a80d 3418 return LenientSimpleCookie(self._downloader._calc_cookies(url))
799207e8 3419
e3c1266f 3420 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3421 """
3422 Apply first Set-Cookie header instead of the last. Experimental.
3423
3424 Some sites (e.g. [1-3]) may serve two cookies under the same name
3425 in Set-Cookie header and expect the first (old) one to be set rather
3426 than second (new). However, as of RFC6265 the newer one cookie
3427 should be set into cookie store what actually happens.
3428 We will workaround this issue by resetting the cookie to
3429 the first one manually.
3430 1. https://new.vk.com/
3431 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3432 3. https://learning.oreilly.com/
3433 """
e3c1266f
S
3434 for header, cookies in url_handle.headers.items():
3435 if header.lower() != 'set-cookie':
3436 continue
cfb0511d 3437 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3438 cookie_value = re.search(
3439 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3440 if cookie_value:
3441 value, domain = cookie_value.groups()
3442 self._set_cookie(domain, cookie, value)
3443 break
3444
82d02080 3445 @classmethod
3446 def get_testcases(cls, include_onlymatching=False):
6368e2e6 3447 # Do not look in super classes
3448 t = vars(cls).get('_TEST')
05900629 3449 if t:
82d02080 3450 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3451 tests = [t]
3452 else:
6368e2e6 3453 tests = vars(cls).get('_TESTS', [])
05900629
PH
3454 for t in tests:
3455 if not include_onlymatching and t.get('only_matching', False):
3456 continue
82d02080 3457 t['name'] = cls.ie_key()
05900629 3458 yield t
e756f45b
M
3459 if getattr(cls, '__wrapped__', None):
3460 yield from cls.__wrapped__.get_testcases(include_onlymatching)
05900629 3461
f2e8dbcc 3462 @classmethod
3463 def get_webpage_testcases(cls):
6368e2e6 3464 tests = vars(cls).get('_WEBPAGE_TESTS', [])
f2e8dbcc 3465 for t in tests:
3466 t['name'] = cls.ie_key()
e756f45b
M
3467 yield t
3468 if getattr(cls, '__wrapped__', None):
3469 yield from cls.__wrapped__.get_webpage_testcases()
f2e8dbcc 3470
6368e2e6 3471 @classproperty(cache=True)
24146491 3472 def age_limit(cls):
3473 """Get age limit from the testcases"""
3474 return max(traverse_obj(
f2e8dbcc 3475 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
24146491 3476 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3477
171a31db 3478 @classproperty(cache=True)
3479 def _RETURN_TYPE(cls):
3480 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3481 tests = tuple(cls.get_testcases(include_onlymatching=False))
3482 if not tests:
3483 return None
3484 elif not any(k.startswith('playlist') for test in tests for k in test):
3485 return 'video'
3486 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3487 return 'playlist'
3488 return 'any'
3489
3490 @classmethod
3491 def is_single_video(cls, url):
3492 """Returns whether the URL is of a single video, None if unknown"""
3493 assert cls.suitable(url), 'The URL must be suitable for the extractor'
3494 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3495
82d02080 3496 @classmethod
3497 def is_suitable(cls, age_limit):
24146491 3498 """Test whether the extractor is generally suitable for the given age limit"""
3499 return not age_restricted(cls.age_limit, age_limit)
05900629 3500
82d02080 3501 @classmethod
3502 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3503 """Description of the extractor"""
3504 desc = ''
82d02080 3505 if cls._NETRC_MACHINE:
8dcce6a8 3506 if markdown:
82d02080 3507 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
8dcce6a8 3508 else:
82d02080 3509 desc += f' [{cls._NETRC_MACHINE}]'
3510 if cls.IE_DESC is False:
8dcce6a8 3511 desc += ' [HIDDEN]'
82d02080 3512 elif cls.IE_DESC:
3513 desc += f' {cls.IE_DESC}'
3514 if cls.SEARCH_KEY:
3515 desc += f'; "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3516 if search_examples:
3517 _COUNTS = ('', '5', '10', 'all')
62b58c09 3518 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
82d02080 3519 if not cls.working():
8dcce6a8 3520 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3521
46d09f87 3522 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3523 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
8dcce6a8 3524 return f'{name}:{desc}' if desc else name
3525
a504ced0 3526 def extract_subtitles(self, *args, **kwargs):
a06916d9 3527 if (self.get_param('writesubtitles', False)
3528 or self.get_param('listsubtitles')):
9868ea49
JMF
3529 return self._get_subtitles(*args, **kwargs)
3530 return {}
a504ced0
JMF
3531
3532 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3533 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3534
0cf643b2
M
3535 class CommentsDisabled(Exception):
3536 """Raise in _get_comments if comments are disabled for the video"""
3537
a2160aa4 3538 def extract_comments(self, *args, **kwargs):
3539 if not self.get_param('getcomments'):
3540 return None
3541 generator = self._get_comments(*args, **kwargs)
3542
3543 def extractor():
3544 comments = []
d2b2fca5 3545 interrupted = True
a2160aa4 3546 try:
3547 while True:
3548 comments.append(next(generator))
a2160aa4 3549 except StopIteration:
3550 interrupted = False
d2b2fca5 3551 except KeyboardInterrupt:
3552 self.to_screen('Interrupted by user')
0cf643b2
M
3553 except self.CommentsDisabled:
3554 return {'comments': None, 'comment_count': None}
d2b2fca5 3555 except Exception as e:
3556 if self.get_param('ignoreerrors') is not True:
3557 raise
3558 self._downloader.report_error(e)
a2160aa4 3559 comment_count = len(comments)
3560 self.to_screen(f'Extracted {comment_count} comments')
3561 return {
3562 'comments': comments,
3563 'comment_count': None if interrupted else comment_count
3564 }
3565 return extractor
3566
3567 def _get_comments(self, *args, **kwargs):
3568 raise NotImplementedError('This method must be implemented by subclasses')
3569
912e0b7e
YCH
3570 @staticmethod
3571 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3572 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3573 will be dropped. """
86e5f3ed 3574 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3575 ret = list(subtitle_list1)
a44ca5a4 3576 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3577 return ret
3578
3579 @classmethod
46890374 3580 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3581 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3582 if target is None:
3583 target = {}
3584 for d in dicts:
3585 for lang, subs in d.items():
3586 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3587 return target
912e0b7e 3588
360e1ca5 3589 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3590 if (self.get_param('writeautomaticsub', False)
3591 or self.get_param('listsubtitles')):
9868ea49
JMF
3592 return self._get_automatic_captions(*args, **kwargs)
3593 return {}
360e1ca5
JMF
3594
3595 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3596 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3597
2762dbb1 3598 @functools.cached_property
24146491 3599 def _cookies_passed(self):
3600 """Whether cookies have been passed to YoutubeDL"""
3601 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3602
d77ab8e2 3603 def mark_watched(self, *args, **kwargs):
1813a6cc 3604 if not self.get_param('mark_watched', False):
3605 return
24146491 3606 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3607 self._mark_watched(*args, **kwargs)
3608
3609 def _mark_watched(self, *args, **kwargs):
3610 raise NotImplementedError('This method must be implemented by subclasses')
3611
38cce791
YCH
3612 def geo_verification_headers(self):
3613 headers = {}
a06916d9 3614 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3615 if geo_verification_proxy:
3616 headers['Ytdl-request-proxy'] = geo_verification_proxy
3617 return headers
3618
8f97a15d 3619 @staticmethod
3620 def _generic_id(url):
14f25df2 3621 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
98763ee3 3622
62b8dac4 3623 def _generic_title(self, url='', webpage='', *, default=None):
3624 return (self._og_search_title(webpage, default=None)
3625 or self._html_extract_title(webpage, default=None)
3626 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3627 or default)
98763ee3 3628
c224251a 3629 @staticmethod
b0089e89 3630 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3631 all_known = all(map(
3632 lambda x: x is not None,
3633 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3634 return (
3635 'private' if is_private
3636 else 'premium_only' if needs_premium
3637 else 'subscriber_only' if needs_subscription
3638 else 'needs_auth' if needs_auth
3639 else 'unlisted' if is_unlisted
3640 else 'public' if all_known
3641 else None)
3642
d43de682 3643 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3644 '''
3645 @returns A list of values for the extractor argument given by "key"
3646 or "default" if no such key is present
3647 @param default The default value to return when the key is not present (default: [])
3648 @param casesense When false, the values are converted to lower case
3649 '''
5225df50 3650 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3651 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
4bb6b02f 3652 if val is None:
3653 return [] if default is NO_DEFAULT else default
3654 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3655
f40ee5e9 3656 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3657 if not playlist_id or not video_id:
3658 return not video_id
3659
3660 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3661 if no_playlist is not None:
3662 return not no_playlist
3663
3664 video_id = '' if video_id is True else f' {video_id}'
3665 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3666 if self.get_param('noplaylist'):
3667 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3668 return False
3669 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3670 return True
3671
be5c1ae8 3672 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
8ca48a1a 3673 RetryManager.report_retry(
3674 err, _count or int(fatal), _retries,
3675 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3676 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
be5c1ae8 3677
3678 def RetryManager(self, **kwargs):
3679 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3680
ade1fa70 3681 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3682 display_id = traverse_obj(info_dict, 'display_id', 'id')
3683 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3684 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3685 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3686
8f97a15d 3687 @classmethod
3688 def extract_from_webpage(cls, ydl, url, webpage):
3689 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3690 else ydl.get_info_extractor(cls.ie_key()))
f2e8dbcc 3691 for info in ie._extract_from_webpage(url, webpage) or []:
3692 # url = None since we do not want to set (webpage/original)_url
3693 ydl.add_default_extra_info(info, ie, None)
3694 yield info
8f97a15d 3695
3696 @classmethod
3697 def _extract_from_webpage(cls, url, webpage):
3698 for embed_url in orderedSet(
3699 cls._extract_embed_urls(url, webpage) or [], lazy=True):
d2c8aadf 3700 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
8f97a15d 3701
3702 @classmethod
3703 def _extract_embed_urls(cls, url, webpage):
3704 """@returns all the embed urls on the webpage"""
3705 if '_EMBED_URL_RE' not in cls.__dict__:
3706 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3707 for idx, regex in enumerate(cls._EMBED_REGEX):
3708 assert regex.count('(?P<url>') == 1, \
3709 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3710 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3711
3712 for regex in cls._EMBED_URL_RE:
3713 for mobj in regex.finditer(webpage):
3714 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3715 if cls._VALID_URL is False or cls.suitable(embed_url):
3716 yield embed_url
3717
3718 class StopExtraction(Exception):
3719 pass
3720
bfd973ec 3721 @classmethod
3722 def _extract_url(cls, webpage): # TODO: Remove
3723 """Only for compatibility with some older extractors"""
3724 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3725
2314b4d8 3726 @classmethod
3727 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3728 if plugin_name:
3729 mro = inspect.getmro(cls)
3730 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
e756f45b
M
3731 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3732 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
2314b4d8 3733 while getattr(super_class, '__wrapped__', None):
3734 super_class = super_class.__wrapped__
3735 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
e756f45b 3736 _PLUGIN_OVERRIDES[super_class].append(cls)
2314b4d8 3737
3738 return super().__init_subclass__(**kwargs)
3739
8dbe9899 3740
d6983cb4
PH
3741class SearchInfoExtractor(InfoExtractor):
3742 """
3743 Base class for paged search queries extractors.
10952eb2 3744 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3745 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3746 """
3747
96565c7e 3748 _MAX_RESULTS = float('inf')
171a31db 3749 _RETURN_TYPE = 'playlist'
96565c7e 3750
8f97a15d 3751 @classproperty
3752 def _VALID_URL(cls):
d6983cb4
PH
3753 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3754
d6983cb4 3755 def _real_extract(self, query):
2c4aaadd 3756 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3757 if prefix == '':
3758 return self._get_n_results(query, 1)
3759 elif prefix == 'all':
3760 return self._get_n_results(query, self._MAX_RESULTS)
3761 else:
3762 n = int(prefix)
3763 if n <= 0:
86e5f3ed 3764 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3765 elif n > self._MAX_RESULTS:
6a39ee13 3766 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3767 n = self._MAX_RESULTS
3768 return self._get_n_results(query, n)
3769
3770 def _get_n_results(self, query, n):
cc16383f 3771 """Get a specified number of results for a query.
3772 Either this function or _search_results must be overridden by subclasses """
3773 return self.playlist_result(
3774 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3775 query, query)
3776
3777 def _search_results(self, query):
3778 """Returns an iterator of search results"""
611c1dd9 3779 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 3780
82d02080 3781 @classproperty
3782 def SEARCH_KEY(cls):
3783 return cls._SEARCH_KEY
fe7866d0 3784
3785
3786class UnsupportedURLIE(InfoExtractor):
3787 _VALID_URL = '.*'
3788 _ENABLED = False
3789 IE_DESC = False
3790
3791 def _real_extract(self, url):
3792 raise UnsupportedError(url)
e756f45b
M
3793
3794
3795_PLUGIN_OVERRIDES = collections.defaultdict(list)