]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
[extractor] Fix DRM detection in m3u8
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
ac668111 3import getpass
3ec05685 4import hashlib
54007a45 5import http.client
6import http.cookiejar
7import http.cookies
2314b4d8 8import inspect
cc16383f 9import itertools
3d3538e4 10import json
f8271158 11import math
4094b6e3 12import netrc
d6983cb4 13import os
773f291d 14import random
6929b41a 15import re
d6983cb4 16import sys
4094b6e3 17import time
8f97a15d 18import types
14f25df2 19import urllib.parse
ac668111 20import urllib.request
f8271158 21import xml.etree.ElementTree
d6983cb4 22
6929b41a 23from ..compat import functools # isort: split
14f25df2 24from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
8817a80d 25from ..cookies import LenientSimpleCookie
f8271158 26from ..downloader.f4m import get_base_url, remove_encrypted_media
8c25f81b 27from ..utils import (
8f97a15d 28 IDENTITY,
f8271158 29 JSON_LD_RE,
30 NO_DEFAULT,
31 ExtractorError,
d0d74b71 32 FormatSorter,
f8271158 33 GeoRestrictedError,
34 GeoUtils,
cb73b846 35 HEADRequest,
b7c47b74 36 LenientJSONDecoder,
f8271158 37 RegexNotFoundError,
be5c1ae8 38 RetryManager,
f8271158 39 UnsupportedError,
05900629 40 age_restricted,
02dc0a36 41 base_url,
08f2a92c 42 bug_reports_message,
82d02080 43 classproperty,
d6983cb4 44 clean_html,
d0d74b71 45 deprecation_warning,
70f0f5a8 46 determine_ext,
d493f15c 47 dict_get,
42676437 48 encode_data_uri,
9b9c5355 49 error_to_compat_str,
46b18f23 50 extract_attributes,
90137ca4 51 filter_dict,
97f4aecf 52 fix_xml_ampersands,
b14f3a4c 53 float_or_none,
b868936c 54 format_field,
31bb8d3f 55 int_or_none,
34921b43 56 join_nonempty,
a4a554a7 57 js_to_json,
46b18f23 58 mimetype2ext,
3158150c 59 network_exceptions,
46b18f23 60 orderedSet,
d493f15c 61 parse_bitrate,
46b18f23
JH
62 parse_codecs,
63 parse_duration,
4ca2a3cf 64 parse_iso8601,
46b18f23 65 parse_m3u8_attributes,
d493f15c 66 parse_resolution,
46b18f23 67 sanitize_filename,
8f97a15d 68 sanitize_url,
b868936c 69 sanitized_Request,
ade1fa70 70 smuggle_url,
d493f15c 71 str_or_none,
ce5b9040 72 str_to_int,
f856816b 73 strip_or_none,
5d3a0e79 74 traverse_obj,
71df9b7f 75 truncate_string,
47046464 76 try_call,
ffa89477 77 try_get,
f38de77f 78 unescapeHTML,
647eab45 79 unified_strdate,
6b3a3098 80 unified_timestamp,
46b18f23 81 update_Request,
09d02ea4 82 update_url_query,
a107193e 83 url_basename,
bebef109 84 url_or_none,
7e68567e 85 urlhandle_detect_ext,
b868936c 86 urljoin,
6606817a 87 variadic,
a6571f10 88 xpath_element,
8d6765cf
S
89 xpath_text,
90 xpath_with_ns,
d6983cb4 91)
c342041f 92
d6983cb4 93
86e5f3ed 94class InfoExtractor:
d6983cb4
PH
95 """Information Extractor class.
96
97 Information extractors are the classes that, given a URL, extract
98 information about the video (or videos) the URL refers to. This
99 information includes the real video URL, the video title, author and
100 others. The information is stored in a dictionary which is then
5d380852 101 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
102 information possibly downloading the video to the file system, among
103 other possible outcomes.
104
cf0649f8 105 The type field determines the type of the result.
fed5d032
PH
106 By far the most common value (and the default if _type is missing) is
107 "video", which indicates a single video.
108
109 For a video, the dictionaries must include the following fields:
d6983cb4
PH
110
111 id: Video identifier.
d4736fdb 112 title: Video title, unescaped. Set to an empty string if video has
113 no title as opposed to "None" which signifies that the
114 extractor failed to obtain a title
d67b0b15 115
f49d89ee 116 Additionally, it must contain either a formats entry or a url one:
d67b0b15 117
f49d89ee
PH
118 formats: A list of dictionaries for each format available, ordered
119 from worst to best quality.
120
121 Potential fields:
c790e93a
S
122 * url The mandatory URL representing the media:
123 for plain file media - HTTP URL of this file,
124 for RTMP - RTMP URL,
125 for HLS - URL of the M3U8 media playlist,
126 for HDS - URL of the F4M manifest,
79d2077e
S
127 for DASH
128 - HTTP URL to plain file media (in case of
129 unfragmented media)
130 - URL of the MPD manifest or base URL
131 representing the media if MPD manifest
8ed7a233 132 is parsed from a string (in case of
79d2077e 133 fragmented media)
c790e93a 134 for MSS - URL of the ISM manifest.
86f4d14f
S
135 * manifest_url
136 The URL of the manifest file in case of
c790e93a
S
137 fragmented media:
138 for HLS - URL of the M3U8 master playlist,
139 for HDS - URL of the F4M manifest,
140 for DASH - URL of the MPD manifest,
141 for MSS - URL of the ISM manifest.
a44ca5a4 142 * manifest_stream_number (For internal use only)
143 The index of the stream in the manifest file
10952eb2 144 * ext Will be calculated from URL if missing
d67b0b15
PH
145 * format A human-readable description of the format
146 ("mp4 container with h264/opus").
147 Calculated from the format_id, width, height.
148 and format_note fields if missing.
149 * format_id A short description of the format
5d4f3985
PH
150 ("mp4_h264_opus" or "19").
151 Technically optional, but strongly recommended.
d67b0b15
PH
152 * format_note Additional info about the format
153 ("3D" or "DASH video")
154 * width Width of the video, if known
155 * height Height of the video, if known
105bfd90 156 * aspect_ratio Aspect ratio of the video, if known
157 Automatically calculated from width and height
f49d89ee 158 * resolution Textual description of width and height
105bfd90 159 Automatically calculated from width and height
176f1866 160 * dynamic_range The dynamic range of the video. One of:
161 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 162 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
163 * abr Average audio bitrate in KBit/s
164 * acodec Name of the audio codec in use
dd27fd17 165 * asr Audio sampling rate in Hertz
b8ed0f15 166 * audio_channels Number of audio channels
d67b0b15 167 * vbr Average video bitrate in KBit/s
fbb21cf5 168 * fps Frame rate
d67b0b15 169 * vcodec Name of the video codec in use
1394ce65 170 * container Name of the container format
d67b0b15 171 * filesize The number of bytes, if known in advance
9732d77e 172 * filesize_approx An estimate for the number of bytes
d67b0b15 173 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 174 * protocol The protocol that will be used for the actual
adbc4ec4
THD
175 download, lower-case. One of "http", "https" or
176 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
177 * fragment_base_url
178 Base URL for fragments. Each fragment's path
179 value (if present) will be relative to
180 this URL.
181 * fragments A list of fragments of a fragmented media.
182 Each fragment entry must contain either an url
183 or a path. If an url is present it should be
184 considered by a client. Otherwise both path and
185 fragment_base_url must be present. Here is
186 the list of all potential fields:
187 * "url" - fragment's URL
188 * "path" - fragment's path relative to
189 fragment_base_url
a0d5077c
S
190 * "duration" (optional, int or float)
191 * "filesize" (optional, int)
adbc4ec4
THD
192 * is_from_start Is a live format that can be downloaded
193 from the start. Boolean
f49d89ee 194 * preference Order number of this format. If this field is
08d13955 195 present and not None, the formats get sorted
38d63d84 196 by this field, regardless of all other values.
f49d89ee
PH
197 -1 for default (order by other properties),
198 -2 or smaller for less than default.
e65566a9
PH
199 < -1000 to hide the format (if there is
200 another one which is strictly better)
32f90364
PH
201 * language Language code, e.g. "de" or "en-US".
202 * language_preference Is this in the language mentioned in
203 the URL?
aff2f4f4
PH
204 10 if it's what the URL is about,
205 -1 for default (don't know),
206 -10 otherwise, other values reserved for now.
5d73273f
PH
207 * quality Order number of the video quality of this
208 format, irrespective of the file format.
209 -1 for default (order by other properties),
210 -2 or smaller for less than default.
c64ed2a3
PH
211 * source_preference Order number for this video source
212 (quality takes higher priority)
213 -1 for default (order by other properties),
214 -2 or smaller for less than default.
d769be6c
PH
215 * http_headers A dictionary of additional HTTP headers
216 to add to the request.
6271f1ca 217 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
218 video's pixels are not square.
219 width : height ratio as float.
220 * no_resume The server does not support resuming the
221 (HTTP or RTMP) download. Boolean.
88acdbc2 222 * has_drm The format has DRM and cannot be downloaded. Boolean
7e68567e 223 * extra_param_to_segment_url A query string to append to each
224 fragment's URL, or to update each existing query string
225 with. Only applied by the native HLS/DASH downloaders.
226 * hls_aes A dictionary of HLS AES-128 decryption information
227 used by the native HLS downloader to override the
228 values in the media playlist when an '#EXT-X-KEY' tag
229 is present in the playlist:
230 * uri The URI from which the key will be downloaded
231 * key The key (as hex) used to decrypt fragments.
232 If `key` is given, any key URI will be ignored
233 * iv The IV (as hex) used to decrypt fragments
0a5a191a 234 * downloader_options A dictionary of downloader options
235 (For internal use only)
236 * http_chunk_size Chunk size for HTTP downloads
237 * ffmpeg_args Extra arguments for ffmpeg downloader
3b1fe47d 238 RTMP formats can also have the additional fields: page_url,
239 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
240 rtmp_protocol, rtmp_real_time
3dee7826 241
c0ba0f48 242 url: Final video URL.
d6983cb4 243 ext: Video filename extension.
d67b0b15
PH
244 format: The video format, defaults to ext (used for --get-format)
245 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 246
d6983cb4
PH
247 The following fields are optional:
248
08d30158 249 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 250 alt_title: A secondary title of the video.
0afef30b
PH
251 display_id An alternative identifier for the video, not necessarily
252 unique, but available before title. Typically, id is
253 something like "4234987", title "Dancing naked mole rats",
254 and display_id "dancing-naked-mole-rats"
d5519808 255 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 256 * "id" (optional, string) - Thumbnail format ID
d5519808 257 * "url"
cfb56d1a 258 * "preference" (optional, int) - quality of the image
d5519808
PH
259 * "width" (optional, int)
260 * "height" (optional, int)
5e1c39ac 261 * "resolution" (optional, string "{width}x{height}",
d5519808 262 deprecated)
2de624fd 263 * "filesize" (optional, int)
297e9952 264 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 265 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 266 description: Full video description.
d6983cb4 267 uploader: Full name of the video uploader.
2bc0c46f 268 license: License name the video is licensed under.
8a92e51c 269 creator: The creator of the video.
10db0d2f 270 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 271 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 272 If not explicitly set, calculated from timestamp
273 release_timestamp: UNIX timestamp of the moment the video was released.
274 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 275 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 276 If not explicitly set, calculated from release_timestamp
277 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 278 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 279 If not explicitly set, calculated from modified_timestamp
d6983cb4 280 uploader_id: Nickname or id of the video uploader.
7bcd2830 281 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 282 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 283 Note that channel fields may or may not repeat uploader
6f1f59f3
S
284 fields. This depends on a particular extractor.
285 channel_id: Id of the channel.
286 channel_url: Full URL to a channel webpage.
6c73052c 287 channel_follower_count: Number of followers of the channel.
da9ec3b9 288 location: Physical location where the video was filmed.
a504ced0 289 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
290 {tag: subformats}. "tag" is usually a language code, and
291 "subformats" is a list sorted from lower to higher
292 preference, each element is a dictionary with the "ext"
293 entry and one of:
a504ced0 294 * "data": The subtitles file contents
10952eb2 295 * "url": A URL pointing to the subtitles file
2412044c 296 It can optionally also have:
297 * "name": Name or description of the subtitles
08d30158 298 * "http_headers": A dictionary of additional HTTP headers
297e9952 299 to add to the request.
4bba3716 300 "ext" will be calculated from URL if missing
e167860c 301 automatic_captions: Like 'subtitles'; contains automatically generated
302 captions instead of normal subtitles
62d231c0 303 duration: Length of the video in seconds, as an integer or float.
f3d29461 304 view_count: How many users have watched the video on the platform.
867c66ff 305 concurrent_view_count: How many users are currently watching the video on the platform.
19e3dfc9
PH
306 like_count: Number of positive ratings of the video
307 dislike_count: Number of negative ratings of the video
02835c6b 308 repost_count: Number of reposts of the video
2d30521a 309 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 310 comment_count: Number of comments on the video
dd622d7c
PH
311 comments: A list of comments, each with one or more of the following
312 properties (all but one of text or html optional):
313 * "author" - human-readable name of the comment author
314 * "author_id" - user ID of the comment author
a1c5d2ca 315 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
316 * "id" - Comment ID
317 * "html" - Comment as HTML
318 * "text" - Plain text of the comment
319 * "timestamp" - UNIX timestamp of comment
320 * "parent" - ID of the comment this one is replying to.
321 Set to "root" to indicate that this is a
322 comment to the original video.
a1c5d2ca
M
323 * "like_count" - Number of positive ratings of the comment
324 * "dislike_count" - Number of negative ratings of the comment
325 * "is_favorited" - Whether the comment is marked as
326 favorite by the video uploader
327 * "author_is_uploader" - Whether the comment is made by
328 the video uploader
8dbe9899 329 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 330 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
331 should allow to get the same result again. (It will be set
332 by YoutubeDL if it's missing)
ad3bc6ac
PH
333 categories: A list of categories that the video falls in, for example
334 ["Sports", "Berlin"]
864f24bd 335 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 336 cast: A list of the video cast
7267bd53
PH
337 is_live: True, False, or None (=unknown). Whether this video is a
338 live stream that goes on instead of a fixed-length video.
f76ede8e 339 was_live: True, False, or None (=unknown). Whether this video was
340 originally a live stream.
0647d925 341 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
e325a21a 342 or 'post_live' (was live, but VOD is not yet processed)
ae30b840 343 If absent, automatically set from is_live, was_live
7c80519c 344 start_time: Time in seconds where the reproduction should start, as
10952eb2 345 specified in the URL.
297a564b 346 end_time: Time in seconds where the reproduction should end, as
10952eb2 347 specified in the URL.
55949fed 348 chapters: A list of dictionaries, with the following entries:
349 * "start_time" - The start time of the chapter in seconds
350 * "end_time" - The end time of the chapter in seconds
351 * "title" (optional, string)
6cfda058 352 playable_in_embed: Whether this video is allowed to play in embedded
353 players on other sites. Can be True (=always allowed),
354 False (=never allowed), None (=unknown), or a string
62b58c09 355 specifying the criteria for embedability; e.g. 'whitelist'
c224251a
M
356 availability: Under what condition the video is available. One of
357 'private', 'premium_only', 'subscriber_only', 'needs_auth',
358 'unlisted' or 'public'. Use 'InfoExtractor._availability'
359 to set it
1e8fe57e 360 _old_archive_ids: A list of old archive ids needed for backward compatibility
784320c9 361 _format_sort_fields: A list of fields to use for sorting formats
277d6ff5 362 __post_extractor: A function to be called just before the metadata is
363 written to either disk, logger or console. The function
364 must return a dict which will be added to the info_dict.
365 This is usefull for additional information that is
366 time-consuming to extract. Note that the fields thus
367 extracted will not be available to output template and
368 match_filter. So, only "comments" and "comment_count" are
369 currently allowed to be extracted via this method.
d6983cb4 370
7109903e
S
371 The following fields should only be used when the video belongs to some logical
372 chapter or section:
373
374 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
375 chapter_number: Number of the chapter the video belongs to, as an integer.
376 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
377
378 The following fields should only be used when the video is an episode of some
8d76bdf1 379 series, programme or podcast:
7109903e
S
380
381 series: Title of the series or programme the video episode belongs to.
9ac24e23 382 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 383 season: Title of the season the video episode belongs to.
27bfd4e5
S
384 season_number: Number of the season the video episode belongs to, as an integer.
385 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
386 episode: Title of the video episode. Unlike mandatory video title field,
387 this field should denote the exact title of the video episode
388 without any kind of decoration.
27bfd4e5
S
389 episode_number: Number of the video episode within a season, as an integer.
390 episode_id: Id of the video episode, as a unicode string.
7109903e 391
7a93ab5f
S
392 The following fields should only be used when the media is a track or a part of
393 a music album:
394
395 track: Title of the track.
396 track_number: Number of the track within an album or a disc, as an integer.
397 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
398 as a unicode string.
399 artist: Artist(s) of the track.
400 genre: Genre(s) of the track.
401 album: Title of the album the track belongs to.
402 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
403 album_artist: List of all artists appeared on the album (e.g.
404 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
405 and compilations).
406 disc_number: Number of the disc or other physical medium the track belongs to,
407 as an integer.
408 release_year: Year (YYYY) when the album was released.
8bcd4048 409 composer: Composer of the piece
7a93ab5f 410
3975b4d2 411 The following fields should only be set for clips that should be cut from the original video:
412
413 section_start: Start time of the section in seconds
414 section_end: End time of the section in seconds
415
45e8a04e 416 The following fields should only be set for storyboards:
417 rows: Number of rows in each storyboard fragment, as an integer
418 columns: Number of columns in each storyboard fragment, as an integer
419
deefc05b 420 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 421
d838b1bd
PH
422 Unless mentioned otherwise, None is equivalent to absence of information.
423
fed5d032
PH
424
425 _type "playlist" indicates multiple videos.
b82f815f
PH
426 There must be a key "entries", which is a list, an iterable, or a PagedList
427 object, each element of which is a valid dictionary by this specification.
fed5d032 428
962ffcf8 429 Additionally, playlists can have "id", "title", and any other relevant
b60419c5 430 attributes with the same semantics as videos (see above).
fed5d032 431
f0d785d3 432 It can also have the following optional fields:
433
434 playlist_count: The total number of videos in a playlist. If not given,
435 YoutubeDL tries to calculate it from "entries"
436
fed5d032
PH
437
438 _type "multi_video" indicates that there are multiple videos that
439 form a single show, for examples multiple acts of an opera or TV episode.
440 It must have an entries key like a playlist and contain all the keys
441 required for a video at the same time.
442
443
444 _type "url" indicates that the video must be extracted from another
445 location, possibly by a different extractor. Its only required key is:
446 "url" - the next URL to extract.
f58766ce
PH
447 The key "ie_key" can be set to the class name (minus the trailing "IE",
448 e.g. "Youtube") if the extractor class is known in advance.
449 Additionally, the dictionary may have any properties of the resolved entity
450 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
451 known ahead of time.
452
453
454 _type "url_transparent" entities have the same specification as "url", but
455 indicate that the given additional information is more precise than the one
456 associated with the resolved URL.
457 This is useful when a site employs a video service that hosts the video and
458 its technical metadata, but that video service does not embed a useful
459 title, description etc.
460
461
8f97a15d 462 Subclasses of this should also be added to the list of extractors and
463 should define a _VALID_URL regexp and, re-define the _real_extract() and
464 (optionally) _real_initialize() methods.
d6983cb4 465
e6f21b3d 466 Subclasses may also override suitable() if necessary, but ensure the function
467 signature is preserved and that this function imports everything it needs
52efa4b3 468 (except other extractors), so that lazy_extractors works correctly.
469
8f97a15d 470 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
471 the HTML of Generic webpages. It may also override _extract_embed_urls
472 or _extract_from_webpage as necessary. While these are normally classmethods,
473 _extract_from_webpage is allowed to be an instance method.
474
475 _extract_from_webpage may raise self.StopExtraction() to stop further
476 processing of the webpage and obtain exclusive rights to it. This is useful
62b58c09
L
477 when the extractor cannot reliably be matched using just the URL,
478 e.g. invidious/peertube instances
8f97a15d 479
480 Embed-only extractors can be defined by setting _VALID_URL = False.
481
52efa4b3 482 To support username + password (or netrc) login, the extractor must define a
483 _NETRC_MACHINE and re-define _perform_login(username, password) and
484 (optionally) _initialize_pre_login() methods. The _perform_login method will
485 be called between _initialize_pre_login and _real_initialize if credentials
486 are passed by the user. In cases where it is necessary to have the login
487 process as part of the extraction rather than initialization, _perform_login
488 can be left undefined.
e6f21b3d 489
4248dad9 490 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
491 geo restriction bypass mechanisms for a particular extractor.
492 Though it won't disable explicit geo restriction bypass based on
504f20dd 493 country code provided with geo_bypass_country.
4248dad9
S
494
495 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
496 countries for this extractor. One of these countries will be used by
497 geo restriction bypass mechanism right away in order to bypass
504f20dd 498 geo restriction, of course, if the mechanism is not disabled.
773f291d 499
5f95927a
S
500 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
501 IP blocks in CIDR notation for this extractor. One of these IP blocks
502 will be used by geo restriction bypass mechanism similarly
504f20dd 503 to _GEO_COUNTRIES.
3ccdde8c 504
fe7866d0 505 The _ENABLED attribute should be set to False for IEs that
506 are disabled by default and must be explicitly enabled.
507
e6f21b3d 508 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
509 in order to warn the users and skip the tests.
510 """
511
512 _ready = False
513 _downloader = None
773f291d 514 _x_forwarded_for_ip = None
4248dad9
S
515 _GEO_BYPASS = True
516 _GEO_COUNTRIES = None
5f95927a 517 _GEO_IP_BLOCKS = None
d6983cb4 518 _WORKING = True
fe7866d0 519 _ENABLED = True
52efa4b3 520 _NETRC_MACHINE = None
231025c4 521 IE_DESC = None
8dcce6a8 522 SEARCH_KEY = None
8f97a15d 523 _VALID_URL = None
524 _EMBED_REGEX = []
d6983cb4 525
8dcce6a8 526 def _login_hint(self, method=NO_DEFAULT, netrc=None):
527 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
528 return {
529 None: '',
530 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
531 'password': f'Use {password_hint}',
532 'cookies': (
533 'Use --cookies-from-browser or --cookies for the authentication. '
17ffed18 534 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
8dcce6a8 535 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 536
d6983cb4 537 def __init__(self, downloader=None):
49a57e70 538 """Constructor. Receives an optional downloader (a YoutubeDL instance).
539 If a downloader is not passed during initialization,
540 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 541 self._ready = False
773f291d 542 self._x_forwarded_for_ip = None
28f436ba 543 self._printed_messages = set()
d6983cb4
PH
544 self.set_downloader(downloader)
545
546 @classmethod
5ad28e7f 547 def _match_valid_url(cls, url):
8f97a15d 548 if cls._VALID_URL is False:
549 return None
79cb2577
PH
550 # This does not use has/getattr intentionally - we want to know whether
551 # we have cached the regexp for *this* class, whereas getattr would also
552 # match the superclass
553 if '_VALID_URL_RE' not in cls.__dict__:
554 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 555 return cls._VALID_URL_RE.match(url)
556
557 @classmethod
558 def suitable(cls, url):
559 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 560 # This function must import everything it needs (except other extractors),
561 # so that lazy_extractors works correctly
5ad28e7f 562 return cls._match_valid_url(url) is not None
d6983cb4 563
ed9266db
PH
564 @classmethod
565 def _match_id(cls, url):
5ad28e7f 566 return cls._match_valid_url(url).group('id')
ed9266db 567
1151c407 568 @classmethod
569 def get_temp_id(cls, url):
570 try:
571 return cls._match_id(url)
572 except (IndexError, AttributeError):
573 return None
574
d6983cb4
PH
575 @classmethod
576 def working(cls):
577 """Getter method for _WORKING."""
578 return cls._WORKING
579
52efa4b3 580 @classmethod
581 def supports_login(cls):
582 return bool(cls._NETRC_MACHINE)
583
d6983cb4
PH
584 def initialize(self):
585 """Initializes an instance (authentication, etc)."""
28f436ba 586 self._printed_messages = set()
5f95927a
S
587 self._initialize_geo_bypass({
588 'countries': self._GEO_COUNTRIES,
589 'ip_blocks': self._GEO_IP_BLOCKS,
590 })
4248dad9 591 if not self._ready:
52efa4b3 592 self._initialize_pre_login()
593 if self.supports_login():
594 username, password = self._get_login_info()
595 if username:
596 self._perform_login(username, password)
597 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 598 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
599 self._real_initialize()
600 self._ready = True
601
5f95927a 602 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
603 """
604 Initialize geo restriction bypass mechanism.
605
606 This method is used to initialize geo bypass mechanism based on faking
607 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 608 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
609 IP will be passed as X-Forwarded-For HTTP header in all subsequent
610 HTTP requests.
e39b5d4a
S
611
612 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
613 during the instance initialization with _GEO_COUNTRIES and
614 _GEO_IP_BLOCKS.
e39b5d4a 615
5f95927a 616 You may also manually call it from extractor's code if geo bypass
e39b5d4a 617 information is not available beforehand (e.g. obtained during
5f95927a
S
618 extraction) or due to some other reason. In this case you should pass
619 this information in geo bypass context passed as first argument. It may
620 contain following fields:
621
622 countries: List of geo unrestricted countries (similar
623 to _GEO_COUNTRIES)
624 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
625 (similar to _GEO_IP_BLOCKS)
626
e39b5d4a 627 """
773f291d 628 if not self._x_forwarded_for_ip:
5f95927a
S
629
630 # Geo bypass mechanism is explicitly disabled by user
a06916d9 631 if not self.get_param('geo_bypass', True):
5f95927a
S
632 return
633
634 if not geo_bypass_context:
635 geo_bypass_context = {}
636
637 # Backward compatibility: previously _initialize_geo_bypass
638 # expected a list of countries, some 3rd party code may still use
639 # it this way
640 if isinstance(geo_bypass_context, (list, tuple)):
641 geo_bypass_context = {
642 'countries': geo_bypass_context,
643 }
644
645 # The whole point of geo bypass mechanism is to fake IP
646 # as X-Forwarded-For HTTP header based on some IP block or
647 # country code.
648
649 # Path 1: bypassing based on IP block in CIDR notation
650
651 # Explicit IP block specified by user, use it right away
652 # regardless of whether extractor is geo bypassable or not
a06916d9 653 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
654
655 # Otherwise use random IP block from geo bypass context but only
656 # if extractor is known as geo bypassable
657 if not ip_block:
658 ip_blocks = geo_bypass_context.get('ip_blocks')
659 if self._GEO_BYPASS and ip_blocks:
660 ip_block = random.choice(ip_blocks)
661
662 if ip_block:
663 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 664 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
665 return
666
667 # Path 2: bypassing based on country code
668
669 # Explicit country code specified by user, use it right away
670 # regardless of whether extractor is geo bypassable or not
a06916d9 671 country = self.get_param('geo_bypass_country', None)
5f95927a
S
672
673 # Otherwise use random country code from geo bypass context but
674 # only if extractor is known as geo bypassable
675 if not country:
676 countries = geo_bypass_context.get('countries')
677 if self._GEO_BYPASS and countries:
678 country = random.choice(countries)
679
680 if country:
681 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 682 self._downloader.write_debug(
86e5f3ed 683 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
684
685 def extract(self, url):
686 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 687 try:
773f291d
S
688 for _ in range(2):
689 try:
690 self.initialize()
71df9b7f 691 self.to_screen('Extracting URL: %s' % (
692 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
0016b84e 693 ie_result = self._real_extract(url)
07cce701 694 if ie_result is None:
695 return None
0016b84e
S
696 if self._x_forwarded_for_ip:
697 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
b79f9e30 698 subtitles = ie_result.get('subtitles') or {}
699 if 'no-live-chat' in self.get_param('compat_opts'):
700 for lang in ('live_chat', 'comments', 'danmaku'):
701 subtitles.pop(lang, None)
0016b84e 702 return ie_result
773f291d 703 except GeoRestrictedError as e:
4248dad9
S
704 if self.__maybe_fake_ip_and_retry(e.countries):
705 continue
773f291d 706 raise
0db3bae8 707 except UnsupportedError:
708 raise
1151c407 709 except ExtractorError as e:
9bcfe33b 710 e.video_id = e.video_id or self.get_temp_id(url),
711 e.ie = e.ie or self.IE_NAME,
712 e.traceback = e.traceback or sys.exc_info()[2]
713 raise
ac668111 714 except http.client.IncompleteRead as e:
1151c407 715 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 716 except (KeyError, StopIteration) as e:
1151c407 717 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 718
4248dad9 719 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 720 if (not self.get_param('geo_bypass_country', None)
3089bc74 721 and self._GEO_BYPASS
a06916d9 722 and self.get_param('geo_bypass', True)
3089bc74
S
723 and not self._x_forwarded_for_ip
724 and countries):
eea0716c
S
725 country_code = random.choice(countries)
726 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
727 if self._x_forwarded_for_ip:
728 self.report_warning(
eea0716c
S
729 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
730 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
731 return True
732 return False
733
d6983cb4 734 def set_downloader(self, downloader):
08d30158 735 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
736 self._downloader = downloader
737
9809740b 738 @property
739 def cache(self):
740 return self._downloader.cache
741
742 @property
743 def cookiejar(self):
744 return self._downloader.cookiejar
745
52efa4b3 746 def _initialize_pre_login(self):
962ffcf8 747 """ Initialization before login. Redefine in subclasses."""
52efa4b3 748 pass
749
750 def _perform_login(self, username, password):
751 """ Login with username and password. Redefine in subclasses."""
752 pass
753
d6983cb4
PH
754 def _real_initialize(self):
755 """Real initialization process. Redefine in subclasses."""
756 pass
757
758 def _real_extract(self, url):
759 """Real extraction process. Redefine in subclasses."""
08d30158 760 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 761
56c73665
JMF
762 @classmethod
763 def ie_key(cls):
764 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 765 return cls.__name__[:-2]
56c73665 766
82d02080 767 @classproperty
768 def IE_NAME(cls):
769 return cls.__name__[:-2]
d6983cb4 770
d391b7e2
S
771 @staticmethod
772 def __can_accept_status_code(err, expected_status):
ac668111 773 assert isinstance(err, urllib.error.HTTPError)
d391b7e2
S
774 if expected_status is None:
775 return False
d391b7e2
S
776 elif callable(expected_status):
777 return expected_status(err.code) is True
778 else:
6606817a 779 return err.code in variadic(expected_status)
d391b7e2 780
c043c246 781 def _create_request(self, url_or_request, data=None, headers=None, query=None):
ac668111 782 if isinstance(url_or_request, urllib.request.Request):
09d02ea4 783 return update_Request(url_or_request, data=data, headers=headers, query=query)
784 if query:
785 url_or_request = update_url_query(url_or_request, query)
c043c246 786 return sanitized_Request(url_or_request, data, headers or {})
f95b9dee 787
c043c246 788 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
d391b7e2
S
789 """
790 Return the response handle.
791
792 See _download_webpage docstring for arguments specification.
793 """
1cf376f5 794 if not self._downloader._first_webpage_request:
49a57e70 795 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 796 if sleep_interval > 0:
5ef7d9bd 797 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 798 time.sleep(sleep_interval)
799 else:
800 self._downloader._first_webpage_request = False
801
d6983cb4
PH
802 if note is None:
803 self.report_download_webpage(video_id)
804 elif note is not False:
7cc3570e 805 if video_id is None:
86e5f3ed 806 self.to_screen(str(note))
7cc3570e 807 else:
86e5f3ed 808 self.to_screen(f'{video_id}: {note}')
2132edaa
S
809
810 # Some sites check X-Forwarded-For HTTP header in order to figure out
811 # the origin of the client behind proxy. This allows bypassing geo
812 # restriction by faking this header's value to IP that belongs to some
813 # geo unrestricted country. We will do so once we encounter any
814 # geo restriction error.
815 if self._x_forwarded_for_ip:
c043c246 816 headers = (headers or {}).copy()
817 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
2132edaa 818
d6983cb4 819 try:
f95b9dee 820 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 821 except network_exceptions as err:
ac668111 822 if isinstance(err, urllib.error.HTTPError):
d391b7e2 823 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
824 # Retain reference to error to prevent file object from
825 # being closed before it can be read. Works around the
826 # effects of <https://bugs.python.org/issue15002>
827 # introduced in Python 3.4.1.
828 err.fp._error = err
d391b7e2
S
829 return err.fp
830
aa94a6d3
PH
831 if errnote is False:
832 return False
d6983cb4 833 if errnote is None:
f1a9d64e 834 errnote = 'Unable to download webpage'
7f8b2714 835
86e5f3ed 836 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 837 if fatal:
497d2fab 838 raise ExtractorError(errmsg, cause=err)
7cc3570e 839 else:
6a39ee13 840 self.report_warning(errmsg)
7cc3570e 841 return False
d6983cb4 842
1890fc63 843 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
844 encoding=None, data=None, headers={}, query={}, expected_status=None):
d391b7e2
S
845 """
846 Return a tuple (page content as string, URL handle).
847
617f658b 848 Arguments:
849 url_or_request -- plain text URL as a string or
ac668111 850 a urllib.request.Request object
617f658b 851 video_id -- Video/playlist/item identifier (string)
852
853 Keyword arguments:
854 note -- note printed before downloading (string)
855 errnote -- note printed in case of an error (string)
856 fatal -- flag denoting whether error should be considered fatal,
857 i.e. whether it should cause ExtractionError to be raised,
858 otherwise a warning will be reported and extraction continued
859 encoding -- encoding for a page content decoding, guessed automatically
860 when not explicitly specified
861 data -- POST data (bytes)
862 headers -- HTTP headers (dict)
863 query -- URL query (dict)
864 expected_status -- allows to accept failed HTTP requests (non 2xx
865 status code) by explicitly specifying a set of accepted status
866 codes. Can be any of the following entities:
867 - an integer type specifying an exact failed status code to
868 accept
869 - a list or a tuple of integer types specifying a list of
870 failed status codes to accept
871 - a callable accepting an actual failed status code and
872 returning True if it should be accepted
873 Note that this argument does not affect success status codes (2xx)
874 which are always accepted.
d391b7e2 875 """
617f658b 876
b9d3e163 877 # Strip hashes from the URL (#1038)
14f25df2 878 if isinstance(url_or_request, str):
b9d3e163
PH
879 url_or_request = url_or_request.partition('#')[0]
880
d391b7e2 881 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
882 if urlh is False:
883 assert not fatal
884 return False
c9a77969 885 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
886 return (content, urlh)
887
c9a77969
YCH
888 @staticmethod
889 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
890 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
891 if m:
892 encoding = m.group(1)
893 else:
0d75ae2c 894 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
895 webpage_bytes[:1024])
896 if m:
897 encoding = m.group(1).decode('ascii')
b60016e8
PH
898 elif webpage_bytes.startswith(b'\xff\xfe'):
899 encoding = 'utf-16'
f143d86a
PH
900 else:
901 encoding = 'utf-8'
c9a77969
YCH
902
903 return encoding
904
4457823d
S
905 def __check_blocked(self, content):
906 first_block = content[:512]
3089bc74
S
907 if ('<title>Access to this site is blocked</title>' in content
908 and 'Websense' in first_block):
4457823d
S
909 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
910 blocked_iframe = self._html_search_regex(
911 r'<iframe src="([^"]+)"', content,
912 'Websense information URL', default=None)
913 if blocked_iframe:
914 msg += ' Visit %s for more details' % blocked_iframe
915 raise ExtractorError(msg, expected=True)
916 if '<title>The URL you requested has been blocked</title>' in first_block:
917 msg = (
918 'Access to this webpage has been blocked by Indian censorship. '
919 'Use a VPN or proxy server (with --proxy) to route around it.')
920 block_msg = self._html_search_regex(
921 r'</h1><p>(.*?)</p>',
922 content, 'block message', default=None)
923 if block_msg:
924 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
925 raise ExtractorError(msg, expected=True)
3089bc74
S
926 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
927 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
928 raise ExtractorError(
929 'Access to this webpage has been blocked by decision of the Russian government. '
930 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
931 expected=True)
932
f95b9dee 933 def _request_dump_filename(self, url, video_id):
934 basen = f'{video_id}_{url}'
935 trim_length = self.get_param('trim_file_name') or 240
936 if len(basen) > trim_length:
937 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
938 basen = basen[:trim_length - len(h)] + h
939 filename = sanitize_filename(f'{basen}.dump', restricted=True)
940 # Working around MAX_PATH limitation on Windows (see
941 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
942 if compat_os_name == 'nt':
943 absfilepath = os.path.abspath(filename)
944 if len(absfilepath) > 259:
945 filename = fR'\\?\{absfilepath}'
946 return filename
947
948 def __decode_webpage(self, webpage_bytes, encoding, headers):
949 if not encoding:
950 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
951 try:
952 return webpage_bytes.decode(encoding, 'replace')
953 except LookupError:
954 return webpage_bytes.decode('utf-8', 'replace')
955
c9a77969 956 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
957 webpage_bytes = urlh.read()
958 if prefix is not None:
959 webpage_bytes = prefix + webpage_bytes
a06916d9 960 if self.get_param('dump_intermediate_pages', False):
f610dbb0 961 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
962 dump = base64.b64encode(webpage_bytes).decode('ascii')
963 self._downloader.to_screen(dump)
f95b9dee 964 if self.get_param('write_pages'):
e121e3ce 965 filename = self._request_dump_filename(urlh.geturl(), video_id)
f95b9dee 966 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
967 with open(filename, 'wb') as outf:
968 outf.write(webpage_bytes)
969
f95b9dee 970 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 971 self.__check_blocked(content)
2410c43d 972
23be51d8 973 return content
d6983cb4 974
6edf2808 975 def __print_error(self, errnote, fatal, video_id, err):
976 if fatal:
c6e07cf1 977 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
6edf2808 978 elif errnote:
c6e07cf1 979 self.report_warning(f'{video_id}: {errnote}: {err}')
6edf2808 980
981 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
e2b38da9
PH
982 if transform_source:
983 xml_string = transform_source(xml_string)
e01c3d2e
S
984 try:
985 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 986 except xml.etree.ElementTree.ParseError as ve:
6edf2808 987 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
267ed0c5 988
6edf2808 989 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
3d3538e4 990 try:
b7c47b74 991 return json.loads(
992 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
3d3538e4 993 except ValueError as ve:
6edf2808 994 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
3d3538e4 995
6edf2808 996 def _parse_socket_response_as_json(self, data, *args, **kwargs):
997 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
adddc50c 998
617f658b 999 def __create_download_methods(name, parser, note, errnote, return_value):
1000
6edf2808 1001 def parse(ie, content, *args, errnote=errnote, **kwargs):
617f658b 1002 if parser is None:
1003 return content
6edf2808 1004 if errnote is False:
1005 kwargs['errnote'] = errnote
617f658b 1006 # parser is fetched by name so subclasses can override it
1007 return getattr(ie, parser)(content, *args, **kwargs)
1008
c4910024 1009 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1010 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1011 res = self._download_webpage_handle(
1012 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1013 data=data, headers=headers, query=query, expected_status=expected_status)
617f658b 1014 if res is False:
1015 return res
1016 content, urlh = res
6edf2808 1017 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
617f658b 1018
f95b9dee 1019 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
c4910024 1020 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
f95b9dee 1021 if self.get_param('load_pages'):
1022 url_or_request = self._create_request(url_or_request, data, headers, query)
1023 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1024 self.to_screen(f'Loading request from {filename}')
1025 try:
1026 with open(filename, 'rb') as dumpf:
1027 webpage_bytes = dumpf.read()
1028 except OSError as e:
1029 self.report_warning(f'Unable to load request from disk: {e}')
1030 else:
1031 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
6edf2808 1032 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
c4910024 1033 kwargs = {
1034 'note': note,
1035 'errnote': errnote,
1036 'transform_source': transform_source,
1037 'fatal': fatal,
1038 'encoding': encoding,
1039 'data': data,
1040 'headers': headers,
1041 'query': query,
1042 'expected_status': expected_status,
1043 }
617f658b 1044 if parser is None:
c4910024 1045 kwargs.pop('transform_source')
617f658b 1046 # The method is fetched by name so subclasses can override _download_..._handle
c4910024 1047 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
617f658b 1048 return res if res is False else res[0]
1049
1050 def impersonate(func, name, return_value):
1051 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1052 func.__doc__ = f'''
1053 @param transform_source Apply this transformation before parsing
1054 @returns {return_value}
1055
1056 See _download_webpage_handle docstring for other arguments specification
1057 '''
1058
1059 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1060 impersonate(download_content, f'_download_{name}', f'{return_value}')
1061 return download_handle, download_content
1062
1063 _download_xml_handle, _download_xml = __create_download_methods(
1064 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1065 _download_json_handle, _download_json = __create_download_methods(
1066 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1067 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1068 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1069 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1070
617f658b 1071 def _download_webpage(
1072 self, url_or_request, video_id, note=None, errnote=None,
1073 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1074 """
617f658b 1075 Return the data of the page as a string.
adddc50c 1076
617f658b 1077 Keyword arguments:
1078 tries -- number of tries
1079 timeout -- sleep interval between tries
1080
1081 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1082 """
617f658b 1083
1084 R''' # NB: These are unused; should they be deprecated?
1085 if tries != 1:
1086 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1087 if timeout is NO_DEFAULT:
1088 timeout = 5
1089 else:
1090 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1091 '''
1092
1093 try_count = 0
1094 while True:
1095 try:
1096 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
ac668111 1097 except http.client.IncompleteRead as e:
617f658b 1098 try_count += 1
1099 if try_count >= tries:
1100 raise e
1101 self._sleep(timeout, video_id)
adddc50c 1102
28f436ba 1103 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
a70635b8 1104 idstr = format_field(video_id, None, '%s: ')
28f436ba 1105 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1106 if only_once:
1107 if f'WARNING: {msg}' in self._printed_messages:
1108 return
1109 self._printed_messages.add(f'WARNING: {msg}')
1110 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1111
a06916d9 1112 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1113 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1114 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1115
1116 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1117 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1118
1119 def get_param(self, name, default=None, *args, **kwargs):
1120 if self._downloader:
1121 return self._downloader.params.get(name, default, *args, **kwargs)
1122 return default
d6983cb4 1123
d5d1df8a 1124 def report_drm(self, video_id, partial=NO_DEFAULT):
1125 if partial is not NO_DEFAULT:
1126 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
88acdbc2 1127 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1128
d6983cb4
PH
1129 def report_extraction(self, id_or_name):
1130 """Report information extraction."""
f1a9d64e 1131 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1132
1133 def report_download_webpage(self, video_id):
1134 """Report webpage download."""
f1a9d64e 1135 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1136
1137 def report_age_confirmation(self):
1138 """Report attempt to confirm age."""
f1a9d64e 1139 self.to_screen('Confirming age')
d6983cb4 1140
fc79158d
JMF
1141 def report_login(self):
1142 """Report attempt to log in."""
f1a9d64e 1143 self.to_screen('Logging in')
fc79158d 1144
b7da73eb 1145 def raise_login_required(
9d5d4d64 1146 self, msg='This video is only available for registered users',
52efa4b3 1147 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1148 if metadata_available and (
1149 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1150 self.report_warning(msg)
7265a219 1151 return
a70635b8 1152 msg += format_field(self._login_hint(method), None, '. %s')
46890374 1153 raise ExtractorError(msg, expected=True)
43e7d3c9 1154
b7da73eb 1155 def raise_geo_restricted(
1156 self, msg='This video is not available from your location due to geo restriction',
1157 countries=None, metadata_available=False):
f2ebc5c7 1158 if metadata_available and (
1159 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1160 self.report_warning(msg)
1161 else:
1162 raise GeoRestrictedError(msg, countries=countries)
1163
1164 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1165 if expected and (
1166 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1167 self.report_warning(msg, video_id)
68f5867c
L
1168 elif isinstance(msg, ExtractorError):
1169 raise msg
b7da73eb 1170 else:
1171 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1172
5f6a1245 1173 # Methods for following #608
c0d0b01f 1174 @staticmethod
311b6615 1175 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1176 """Returns a URL that points to a page that should be processed"""
311b6615 1177 if ie is not None:
1178 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1179 if video_id is not None:
311b6615 1180 kwargs['id'] = video_id
830d53bf 1181 if video_title is not None:
311b6615 1182 kwargs['title'] = video_title
1183 return {
1184 **kwargs,
1185 '_type': 'url_transparent' if url_transparent else 'url',
1186 'url': url,
1187 }
1188
8f97a15d 1189 @classmethod
1190 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1191 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1192 return cls.playlist_result(
1193 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1194 playlist_id, playlist_title, **kwargs)
46b18f23 1195
c0d0b01f 1196 @staticmethod
311b6615 1197 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1198 """Returns a playlist"""
d6983cb4 1199 if playlist_id:
311b6615 1200 kwargs['id'] = playlist_id
d6983cb4 1201 if playlist_title:
311b6615 1202 kwargs['title'] = playlist_title
ecc97af3 1203 if playlist_description is not None:
311b6615 1204 kwargs['description'] = playlist_description
1205 return {
1206 **kwargs,
1207 '_type': 'multi_video' if multi_video else 'playlist',
1208 'entries': entries,
1209 }
d6983cb4 1210
c342041f 1211 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1212 """
1213 Perform a regex search on the given string, using a single or a list of
1214 patterns returning the first matching group.
1215 In case of failure return a default value or raise a WARNING or a
55b3e45b 1216 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1217 """
61d3665d 1218 if string is None:
1219 mobj = None
77f90330 1220 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1221 mobj = re.search(pattern, string, flags)
1222 else:
1223 for p in pattern:
1224 mobj = re.search(p, string, flags)
c3415d1b
PH
1225 if mobj:
1226 break
d6983cb4 1227
ec11a9f4 1228 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1229
1230 if mobj:
711ede6e
PH
1231 if group is None:
1232 # return the first matching group
1233 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1234 elif isinstance(group, (list, tuple)):
1235 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1236 else:
1237 return mobj.group(group)
c342041f 1238 elif default is not NO_DEFAULT:
d6983cb4
PH
1239 return default
1240 elif fatal:
f1a9d64e 1241 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1242 else:
6a39ee13 1243 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1244 return None
1245
f0bc6e20 1246 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
8b7fb8b6 1247 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
b7c47b74 1248 """Searches string for the JSON object specified by start_pattern"""
1249 # NB: end_pattern is only used to reduce the size of the initial match
f0bc6e20 1250 if default is NO_DEFAULT:
1251 default, has_default = {}, False
1252 else:
1253 fatal, has_default = False, True
1254
1255 json_string = self._search_regex(
8b7fb8b6 1256 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
f0bc6e20 1257 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1258 if not json_string:
1259 return default
1260
1261 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1262 try:
1263 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1264 except ExtractorError as e:
1265 if fatal:
1266 raise ExtractorError(
1267 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1268 elif not has_default:
1269 self.report_warning(
1270 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1271 return default
b7c47b74 1272
c342041f 1273 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1274 """
1275 Like _search_regex, but strips HTML tags and unescapes entities.
1276 """
711ede6e 1277 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
08e29b9f 1278 if isinstance(res, tuple):
edfc7725 1279 return tuple(map(clean_html, res))
1280 return clean_html(res)
d6983cb4 1281
2118fdd1
RA
1282 def _get_netrc_login_info(self, netrc_machine=None):
1283 username = None
1284 password = None
1285 netrc_machine = netrc_machine or self._NETRC_MACHINE
1286
a06916d9 1287 if self.get_param('usenetrc', False):
2118fdd1 1288 try:
0001fcb5 1289 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1290 if os.path.isdir(netrc_file):
1291 netrc_file = os.path.join(netrc_file, '.netrc')
1292 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1293 if info is not None:
1294 username = info[0]
1295 password = info[2]
1296 else:
dcce092e
S
1297 raise netrc.NetrcParseError(
1298 'No authenticators for %s' % netrc_machine)
86e5f3ed 1299 except (OSError, netrc.NetrcParseError) as err:
6a39ee13 1300 self.report_warning(
dcce092e 1301 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1302
dcce092e 1303 return username, password
2118fdd1 1304
1b6712ab 1305 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1306 """
cf0649f8 1307 Get the login info as (username, password)
32443dd3
S
1308 First look for the manually specified credentials using username_option
1309 and password_option as keys in params dictionary. If no such credentials
1310 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1311 value.
fc79158d
JMF
1312 If there's no info available, return (None, None)
1313 """
fc79158d
JMF
1314
1315 # Attempt to use provided username and password or .netrc data
a06916d9 1316 username = self.get_param(username_option)
1317 if username is not None:
1318 password = self.get_param(password_option)
2118fdd1 1319 else:
1b6712ab 1320 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1321
2133565c 1322 return username, password
fc79158d 1323
e64b7569 1324 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1325 """
1326 Get the two-factor authentication info
1327 TODO - asking the user will be required for sms/phone verify
1328 currently just uses the command line option
1329 If there's no info available, return None
1330 """
83317f69 1331
a06916d9 1332 tfa = self.get_param('twofactor')
1333 if tfa is not None:
1334 return tfa
83317f69 1335
ac668111 1336 return getpass.getpass('Type %s and press [Return]: ' % note)
83317f69 1337
46720279
JMF
1338 # Helper functions for extracting OpenGraph info
1339 @staticmethod
ab2d5247 1340 def _og_regexes(prop):
45b2ee6f 1341 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
fbfde1c3
F
1342 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1343 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1344 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1345 return [
78fb87b2
JMF
1346 template % (property_re, content_re),
1347 template % (content_re, property_re),
ab2d5247 1348 ]
46720279 1349
864f24bd
S
1350 @staticmethod
1351 def _meta_regex(prop):
1352 return r'''(?isx)<meta
8b9848ac 1353 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1354 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1355
3c4e6d83 1356 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1357 prop = variadic(prop)
46720279 1358 if name is None:
b070564e
S
1359 name = 'OpenGraph %s' % prop[0]
1360 og_regexes = []
1361 for p in prop:
1362 og_regexes.extend(self._og_regexes(p))
1363 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1364 if escaped is None:
1365 return None
1366 return unescapeHTML(escaped)
46720279
JMF
1367
1368 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1369 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1370
1371 def _og_search_description(self, html, **kargs):
1372 return self._og_search_property('description', html, fatal=False, **kargs)
1373
04f3fd2c 1374 def _og_search_title(self, html, *, fatal=False, **kargs):
1375 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1376
8ffa13e0 1377 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1378 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1379 if secure:
1380 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1381 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1382
78338f71
JMF
1383 def _og_search_url(self, html, **kargs):
1384 return self._og_search_property('url', html, **kargs)
1385
04f3fd2c 1386 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1387 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1388
40c696e5 1389 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1390 name = variadic(name)
59040888 1391 if display_name is None:
88d9f6c0 1392 display_name = name[0]
59040888 1393 return self._html_search_regex(
88d9f6c0 1394 [self._meta_regex(n) for n in name],
711ede6e 1395 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1396
1397 def _dc_search_uploader(self, html):
1398 return self._html_search_meta('dc.creator', html, 'uploader')
1399
8f97a15d 1400 @staticmethod
1401 def _rta_search(html):
8dbe9899
PH
1402 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1403 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1404 r' content="RTA-5042-1996-1400-1577-RTA"',
1405 html):
1406 return 18
8f97a15d 1407
1408 # And then there are the jokers who advertise that they use RTA, but actually don't.
1409 AGE_LIMIT_MARKERS = [
1410 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
32a84bcf
SS
1411 r'>[^<]*you acknowledge you are at least (\d+) years old',
1412 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
8f97a15d 1413 ]
32a84bcf
SS
1414
1415 age_limit = 0
1416 for marker in AGE_LIMIT_MARKERS:
1417 mobj = re.search(marker, html)
1418 if mobj:
1419 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1420 return age_limit
8dbe9899 1421
59040888
PH
1422 def _media_rating_search(self, html):
1423 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1424 rating = self._html_search_meta('rating', html)
1425
1426 if not rating:
1427 return None
1428
1429 RATING_TABLE = {
1430 'safe for kids': 0,
1431 'general': 8,
1432 '14 years': 14,
1433 'mature': 17,
1434 'restricted': 19,
1435 }
d800609c 1436 return RATING_TABLE.get(rating.lower())
59040888 1437
69319969 1438 def _family_friendly_search(self, html):
6ca7732d 1439 # See http://schema.org/VideoObject
ac8491fc
S
1440 family_friendly = self._html_search_meta(
1441 'isFamilyFriendly', html, default=None)
69319969
NJ
1442
1443 if not family_friendly:
1444 return None
1445
1446 RATING_TABLE = {
1447 '1': 0,
1448 'true': 0,
1449 '0': 18,
1450 'false': 18,
1451 }
d800609c 1452 return RATING_TABLE.get(family_friendly.lower())
69319969 1453
0c708f11
JMF
1454 def _twitter_search_player(self, html):
1455 return self._html_search_meta('twitter:player', html,
9e1a5b84 1456 'twitter card player')
0c708f11 1457
0c36dc00 1458 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1459 """Yield all json ld objects in the html"""
1460 if default is not NO_DEFAULT:
1461 fatal = False
1462 for mobj in re.finditer(JSON_LD_RE, html):
1463 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1464 for json_ld in variadic(json_ld_item):
1465 if isinstance(json_ld, dict):
1466 yield json_ld
1467
1468 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1469 """Search for a video in any json ld in the html"""
1470 if default is not NO_DEFAULT:
1471 fatal = False
1472 info = self._json_ld(
1473 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1474 video_id, fatal=fatal, expected_type=expected_type)
1475 if info:
1476 return info
4433bb02
S
1477 if default is not NO_DEFAULT:
1478 return default
1479 elif fatal:
1480 raise RegexNotFoundError('Unable to extract JSON-LD')
1481 else:
6a39ee13 1482 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1483 return {}
4ca2a3cf 1484
95b31e26 1485 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
14f25df2 1486 if isinstance(json_ld, str):
4ca2a3cf
S
1487 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1488 if not json_ld:
1489 return {}
1490 info = {}
bae14048 1491
e7e4a6e0
S
1492 INTERACTION_TYPE_MAP = {
1493 'CommentAction': 'comment',
1494 'AgreeAction': 'like',
1495 'DisagreeAction': 'dislike',
1496 'LikeAction': 'like',
1497 'DislikeAction': 'dislike',
1498 'ListenAction': 'view',
1499 'WatchAction': 'view',
1500 'ViewAction': 'view',
1501 }
1502
f3c0c773 1503 def is_type(e, *expected_types):
1504 type = variadic(traverse_obj(e, '@type'))
1505 return any(x in type for x in expected_types)
1506
29f7c58a 1507 def extract_interaction_type(e):
1508 interaction_type = e.get('interactionType')
1509 if isinstance(interaction_type, dict):
1510 interaction_type = interaction_type.get('@type')
1511 return str_or_none(interaction_type)
1512
e7e4a6e0
S
1513 def extract_interaction_statistic(e):
1514 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1515 if isinstance(interaction_statistic, dict):
1516 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1517 if not isinstance(interaction_statistic, list):
1518 return
1519 for is_e in interaction_statistic:
f3c0c773 1520 if not is_type(is_e, 'InteractionCounter'):
e7e4a6e0 1521 continue
29f7c58a 1522 interaction_type = extract_interaction_type(is_e)
1523 if not interaction_type:
e7e4a6e0 1524 continue
ce5b9040
S
1525 # For interaction count some sites provide string instead of
1526 # an integer (as per spec) with non digit characters (e.g. ",")
1527 # so extracting count with more relaxed str_to_int
1528 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1529 if interaction_count is None:
1530 continue
1531 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1532 if not count_kind:
1533 continue
1534 count_key = '%s_count' % count_kind
1535 if info.get(count_key) is not None:
1536 continue
1537 info[count_key] = interaction_count
1538
f5225737 1539 def extract_chapter_information(e):
1540 chapters = [{
1541 'title': part.get('name'),
1542 'start_time': part.get('startOffset'),
1543 'end_time': part.get('endOffset'),
85553414 1544 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1545 for idx, (last_c, current_c, next_c) in enumerate(zip(
1546 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1547 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1548 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1549 if None in current_c.values():
1550 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1551 return
1552 if chapters:
1553 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1554 info['chapters'] = chapters
1555
bae14048 1556 def extract_video_object(e):
f7ad7160 1557 author = e.get('author')
bae14048 1558 info.update({
0c36dc00 1559 'url': url_or_none(e.get('contentUrl')),
0f60ba6e 1560 'ext': mimetype2ext(e.get('encodingFormat')),
bae14048
S
1561 'title': unescapeHTML(e.get('name')),
1562 'description': unescapeHTML(e.get('description')),
eb2333bc 1563 'thumbnails': [{'url': unescapeHTML(url)}
21633673 1564 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1565 if url_or_none(url)],
bae14048
S
1566 'duration': parse_duration(e.get('duration')),
1567 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1568 # author can be an instance of 'Organization' or 'Person' types.
1569 # both types can have 'name' property(inherited from 'Thing' type). [1]
1570 # however some websites are using 'Text' type instead.
1571 # 1. https://schema.org/VideoObject
14f25df2 1572 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
0f60ba6e 1573 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
56ba69e4 1574 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
bae14048
S
1575 'tbr': int_or_none(e.get('bitrate')),
1576 'width': int_or_none(e.get('width')),
1577 'height': int_or_none(e.get('height')),
33a81c2c 1578 'view_count': int_or_none(e.get('interactionCount')),
0f60ba6e 1579 'tags': try_call(lambda: e.get('keywords').split(',')),
bae14048 1580 })
0f60ba6e 1581 if is_type(e, 'AudioObject'):
1582 info.update({
1583 'vcodec': 'none',
1584 'abr': int_or_none(e.get('bitrate')),
1585 })
e7e4a6e0 1586 extract_interaction_statistic(e)
f5225737 1587 extract_chapter_information(e)
bae14048 1588
d5c32548 1589 def traverse_json_ld(json_ld, at_top_level=True):
1d55ebab
SS
1590 for e in variadic(json_ld):
1591 if not isinstance(e, dict):
1592 continue
d5c32548
ZM
1593 if at_top_level and '@context' not in e:
1594 continue
1595 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1d55ebab 1596 traverse_json_ld(e['@graph'], at_top_level=False)
c13a301a 1597 continue
f3c0c773 1598 if expected_type is not None and not is_type(e, expected_type):
4433bb02 1599 continue
8f122fa0 1600 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1601 if rating is not None:
1602 info['average_rating'] = rating
f3c0c773 1603 if is_type(e, 'TVEpisode', 'Episode'):
440863ad 1604 episode_name = unescapeHTML(e.get('name'))
46933a15 1605 info.update({
440863ad 1606 'episode': episode_name,
46933a15
S
1607 'episode_number': int_or_none(e.get('episodeNumber')),
1608 'description': unescapeHTML(e.get('description')),
1609 })
440863ad
S
1610 if not info.get('title') and episode_name:
1611 info['title'] = episode_name
46933a15 1612 part_of_season = e.get('partOfSeason')
f3c0c773 1613 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1614 info.update({
1615 'season': unescapeHTML(part_of_season.get('name')),
1616 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1617 })
d16b3c66 1618 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
f3c0c773 1619 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1620 info['series'] = unescapeHTML(part_of_series.get('name'))
f3c0c773 1621 elif is_type(e, 'Movie'):
391256dc
S
1622 info.update({
1623 'title': unescapeHTML(e.get('name')),
1624 'description': unescapeHTML(e.get('description')),
1625 'duration': parse_duration(e.get('duration')),
1626 'timestamp': unified_timestamp(e.get('dateCreated')),
1627 })
f3c0c773 1628 elif is_type(e, 'Article', 'NewsArticle'):
46933a15
S
1629 info.update({
1630 'timestamp': parse_iso8601(e.get('datePublished')),
1631 'title': unescapeHTML(e.get('headline')),
d5c32548 1632 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1633 })
f3c0c773 1634 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
2edb38e8 1635 extract_video_object(e['video'][0])
f3c0c773 1636 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
e50c3500 1637 extract_video_object(e['subjectOf'][0])
0f60ba6e 1638 elif is_type(e, 'VideoObject', 'AudioObject'):
bae14048 1639 extract_video_object(e)
4433bb02
S
1640 if expected_type is None:
1641 continue
1642 else:
1643 break
c69701c6 1644 video = e.get('video')
f3c0c773 1645 if is_type(video, 'VideoObject'):
c69701c6 1646 extract_video_object(video)
4433bb02
S
1647 if expected_type is None:
1648 continue
1649 else:
1650 break
d5c32548 1651
1d55ebab 1652 traverse_json_ld(json_ld)
90137ca4 1653 return filter_dict(info)
4ca2a3cf 1654
135dfa2c 1655 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1656 return self._parse_json(
1657 self._search_regex(
1658 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1659 webpage, 'next.js data', fatal=fatal, **kw),
1660 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1661
8072ef2b 1662 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1663 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
66f4c04e 1664 rectx = re.escape(context_name)
8072ef2b 1665 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
66f4c04e 1666 js, arg_keys, arg_vals = self._search_regex(
8072ef2b 1667 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
f7fc8d39 1668 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1669 default=NO_DEFAULT if fatal else (None, None, None))
1670 if js is None:
1671 return {}
66f4c04e 1672
b23167e7
L
1673 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1674 f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
66f4c04e 1675
8072ef2b 1676 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1677 return traverse_obj(ret, traverse) or {}
66f4c04e 1678
27713812 1679 @staticmethod
f8da79f8 1680 def _hidden_inputs(html):
586f1cc5 1681 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1682 hidden_inputs = {}
c8498368
S
1683 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1684 attrs = extract_attributes(input)
1685 if not input:
201ea3ee 1686 continue
c8498368 1687 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1688 continue
c8498368
S
1689 name = attrs.get('name') or attrs.get('id')
1690 value = attrs.get('value')
1691 if name and value is not None:
1692 hidden_inputs[name] = value
201ea3ee 1693 return hidden_inputs
27713812 1694
cf61d96d
S
1695 def _form_hidden_inputs(self, form_id, html):
1696 form = self._search_regex(
73eb13df 1697 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1698 html, '%s form' % form_id, group='form')
1699 return self._hidden_inputs(form)
1700
d0d74b71 1701 @classproperty(cache=True)
1702 def FormatSort(cls):
1703 class FormatSort(FormatSorter):
1704 def __init__(ie, *args, **kwargs):
1705 super().__init__(ie._downloader, *args, **kwargs)
eb8a4433 1706
d0d74b71 1707 deprecation_warning(
1708 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1709 'Use yt_dlp.utils.FormatSorter instead')
1710 return FormatSort
eb8a4433 1711
1712 def _sort_formats(self, formats, field_preference=[]):
9f14daf2 1713 if not field_preference:
1714 self._downloader.deprecation_warning(
1715 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1716 return
1717 self._downloader.deprecation_warning(
1718 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1719 'Return _format_sort_fields in the info_dict instead')
1720 if formats:
784320c9 1721 formats[0]['__sort_fields'] = field_preference
59040888 1722
96a53167
S
1723 def _check_formats(self, formats, video_id):
1724 if formats:
1725 formats[:] = filter(
1726 lambda f: self._is_valid_url(
1727 f['url'], video_id,
1728 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1729 formats)
1730
f5bdb444
S
1731 @staticmethod
1732 def _remove_duplicate_formats(formats):
1733 format_urls = set()
1734 unique_formats = []
1735 for f in formats:
1736 if f['url'] not in format_urls:
1737 format_urls.add(f['url'])
1738 unique_formats.append(f)
1739 formats[:] = unique_formats
1740
45024183 1741 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1742 url = self._proto_relative_url(url, scheme='http:')
1743 # For now assume non HTTP(S) URLs always valid
1744 if not (url.startswith('http://') or url.startswith('https://')):
1745 return True
96a53167 1746 try:
45024183 1747 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1748 return True
8bdd16b4 1749 except ExtractorError as e:
25e911a9 1750 self.to_screen(
8bdd16b4 1751 '%s: %s URL is invalid, skipping: %s'
1752 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1753 return False
96a53167 1754
20991253 1755 def http_scheme(self):
1ede5b24 1756 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1757 return (
1758 'http:'
a06916d9 1759 if self.get_param('prefer_insecure', False)
20991253
PH
1760 else 'https:')
1761
57c7411f 1762 def _proto_relative_url(self, url, scheme=None):
8f97a15d 1763 scheme = scheme or self.http_scheme()
1764 assert scheme.endswith(':')
1765 return sanitize_url(url, scheme=scheme[:-1])
57c7411f 1766
4094b6e3
PH
1767 def _sleep(self, timeout, video_id, msg_template=None):
1768 if msg_template is None:
f1a9d64e 1769 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1770 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1771 self.to_screen(msg)
1772 time.sleep(timeout)
1773
f983b875 1774 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1775 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1776 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
0b5546c7 1777 if self.get_param('ignore_no_formats_error'):
1778 fatal = False
1779
a076c1f9 1780 res = self._download_xml_handle(
f036a632 1781 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1782 'Unable to download f4m manifest',
1783 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1784 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1785 transform_source=transform_source,
7360c06f 1786 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 1787 if res is False:
8d29e47f 1788 return []
31bb8d3f 1789
a076c1f9
E
1790 manifest, urlh = res
1791 manifest_url = urlh.geturl()
1792
0fdbb332 1793 return self._parse_f4m_formats(
f983b875 1794 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1795 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1796
f983b875 1797 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1798 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1799 fatal=True, m3u8_id=None):
f9934b96 1800 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
1801 return []
1802
7a5c1cfe 1803 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1804 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1805 if akamai_pv is not None and ';' in akamai_pv.text:
1806 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1807 if playerVerificationChallenge.strip() != '':
1808 return []
1809
31bb8d3f 1810 formats = []
7a47d07c 1811 manifest_version = '1.0'
b2527359 1812 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1813 if not media_nodes:
7a47d07c 1814 manifest_version = '2.0'
34e48bed 1815 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1816 # Remove unsupported DRM protected media from final formats
067aa17e 1817 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1818 media_nodes = remove_encrypted_media(media_nodes)
1819 if not media_nodes:
1820 return formats
48107c19
S
1821
1822 manifest_base_url = get_base_url(manifest)
0a5685b2 1823
a6571f10 1824 bootstrap_info = xpath_element(
0a5685b2
YCH
1825 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1826 'bootstrap info', default=None)
1827
edd6074c
RA
1828 vcodec = None
1829 mime_type = xpath_text(
1830 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1831 'base URL', default=None)
1832 if mime_type and mime_type.startswith('audio/'):
1833 vcodec = 'none'
1834
b2527359 1835 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1836 tbr = int_or_none(media_el.attrib.get('bitrate'))
1837 width = int_or_none(media_el.attrib.get('width'))
1838 height = int_or_none(media_el.attrib.get('height'))
34921b43 1839 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
1840 # If <bootstrapInfo> is present, the specified f4m is a
1841 # stream-level manifest, and only set-level manifests may refer to
1842 # external resources. See section 11.4 and section 4 of F4M spec
1843 if bootstrap_info is None:
1844 media_url = None
1845 # @href is introduced in 2.0, see section 11.6 of F4M spec
1846 if manifest_version == '2.0':
1847 media_url = media_el.attrib.get('href')
1848 if media_url is None:
1849 media_url = media_el.attrib.get('url')
31c746e5
S
1850 if not media_url:
1851 continue
cc357c4d
S
1852 manifest_url = (
1853 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1854 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1855 # If media_url is itself a f4m manifest do the recursive extraction
1856 # since bitrates in parent manifest (this one) and media_url manifest
1857 # may differ leading to inability to resolve the format by requested
1858 # bitrate in f4m downloader
240b6045
YCH
1859 ext = determine_ext(manifest_url)
1860 if ext == 'f4m':
77b8b4e6 1861 f4m_formats = self._extract_f4m_formats(
f983b875 1862 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1863 transform_source=transform_source, fatal=fatal)
1864 # Sometimes stream-level manifest contains single media entry that
1865 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1866 # At the same time parent's media entry in set-level manifest may
1867 # contain it. We will copy it from parent in such cases.
1868 if len(f4m_formats) == 1:
1869 f = f4m_formats[0]
1870 f.update({
1871 'tbr': f.get('tbr') or tbr,
1872 'width': f.get('width') or width,
1873 'height': f.get('height') or height,
1874 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1875 'vcodec': vcodec,
77b8b4e6
S
1876 })
1877 formats.extend(f4m_formats)
70f0f5a8 1878 continue
240b6045
YCH
1879 elif ext == 'm3u8':
1880 formats.extend(self._extract_m3u8_formats(
1881 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1882 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1883 continue
31bb8d3f 1884 formats.append({
77b8b4e6 1885 'format_id': format_id,
31bb8d3f 1886 'url': manifest_url,
30d0b549 1887 'manifest_url': manifest_url,
a6571f10 1888 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1889 'protocol': 'f4m',
b2527359 1890 'tbr': tbr,
77b8b4e6
S
1891 'width': width,
1892 'height': height,
edd6074c 1893 'vcodec': vcodec,
60ca389c 1894 'preference': preference,
f983b875 1895 'quality': quality,
31bb8d3f 1896 })
31bb8d3f
JMF
1897 return formats
1898
f983b875 1899 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1900 return {
34921b43 1901 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
1902 'url': m3u8_url,
1903 'ext': ext,
1904 'protocol': 'm3u8',
37768f92 1905 'preference': preference - 100 if preference else -100,
f983b875 1906 'quality': quality,
704df56d
PH
1907 'resolution': 'multiple',
1908 'format_note': 'Quality selection URL',
16da9bbc
YCH
1909 }
1910
b5ae35ee 1911 def _report_ignoring_subs(self, name):
1912 self.report_warning(bug_reports_message(
1913 f'Ignoring subtitle tracks found in the {name} manifest; '
1914 'if any subtitle tracks are missing,'
1915 ), only_once=True)
1916
a0c3b2d5
F
1917 def _extract_m3u8_formats(self, *args, **kwargs):
1918 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1919 if subs:
b5ae35ee 1920 self._report_ignoring_subs('HLS')
a0c3b2d5
F
1921 return fmts
1922
1923 def _extract_m3u8_formats_and_subtitles(
177877c5 1924 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1925 preference=None, quality=None, m3u8_id=None, note=None,
1926 errnote=None, fatal=True, live=False, data=None, headers={},
1927 query={}):
1928
0b5546c7 1929 if self.get_param('ignore_no_formats_error'):
1930 fatal = False
1931
71df9b7f 1932 if not m3u8_url:
1933 if errnote is not False:
1934 errnote = errnote or 'Failed to obtain m3u8 URL'
1935 if fatal:
1936 raise ExtractorError(errnote, video_id=video_id)
1937 self.report_warning(f'{errnote}{bug_reports_message()}')
1938 return [], {}
1939
dbd82a1d 1940 res = self._download_webpage_handle(
81515ad9 1941 m3u8_url, video_id,
37a3bb66 1942 note='Downloading m3u8 information' if note is None else note,
1943 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1944 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1945
dbd82a1d 1946 if res is False:
a0c3b2d5 1947 return [], {}
cb252080 1948
dbd82a1d 1949 m3u8_doc, urlh = res
37113045 1950 m3u8_url = urlh.geturl()
9cdffeeb 1951
a0c3b2d5 1952 return self._parse_m3u8_formats_and_subtitles(
cb252080 1953 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1954 preference=preference, quality=quality, m3u8_id=m3u8_id,
1955 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1956 headers=headers, query=query, video_id=video_id)
cb252080 1957
a0c3b2d5 1958 def _parse_m3u8_formats_and_subtitles(
42676437 1959 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1960 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1961 errnote=None, fatal=True, data=None, headers={}, query={},
1962 video_id=None):
60755938 1963 formats, subtitles = [], {}
a0c3b2d5 1964
6b993ca7 1965 has_drm = re.search('|'.join([
1966 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
1967 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
1968 ]), m3u8_doc)
a0c3b2d5 1969
60755938 1970 def format_url(url):
14f25df2 1971 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
60755938 1972
1973 if self.get_param('hls_split_discontinuity', False):
1974 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1975 if not m3u8_doc:
1976 if not manifest_url:
1977 return []
1978 m3u8_doc = self._download_webpage(
1979 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1980 note=False, errnote='Failed to download m3u8 playlist information')
1981 if m3u8_doc is False:
1982 return []
1983 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 1984
60755938 1985 else:
1986 def _extract_m3u8_playlist_indices(*args, **kwargs):
1987 return [None]
310c2ed2 1988
cb252080
S
1989 # References:
1990 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
1991 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1992 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
1993
1994 # We should try extracting formats only from master playlists [1, 4.3.4],
1995 # i.e. playlists that describe available qualities. On the other hand
1996 # media playlists [1, 4.3.3] should be returned as is since they contain
1997 # just the media without qualities renditions.
9cdffeeb 1998 # Fortunately, master playlist can be easily distinguished from media
cb252080 1999 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2000 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2001 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2002 # media playlist and MUST NOT appear in master playlist thus we can
2003 # clearly detect media playlist with this criterion.
2004
9cdffeeb 2005 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2006 formats = [{
34921b43 2007 'format_id': join_nonempty(m3u8_id, idx),
60755938 2008 'format_index': idx,
42676437 2009 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2010 'ext': ext,
2011 'protocol': entry_protocol,
2012 'preference': preference,
2013 'quality': quality,
88acdbc2 2014 'has_drm': has_drm,
60755938 2015 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2016
a0c3b2d5 2017 return formats, subtitles
cb252080
S
2018
2019 groups = {}
2020 last_stream_inf = {}
2021
2022 def extract_media(x_media_line):
2023 media = parse_m3u8_attributes(x_media_line)
2024 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2025 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2026 if not (media_type and group_id and name):
2027 return
2028 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2029 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2030 if media_type == 'SUBTITLES':
3907333c 2031 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2032 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2033 # However, lack of URI has been spotted in the wild.
2034 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2035 if not media.get('URI'):
2036 return
a0c3b2d5
F
2037 url = format_url(media['URI'])
2038 sub_info = {
2039 'url': url,
2040 'ext': determine_ext(url),
2041 }
4a2f19ab
F
2042 if sub_info['ext'] == 'm3u8':
2043 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2044 # files may contain is WebVTT:
2045 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2046 sub_info['ext'] = 'vtt'
2047 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2048 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2049 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2050 if media_type not in ('VIDEO', 'AUDIO'):
2051 return
2052 media_url = media.get('URI')
2053 if media_url:
310c2ed2 2054 manifest_url = format_url(media_url)
60755938 2055 formats.extend({
34921b43 2056 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2057 'format_note': name,
2058 'format_index': idx,
2059 'url': manifest_url,
2060 'manifest_url': m3u8_url,
2061 'language': media.get('LANGUAGE'),
2062 'ext': ext,
2063 'protocol': entry_protocol,
2064 'preference': preference,
2065 'quality': quality,
43a3eaf9 2066 'has_drm': has_drm,
60755938 2067 'vcodec': 'none' if media_type == 'AUDIO' else None,
2068 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2069
2070 def build_stream_name():
2071 # Despite specification does not mention NAME attribute for
3019cb0c
S
2072 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2073 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2074 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2075 stream_name = last_stream_inf.get('NAME')
2076 if stream_name:
2077 return stream_name
2078 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2079 # from corresponding rendition group
2080 stream_group_id = last_stream_inf.get('VIDEO')
2081 if not stream_group_id:
2082 return
2083 stream_group = groups.get(stream_group_id)
2084 if not stream_group:
2085 return stream_group_id
2086 rendition = stream_group[0]
2087 return rendition.get('NAME') or stream_group_id
2088
379306ef 2089 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2090 # chance to detect video only formats when EXT-X-STREAM-INF tags
2091 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2092 for line in m3u8_doc.splitlines():
2093 if line.startswith('#EXT-X-MEDIA:'):
2094 extract_media(line)
2095
704df56d
PH
2096 for line in m3u8_doc.splitlines():
2097 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2098 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2099 elif line.startswith('#') or not line.strip():
2100 continue
2101 else:
9c99bef7 2102 tbr = float_or_none(
3089bc74
S
2103 last_stream_inf.get('AVERAGE-BANDWIDTH')
2104 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2105 manifest_url = format_url(line.strip())
5ef62fc4 2106
60755938 2107 for idx in _extract_m3u8_playlist_indices(manifest_url):
2108 format_id = [m3u8_id, None, idx]
310c2ed2 2109 # Bandwidth of live streams may differ over time thus making
2110 # format_id unpredictable. So it's better to keep provided
2111 # format_id intact.
2112 if not live:
60755938 2113 stream_name = build_stream_name()
34921b43 2114 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2115 f = {
34921b43 2116 'format_id': join_nonempty(*format_id),
60755938 2117 'format_index': idx,
310c2ed2 2118 'url': manifest_url,
2119 'manifest_url': m3u8_url,
2120 'tbr': tbr,
2121 'ext': ext,
2122 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2123 'protocol': entry_protocol,
2124 'preference': preference,
2125 'quality': quality,
43a3eaf9 2126 'has_drm': has_drm,
310c2ed2 2127 }
2128 resolution = last_stream_inf.get('RESOLUTION')
2129 if resolution:
2130 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2131 if mobj:
2132 f['width'] = int(mobj.group('width'))
2133 f['height'] = int(mobj.group('height'))
2134 # Unified Streaming Platform
2135 mobj = re.search(
2136 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2137 if mobj:
2138 abr, vbr = mobj.groups()
2139 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2140 f.update({
2141 'vbr': vbr,
2142 'abr': abr,
2143 })
2144 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2145 f.update(codecs)
2146 audio_group_id = last_stream_inf.get('AUDIO')
2147 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2148 # references a rendition group MUST have a CODECS attribute.
62b58c09 2149 # However, this is not always respected. E.g. [2]
310c2ed2 2150 # contains EXT-X-STREAM-INF tag which references AUDIO
2151 # rendition group but does not have CODECS and despite
2152 # referencing an audio group it represents a complete
2153 # (with audio and video) format. So, for such cases we will
2154 # ignore references to rendition groups and treat them
2155 # as complete formats.
2156 if audio_group_id and codecs and f.get('vcodec') != 'none':
2157 audio_group = groups.get(audio_group_id)
2158 if audio_group and audio_group[0].get('URI'):
2159 # TODO: update acodec for audio only formats with
2160 # the same GROUP-ID
2161 f['acodec'] = 'none'
fc21af50 2162 if not f.get('ext'):
2163 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2164 formats.append(f)
2165
2166 # for DailyMotion
2167 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2168 if progressive_uri:
2169 http_f = f.copy()
2170 del http_f['manifest_url']
2171 http_f.update({
2172 'format_id': f['format_id'].replace('hls-', 'http-'),
2173 'protocol': 'http',
2174 'url': progressive_uri,
2175 })
2176 formats.append(http_f)
5ef62fc4 2177
cb252080 2178 last_stream_inf = {}
a0c3b2d5 2179 return formats, subtitles
704df56d 2180
3cf4b91d
C
2181 def _extract_m3u8_vod_duration(
2182 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2183
2184 m3u8_vod = self._download_webpage(
2185 m3u8_vod_url, video_id,
2186 note='Downloading m3u8 VOD manifest' if note is None else note,
2187 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2188 fatal=False, data=data, headers=headers, query=query)
2189
2190 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2191
2192 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
5ab3534d 2193 if '#EXT-X-ENDLIST' not in m3u8_vod:
3cf4b91d
C
2194 return None
2195
2196 return int(sum(
2197 float(line[len('#EXTINF:'):].split(',')[0])
2198 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2199
5ab3534d 2200 def _extract_mpd_vod_duration(
2201 self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2202
2203 mpd_doc = self._download_xml(
2204 mpd_url, video_id,
2205 note='Downloading MPD VOD manifest' if note is None else note,
2206 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2207 fatal=False, data=data, headers=headers, query=query) or {}
2208 return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2209
a107193e
S
2210 @staticmethod
2211 def _xpath_ns(path, namespace=None):
2212 if not namespace:
2213 return path
2214 out = []
2215 for c in path.split('/'):
2216 if not c or c == '.':
2217 out.append(c)
2218 else:
2219 out.append('{%s}%s' % (namespace, c))
2220 return '/'.join(out)
2221
da1c94ee 2222 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
0b5546c7 2223 if self.get_param('ignore_no_formats_error'):
2224 fatal = False
2225
a076c1f9
E
2226 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2227 if res is False:
995029a1 2228 assert not fatal
774a46c5 2229 return [], {}
e89a2aab 2230
a076c1f9
E
2231 smil, urlh = res
2232 smil_url = urlh.geturl()
2233
17712eeb 2234 namespace = self._parse_smil_namespace(smil)
a107193e 2235
da1c94ee 2236 fmts = self._parse_smil_formats(
a107193e 2237 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2238 subs = self._parse_smil_subtitles(
2239 smil, namespace=namespace)
2240
2241 return fmts, subs
2242
2243 def _extract_smil_formats(self, *args, **kwargs):
2244 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2245 if subs:
b5ae35ee 2246 self._report_ignoring_subs('SMIL')
da1c94ee 2247 return fmts
a107193e
S
2248
2249 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2250 res = self._download_smil(smil_url, video_id, fatal=fatal)
2251 if res is False:
a107193e 2252 return {}
a076c1f9
E
2253
2254 smil, urlh = res
2255 smil_url = urlh.geturl()
2256
a107193e
S
2257 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2258
09f572fb 2259 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2260 return self._download_xml_handle(
a107193e 2261 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2262 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2263
2264 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2265 namespace = self._parse_smil_namespace(smil)
a107193e
S
2266
2267 formats = self._parse_smil_formats(
2268 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2269 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2270
2271 video_id = os.path.splitext(url_basename(smil_url))[0]
2272 title = None
2273 description = None
647eab45 2274 upload_date = None
a107193e
S
2275 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2276 name = meta.attrib.get('name')
2277 content = meta.attrib.get('content')
2278 if not name or not content:
2279 continue
2280 if not title and name == 'title':
2281 title = content
2282 elif not description and name in ('description', 'abstract'):
2283 description = content
647eab45
S
2284 elif not upload_date and name == 'date':
2285 upload_date = unified_strdate(content)
a107193e 2286
1e5bcdec
S
2287 thumbnails = [{
2288 'id': image.get('type'),
2289 'url': image.get('src'),
2290 'width': int_or_none(image.get('width')),
2291 'height': int_or_none(image.get('height')),
2292 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2293
a107193e
S
2294 return {
2295 'id': video_id,
2296 'title': title or video_id,
2297 'description': description,
647eab45 2298 'upload_date': upload_date,
1e5bcdec 2299 'thumbnails': thumbnails,
a107193e
S
2300 'formats': formats,
2301 'subtitles': subtitles,
2302 }
2303
17712eeb
S
2304 def _parse_smil_namespace(self, smil):
2305 return self._search_regex(
2306 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2307
f877c6ae 2308 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2309 base = smil_url
2310 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2311 b = meta.get('base') or meta.get('httpBase')
2312 if b:
2313 base = b
2314 break
e89a2aab
S
2315
2316 formats = []
2317 rtmp_count = 0
a107193e 2318 http_count = 0
7f32e5dc 2319 m3u8_count = 0
9359f3d4 2320 imgs_count = 0
a107193e 2321
9359f3d4 2322 srcs = set()
ad96b4c8
YCH
2323 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2324 for medium in media:
2325 src = medium.get('src')
81e1c4e2 2326 if not src or src in srcs:
a107193e 2327 continue
9359f3d4 2328 srcs.add(src)
a107193e 2329
ad96b4c8
YCH
2330 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2331 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2332 width = int_or_none(medium.get('width'))
2333 height = int_or_none(medium.get('height'))
2334 proto = medium.get('proto')
2335 ext = medium.get('ext')
cb73b846 2336 src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2337 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
ad96b4c8 2338 streamer = medium.get('streamer') or base
a107193e
S
2339
2340 if proto == 'rtmp' or streamer.startswith('rtmp'):
2341 rtmp_count += 1
2342 formats.append({
2343 'url': streamer,
2344 'play_path': src,
2345 'ext': 'flv',
2346 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2347 'tbr': bitrate,
2348 'filesize': filesize,
2349 'width': width,
2350 'height': height,
2351 })
f877c6ae
YCH
2352 if transform_rtmp_url:
2353 streamer, src = transform_rtmp_url(streamer, src)
2354 formats[-1].update({
2355 'url': streamer,
2356 'play_path': src,
2357 })
a107193e
S
2358 continue
2359
14f25df2 2360 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
c349456e 2361 src_url = src_url.strip()
a107193e
S
2362
2363 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2364 m3u8_formats = self._extract_m3u8_formats(
2365 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2366 if len(m3u8_formats) == 1:
2367 m3u8_count += 1
2368 m3u8_formats[0].update({
2369 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2370 'tbr': bitrate,
2371 'width': width,
2372 'height': height,
2373 })
2374 formats.extend(m3u8_formats)
bd21ead2 2375 elif src_ext == 'f4m':
a107193e
S
2376 f4m_url = src_url
2377 if not f4m_params:
2378 f4m_params = {
2379 'hdcore': '3.2.0',
2380 'plugin': 'flowplayer-3.2.0.1',
2381 }
2382 f4m_url += '&' if '?' in f4m_url else '?'
14f25df2 2383 f4m_url += urllib.parse.urlencode(f4m_params)
7e5edcfd 2384 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2385 elif src_ext == 'mpd':
2386 formats.extend(self._extract_mpd_formats(
2387 src_url, video_id, mpd_id='dash', fatal=False))
2388 elif re.search(r'\.ism/[Mm]anifest', src_url):
2389 formats.extend(self._extract_ism_formats(
2390 src_url, video_id, ism_id='mss', fatal=False))
2391 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2392 http_count += 1
2393 formats.append({
2394 'url': src_url,
2395 'ext': ext or src_ext or 'flv',
2396 'format_id': 'http-%d' % (bitrate or http_count),
2397 'tbr': bitrate,
2398 'filesize': filesize,
2399 'width': width,
2400 'height': height,
2401 })
63757032 2402
9359f3d4
F
2403 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2404 src = medium.get('src')
2405 if not src or src in srcs:
2406 continue
2407 srcs.add(src)
2408
2409 imgs_count += 1
2410 formats.append({
2411 'format_id': 'imagestream-%d' % (imgs_count),
2412 'url': src,
2413 'ext': mimetype2ext(medium.get('type')),
2414 'acodec': 'none',
2415 'vcodec': 'none',
2416 'width': int_or_none(medium.get('width')),
2417 'height': int_or_none(medium.get('height')),
2418 'format_note': 'SMIL storyboards',
2419 })
2420
e89a2aab
S
2421 return formats
2422
ce00af87 2423 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2424 urls = []
a107193e
S
2425 subtitles = {}
2426 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2427 src = textstream.get('src')
d413095f 2428 if not src or src in urls:
a107193e 2429 continue
d413095f 2430 urls.append(src)
df634be2 2431 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2432 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2433 subtitles.setdefault(lang, []).append({
2434 'url': src,
2435 'ext': ext,
2436 })
2437 return subtitles
63757032 2438
47a5cb77 2439 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2440 res = self._download_xml_handle(
47a5cb77 2441 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2442 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2443 if res is False:
942acef5 2444 return []
a076c1f9
E
2445
2446 xspf, urlh = res
2447 xspf_url = urlh.geturl()
2448
47a5cb77
S
2449 return self._parse_xspf(
2450 xspf, playlist_id, xspf_url=xspf_url,
2451 xspf_base_url=base_url(xspf_url))
8d6765cf 2452
47a5cb77 2453 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2454 NS_MAP = {
2455 'xspf': 'http://xspf.org/ns/0/',
2456 's1': 'http://static.streamone.nl/player/ns/0',
2457 }
2458
2459 entries = []
47a5cb77 2460 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2461 title = xpath_text(
98044462 2462 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2463 description = xpath_text(
2464 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2465 thumbnail = xpath_text(
2466 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2467 duration = float_or_none(
2468 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2469
47a5cb77
S
2470 formats = []
2471 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2472 format_url = urljoin(xspf_base_url, location.text)
2473 if not format_url:
2474 continue
2475 formats.append({
2476 'url': format_url,
2477 'manifest_url': xspf_url,
2478 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2479 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2480 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2481 })
8d6765cf
S
2482
2483 entries.append({
2484 'id': playlist_id,
2485 'title': title,
2486 'description': description,
2487 'thumbnail': thumbnail,
2488 'duration': duration,
2489 'formats': formats,
2490 })
2491 return entries
2492
171e59ed
F
2493 def _extract_mpd_formats(self, *args, **kwargs):
2494 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2495 if subs:
b5ae35ee 2496 self._report_ignoring_subs('DASH')
171e59ed
F
2497 return fmts
2498
2499 def _extract_mpd_formats_and_subtitles(
2500 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2501 fatal=True, data=None, headers={}, query={}):
0b5546c7 2502
2503 if self.get_param('ignore_no_formats_error'):
2504 fatal = False
2505
47a5cb77 2506 res = self._download_xml_handle(
1bac3455 2507 mpd_url, video_id,
37a3bb66 2508 note='Downloading MPD manifest' if note is None else note,
2509 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2510 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2511 if res is False:
171e59ed 2512 return [], {}
47a5cb77 2513 mpd_doc, urlh = res
c25720ef 2514 if mpd_doc is None:
171e59ed 2515 return [], {}
779da8e3
E
2516
2517 # We could have been redirected to a new url when we retrieved our mpd file.
2518 mpd_url = urlh.geturl()
2519 mpd_base_url = base_url(mpd_url)
1bac3455 2520
171e59ed 2521 return self._parse_mpd_formats_and_subtitles(
545cc85d 2522 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2523
171e59ed
F
2524 def _parse_mpd_formats(self, *args, **kwargs):
2525 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2526 if subs:
b5ae35ee 2527 self._report_ignoring_subs('DASH')
171e59ed
F
2528 return fmts
2529
2530 def _parse_mpd_formats_and_subtitles(
2531 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2532 """
2533 Parse formats from MPD manifest.
2534 References:
2535 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2536 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2537 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2538 """
a06916d9 2539 if not self.get_param('dynamic_mpd', True):
78895bd3 2540 if mpd_doc.get('type') == 'dynamic':
171e59ed 2541 return [], {}
2d2fa82d 2542
91cb6b50 2543 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2544
2545 def _add_ns(path):
2546 return self._xpath_ns(path, namespace)
2547
675d0016 2548 def is_drm_protected(element):
2549 return element.find(_add_ns('ContentProtection')) is not None
2550
1bac3455 2551 def extract_multisegment_info(element, ms_parent_info):
2552 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2553
2554 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2555 # common attributes and elements. We will only extract relevant
2556 # for us.
2557 def extract_common(source):
2558 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2559 if segment_timeline is not None:
2560 s_e = segment_timeline.findall(_add_ns('S'))
2561 if s_e:
2562 ms_info['total_number'] = 0
2563 ms_info['s'] = []
2564 for s in s_e:
2565 r = int(s.get('r', 0))
2566 ms_info['total_number'] += 1 + r
2567 ms_info['s'].append({
2568 't': int(s.get('t', 0)),
2569 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2570 'd': int(s.attrib['d']),
2571 'r': r,
2572 })
2573 start_number = source.get('startNumber')
2574 if start_number:
2575 ms_info['start_number'] = int(start_number)
2576 timescale = source.get('timescale')
2577 if timescale:
2578 ms_info['timescale'] = int(timescale)
2579 segment_duration = source.get('duration')
2580 if segment_duration:
48504785 2581 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2582
2583 def extract_Initialization(source):
2584 initialization = source.find(_add_ns('Initialization'))
2585 if initialization is not None:
2586 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2587
f14be228 2588 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2589 if segment_list is not None:
b4c1d6e8
S
2590 extract_common(segment_list)
2591 extract_Initialization(segment_list)
f14be228 2592 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2593 if segment_urls_e:
2594 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2595 else:
f14be228 2596 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2597 if segment_template is not None:
b4c1d6e8 2598 extract_common(segment_template)
e228616c
S
2599 media = segment_template.get('media')
2600 if media:
2601 ms_info['media'] = media
1bac3455 2602 initialization = segment_template.get('initialization')
2603 if initialization:
e228616c 2604 ms_info['initialization'] = initialization
1bac3455 2605 else:
b4c1d6e8 2606 extract_Initialization(segment_template)
1bac3455 2607 return ms_info
b323e170 2608
1bac3455 2609 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2610 formats, subtitles = [], {}
234416e4 2611 stream_numbers = collections.defaultdict(int)
f14be228 2612 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2613 period_duration = parse_duration(period.get('duration')) or mpd_duration
2614 period_ms_info = extract_multisegment_info(period, {
2615 'start_number': 1,
2616 'timescale': 1,
2617 })
f14be228 2618 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2619 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2620 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2621 representation_attrib = adaptation_set.attrib.copy()
2622 representation_attrib.update(representation.attrib)
f0948348 2623 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2624 mime_type = representation_attrib['mimeType']
171e59ed
F
2625 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2626
21633673 2627 codec_str = representation_attrib.get('codecs', '')
2628 # Some kind of binary subtitle found in some youtube livestreams
2629 if mime_type == 'application/x-rawcc':
2630 codecs = {'scodec': codec_str}
2631 else:
2632 codecs = parse_codecs(codec_str)
be2fc5b2 2633 if content_type not in ('video', 'audio', 'text'):
2634 if mime_type == 'image/jpeg':
a8731fcc 2635 content_type = mime_type
21633673 2636 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2637 content_type = 'video'
21633673 2638 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2639 content_type = 'audio'
3fe75fdc 2640 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2641 content_type = 'text'
6993f78d 2642 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2643 content_type = 'text'
cdb19aa4 2644 else:
be2fc5b2 2645 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2646 continue
2647
2648 base_url = ''
2649 for element in (representation, adaptation_set, period, mpd_doc):
2650 base_url_e = element.find(_add_ns('BaseURL'))
47046464 2651 if try_call(lambda: base_url_e.text) is not None:
be2fc5b2 2652 base_url = base_url_e.text + base_url
2653 if re.match(r'^https?://', base_url):
2654 break
f9cc0161 2655 if mpd_base_url and base_url.startswith('/'):
14f25df2 2656 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
f9cc0161
D
2657 elif mpd_base_url and not re.match(r'^https?://', base_url):
2658 if not mpd_base_url.endswith('/'):
be2fc5b2 2659 mpd_base_url += '/'
2660 base_url = mpd_base_url + base_url
2661 representation_id = representation_attrib.get('id')
2662 lang = representation_attrib.get('lang')
2663 url_el = representation.find(_add_ns('BaseURL'))
2664 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2665 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2666 if representation_id is not None:
2667 format_id = representation_id
2668 else:
2669 format_id = content_type
2670 if mpd_id:
2671 format_id = mpd_id + '-' + format_id
2672 if content_type in ('video', 'audio'):
2673 f = {
2674 'format_id': format_id,
2675 'manifest_url': mpd_url,
2676 'ext': mimetype2ext(mime_type),
2677 'width': int_or_none(representation_attrib.get('width')),
2678 'height': int_or_none(representation_attrib.get('height')),
2679 'tbr': float_or_none(bandwidth, 1000),
2680 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2681 'fps': int_or_none(representation_attrib.get('frameRate')),
2682 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2683 'format_note': 'DASH %s' % content_type,
2684 'filesize': filesize,
2685 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2686 **codecs
be2fc5b2 2687 }
be2fc5b2 2688 elif content_type == 'text':
2689 f = {
2690 'ext': mimetype2ext(mime_type),
2691 'manifest_url': mpd_url,
2692 'filesize': filesize,
2693 }
2694 elif content_type == 'image/jpeg':
2695 # See test case in VikiIE
2696 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2697 f = {
2698 'format_id': format_id,
2699 'ext': 'mhtml',
2700 'manifest_url': mpd_url,
2701 'format_note': 'DASH storyboards (jpeg)',
2702 'acodec': 'none',
2703 'vcodec': 'none',
2704 }
88acdbc2 2705 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2706 f['has_drm'] = True
be2fc5b2 2707 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2708
2709 def prepare_template(template_name, identifiers):
2710 tmpl = representation_ms_info[template_name]
0cb0fdbb 2711 if representation_id is not None:
2712 tmpl = tmpl.replace('$RepresentationID$', representation_id)
be2fc5b2 2713 # First of, % characters outside $...$ templates
2714 # must be escaped by doubling for proper processing
2715 # by % operator string formatting used further (see
2716 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2717 t = ''
2718 in_template = False
2719 for c in tmpl:
2720 t += c
2721 if c == '$':
2722 in_template = not in_template
2723 elif c == '%' and not in_template:
eca1f0d1 2724 t += c
be2fc5b2 2725 # Next, $...$ templates are translated to their
2726 # %(...) counterparts to be used with % operator
be2fc5b2 2727 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2728 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2729 t.replace('$$', '$')
2730 return t
2731
2732 # @initialization is a regular template like @media one
2733 # so it should be handled just the same way (see
2734 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2735 if 'initialization' in representation_ms_info:
2736 initialization_template = prepare_template(
2737 'initialization',
2738 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2739 # $Time$ shall not be included for @initialization thus
2740 # only $Bandwidth$ remains
2741 ('Bandwidth', ))
2742 representation_ms_info['initialization_url'] = initialization_template % {
2743 'Bandwidth': bandwidth,
2744 }
2745
2746 def location_key(location):
2747 return 'url' if re.match(r'^https?://', location) else 'path'
2748
2749 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2750
2751 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2752 media_location_key = location_key(media_template)
2753
2754 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2755 # can't be used at the same time
2756 if '%(Number' in media_template and 's' not in representation_ms_info:
2757 segment_duration = None
2758 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2759 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2760 representation_ms_info['total_number'] = int(math.ceil(
2761 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2762 representation_ms_info['fragments'] = [{
2763 media_location_key: media_template % {
2764 'Number': segment_number,
2765 'Bandwidth': bandwidth,
2766 },
2767 'duration': segment_duration,
2768 } for segment_number in range(
2769 representation_ms_info['start_number'],
2770 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2771 else:
2772 # $Number*$ or $Time$ in media template with S list available
2773 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2774 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2775 representation_ms_info['fragments'] = []
2776 segment_time = 0
2777 segment_d = None
2778 segment_number = representation_ms_info['start_number']
2779
2780 def add_segment_url():
2781 segment_url = media_template % {
2782 'Time': segment_time,
2783 'Bandwidth': bandwidth,
2784 'Number': segment_number,
2785 }
2786 representation_ms_info['fragments'].append({
2787 media_location_key: segment_url,
2788 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2789 })
2790
2791 for num, s in enumerate(representation_ms_info['s']):
2792 segment_time = s.get('t') or segment_time
2793 segment_d = s['d']
2794 add_segment_url()
2795 segment_number += 1
2796 for r in range(s.get('r', 0)):
2797 segment_time += segment_d
f0948348 2798 add_segment_url()
b4c1d6e8 2799 segment_number += 1
be2fc5b2 2800 segment_time += segment_d
2801 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
62b58c09
L
2802 # No media template,
2803 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
be2fc5b2 2804 # or any YouTube dashsegments video
2805 fragments = []
2806 segment_index = 0
2807 timescale = representation_ms_info['timescale']
2808 for s in representation_ms_info['s']:
2809 duration = float_or_none(s['d'], timescale)
2810 for r in range(s.get('r', 0) + 1):
2811 segment_uri = representation_ms_info['segment_urls'][segment_index]
2812 fragments.append({
2813 location_key(segment_uri): segment_uri,
2814 'duration': duration,
2815 })
2816 segment_index += 1
2817 representation_ms_info['fragments'] = fragments
2818 elif 'segment_urls' in representation_ms_info:
2819 # Segment URLs with no SegmentTimeline
62b58c09 2820 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
be2fc5b2 2821 # https://github.com/ytdl-org/youtube-dl/pull/14844
2822 fragments = []
2823 segment_duration = float_or_none(
2824 representation_ms_info['segment_duration'],
2825 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2826 for segment_url in representation_ms_info['segment_urls']:
2827 fragment = {
2828 location_key(segment_url): segment_url,
2829 }
2830 if segment_duration:
2831 fragment['duration'] = segment_duration
2832 fragments.append(fragment)
2833 representation_ms_info['fragments'] = fragments
2834 # If there is a fragments key available then we correctly recognized fragmented media.
2835 # Otherwise we will assume unfragmented media with direct access. Technically, such
2836 # assumption is not necessarily correct since we may simply have no support for
2837 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2838 if 'fragments' in representation_ms_info:
2839 f.update({
2840 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2841 'url': mpd_url or base_url,
2842 'fragment_base_url': base_url,
2843 'fragments': [],
2844 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2845 })
2846 if 'initialization_url' in representation_ms_info:
2847 initialization_url = representation_ms_info['initialization_url']
2848 if not f.get('url'):
2849 f['url'] = initialization_url
2850 f['fragments'].append({location_key(initialization_url): initialization_url})
2851 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 2852 if not period_duration:
2853 period_duration = try_get(
2854 representation_ms_info,
2855 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 2856 else:
be2fc5b2 2857 # Assuming direct URL to unfragmented media.
2858 f['url'] = base_url
234416e4 2859 if content_type in ('video', 'audio', 'image/jpeg'):
2860 f['manifest_stream_number'] = stream_numbers[f['url']]
2861 stream_numbers[f['url']] += 1
be2fc5b2 2862 formats.append(f)
2863 elif content_type == 'text':
2864 subtitles.setdefault(lang or 'und', []).append(f)
2865
171e59ed 2866 return formats, subtitles
17b598d3 2867
fd76a142
F
2868 def _extract_ism_formats(self, *args, **kwargs):
2869 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2870 if subs:
b5ae35ee 2871 self._report_ignoring_subs('ISM')
fd76a142
F
2872 return fmts
2873
2874 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
0b5546c7 2875 if self.get_param('ignore_no_formats_error'):
2876 fatal = False
2877
47a5cb77 2878 res = self._download_xml_handle(
b2758123 2879 ism_url, video_id,
37a3bb66 2880 note='Downloading ISM manifest' if note is None else note,
2881 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2882 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2883 if res is False:
fd76a142 2884 return [], {}
47a5cb77 2885 ism_doc, urlh = res
13b08034 2886 if ism_doc is None:
fd76a142 2887 return [], {}
b2758123 2888
fd76a142 2889 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 2890
fd76a142 2891 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2892 """
2893 Parse formats from ISM manifest.
2894 References:
2895 1. [MS-SSTR]: Smooth Streaming Protocol,
2896 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2897 """
06869367 2898 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2899 return [], {}
b2758123 2900
b2758123
RA
2901 duration = int(ism_doc.attrib['Duration'])
2902 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2903
2904 formats = []
fd76a142 2905 subtitles = {}
b2758123
RA
2906 for stream in ism_doc.findall('StreamIndex'):
2907 stream_type = stream.get('Type')
fd76a142 2908 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2909 continue
2910 url_pattern = stream.attrib['Url']
2911 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2912 stream_name = stream.get('Name')
fd76a142 2913 stream_language = stream.get('Language', 'und')
b2758123 2914 for track in stream.findall('QualityLevel'):
81b6102d 2915 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2916 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
b2758123 2917 # TODO: add support for WVC1 and WMAP
81b6102d 2918 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
b2758123
RA
2919 self.report_warning('%s is not a supported codec' % fourcc)
2920 continue
2921 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2922 # [1] does not mention Width and Height attributes. However,
2923 # they're often present while MaxWidth and MaxHeight are
2924 # missing, so should be used as fallbacks
2925 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2926 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2927 sampling_rate = int_or_none(track.get('SamplingRate'))
2928
2929 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
14f25df2 2930 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
b2758123
RA
2931
2932 fragments = []
2933 fragment_ctx = {
2934 'time': 0,
2935 }
2936 stream_fragments = stream.findall('c')
2937 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2938 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2939 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2940 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2941 if not fragment_ctx['duration']:
2942 try:
2943 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2944 except IndexError:
2945 next_fragment_time = duration
1616f9b4 2946 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2947 for _ in range(fragment_repeat):
2948 fragments.append({
14f25df2 2949 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2950 'duration': fragment_ctx['duration'] / stream_timescale,
2951 })
2952 fragment_ctx['time'] += fragment_ctx['duration']
2953
fd76a142
F
2954 if stream_type == 'text':
2955 subtitles.setdefault(stream_language, []).append({
2956 'ext': 'ismt',
2957 'protocol': 'ism',
2958 'url': ism_url,
2959 'manifest_url': ism_url,
2960 'fragments': fragments,
2961 '_download_params': {
2962 'stream_type': stream_type,
2963 'duration': duration,
2964 'timescale': stream_timescale,
2965 'fourcc': fourcc,
2966 'language': stream_language,
2967 'codec_private_data': track.get('CodecPrivateData'),
2968 }
2969 })
2970 elif stream_type in ('video', 'audio'):
2971 formats.append({
34921b43 2972 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
2973 'url': ism_url,
2974 'manifest_url': ism_url,
2975 'ext': 'ismv' if stream_type == 'video' else 'isma',
2976 'width': width,
2977 'height': height,
2978 'tbr': tbr,
2979 'asr': sampling_rate,
2980 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2981 'acodec': 'none' if stream_type == 'video' else fourcc,
2982 'protocol': 'ism',
2983 'fragments': fragments,
88acdbc2 2984 'has_drm': ism_doc.find('Protection') is not None,
fd76a142
F
2985 '_download_params': {
2986 'stream_type': stream_type,
2987 'duration': duration,
2988 'timescale': stream_timescale,
2989 'width': width or 0,
2990 'height': height or 0,
2991 'fourcc': fourcc,
2992 'language': stream_language,
2993 'codec_private_data': track.get('CodecPrivateData'),
2994 'sampling_rate': sampling_rate,
2995 'channels': int_or_none(track.get('Channels', 2)),
2996 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2997 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2998 },
2999 })
3000 return formats, subtitles
b2758123 3001
079a7cfc 3002 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
3003 def absolute_url(item_url):
3004 return urljoin(base_url, item_url)
59bbe491 3005
3006 def parse_content_type(content_type):
3007 if not content_type:
3008 return {}
3009 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3010 if ctr:
3011 mimetype, codecs = ctr.groups()
3012 f = parse_codecs(codecs)
3013 f['ext'] = mimetype2ext(mimetype)
3014 return f
3015 return {}
3016
222a2308
L
3017 def _media_formats(src, cur_media_type, type_info=None):
3018 type_info = type_info or {}
520251c0 3019 full_url = absolute_url(src)
82889d4a 3020 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3021 if ext == 'm3u8':
520251c0
YCH
3022 is_plain_url = False
3023 formats = self._extract_m3u8_formats(
ad120ae1 3024 full_url, video_id, ext='mp4',
eeb0a956 3025 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3026 preference=preference, quality=quality, fatal=False)
87a449c1
S
3027 elif ext == 'mpd':
3028 is_plain_url = False
3029 formats = self._extract_mpd_formats(
b359e977 3030 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3031 else:
3032 is_plain_url = True
3033 formats = [{
3034 'url': full_url,
3035 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3036 'ext': ext,
520251c0
YCH
3037 }]
3038 return is_plain_url, formats
3039
59bbe491 3040 entries = []
4328ddf8 3041 # amp-video and amp-audio are very similar to their HTML5 counterparts
962ffcf8 3042 # so we will include them right here (see
4328ddf8 3043 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3044 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3045 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3046 media_tags = [(media_tag, media_tag_name, media_type, '')
3047 for media_tag, media_tag_name, media_type
3048 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3049 media_tags.extend(re.findall(
3050 # We only allow video|audio followed by a whitespace or '>'.
3051 # Allowing more characters may end up in significant slow down (see
62b58c09
L
3052 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3053 # e.g. http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3054 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3055 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3056 media_info = {
3057 'formats': [],
3058 'subtitles': {},
3059 }
3060 media_attributes = extract_attributes(media_tag)
bfbecd11 3061 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3062 if src:
222a2308
L
3063 f = parse_content_type(media_attributes.get('type'))
3064 _, formats = _media_formats(src, media_type, f)
520251c0 3065 media_info['formats'].extend(formats)
6780154e 3066 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3067 if media_content:
3068 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3069 s_attr = extract_attributes(source_tag)
3070 # data-video-src and data-src are non standard but seen
3071 # several times in the wild
bfbecd11 3072 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3073 if not src:
3074 continue
d493f15c 3075 f = parse_content_type(s_attr.get('type'))
868f79db 3076 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3077 if is_plain_url:
d493f15c
S
3078 # width, height, res, label and title attributes are
3079 # all not standard but seen several times in the wild
3080 labels = [
3081 s_attr.get(lbl)
3082 for lbl in ('label', 'title')
3083 if str_or_none(s_attr.get(lbl))
3084 ]
3085 width = int_or_none(s_attr.get('width'))
3089bc74
S
3086 height = (int_or_none(s_attr.get('height'))
3087 or int_or_none(s_attr.get('res')))
d493f15c
S
3088 if not width or not height:
3089 for lbl in labels:
3090 resolution = parse_resolution(lbl)
3091 if not resolution:
3092 continue
3093 width = width or resolution.get('width')
3094 height = height or resolution.get('height')
3095 for lbl in labels:
3096 tbr = parse_bitrate(lbl)
3097 if tbr:
3098 break
3099 else:
3100 tbr = None
1ed45499 3101 f.update({
d493f15c
S
3102 'width': width,
3103 'height': height,
3104 'tbr': tbr,
3105 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3106 })
520251c0
YCH
3107 f.update(formats[0])
3108 media_info['formats'].append(f)
3109 else:
3110 media_info['formats'].extend(formats)
59bbe491 3111 for track_tag in re.findall(r'<track[^>]+>', media_content):
3112 track_attributes = extract_attributes(track_tag)
3113 kind = track_attributes.get('kind')
5968d7d2 3114 if not kind or kind in ('subtitles', 'captions'):
f856816b 3115 src = strip_or_none(track_attributes.get('src'))
59bbe491 3116 if not src:
3117 continue
3118 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3119 media_info['subtitles'].setdefault(lang, []).append({
3120 'url': absolute_url(src),
3121 })
5e8e2fa5
S
3122 for f in media_info['formats']:
3123 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3124 if media_info['formats'] or media_info['subtitles']:
59bbe491 3125 entries.append(media_info)
3126 return entries
3127
f6a1d69a
F
3128 def _extract_akamai_formats(self, *args, **kwargs):
3129 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3130 if subs:
b5ae35ee 3131 self._report_ignoring_subs('akamai')
f6a1d69a
F
3132 return fmts
3133
3134 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3135 signed = 'hdnea=' in manifest_url
3136 if not signed:
3137 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3138 manifest_url = re.sub(
3139 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3140 '', manifest_url).strip('?')
3141
c7c43a93 3142 formats = []
f6a1d69a 3143 subtitles = {}
70c5802b 3144
e71a4509 3145 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3146 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3147 hds_host = hosts.get('hds')
3148 if hds_host:
3149 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3150 if 'hdcore=' not in f4m_url:
3151 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3152 f4m_formats = self._extract_f4m_formats(
3153 f4m_url, video_id, f4m_id='hds', fatal=False)
3154 for entry in f4m_formats:
3155 entry.update({'extra_param_to_segment_url': hdcore_sign})
3156 formats.extend(f4m_formats)
70c5802b 3157
c4251b9a
RA
3158 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3159 hls_host = hosts.get('hls')
3160 if hls_host:
3161 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3162 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3163 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3164 m3u8_id='hls', fatal=False)
3165 formats.extend(m3u8_formats)
f6a1d69a 3166 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3167
3168 http_host = hosts.get('http')
29f7c58a 3169 if http_host and m3u8_formats and not signed:
3170 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3171 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3172 qualities_length = len(qualities)
29f7c58a 3173 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3174 i = 0
29f7c58a 3175 for f in m3u8_formats:
3176 if f['vcodec'] != 'none':
70c5802b 3177 for protocol in ('http', 'https'):
3178 http_f = f.copy()
3179 del http_f['manifest_url']
3180 http_url = re.sub(
86e5f3ed 3181 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3182 http_f.update({
3183 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3184 'url': http_url,
3185 'protocol': protocol,
3186 })
29f7c58a 3187 formats.append(http_f)
70c5802b 3188 i += 1
70c5802b 3189
f6a1d69a 3190 return formats, subtitles
c7c43a93 3191
6ad02195 3192 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
14f25df2 3193 query = urllib.parse.urlparse(url).query
6ad02195 3194 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3195 mobj = re.search(
3196 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3197 url_base = mobj.group('url')
3198 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3199 formats = []
044eeb14
S
3200
3201 def manifest_url(manifest):
86e5f3ed 3202 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3203 if query:
3204 m_url += '?%s' % query
3205 return m_url
3206
6ad02195
RA
3207 if 'm3u8' not in skip_protocols:
3208 formats.extend(self._extract_m3u8_formats(
044eeb14 3209 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3210 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3211 if 'f4m' not in skip_protocols:
3212 formats.extend(self._extract_f4m_formats(
044eeb14 3213 manifest_url('manifest.f4m'),
6ad02195 3214 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3215 if 'dash' not in skip_protocols:
3216 formats.extend(self._extract_mpd_formats(
044eeb14 3217 manifest_url('manifest.mpd'),
0384932e 3218 video_id, mpd_id='dash', fatal=False))
6ad02195 3219 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3220 if 'smil' not in skip_protocols:
3221 rtmp_formats = self._extract_smil_formats(
044eeb14 3222 manifest_url('jwplayer.smil'),
6ad02195
RA
3223 video_id, fatal=False)
3224 for rtmp_format in rtmp_formats:
3225 rtsp_format = rtmp_format.copy()
3226 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3227 del rtsp_format['play_path']
3228 del rtsp_format['ext']
3229 rtsp_format.update({
3230 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3231 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3232 'protocol': 'rtsp',
3233 })
3234 formats.extend([rtmp_format, rtsp_format])
3235 else:
3236 for protocol in ('rtmp', 'rtsp'):
3237 if protocol not in skip_protocols:
3238 formats.append({
86e5f3ed 3239 'url': f'{protocol}:{url_base}',
6ad02195
RA
3240 'format_id': protocol,
3241 'protocol': protocol,
3242 })
3243 return formats
3244
c73e330e 3245 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3246 mobj = re.search(
32a84bcf 3247 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
a4a554a7
YCH
3248 webpage)
3249 if mobj:
c73e330e
RU
3250 try:
3251 jwplayer_data = self._parse_json(mobj.group('options'),
3252 video_id=video_id,
3253 transform_source=transform_source)
3254 except ExtractorError:
3255 pass
3256 else:
3257 if isinstance(jwplayer_data, dict):
3258 return jwplayer_data
a4a554a7
YCH
3259
3260 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3261 jwplayer_data = self._find_jwplayer_data(
3262 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3263 return self._parse_jwplayer_data(
3264 jwplayer_data, video_id, *args, **kwargs)
3265
3266 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3267 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
a4a554a7 3268 entries = []
32a84bcf
SS
3269 if not isinstance(jwplayer_data, dict):
3270 return entries
a4a554a7 3271
32a84bcf
SS
3272 playlist_items = jwplayer_data.get('playlist')
3273 # JWPlayer backward compatibility: single playlist item/flattened playlists
a4a554a7 3274 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
32a84bcf
SS
3275 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3276 if not isinstance(playlist_items, list):
3277 playlist_items = (playlist_items or jwplayer_data, )
a4a554a7 3278
32a84bcf
SS
3279 for video_data in playlist_items:
3280 if not isinstance(video_data, dict):
3281 continue
a4a554a7
YCH
3282 # JWPlayer backward compatibility: flattened sources
3283 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3284 if 'sources' not in video_data:
3285 video_data['sources'] = [video_data]
3286
3287 this_video_id = video_id or video_data['mediaid']
3288
1a2192cb
S
3289 formats = self._parse_jwplayer_formats(
3290 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3291 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3292
3293 subtitles = {}
3294 tracks = video_data.get('tracks')
3295 if tracks and isinstance(tracks, list):
3296 for track in tracks:
96a2daa1
S
3297 if not isinstance(track, dict):
3298 continue
f4b74272 3299 track_kind = track.get('kind')
14f25df2 3300 if not track_kind or not isinstance(track_kind, str):
f4b74272
S
3301 continue
3302 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3303 continue
3304 track_url = urljoin(base_url, track.get('file'))
3305 if not track_url:
3306 continue
3307 subtitles.setdefault(track.get('label') or 'en', []).append({
3308 'url': self._proto_relative_url(track_url)
3309 })
3310
50d808f5 3311 entry = {
a4a554a7 3312 'id': this_video_id,
50d808f5 3313 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3314 'description': clean_html(video_data.get('description')),
6945b9e7 3315 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3316 'timestamp': int_or_none(video_data.get('pubdate')),
3317 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3318 'subtitles': subtitles,
32a84bcf
SS
3319 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3320 'genre': clean_html(video_data.get('genre')),
3321 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3322 'season_number': int_or_none(video_data.get('season')),
3323 'episode_number': int_or_none(video_data.get('episode')),
3324 'release_year': int_or_none(video_data.get('releasedate')),
3325 'age_limit': int_or_none(video_data.get('age_restriction')),
50d808f5
RA
3326 }
3327 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3328 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3329 entry.update({
3330 '_type': 'url_transparent',
3331 'url': formats[0]['url'],
3332 })
3333 else:
50d808f5
RA
3334 entry['formats'] = formats
3335 entries.append(entry)
a4a554a7
YCH
3336 if len(entries) == 1:
3337 return entries[0]
3338 else:
3339 return self.playlist_result(entries)
3340
ed0cf9b3
S
3341 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3342 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
32a84bcf 3343 urls = set()
ed0cf9b3 3344 formats = []
1a2192cb 3345 for source in jwplayer_sources_data:
0a268c6e
S
3346 if not isinstance(source, dict):
3347 continue
6945b9e7
RA
3348 source_url = urljoin(
3349 base_url, self._proto_relative_url(source.get('file')))
3350 if not source_url or source_url in urls:
bf1b87cd 3351 continue
32a84bcf 3352 urls.add(source_url)
ed0cf9b3
S
3353 source_type = source.get('type') or ''
3354 ext = mimetype2ext(source_type) or determine_ext(source_url)
32a84bcf 3355 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
ed0cf9b3 3356 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3357 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3358 m3u8_id=m3u8_id, fatal=False))
32a84bcf 3359 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
ed0cf9b3
S
3360 formats.extend(self._extract_mpd_formats(
3361 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3362 elif ext == 'smil':
3363 formats.extend(self._extract_smil_formats(
3364 source_url, video_id, fatal=False))
ed0cf9b3 3365 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3366 elif source_type.startswith('audio') or ext in (
3367 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3368 formats.append({
3369 'url': source_url,
3370 'vcodec': 'none',
3371 'ext': ext,
3372 })
3373 else:
32a84bcf 3374 format_id = str_or_none(source.get('label'))
ed0cf9b3 3375 height = int_or_none(source.get('height'))
32a84bcf 3376 if height is None and format_id:
ed0cf9b3 3377 # Often no height is provided but there is a label in
0236cd0d 3378 # format like "1080p", "720p SD", or 1080.
32a84bcf 3379 height = parse_resolution(format_id).get('height')
ed0cf9b3
S
3380 a_format = {
3381 'url': source_url,
3382 'width': int_or_none(source.get('width')),
3383 'height': height,
d3a3d7f0 3384 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3385 'filesize': int_or_none(source.get('filesize')),
ed0cf9b3 3386 'ext': ext,
32a84bcf 3387 'format_id': format_id
ed0cf9b3
S
3388 }
3389 if source_url.startswith('rtmp'):
3390 a_format['ext'] = 'flv'
ed0cf9b3
S
3391 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3392 # of jwplayer.flash.swf
3393 rtmp_url_parts = re.split(
3394 r'((?:mp4|mp3|flv):)', source_url, 1)
3395 if len(rtmp_url_parts) == 3:
3396 rtmp_url, prefix, play_path = rtmp_url_parts
3397 a_format.update({
3398 'url': rtmp_url,
3399 'play_path': prefix + play_path,
3400 })
3401 if rtmp_params:
3402 a_format.update(rtmp_params)
3403 formats.append(a_format)
3404 return formats
3405
f4b1c7ad 3406 def _live_title(self, name):
39ca3b5c 3407 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3408 return name
f4b1c7ad 3409
b14f3a4c
PH
3410 def _int(self, v, name, fatal=False, **kwargs):
3411 res = int_or_none(v, **kwargs)
b14f3a4c 3412 if res is None:
86e5f3ed 3413 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3414 if fatal:
3415 raise ExtractorError(msg)
3416 else:
6a39ee13 3417 self.report_warning(msg)
b14f3a4c
PH
3418 return res
3419
3420 def _float(self, v, name, fatal=False, **kwargs):
3421 res = float_or_none(v, **kwargs)
3422 if res is None:
86e5f3ed 3423 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3424 if fatal:
3425 raise ExtractorError(msg)
3426 else:
6a39ee13 3427 self.report_warning(msg)
b14f3a4c
PH
3428 return res
3429
40e41780
TF
3430 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3431 path='/', secure=False, discard=False, rest={}, **kwargs):
ac668111 3432 cookie = http.cookiejar.Cookie(
4ed2d7b7 3433 0, name, value, port, port is not None, domain, True,
40e41780
TF
3434 domain.startswith('.'), path, True, secure, expire_time,
3435 discard, None, None, rest)
9809740b 3436 self.cookiejar.set_cookie(cookie)
42939b61 3437
799207e8 3438 def _get_cookies(self, url):
ac668111 3439 """ Return a http.cookies.SimpleCookie with the cookies for the url """
8817a80d 3440 return LenientSimpleCookie(self._downloader._calc_cookies(url))
799207e8 3441
e3c1266f 3442 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3443 """
3444 Apply first Set-Cookie header instead of the last. Experimental.
3445
3446 Some sites (e.g. [1-3]) may serve two cookies under the same name
3447 in Set-Cookie header and expect the first (old) one to be set rather
3448 than second (new). However, as of RFC6265 the newer one cookie
3449 should be set into cookie store what actually happens.
3450 We will workaround this issue by resetting the cookie to
3451 the first one manually.
3452 1. https://new.vk.com/
3453 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3454 3. https://learning.oreilly.com/
3455 """
e3c1266f
S
3456 for header, cookies in url_handle.headers.items():
3457 if header.lower() != 'set-cookie':
3458 continue
cfb0511d 3459 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3460 cookie_value = re.search(
3461 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3462 if cookie_value:
3463 value, domain = cookie_value.groups()
3464 self._set_cookie(domain, cookie, value)
3465 break
3466
82d02080 3467 @classmethod
3468 def get_testcases(cls, include_onlymatching=False):
6368e2e6 3469 # Do not look in super classes
3470 t = vars(cls).get('_TEST')
05900629 3471 if t:
82d02080 3472 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3473 tests = [t]
3474 else:
6368e2e6 3475 tests = vars(cls).get('_TESTS', [])
05900629
PH
3476 for t in tests:
3477 if not include_onlymatching and t.get('only_matching', False):
3478 continue
82d02080 3479 t['name'] = cls.ie_key()
05900629 3480 yield t
e756f45b
M
3481 if getattr(cls, '__wrapped__', None):
3482 yield from cls.__wrapped__.get_testcases(include_onlymatching)
05900629 3483
f2e8dbcc 3484 @classmethod
3485 def get_webpage_testcases(cls):
6368e2e6 3486 tests = vars(cls).get('_WEBPAGE_TESTS', [])
f2e8dbcc 3487 for t in tests:
3488 t['name'] = cls.ie_key()
e756f45b
M
3489 yield t
3490 if getattr(cls, '__wrapped__', None):
3491 yield from cls.__wrapped__.get_webpage_testcases()
f2e8dbcc 3492
6368e2e6 3493 @classproperty(cache=True)
24146491 3494 def age_limit(cls):
3495 """Get age limit from the testcases"""
3496 return max(traverse_obj(
f2e8dbcc 3497 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
24146491 3498 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3499
171a31db 3500 @classproperty(cache=True)
3501 def _RETURN_TYPE(cls):
3502 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3503 tests = tuple(cls.get_testcases(include_onlymatching=False))
3504 if not tests:
3505 return None
3506 elif not any(k.startswith('playlist') for test in tests for k in test):
3507 return 'video'
3508 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3509 return 'playlist'
3510 return 'any'
3511
3512 @classmethod
3513 def is_single_video(cls, url):
3514 """Returns whether the URL is of a single video, None if unknown"""
3515 assert cls.suitable(url), 'The URL must be suitable for the extractor'
3516 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3517
82d02080 3518 @classmethod
3519 def is_suitable(cls, age_limit):
24146491 3520 """Test whether the extractor is generally suitable for the given age limit"""
3521 return not age_restricted(cls.age_limit, age_limit)
05900629 3522
82d02080 3523 @classmethod
3524 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3525 """Description of the extractor"""
3526 desc = ''
82d02080 3527 if cls._NETRC_MACHINE:
8dcce6a8 3528 if markdown:
82d02080 3529 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
8dcce6a8 3530 else:
82d02080 3531 desc += f' [{cls._NETRC_MACHINE}]'
3532 if cls.IE_DESC is False:
8dcce6a8 3533 desc += ' [HIDDEN]'
82d02080 3534 elif cls.IE_DESC:
3535 desc += f' {cls.IE_DESC}'
3536 if cls.SEARCH_KEY:
08e29b9f 3537 desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3538 if search_examples:
3539 _COUNTS = ('', '5', '10', 'all')
62b58c09 3540 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
82d02080 3541 if not cls.working():
8dcce6a8 3542 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3543
46d09f87 3544 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3545 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
8dcce6a8 3546 return f'{name}:{desc}' if desc else name
3547
a504ced0 3548 def extract_subtitles(self, *args, **kwargs):
a06916d9 3549 if (self.get_param('writesubtitles', False)
3550 or self.get_param('listsubtitles')):
9868ea49
JMF
3551 return self._get_subtitles(*args, **kwargs)
3552 return {}
a504ced0
JMF
3553
3554 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3555 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3556
0cf643b2
M
3557 class CommentsDisabled(Exception):
3558 """Raise in _get_comments if comments are disabled for the video"""
3559
a2160aa4 3560 def extract_comments(self, *args, **kwargs):
3561 if not self.get_param('getcomments'):
3562 return None
3563 generator = self._get_comments(*args, **kwargs)
3564
3565 def extractor():
3566 comments = []
d2b2fca5 3567 interrupted = True
a2160aa4 3568 try:
3569 while True:
3570 comments.append(next(generator))
a2160aa4 3571 except StopIteration:
3572 interrupted = False
d2b2fca5 3573 except KeyboardInterrupt:
3574 self.to_screen('Interrupted by user')
0cf643b2
M
3575 except self.CommentsDisabled:
3576 return {'comments': None, 'comment_count': None}
d2b2fca5 3577 except Exception as e:
3578 if self.get_param('ignoreerrors') is not True:
3579 raise
3580 self._downloader.report_error(e)
a2160aa4 3581 comment_count = len(comments)
3582 self.to_screen(f'Extracted {comment_count} comments')
3583 return {
3584 'comments': comments,
3585 'comment_count': None if interrupted else comment_count
3586 }
3587 return extractor
3588
3589 def _get_comments(self, *args, **kwargs):
3590 raise NotImplementedError('This method must be implemented by subclasses')
3591
912e0b7e
YCH
3592 @staticmethod
3593 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3594 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3595 will be dropped. """
86e5f3ed 3596 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3597 ret = list(subtitle_list1)
a44ca5a4 3598 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3599 return ret
3600
3601 @classmethod
46890374 3602 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3603 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3604 if target is None:
3605 target = {}
3606 for d in dicts:
3607 for lang, subs in d.items():
3608 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3609 return target
912e0b7e 3610
360e1ca5 3611 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3612 if (self.get_param('writeautomaticsub', False)
3613 or self.get_param('listsubtitles')):
9868ea49
JMF
3614 return self._get_automatic_captions(*args, **kwargs)
3615 return {}
360e1ca5
JMF
3616
3617 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3618 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3619
2762dbb1 3620 @functools.cached_property
24146491 3621 def _cookies_passed(self):
3622 """Whether cookies have been passed to YoutubeDL"""
3623 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3624
d77ab8e2 3625 def mark_watched(self, *args, **kwargs):
1813a6cc 3626 if not self.get_param('mark_watched', False):
3627 return
24146491 3628 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3629 self._mark_watched(*args, **kwargs)
3630
3631 def _mark_watched(self, *args, **kwargs):
3632 raise NotImplementedError('This method must be implemented by subclasses')
3633
38cce791
YCH
3634 def geo_verification_headers(self):
3635 headers = {}
a06916d9 3636 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3637 if geo_verification_proxy:
3638 headers['Ytdl-request-proxy'] = geo_verification_proxy
3639 return headers
3640
8f97a15d 3641 @staticmethod
3642 def _generic_id(url):
14f25df2 3643 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
98763ee3 3644
62b8dac4 3645 def _generic_title(self, url='', webpage='', *, default=None):
3646 return (self._og_search_title(webpage, default=None)
3647 or self._html_extract_title(webpage, default=None)
3648 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3649 or default)
98763ee3 3650
c224251a 3651 @staticmethod
b0089e89 3652 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3653 all_known = all(map(
3654 lambda x: x is not None,
3655 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3656 return (
3657 'private' if is_private
3658 else 'premium_only' if needs_premium
3659 else 'subscriber_only' if needs_subscription
3660 else 'needs_auth' if needs_auth
3661 else 'unlisted' if is_unlisted
3662 else 'public' if all_known
3663 else None)
3664
d43de682 3665 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3666 '''
3667 @returns A list of values for the extractor argument given by "key"
3668 or "default" if no such key is present
3669 @param default The default value to return when the key is not present (default: [])
3670 @param casesense When false, the values are converted to lower case
3671 '''
5225df50 3672 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3673 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
4bb6b02f 3674 if val is None:
3675 return [] if default is NO_DEFAULT else default
3676 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3677
f40ee5e9 3678 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3679 if not playlist_id or not video_id:
3680 return not video_id
3681
3682 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3683 if no_playlist is not None:
3684 return not no_playlist
3685
3686 video_id = '' if video_id is True else f' {video_id}'
3687 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3688 if self.get_param('noplaylist'):
3689 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3690 return False
3691 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3692 return True
3693
be5c1ae8 3694 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
8ca48a1a 3695 RetryManager.report_retry(
3696 err, _count or int(fatal), _retries,
3697 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3698 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
be5c1ae8 3699
3700 def RetryManager(self, **kwargs):
3701 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3702
ade1fa70 3703 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3704 display_id = traverse_obj(info_dict, 'display_id', 'id')
3705 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3706 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3707 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3708
8f97a15d 3709 @classmethod
3710 def extract_from_webpage(cls, ydl, url, webpage):
3711 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3712 else ydl.get_info_extractor(cls.ie_key()))
f2e8dbcc 3713 for info in ie._extract_from_webpage(url, webpage) or []:
3714 # url = None since we do not want to set (webpage/original)_url
3715 ydl.add_default_extra_info(info, ie, None)
3716 yield info
8f97a15d 3717
3718 @classmethod
3719 def _extract_from_webpage(cls, url, webpage):
3720 for embed_url in orderedSet(
3721 cls._extract_embed_urls(url, webpage) or [], lazy=True):
d2c8aadf 3722 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
8f97a15d 3723
3724 @classmethod
3725 def _extract_embed_urls(cls, url, webpage):
3726 """@returns all the embed urls on the webpage"""
3727 if '_EMBED_URL_RE' not in cls.__dict__:
3728 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3729 for idx, regex in enumerate(cls._EMBED_REGEX):
3730 assert regex.count('(?P<url>') == 1, \
3731 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3732 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3733
3734 for regex in cls._EMBED_URL_RE:
3735 for mobj in regex.finditer(webpage):
3736 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3737 if cls._VALID_URL is False or cls.suitable(embed_url):
3738 yield embed_url
3739
3740 class StopExtraction(Exception):
3741 pass
3742
bfd973ec 3743 @classmethod
3744 def _extract_url(cls, webpage): # TODO: Remove
3745 """Only for compatibility with some older extractors"""
3746 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3747
2314b4d8 3748 @classmethod
3749 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3750 if plugin_name:
3751 mro = inspect.getmro(cls)
3752 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
e756f45b
M
3753 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3754 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
2314b4d8 3755 while getattr(super_class, '__wrapped__', None):
3756 super_class = super_class.__wrapped__
3757 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
e756f45b 3758 _PLUGIN_OVERRIDES[super_class].append(cls)
2314b4d8 3759
3760 return super().__init_subclass__(**kwargs)
3761
8dbe9899 3762
d6983cb4
PH
3763class SearchInfoExtractor(InfoExtractor):
3764 """
3765 Base class for paged search queries extractors.
10952eb2 3766 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3767 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3768 """
3769
96565c7e 3770 _MAX_RESULTS = float('inf')
171a31db 3771 _RETURN_TYPE = 'playlist'
96565c7e 3772
8f97a15d 3773 @classproperty
3774 def _VALID_URL(cls):
d6983cb4
PH
3775 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3776
d6983cb4 3777 def _real_extract(self, query):
2c4aaadd 3778 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3779 if prefix == '':
3780 return self._get_n_results(query, 1)
3781 elif prefix == 'all':
3782 return self._get_n_results(query, self._MAX_RESULTS)
3783 else:
3784 n = int(prefix)
3785 if n <= 0:
86e5f3ed 3786 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3787 elif n > self._MAX_RESULTS:
6a39ee13 3788 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3789 n = self._MAX_RESULTS
3790 return self._get_n_results(query, n)
3791
3792 def _get_n_results(self, query, n):
cc16383f 3793 """Get a specified number of results for a query.
3794 Either this function or _search_results must be overridden by subclasses """
3795 return self.playlist_result(
3796 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3797 query, query)
3798
3799 def _search_results(self, query):
3800 """Returns an iterator of search results"""
611c1dd9 3801 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 3802
82d02080 3803 @classproperty
3804 def SEARCH_KEY(cls):
3805 return cls._SEARCH_KEY
fe7866d0 3806
3807
3808class UnsupportedURLIE(InfoExtractor):
3809 _VALID_URL = '.*'
3810 _ENABLED = False
3811 IE_DESC = False
3812
3813 def _real_extract(self, url):
3814 raise UnsupportedError(url)
e756f45b
M
3815
3816
3817_PLUGIN_OVERRIDES = collections.defaultdict(list)