]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
[cookies] Move `YoutubeDLCookieJar` to cookies module (#7091)
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
ac668111 3import getpass
3ec05685 4import hashlib
54007a45 5import http.client
6import http.cookiejar
7import http.cookies
2314b4d8 8import inspect
cc16383f 9import itertools
3d3538e4 10import json
f8271158 11import math
4094b6e3 12import netrc
d6983cb4 13import os
773f291d 14import random
6929b41a 15import re
d6983cb4 16import sys
4094b6e3 17import time
8f97a15d 18import types
14f25df2 19import urllib.parse
ac668111 20import urllib.request
f8271158 21import xml.etree.ElementTree
d6983cb4 22
6929b41a 23from ..compat import functools # isort: split
14f25df2 24from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
8817a80d 25from ..cookies import LenientSimpleCookie
f8271158 26from ..downloader.f4m import get_base_url, remove_encrypted_media
8c25f81b 27from ..utils import (
8f97a15d 28 IDENTITY,
f8271158 29 JSON_LD_RE,
30 NO_DEFAULT,
31 ExtractorError,
d0d74b71 32 FormatSorter,
f8271158 33 GeoRestrictedError,
34 GeoUtils,
cb73b846 35 HEADRequest,
b7c47b74 36 LenientJSONDecoder,
f8271158 37 RegexNotFoundError,
be5c1ae8 38 RetryManager,
f8271158 39 UnsupportedError,
05900629 40 age_restricted,
02dc0a36 41 base_url,
08f2a92c 42 bug_reports_message,
82d02080 43 classproperty,
d6983cb4 44 clean_html,
d0d74b71 45 deprecation_warning,
70f0f5a8 46 determine_ext,
d493f15c 47 dict_get,
42676437 48 encode_data_uri,
9b9c5355 49 error_to_compat_str,
46b18f23 50 extract_attributes,
90137ca4 51 filter_dict,
97f4aecf 52 fix_xml_ampersands,
b14f3a4c 53 float_or_none,
b868936c 54 format_field,
31bb8d3f 55 int_or_none,
34921b43 56 join_nonempty,
a4a554a7 57 js_to_json,
46b18f23 58 mimetype2ext,
3158150c 59 network_exceptions,
46b18f23 60 orderedSet,
d493f15c 61 parse_bitrate,
46b18f23
JH
62 parse_codecs,
63 parse_duration,
4ca2a3cf 64 parse_iso8601,
46b18f23 65 parse_m3u8_attributes,
d493f15c 66 parse_resolution,
46b18f23 67 sanitize_filename,
8f97a15d 68 sanitize_url,
b868936c 69 sanitized_Request,
ade1fa70 70 smuggle_url,
d493f15c 71 str_or_none,
ce5b9040 72 str_to_int,
f856816b 73 strip_or_none,
5d3a0e79 74 traverse_obj,
71df9b7f 75 truncate_string,
47046464 76 try_call,
ffa89477 77 try_get,
f38de77f 78 unescapeHTML,
647eab45 79 unified_strdate,
6b3a3098 80 unified_timestamp,
46b18f23 81 update_Request,
09d02ea4 82 update_url_query,
a107193e 83 url_basename,
bebef109 84 url_or_none,
7e68567e 85 urlhandle_detect_ext,
b868936c 86 urljoin,
6606817a 87 variadic,
a6571f10 88 xpath_element,
8d6765cf
S
89 xpath_text,
90 xpath_with_ns,
d6983cb4 91)
c342041f 92
d6983cb4 93
86e5f3ed 94class InfoExtractor:
d6983cb4
PH
95 """Information Extractor class.
96
97 Information extractors are the classes that, given a URL, extract
98 information about the video (or videos) the URL refers to. This
99 information includes the real video URL, the video title, author and
100 others. The information is stored in a dictionary which is then
5d380852 101 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
102 information possibly downloading the video to the file system, among
103 other possible outcomes.
104
cf0649f8 105 The type field determines the type of the result.
fed5d032
PH
106 By far the most common value (and the default if _type is missing) is
107 "video", which indicates a single video.
108
109 For a video, the dictionaries must include the following fields:
d6983cb4
PH
110
111 id: Video identifier.
d4736fdb 112 title: Video title, unescaped. Set to an empty string if video has
113 no title as opposed to "None" which signifies that the
114 extractor failed to obtain a title
d67b0b15 115
f49d89ee 116 Additionally, it must contain either a formats entry or a url one:
d67b0b15 117
f49d89ee
PH
118 formats: A list of dictionaries for each format available, ordered
119 from worst to best quality.
120
121 Potential fields:
c790e93a
S
122 * url The mandatory URL representing the media:
123 for plain file media - HTTP URL of this file,
124 for RTMP - RTMP URL,
125 for HLS - URL of the M3U8 media playlist,
126 for HDS - URL of the F4M manifest,
79d2077e
S
127 for DASH
128 - HTTP URL to plain file media (in case of
129 unfragmented media)
130 - URL of the MPD manifest or base URL
131 representing the media if MPD manifest
8ed7a233 132 is parsed from a string (in case of
79d2077e 133 fragmented media)
c790e93a 134 for MSS - URL of the ISM manifest.
f34804b2 135 * request_data Data to send in POST request to the URL
86f4d14f
S
136 * manifest_url
137 The URL of the manifest file in case of
c790e93a
S
138 fragmented media:
139 for HLS - URL of the M3U8 master playlist,
140 for HDS - URL of the F4M manifest,
141 for DASH - URL of the MPD manifest,
142 for MSS - URL of the ISM manifest.
a44ca5a4 143 * manifest_stream_number (For internal use only)
144 The index of the stream in the manifest file
10952eb2 145 * ext Will be calculated from URL if missing
d67b0b15
PH
146 * format A human-readable description of the format
147 ("mp4 container with h264/opus").
148 Calculated from the format_id, width, height.
149 and format_note fields if missing.
150 * format_id A short description of the format
5d4f3985
PH
151 ("mp4_h264_opus" or "19").
152 Technically optional, but strongly recommended.
d67b0b15
PH
153 * format_note Additional info about the format
154 ("3D" or "DASH video")
155 * width Width of the video, if known
156 * height Height of the video, if known
105bfd90 157 * aspect_ratio Aspect ratio of the video, if known
158 Automatically calculated from width and height
f49d89ee 159 * resolution Textual description of width and height
105bfd90 160 Automatically calculated from width and height
176f1866 161 * dynamic_range The dynamic range of the video. One of:
162 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 163 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
164 * abr Average audio bitrate in KBit/s
165 * acodec Name of the audio codec in use
dd27fd17 166 * asr Audio sampling rate in Hertz
b8ed0f15 167 * audio_channels Number of audio channels
d67b0b15 168 * vbr Average video bitrate in KBit/s
fbb21cf5 169 * fps Frame rate
d67b0b15 170 * vcodec Name of the video codec in use
1394ce65 171 * container Name of the container format
d67b0b15 172 * filesize The number of bytes, if known in advance
9732d77e 173 * filesize_approx An estimate for the number of bytes
d67b0b15 174 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 175 * protocol The protocol that will be used for the actual
adbc4ec4
THD
176 download, lower-case. One of "http", "https" or
177 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
178 * fragment_base_url
179 Base URL for fragments. Each fragment's path
180 value (if present) will be relative to
181 this URL.
182 * fragments A list of fragments of a fragmented media.
183 Each fragment entry must contain either an url
184 or a path. If an url is present it should be
185 considered by a client. Otherwise both path and
186 fragment_base_url must be present. Here is
187 the list of all potential fields:
188 * "url" - fragment's URL
189 * "path" - fragment's path relative to
190 fragment_base_url
a0d5077c
S
191 * "duration" (optional, int or float)
192 * "filesize" (optional, int)
adbc4ec4
THD
193 * is_from_start Is a live format that can be downloaded
194 from the start. Boolean
f49d89ee 195 * preference Order number of this format. If this field is
08d13955 196 present and not None, the formats get sorted
38d63d84 197 by this field, regardless of all other values.
f49d89ee
PH
198 -1 for default (order by other properties),
199 -2 or smaller for less than default.
e65566a9
PH
200 < -1000 to hide the format (if there is
201 another one which is strictly better)
32f90364
PH
202 * language Language code, e.g. "de" or "en-US".
203 * language_preference Is this in the language mentioned in
204 the URL?
aff2f4f4
PH
205 10 if it's what the URL is about,
206 -1 for default (don't know),
207 -10 otherwise, other values reserved for now.
5d73273f
PH
208 * quality Order number of the video quality of this
209 format, irrespective of the file format.
210 -1 for default (order by other properties),
211 -2 or smaller for less than default.
c64ed2a3
PH
212 * source_preference Order number for this video source
213 (quality takes higher priority)
214 -1 for default (order by other properties),
215 -2 or smaller for less than default.
d769be6c
PH
216 * http_headers A dictionary of additional HTTP headers
217 to add to the request.
6271f1ca 218 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
219 video's pixels are not square.
220 width : height ratio as float.
221 * no_resume The server does not support resuming the
222 (HTTP or RTMP) download. Boolean.
88acdbc2 223 * has_drm The format has DRM and cannot be downloaded. Boolean
7e68567e 224 * extra_param_to_segment_url A query string to append to each
225 fragment's URL, or to update each existing query string
226 with. Only applied by the native HLS/DASH downloaders.
227 * hls_aes A dictionary of HLS AES-128 decryption information
228 used by the native HLS downloader to override the
229 values in the media playlist when an '#EXT-X-KEY' tag
230 is present in the playlist:
231 * uri The URI from which the key will be downloaded
232 * key The key (as hex) used to decrypt fragments.
233 If `key` is given, any key URI will be ignored
234 * iv The IV (as hex) used to decrypt fragments
0a5a191a 235 * downloader_options A dictionary of downloader options
236 (For internal use only)
237 * http_chunk_size Chunk size for HTTP downloads
238 * ffmpeg_args Extra arguments for ffmpeg downloader
3b1fe47d 239 RTMP formats can also have the additional fields: page_url,
240 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
241 rtmp_protocol, rtmp_real_time
3dee7826 242
c0ba0f48 243 url: Final video URL.
d6983cb4 244 ext: Video filename extension.
d67b0b15
PH
245 format: The video format, defaults to ext (used for --get-format)
246 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 247
d6983cb4
PH
248 The following fields are optional:
249
08d30158 250 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 251 alt_title: A secondary title of the video.
0afef30b
PH
252 display_id An alternative identifier for the video, not necessarily
253 unique, but available before title. Typically, id is
254 something like "4234987", title "Dancing naked mole rats",
255 and display_id "dancing-naked-mole-rats"
d5519808 256 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 257 * "id" (optional, string) - Thumbnail format ID
d5519808 258 * "url"
cfb56d1a 259 * "preference" (optional, int) - quality of the image
d5519808
PH
260 * "width" (optional, int)
261 * "height" (optional, int)
5e1c39ac 262 * "resolution" (optional, string "{width}x{height}",
d5519808 263 deprecated)
2de624fd 264 * "filesize" (optional, int)
297e9952 265 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 266 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 267 description: Full video description.
d6983cb4 268 uploader: Full name of the video uploader.
2bc0c46f 269 license: License name the video is licensed under.
8a92e51c 270 creator: The creator of the video.
10db0d2f 271 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 272 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 273 If not explicitly set, calculated from timestamp
274 release_timestamp: UNIX timestamp of the moment the video was released.
275 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 276 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 277 If not explicitly set, calculated from release_timestamp
278 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 279 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 280 If not explicitly set, calculated from modified_timestamp
d6983cb4 281 uploader_id: Nickname or id of the video uploader.
7bcd2830 282 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 283 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 284 Note that channel fields may or may not repeat uploader
6f1f59f3
S
285 fields. This depends on a particular extractor.
286 channel_id: Id of the channel.
287 channel_url: Full URL to a channel webpage.
6c73052c 288 channel_follower_count: Number of followers of the channel.
da9ec3b9 289 location: Physical location where the video was filmed.
a504ced0 290 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
291 {tag: subformats}. "tag" is usually a language code, and
292 "subformats" is a list sorted from lower to higher
293 preference, each element is a dictionary with the "ext"
294 entry and one of:
a504ced0 295 * "data": The subtitles file contents
10952eb2 296 * "url": A URL pointing to the subtitles file
2412044c 297 It can optionally also have:
298 * "name": Name or description of the subtitles
08d30158 299 * "http_headers": A dictionary of additional HTTP headers
297e9952 300 to add to the request.
4bba3716 301 "ext" will be calculated from URL if missing
e167860c 302 automatic_captions: Like 'subtitles'; contains automatically generated
303 captions instead of normal subtitles
62d231c0 304 duration: Length of the video in seconds, as an integer or float.
f3d29461 305 view_count: How many users have watched the video on the platform.
867c66ff 306 concurrent_view_count: How many users are currently watching the video on the platform.
19e3dfc9
PH
307 like_count: Number of positive ratings of the video
308 dislike_count: Number of negative ratings of the video
02835c6b 309 repost_count: Number of reposts of the video
2d30521a 310 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 311 comment_count: Number of comments on the video
dd622d7c
PH
312 comments: A list of comments, each with one or more of the following
313 properties (all but one of text or html optional):
314 * "author" - human-readable name of the comment author
315 * "author_id" - user ID of the comment author
a1c5d2ca 316 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
317 * "id" - Comment ID
318 * "html" - Comment as HTML
319 * "text" - Plain text of the comment
320 * "timestamp" - UNIX timestamp of comment
321 * "parent" - ID of the comment this one is replying to.
322 Set to "root" to indicate that this is a
323 comment to the original video.
a1c5d2ca
M
324 * "like_count" - Number of positive ratings of the comment
325 * "dislike_count" - Number of negative ratings of the comment
326 * "is_favorited" - Whether the comment is marked as
327 favorite by the video uploader
328 * "author_is_uploader" - Whether the comment is made by
329 the video uploader
8dbe9899 330 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 331 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
332 should allow to get the same result again. (It will be set
333 by YoutubeDL if it's missing)
ad3bc6ac
PH
334 categories: A list of categories that the video falls in, for example
335 ["Sports", "Berlin"]
864f24bd 336 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 337 cast: A list of the video cast
7267bd53
PH
338 is_live: True, False, or None (=unknown). Whether this video is a
339 live stream that goes on instead of a fixed-length video.
f76ede8e 340 was_live: True, False, or None (=unknown). Whether this video was
341 originally a live stream.
0647d925 342 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
e325a21a 343 or 'post_live' (was live, but VOD is not yet processed)
ae30b840 344 If absent, automatically set from is_live, was_live
7c80519c 345 start_time: Time in seconds where the reproduction should start, as
10952eb2 346 specified in the URL.
297a564b 347 end_time: Time in seconds where the reproduction should end, as
10952eb2 348 specified in the URL.
55949fed 349 chapters: A list of dictionaries, with the following entries:
350 * "start_time" - The start time of the chapter in seconds
351 * "end_time" - The end time of the chapter in seconds
352 * "title" (optional, string)
5caf30db
A
353 heatmap: A list of dictionaries, with the following entries:
354 * "start_time" - The start time of the data point in seconds
355 * "end_time" - The end time of the data point in seconds
356 * "value" - The normalized value of the data point (float between 0 and 1)
6cfda058 357 playable_in_embed: Whether this video is allowed to play in embedded
358 players on other sites. Can be True (=always allowed),
359 False (=never allowed), None (=unknown), or a string
62b58c09 360 specifying the criteria for embedability; e.g. 'whitelist'
c224251a
M
361 availability: Under what condition the video is available. One of
362 'private', 'premium_only', 'subscriber_only', 'needs_auth',
363 'unlisted' or 'public'. Use 'InfoExtractor._availability'
364 to set it
1e8fe57e 365 _old_archive_ids: A list of old archive ids needed for backward compatibility
784320c9 366 _format_sort_fields: A list of fields to use for sorting formats
277d6ff5 367 __post_extractor: A function to be called just before the metadata is
368 written to either disk, logger or console. The function
369 must return a dict which will be added to the info_dict.
370 This is usefull for additional information that is
371 time-consuming to extract. Note that the fields thus
372 extracted will not be available to output template and
373 match_filter. So, only "comments" and "comment_count" are
374 currently allowed to be extracted via this method.
d6983cb4 375
7109903e
S
376 The following fields should only be used when the video belongs to some logical
377 chapter or section:
378
379 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
380 chapter_number: Number of the chapter the video belongs to, as an integer.
381 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
382
383 The following fields should only be used when the video is an episode of some
8d76bdf1 384 series, programme or podcast:
7109903e
S
385
386 series: Title of the series or programme the video episode belongs to.
9ac24e23 387 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 388 season: Title of the season the video episode belongs to.
27bfd4e5
S
389 season_number: Number of the season the video episode belongs to, as an integer.
390 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
391 episode: Title of the video episode. Unlike mandatory video title field,
392 this field should denote the exact title of the video episode
393 without any kind of decoration.
27bfd4e5
S
394 episode_number: Number of the video episode within a season, as an integer.
395 episode_id: Id of the video episode, as a unicode string.
7109903e 396
7a93ab5f
S
397 The following fields should only be used when the media is a track or a part of
398 a music album:
399
400 track: Title of the track.
401 track_number: Number of the track within an album or a disc, as an integer.
402 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
403 as a unicode string.
404 artist: Artist(s) of the track.
405 genre: Genre(s) of the track.
406 album: Title of the album the track belongs to.
407 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
408 album_artist: List of all artists appeared on the album (e.g.
409 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
410 and compilations).
411 disc_number: Number of the disc or other physical medium the track belongs to,
412 as an integer.
413 release_year: Year (YYYY) when the album was released.
8bcd4048 414 composer: Composer of the piece
7a93ab5f 415
3975b4d2 416 The following fields should only be set for clips that should be cut from the original video:
417
418 section_start: Start time of the section in seconds
419 section_end: End time of the section in seconds
420
45e8a04e 421 The following fields should only be set for storyboards:
422 rows: Number of rows in each storyboard fragment, as an integer
423 columns: Number of columns in each storyboard fragment, as an integer
424
deefc05b 425 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 426
d838b1bd
PH
427 Unless mentioned otherwise, None is equivalent to absence of information.
428
fed5d032
PH
429
430 _type "playlist" indicates multiple videos.
b82f815f
PH
431 There must be a key "entries", which is a list, an iterable, or a PagedList
432 object, each element of which is a valid dictionary by this specification.
fed5d032 433
962ffcf8 434 Additionally, playlists can have "id", "title", and any other relevant
b60419c5 435 attributes with the same semantics as videos (see above).
fed5d032 436
f0d785d3 437 It can also have the following optional fields:
438
439 playlist_count: The total number of videos in a playlist. If not given,
440 YoutubeDL tries to calculate it from "entries"
441
fed5d032
PH
442
443 _type "multi_video" indicates that there are multiple videos that
444 form a single show, for examples multiple acts of an opera or TV episode.
445 It must have an entries key like a playlist and contain all the keys
446 required for a video at the same time.
447
448
449 _type "url" indicates that the video must be extracted from another
450 location, possibly by a different extractor. Its only required key is:
451 "url" - the next URL to extract.
f58766ce
PH
452 The key "ie_key" can be set to the class name (minus the trailing "IE",
453 e.g. "Youtube") if the extractor class is known in advance.
454 Additionally, the dictionary may have any properties of the resolved entity
455 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
456 known ahead of time.
457
458
459 _type "url_transparent" entities have the same specification as "url", but
460 indicate that the given additional information is more precise than the one
461 associated with the resolved URL.
462 This is useful when a site employs a video service that hosts the video and
463 its technical metadata, but that video service does not embed a useful
464 title, description etc.
465
466
8f97a15d 467 Subclasses of this should also be added to the list of extractors and
468 should define a _VALID_URL regexp and, re-define the _real_extract() and
469 (optionally) _real_initialize() methods.
d6983cb4 470
e6f21b3d 471 Subclasses may also override suitable() if necessary, but ensure the function
472 signature is preserved and that this function imports everything it needs
52efa4b3 473 (except other extractors), so that lazy_extractors works correctly.
474
8f97a15d 475 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
476 the HTML of Generic webpages. It may also override _extract_embed_urls
477 or _extract_from_webpage as necessary. While these are normally classmethods,
478 _extract_from_webpage is allowed to be an instance method.
479
480 _extract_from_webpage may raise self.StopExtraction() to stop further
481 processing of the webpage and obtain exclusive rights to it. This is useful
62b58c09
L
482 when the extractor cannot reliably be matched using just the URL,
483 e.g. invidious/peertube instances
8f97a15d 484
485 Embed-only extractors can be defined by setting _VALID_URL = False.
486
52efa4b3 487 To support username + password (or netrc) login, the extractor must define a
488 _NETRC_MACHINE and re-define _perform_login(username, password) and
489 (optionally) _initialize_pre_login() methods. The _perform_login method will
490 be called between _initialize_pre_login and _real_initialize if credentials
491 are passed by the user. In cases where it is necessary to have the login
492 process as part of the extraction rather than initialization, _perform_login
493 can be left undefined.
e6f21b3d 494
4248dad9 495 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
496 geo restriction bypass mechanisms for a particular extractor.
497 Though it won't disable explicit geo restriction bypass based on
504f20dd 498 country code provided with geo_bypass_country.
4248dad9
S
499
500 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
501 countries for this extractor. One of these countries will be used by
502 geo restriction bypass mechanism right away in order to bypass
504f20dd 503 geo restriction, of course, if the mechanism is not disabled.
773f291d 504
5f95927a
S
505 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
506 IP blocks in CIDR notation for this extractor. One of these IP blocks
507 will be used by geo restriction bypass mechanism similarly
504f20dd 508 to _GEO_COUNTRIES.
3ccdde8c 509
fe7866d0 510 The _ENABLED attribute should be set to False for IEs that
511 are disabled by default and must be explicitly enabled.
512
e6f21b3d 513 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
514 in order to warn the users and skip the tests.
515 """
516
517 _ready = False
518 _downloader = None
773f291d 519 _x_forwarded_for_ip = None
4248dad9
S
520 _GEO_BYPASS = True
521 _GEO_COUNTRIES = None
5f95927a 522 _GEO_IP_BLOCKS = None
d6983cb4 523 _WORKING = True
fe7866d0 524 _ENABLED = True
52efa4b3 525 _NETRC_MACHINE = None
231025c4 526 IE_DESC = None
8dcce6a8 527 SEARCH_KEY = None
8f97a15d 528 _VALID_URL = None
529 _EMBED_REGEX = []
d6983cb4 530
8dcce6a8 531 def _login_hint(self, method=NO_DEFAULT, netrc=None):
532 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
533 return {
534 None: '',
535 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
536 'password': f'Use {password_hint}',
537 'cookies': (
538 'Use --cookies-from-browser or --cookies for the authentication. '
17ffed18 539 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
8dcce6a8 540 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 541
d6983cb4 542 def __init__(self, downloader=None):
49a57e70 543 """Constructor. Receives an optional downloader (a YoutubeDL instance).
544 If a downloader is not passed during initialization,
545 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 546 self._ready = False
773f291d 547 self._x_forwarded_for_ip = None
28f436ba 548 self._printed_messages = set()
d6983cb4
PH
549 self.set_downloader(downloader)
550
551 @classmethod
5ad28e7f 552 def _match_valid_url(cls, url):
8f97a15d 553 if cls._VALID_URL is False:
554 return None
79cb2577
PH
555 # This does not use has/getattr intentionally - we want to know whether
556 # we have cached the regexp for *this* class, whereas getattr would also
557 # match the superclass
558 if '_VALID_URL_RE' not in cls.__dict__:
559 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 560 return cls._VALID_URL_RE.match(url)
561
562 @classmethod
563 def suitable(cls, url):
564 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 565 # This function must import everything it needs (except other extractors),
566 # so that lazy_extractors works correctly
5ad28e7f 567 return cls._match_valid_url(url) is not None
d6983cb4 568
ed9266db
PH
569 @classmethod
570 def _match_id(cls, url):
5ad28e7f 571 return cls._match_valid_url(url).group('id')
ed9266db 572
1151c407 573 @classmethod
574 def get_temp_id(cls, url):
575 try:
576 return cls._match_id(url)
577 except (IndexError, AttributeError):
578 return None
579
d6983cb4
PH
580 @classmethod
581 def working(cls):
582 """Getter method for _WORKING."""
583 return cls._WORKING
584
52efa4b3 585 @classmethod
586 def supports_login(cls):
587 return bool(cls._NETRC_MACHINE)
588
d6983cb4
PH
589 def initialize(self):
590 """Initializes an instance (authentication, etc)."""
28f436ba 591 self._printed_messages = set()
5f95927a
S
592 self._initialize_geo_bypass({
593 'countries': self._GEO_COUNTRIES,
594 'ip_blocks': self._GEO_IP_BLOCKS,
595 })
4248dad9 596 if not self._ready:
52efa4b3 597 self._initialize_pre_login()
598 if self.supports_login():
599 username, password = self._get_login_info()
600 if username:
601 self._perform_login(username, password)
602 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 603 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
604 self._real_initialize()
605 self._ready = True
606
5f95927a 607 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
608 """
609 Initialize geo restriction bypass mechanism.
610
611 This method is used to initialize geo bypass mechanism based on faking
612 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 613 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
614 IP will be passed as X-Forwarded-For HTTP header in all subsequent
615 HTTP requests.
e39b5d4a
S
616
617 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
618 during the instance initialization with _GEO_COUNTRIES and
619 _GEO_IP_BLOCKS.
e39b5d4a 620
5f95927a 621 You may also manually call it from extractor's code if geo bypass
e39b5d4a 622 information is not available beforehand (e.g. obtained during
5f95927a
S
623 extraction) or due to some other reason. In this case you should pass
624 this information in geo bypass context passed as first argument. It may
625 contain following fields:
626
627 countries: List of geo unrestricted countries (similar
628 to _GEO_COUNTRIES)
629 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
630 (similar to _GEO_IP_BLOCKS)
631
e39b5d4a 632 """
773f291d 633 if not self._x_forwarded_for_ip:
5f95927a
S
634
635 # Geo bypass mechanism is explicitly disabled by user
a06916d9 636 if not self.get_param('geo_bypass', True):
5f95927a
S
637 return
638
639 if not geo_bypass_context:
640 geo_bypass_context = {}
641
642 # Backward compatibility: previously _initialize_geo_bypass
643 # expected a list of countries, some 3rd party code may still use
644 # it this way
645 if isinstance(geo_bypass_context, (list, tuple)):
646 geo_bypass_context = {
647 'countries': geo_bypass_context,
648 }
649
650 # The whole point of geo bypass mechanism is to fake IP
651 # as X-Forwarded-For HTTP header based on some IP block or
652 # country code.
653
654 # Path 1: bypassing based on IP block in CIDR notation
655
656 # Explicit IP block specified by user, use it right away
657 # regardless of whether extractor is geo bypassable or not
a06916d9 658 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
659
660 # Otherwise use random IP block from geo bypass context but only
661 # if extractor is known as geo bypassable
662 if not ip_block:
663 ip_blocks = geo_bypass_context.get('ip_blocks')
664 if self._GEO_BYPASS and ip_blocks:
665 ip_block = random.choice(ip_blocks)
666
667 if ip_block:
668 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 669 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
670 return
671
672 # Path 2: bypassing based on country code
673
674 # Explicit country code specified by user, use it right away
675 # regardless of whether extractor is geo bypassable or not
a06916d9 676 country = self.get_param('geo_bypass_country', None)
5f95927a
S
677
678 # Otherwise use random country code from geo bypass context but
679 # only if extractor is known as geo bypassable
680 if not country:
681 countries = geo_bypass_context.get('countries')
682 if self._GEO_BYPASS and countries:
683 country = random.choice(countries)
684
685 if country:
686 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 687 self._downloader.write_debug(
86e5f3ed 688 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
689
690 def extract(self, url):
691 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 692 try:
773f291d
S
693 for _ in range(2):
694 try:
695 self.initialize()
71df9b7f 696 self.to_screen('Extracting URL: %s' % (
697 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
0016b84e 698 ie_result = self._real_extract(url)
07cce701 699 if ie_result is None:
700 return None
0016b84e
S
701 if self._x_forwarded_for_ip:
702 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
b79f9e30 703 subtitles = ie_result.get('subtitles') or {}
704 if 'no-live-chat' in self.get_param('compat_opts'):
705 for lang in ('live_chat', 'comments', 'danmaku'):
706 subtitles.pop(lang, None)
0016b84e 707 return ie_result
773f291d 708 except GeoRestrictedError as e:
4248dad9
S
709 if self.__maybe_fake_ip_and_retry(e.countries):
710 continue
773f291d 711 raise
0db3bae8 712 except UnsupportedError:
713 raise
1151c407 714 except ExtractorError as e:
9bcfe33b 715 e.video_id = e.video_id or self.get_temp_id(url),
716 e.ie = e.ie or self.IE_NAME,
717 e.traceback = e.traceback or sys.exc_info()[2]
718 raise
ac668111 719 except http.client.IncompleteRead as e:
1151c407 720 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 721 except (KeyError, StopIteration) as e:
1151c407 722 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 723
4248dad9 724 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 725 if (not self.get_param('geo_bypass_country', None)
3089bc74 726 and self._GEO_BYPASS
a06916d9 727 and self.get_param('geo_bypass', True)
3089bc74
S
728 and not self._x_forwarded_for_ip
729 and countries):
eea0716c
S
730 country_code = random.choice(countries)
731 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
732 if self._x_forwarded_for_ip:
733 self.report_warning(
eea0716c
S
734 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
735 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
736 return True
737 return False
738
d6983cb4 739 def set_downloader(self, downloader):
08d30158 740 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
741 self._downloader = downloader
742
9809740b 743 @property
744 def cache(self):
745 return self._downloader.cache
746
747 @property
748 def cookiejar(self):
749 return self._downloader.cookiejar
750
52efa4b3 751 def _initialize_pre_login(self):
962ffcf8 752 """ Initialization before login. Redefine in subclasses."""
52efa4b3 753 pass
754
755 def _perform_login(self, username, password):
756 """ Login with username and password. Redefine in subclasses."""
757 pass
758
d6983cb4
PH
759 def _real_initialize(self):
760 """Real initialization process. Redefine in subclasses."""
761 pass
762
763 def _real_extract(self, url):
764 """Real extraction process. Redefine in subclasses."""
08d30158 765 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 766
56c73665
JMF
767 @classmethod
768 def ie_key(cls):
769 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 770 return cls.__name__[:-2]
56c73665 771
82d02080 772 @classproperty
773 def IE_NAME(cls):
774 return cls.__name__[:-2]
d6983cb4 775
d391b7e2
S
776 @staticmethod
777 def __can_accept_status_code(err, expected_status):
ac668111 778 assert isinstance(err, urllib.error.HTTPError)
d391b7e2
S
779 if expected_status is None:
780 return False
d391b7e2
S
781 elif callable(expected_status):
782 return expected_status(err.code) is True
783 else:
6606817a 784 return err.code in variadic(expected_status)
d391b7e2 785
c043c246 786 def _create_request(self, url_or_request, data=None, headers=None, query=None):
ac668111 787 if isinstance(url_or_request, urllib.request.Request):
09d02ea4 788 return update_Request(url_or_request, data=data, headers=headers, query=query)
789 if query:
790 url_or_request = update_url_query(url_or_request, query)
c043c246 791 return sanitized_Request(url_or_request, data, headers or {})
f95b9dee 792
c043c246 793 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
d391b7e2
S
794 """
795 Return the response handle.
796
797 See _download_webpage docstring for arguments specification.
798 """
1cf376f5 799 if not self._downloader._first_webpage_request:
49a57e70 800 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 801 if sleep_interval > 0:
5ef7d9bd 802 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 803 time.sleep(sleep_interval)
804 else:
805 self._downloader._first_webpage_request = False
806
d6983cb4
PH
807 if note is None:
808 self.report_download_webpage(video_id)
809 elif note is not False:
7cc3570e 810 if video_id is None:
86e5f3ed 811 self.to_screen(str(note))
7cc3570e 812 else:
86e5f3ed 813 self.to_screen(f'{video_id}: {note}')
2132edaa
S
814
815 # Some sites check X-Forwarded-For HTTP header in order to figure out
816 # the origin of the client behind proxy. This allows bypassing geo
817 # restriction by faking this header's value to IP that belongs to some
818 # geo unrestricted country. We will do so once we encounter any
819 # geo restriction error.
820 if self._x_forwarded_for_ip:
c043c246 821 headers = (headers or {}).copy()
822 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
2132edaa 823
d6983cb4 824 try:
f95b9dee 825 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 826 except network_exceptions as err:
ac668111 827 if isinstance(err, urllib.error.HTTPError):
d391b7e2 828 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
829 # Retain reference to error to prevent file object from
830 # being closed before it can be read. Works around the
831 # effects of <https://bugs.python.org/issue15002>
832 # introduced in Python 3.4.1.
833 err.fp._error = err
d391b7e2
S
834 return err.fp
835
aa94a6d3
PH
836 if errnote is False:
837 return False
d6983cb4 838 if errnote is None:
f1a9d64e 839 errnote = 'Unable to download webpage'
7f8b2714 840
86e5f3ed 841 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 842 if fatal:
497d2fab 843 raise ExtractorError(errmsg, cause=err)
7cc3570e 844 else:
6a39ee13 845 self.report_warning(errmsg)
7cc3570e 846 return False
d6983cb4 847
1890fc63 848 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
849 encoding=None, data=None, headers={}, query={}, expected_status=None):
d391b7e2
S
850 """
851 Return a tuple (page content as string, URL handle).
852
617f658b 853 Arguments:
854 url_or_request -- plain text URL as a string or
ac668111 855 a urllib.request.Request object
617f658b 856 video_id -- Video/playlist/item identifier (string)
857
858 Keyword arguments:
859 note -- note printed before downloading (string)
860 errnote -- note printed in case of an error (string)
861 fatal -- flag denoting whether error should be considered fatal,
862 i.e. whether it should cause ExtractionError to be raised,
863 otherwise a warning will be reported and extraction continued
864 encoding -- encoding for a page content decoding, guessed automatically
865 when not explicitly specified
866 data -- POST data (bytes)
867 headers -- HTTP headers (dict)
868 query -- URL query (dict)
869 expected_status -- allows to accept failed HTTP requests (non 2xx
870 status code) by explicitly specifying a set of accepted status
871 codes. Can be any of the following entities:
872 - an integer type specifying an exact failed status code to
873 accept
874 - a list or a tuple of integer types specifying a list of
875 failed status codes to accept
876 - a callable accepting an actual failed status code and
877 returning True if it should be accepted
878 Note that this argument does not affect success status codes (2xx)
879 which are always accepted.
d391b7e2 880 """
617f658b 881
b9d3e163 882 # Strip hashes from the URL (#1038)
14f25df2 883 if isinstance(url_or_request, str):
b9d3e163
PH
884 url_or_request = url_or_request.partition('#')[0]
885
d391b7e2 886 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
887 if urlh is False:
888 assert not fatal
889 return False
c9a77969 890 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
891 return (content, urlh)
892
c9a77969
YCH
893 @staticmethod
894 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
895 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
896 if m:
897 encoding = m.group(1)
898 else:
0d75ae2c 899 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
900 webpage_bytes[:1024])
901 if m:
902 encoding = m.group(1).decode('ascii')
b60016e8
PH
903 elif webpage_bytes.startswith(b'\xff\xfe'):
904 encoding = 'utf-16'
f143d86a
PH
905 else:
906 encoding = 'utf-8'
c9a77969
YCH
907
908 return encoding
909
4457823d
S
910 def __check_blocked(self, content):
911 first_block = content[:512]
3089bc74
S
912 if ('<title>Access to this site is blocked</title>' in content
913 and 'Websense' in first_block):
4457823d
S
914 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
915 blocked_iframe = self._html_search_regex(
916 r'<iframe src="([^"]+)"', content,
917 'Websense information URL', default=None)
918 if blocked_iframe:
919 msg += ' Visit %s for more details' % blocked_iframe
920 raise ExtractorError(msg, expected=True)
921 if '<title>The URL you requested has been blocked</title>' in first_block:
922 msg = (
923 'Access to this webpage has been blocked by Indian censorship. '
924 'Use a VPN or proxy server (with --proxy) to route around it.')
925 block_msg = self._html_search_regex(
926 r'</h1><p>(.*?)</p>',
927 content, 'block message', default=None)
928 if block_msg:
929 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
930 raise ExtractorError(msg, expected=True)
3089bc74
S
931 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
932 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
933 raise ExtractorError(
934 'Access to this webpage has been blocked by decision of the Russian government. '
935 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
936 expected=True)
937
f95b9dee 938 def _request_dump_filename(self, url, video_id):
939 basen = f'{video_id}_{url}'
940 trim_length = self.get_param('trim_file_name') or 240
941 if len(basen) > trim_length:
942 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
943 basen = basen[:trim_length - len(h)] + h
944 filename = sanitize_filename(f'{basen}.dump', restricted=True)
945 # Working around MAX_PATH limitation on Windows (see
946 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
947 if compat_os_name == 'nt':
948 absfilepath = os.path.abspath(filename)
949 if len(absfilepath) > 259:
950 filename = fR'\\?\{absfilepath}'
951 return filename
952
953 def __decode_webpage(self, webpage_bytes, encoding, headers):
954 if not encoding:
955 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
956 try:
957 return webpage_bytes.decode(encoding, 'replace')
958 except LookupError:
959 return webpage_bytes.decode('utf-8', 'replace')
960
c9a77969 961 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
962 webpage_bytes = urlh.read()
963 if prefix is not None:
964 webpage_bytes = prefix + webpage_bytes
a06916d9 965 if self.get_param('dump_intermediate_pages', False):
f610dbb0 966 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
967 dump = base64.b64encode(webpage_bytes).decode('ascii')
968 self._downloader.to_screen(dump)
f95b9dee 969 if self.get_param('write_pages'):
e121e3ce 970 filename = self._request_dump_filename(urlh.geturl(), video_id)
f95b9dee 971 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
972 with open(filename, 'wb') as outf:
973 outf.write(webpage_bytes)
974
f95b9dee 975 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 976 self.__check_blocked(content)
2410c43d 977
23be51d8 978 return content
d6983cb4 979
6edf2808 980 def __print_error(self, errnote, fatal, video_id, err):
981 if fatal:
c6e07cf1 982 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
6edf2808 983 elif errnote:
c6e07cf1 984 self.report_warning(f'{video_id}: {errnote}: {err}')
6edf2808 985
986 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
e2b38da9
PH
987 if transform_source:
988 xml_string = transform_source(xml_string)
e01c3d2e
S
989 try:
990 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 991 except xml.etree.ElementTree.ParseError as ve:
6edf2808 992 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
267ed0c5 993
6edf2808 994 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
3d3538e4 995 try:
b7c47b74 996 return json.loads(
997 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
3d3538e4 998 except ValueError as ve:
6edf2808 999 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
3d3538e4 1000
6edf2808 1001 def _parse_socket_response_as_json(self, data, *args, **kwargs):
1002 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
adddc50c 1003
617f658b 1004 def __create_download_methods(name, parser, note, errnote, return_value):
1005
6edf2808 1006 def parse(ie, content, *args, errnote=errnote, **kwargs):
617f658b 1007 if parser is None:
1008 return content
6edf2808 1009 if errnote is False:
1010 kwargs['errnote'] = errnote
617f658b 1011 # parser is fetched by name so subclasses can override it
1012 return getattr(ie, parser)(content, *args, **kwargs)
1013
c4910024 1014 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1015 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1016 res = self._download_webpage_handle(
1017 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1018 data=data, headers=headers, query=query, expected_status=expected_status)
617f658b 1019 if res is False:
1020 return res
1021 content, urlh = res
6edf2808 1022 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
617f658b 1023
f95b9dee 1024 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
c4910024 1025 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
f95b9dee 1026 if self.get_param('load_pages'):
1027 url_or_request = self._create_request(url_or_request, data, headers, query)
1028 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1029 self.to_screen(f'Loading request from {filename}')
1030 try:
1031 with open(filename, 'rb') as dumpf:
1032 webpage_bytes = dumpf.read()
1033 except OSError as e:
1034 self.report_warning(f'Unable to load request from disk: {e}')
1035 else:
1036 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
6edf2808 1037 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
c4910024 1038 kwargs = {
1039 'note': note,
1040 'errnote': errnote,
1041 'transform_source': transform_source,
1042 'fatal': fatal,
1043 'encoding': encoding,
1044 'data': data,
1045 'headers': headers,
1046 'query': query,
1047 'expected_status': expected_status,
1048 }
617f658b 1049 if parser is None:
c4910024 1050 kwargs.pop('transform_source')
617f658b 1051 # The method is fetched by name so subclasses can override _download_..._handle
c4910024 1052 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
617f658b 1053 return res if res is False else res[0]
1054
1055 def impersonate(func, name, return_value):
1056 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1057 func.__doc__ = f'''
1058 @param transform_source Apply this transformation before parsing
1059 @returns {return_value}
1060
1061 See _download_webpage_handle docstring for other arguments specification
1062 '''
1063
1064 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1065 impersonate(download_content, f'_download_{name}', f'{return_value}')
1066 return download_handle, download_content
1067
1068 _download_xml_handle, _download_xml = __create_download_methods(
1069 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1070 _download_json_handle, _download_json = __create_download_methods(
1071 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1072 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1073 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1074 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1075
617f658b 1076 def _download_webpage(
1077 self, url_or_request, video_id, note=None, errnote=None,
1078 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1079 """
617f658b 1080 Return the data of the page as a string.
adddc50c 1081
617f658b 1082 Keyword arguments:
1083 tries -- number of tries
1084 timeout -- sleep interval between tries
1085
1086 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1087 """
617f658b 1088
1089 R''' # NB: These are unused; should they be deprecated?
1090 if tries != 1:
1091 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1092 if timeout is NO_DEFAULT:
1093 timeout = 5
1094 else:
1095 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1096 '''
1097
1098 try_count = 0
1099 while True:
1100 try:
1101 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
ac668111 1102 except http.client.IncompleteRead as e:
617f658b 1103 try_count += 1
1104 if try_count >= tries:
1105 raise e
1106 self._sleep(timeout, video_id)
adddc50c 1107
28f436ba 1108 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
a70635b8 1109 idstr = format_field(video_id, None, '%s: ')
28f436ba 1110 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1111 if only_once:
1112 if f'WARNING: {msg}' in self._printed_messages:
1113 return
1114 self._printed_messages.add(f'WARNING: {msg}')
1115 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1116
a06916d9 1117 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1118 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1119 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1120
1121 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1122 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1123
1124 def get_param(self, name, default=None, *args, **kwargs):
1125 if self._downloader:
1126 return self._downloader.params.get(name, default, *args, **kwargs)
1127 return default
d6983cb4 1128
d5d1df8a 1129 def report_drm(self, video_id, partial=NO_DEFAULT):
1130 if partial is not NO_DEFAULT:
1131 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
88acdbc2 1132 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1133
d6983cb4
PH
1134 def report_extraction(self, id_or_name):
1135 """Report information extraction."""
f1a9d64e 1136 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1137
1138 def report_download_webpage(self, video_id):
1139 """Report webpage download."""
f1a9d64e 1140 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1141
1142 def report_age_confirmation(self):
1143 """Report attempt to confirm age."""
f1a9d64e 1144 self.to_screen('Confirming age')
d6983cb4 1145
fc79158d
JMF
1146 def report_login(self):
1147 """Report attempt to log in."""
f1a9d64e 1148 self.to_screen('Logging in')
fc79158d 1149
b7da73eb 1150 def raise_login_required(
9d5d4d64 1151 self, msg='This video is only available for registered users',
52efa4b3 1152 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1153 if metadata_available and (
1154 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1155 self.report_warning(msg)
7265a219 1156 return
a70635b8 1157 msg += format_field(self._login_hint(method), None, '. %s')
46890374 1158 raise ExtractorError(msg, expected=True)
43e7d3c9 1159
b7da73eb 1160 def raise_geo_restricted(
1161 self, msg='This video is not available from your location due to geo restriction',
1162 countries=None, metadata_available=False):
f2ebc5c7 1163 if metadata_available and (
1164 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1165 self.report_warning(msg)
1166 else:
1167 raise GeoRestrictedError(msg, countries=countries)
1168
1169 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1170 if expected and (
1171 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1172 self.report_warning(msg, video_id)
68f5867c
L
1173 elif isinstance(msg, ExtractorError):
1174 raise msg
b7da73eb 1175 else:
1176 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1177
5f6a1245 1178 # Methods for following #608
c0d0b01f 1179 @staticmethod
311b6615 1180 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1181 """Returns a URL that points to a page that should be processed"""
311b6615 1182 if ie is not None:
1183 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1184 if video_id is not None:
311b6615 1185 kwargs['id'] = video_id
830d53bf 1186 if video_title is not None:
311b6615 1187 kwargs['title'] = video_title
1188 return {
1189 **kwargs,
1190 '_type': 'url_transparent' if url_transparent else 'url',
1191 'url': url,
1192 }
1193
8f97a15d 1194 @classmethod
1195 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1196 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1197 return cls.playlist_result(
1198 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1199 playlist_id, playlist_title, **kwargs)
46b18f23 1200
c0d0b01f 1201 @staticmethod
311b6615 1202 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1203 """Returns a playlist"""
d6983cb4 1204 if playlist_id:
311b6615 1205 kwargs['id'] = playlist_id
d6983cb4 1206 if playlist_title:
311b6615 1207 kwargs['title'] = playlist_title
ecc97af3 1208 if playlist_description is not None:
311b6615 1209 kwargs['description'] = playlist_description
1210 return {
1211 **kwargs,
1212 '_type': 'multi_video' if multi_video else 'playlist',
1213 'entries': entries,
1214 }
d6983cb4 1215
c342041f 1216 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1217 """
1218 Perform a regex search on the given string, using a single or a list of
1219 patterns returning the first matching group.
1220 In case of failure return a default value or raise a WARNING or a
55b3e45b 1221 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1222 """
61d3665d 1223 if string is None:
1224 mobj = None
77f90330 1225 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1226 mobj = re.search(pattern, string, flags)
1227 else:
1228 for p in pattern:
1229 mobj = re.search(p, string, flags)
c3415d1b
PH
1230 if mobj:
1231 break
d6983cb4 1232
ec11a9f4 1233 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1234
1235 if mobj:
711ede6e
PH
1236 if group is None:
1237 # return the first matching group
1238 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1239 elif isinstance(group, (list, tuple)):
1240 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1241 else:
1242 return mobj.group(group)
c342041f 1243 elif default is not NO_DEFAULT:
d6983cb4
PH
1244 return default
1245 elif fatal:
f1a9d64e 1246 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1247 else:
6a39ee13 1248 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1249 return None
1250
f0bc6e20 1251 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
8b7fb8b6 1252 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
b7c47b74 1253 """Searches string for the JSON object specified by start_pattern"""
1254 # NB: end_pattern is only used to reduce the size of the initial match
f0bc6e20 1255 if default is NO_DEFAULT:
1256 default, has_default = {}, False
1257 else:
1258 fatal, has_default = False, True
1259
1260 json_string = self._search_regex(
8b7fb8b6 1261 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
f0bc6e20 1262 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1263 if not json_string:
1264 return default
1265
1266 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1267 try:
1268 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1269 except ExtractorError as e:
1270 if fatal:
1271 raise ExtractorError(
1272 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1273 elif not has_default:
1274 self.report_warning(
1275 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1276 return default
b7c47b74 1277
c342041f 1278 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1279 """
1280 Like _search_regex, but strips HTML tags and unescapes entities.
1281 """
711ede6e 1282 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
08e29b9f 1283 if isinstance(res, tuple):
edfc7725 1284 return tuple(map(clean_html, res))
1285 return clean_html(res)
d6983cb4 1286
2118fdd1
RA
1287 def _get_netrc_login_info(self, netrc_machine=None):
1288 username = None
1289 password = None
1290 netrc_machine = netrc_machine or self._NETRC_MACHINE
1291
a06916d9 1292 if self.get_param('usenetrc', False):
2118fdd1 1293 try:
0001fcb5 1294 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1295 if os.path.isdir(netrc_file):
1296 netrc_file = os.path.join(netrc_file, '.netrc')
1297 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1298 if info is not None:
1299 username = info[0]
1300 password = info[2]
1301 else:
dcce092e
S
1302 raise netrc.NetrcParseError(
1303 'No authenticators for %s' % netrc_machine)
86e5f3ed 1304 except (OSError, netrc.NetrcParseError) as err:
6a39ee13 1305 self.report_warning(
dcce092e 1306 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1307
dcce092e 1308 return username, password
2118fdd1 1309
1b6712ab 1310 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1311 """
cf0649f8 1312 Get the login info as (username, password)
32443dd3
S
1313 First look for the manually specified credentials using username_option
1314 and password_option as keys in params dictionary. If no such credentials
1315 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1316 value.
fc79158d
JMF
1317 If there's no info available, return (None, None)
1318 """
fc79158d
JMF
1319
1320 # Attempt to use provided username and password or .netrc data
a06916d9 1321 username = self.get_param(username_option)
1322 if username is not None:
1323 password = self.get_param(password_option)
2118fdd1 1324 else:
1b6712ab 1325 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1326
2133565c 1327 return username, password
fc79158d 1328
e64b7569 1329 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1330 """
1331 Get the two-factor authentication info
1332 TODO - asking the user will be required for sms/phone verify
1333 currently just uses the command line option
1334 If there's no info available, return None
1335 """
83317f69 1336
a06916d9 1337 tfa = self.get_param('twofactor')
1338 if tfa is not None:
1339 return tfa
83317f69 1340
ac668111 1341 return getpass.getpass('Type %s and press [Return]: ' % note)
83317f69 1342
46720279
JMF
1343 # Helper functions for extracting OpenGraph info
1344 @staticmethod
ab2d5247 1345 def _og_regexes(prop):
45b2ee6f 1346 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
fbfde1c3
F
1347 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1348 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1349 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1350 return [
78fb87b2
JMF
1351 template % (property_re, content_re),
1352 template % (content_re, property_re),
ab2d5247 1353 ]
46720279 1354
864f24bd
S
1355 @staticmethod
1356 def _meta_regex(prop):
1357 return r'''(?isx)<meta
8b9848ac 1358 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1359 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1360
3c4e6d83 1361 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1362 prop = variadic(prop)
46720279 1363 if name is None:
b070564e
S
1364 name = 'OpenGraph %s' % prop[0]
1365 og_regexes = []
1366 for p in prop:
1367 og_regexes.extend(self._og_regexes(p))
1368 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1369 if escaped is None:
1370 return None
1371 return unescapeHTML(escaped)
46720279
JMF
1372
1373 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1374 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1375
1376 def _og_search_description(self, html, **kargs):
1377 return self._og_search_property('description', html, fatal=False, **kargs)
1378
04f3fd2c 1379 def _og_search_title(self, html, *, fatal=False, **kargs):
1380 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1381
8ffa13e0 1382 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1383 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1384 if secure:
1385 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1386 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1387
78338f71
JMF
1388 def _og_search_url(self, html, **kargs):
1389 return self._og_search_property('url', html, **kargs)
1390
04f3fd2c 1391 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1392 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1393
40c696e5 1394 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1395 name = variadic(name)
59040888 1396 if display_name is None:
88d9f6c0 1397 display_name = name[0]
59040888 1398 return self._html_search_regex(
88d9f6c0 1399 [self._meta_regex(n) for n in name],
711ede6e 1400 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1401
1402 def _dc_search_uploader(self, html):
1403 return self._html_search_meta('dc.creator', html, 'uploader')
1404
8f97a15d 1405 @staticmethod
1406 def _rta_search(html):
8dbe9899
PH
1407 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1408 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1409 r' content="RTA-5042-1996-1400-1577-RTA"',
1410 html):
1411 return 18
8f97a15d 1412
1413 # And then there are the jokers who advertise that they use RTA, but actually don't.
1414 AGE_LIMIT_MARKERS = [
1415 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
32a84bcf
SS
1416 r'>[^<]*you acknowledge you are at least (\d+) years old',
1417 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
8f97a15d 1418 ]
32a84bcf
SS
1419
1420 age_limit = 0
1421 for marker in AGE_LIMIT_MARKERS:
1422 mobj = re.search(marker, html)
1423 if mobj:
1424 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1425 return age_limit
8dbe9899 1426
59040888
PH
1427 def _media_rating_search(self, html):
1428 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1429 rating = self._html_search_meta('rating', html)
1430
1431 if not rating:
1432 return None
1433
1434 RATING_TABLE = {
1435 'safe for kids': 0,
1436 'general': 8,
1437 '14 years': 14,
1438 'mature': 17,
1439 'restricted': 19,
1440 }
d800609c 1441 return RATING_TABLE.get(rating.lower())
59040888 1442
69319969 1443 def _family_friendly_search(self, html):
6ca7732d 1444 # See http://schema.org/VideoObject
ac8491fc
S
1445 family_friendly = self._html_search_meta(
1446 'isFamilyFriendly', html, default=None)
69319969
NJ
1447
1448 if not family_friendly:
1449 return None
1450
1451 RATING_TABLE = {
1452 '1': 0,
1453 'true': 0,
1454 '0': 18,
1455 'false': 18,
1456 }
d800609c 1457 return RATING_TABLE.get(family_friendly.lower())
69319969 1458
0c708f11
JMF
1459 def _twitter_search_player(self, html):
1460 return self._html_search_meta('twitter:player', html,
9e1a5b84 1461 'twitter card player')
0c708f11 1462
0c36dc00 1463 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1464 """Yield all json ld objects in the html"""
1465 if default is not NO_DEFAULT:
1466 fatal = False
1467 for mobj in re.finditer(JSON_LD_RE, html):
1468 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1469 for json_ld in variadic(json_ld_item):
1470 if isinstance(json_ld, dict):
1471 yield json_ld
1472
1473 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1474 """Search for a video in any json ld in the html"""
1475 if default is not NO_DEFAULT:
1476 fatal = False
1477 info = self._json_ld(
1478 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1479 video_id, fatal=fatal, expected_type=expected_type)
1480 if info:
1481 return info
4433bb02
S
1482 if default is not NO_DEFAULT:
1483 return default
1484 elif fatal:
1485 raise RegexNotFoundError('Unable to extract JSON-LD')
1486 else:
6a39ee13 1487 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1488 return {}
4ca2a3cf 1489
95b31e26 1490 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
14f25df2 1491 if isinstance(json_ld, str):
4ca2a3cf
S
1492 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1493 if not json_ld:
1494 return {}
1495 info = {}
bae14048 1496
e7e4a6e0
S
1497 INTERACTION_TYPE_MAP = {
1498 'CommentAction': 'comment',
1499 'AgreeAction': 'like',
1500 'DisagreeAction': 'dislike',
1501 'LikeAction': 'like',
1502 'DislikeAction': 'dislike',
1503 'ListenAction': 'view',
1504 'WatchAction': 'view',
1505 'ViewAction': 'view',
1506 }
1507
f3c0c773 1508 def is_type(e, *expected_types):
1509 type = variadic(traverse_obj(e, '@type'))
1510 return any(x in type for x in expected_types)
1511
29f7c58a 1512 def extract_interaction_type(e):
1513 interaction_type = e.get('interactionType')
1514 if isinstance(interaction_type, dict):
1515 interaction_type = interaction_type.get('@type')
1516 return str_or_none(interaction_type)
1517
e7e4a6e0
S
1518 def extract_interaction_statistic(e):
1519 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1520 if isinstance(interaction_statistic, dict):
1521 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1522 if not isinstance(interaction_statistic, list):
1523 return
1524 for is_e in interaction_statistic:
f3c0c773 1525 if not is_type(is_e, 'InteractionCounter'):
e7e4a6e0 1526 continue
29f7c58a 1527 interaction_type = extract_interaction_type(is_e)
1528 if not interaction_type:
e7e4a6e0 1529 continue
ce5b9040
S
1530 # For interaction count some sites provide string instead of
1531 # an integer (as per spec) with non digit characters (e.g. ",")
1532 # so extracting count with more relaxed str_to_int
1533 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1534 if interaction_count is None:
1535 continue
1536 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1537 if not count_kind:
1538 continue
1539 count_key = '%s_count' % count_kind
1540 if info.get(count_key) is not None:
1541 continue
1542 info[count_key] = interaction_count
1543
f5225737 1544 def extract_chapter_information(e):
1545 chapters = [{
1546 'title': part.get('name'),
1547 'start_time': part.get('startOffset'),
1548 'end_time': part.get('endOffset'),
85553414 1549 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1550 for idx, (last_c, current_c, next_c) in enumerate(zip(
1551 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1552 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1553 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1554 if None in current_c.values():
1555 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1556 return
1557 if chapters:
1558 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1559 info['chapters'] = chapters
1560
bae14048 1561 def extract_video_object(e):
f7ad7160 1562 author = e.get('author')
bae14048 1563 info.update({
0c36dc00 1564 'url': url_or_none(e.get('contentUrl')),
0f60ba6e 1565 'ext': mimetype2ext(e.get('encodingFormat')),
bae14048
S
1566 'title': unescapeHTML(e.get('name')),
1567 'description': unescapeHTML(e.get('description')),
eb2333bc 1568 'thumbnails': [{'url': unescapeHTML(url)}
21633673 1569 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1570 if url_or_none(url)],
bae14048
S
1571 'duration': parse_duration(e.get('duration')),
1572 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1573 # author can be an instance of 'Organization' or 'Person' types.
1574 # both types can have 'name' property(inherited from 'Thing' type). [1]
1575 # however some websites are using 'Text' type instead.
1576 # 1. https://schema.org/VideoObject
14f25df2 1577 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
0f60ba6e 1578 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
56ba69e4 1579 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
bae14048
S
1580 'tbr': int_or_none(e.get('bitrate')),
1581 'width': int_or_none(e.get('width')),
1582 'height': int_or_none(e.get('height')),
33a81c2c 1583 'view_count': int_or_none(e.get('interactionCount')),
0f60ba6e 1584 'tags': try_call(lambda: e.get('keywords').split(',')),
bae14048 1585 })
0f60ba6e 1586 if is_type(e, 'AudioObject'):
1587 info.update({
1588 'vcodec': 'none',
1589 'abr': int_or_none(e.get('bitrate')),
1590 })
e7e4a6e0 1591 extract_interaction_statistic(e)
f5225737 1592 extract_chapter_information(e)
bae14048 1593
d5c32548 1594 def traverse_json_ld(json_ld, at_top_level=True):
1d55ebab
SS
1595 for e in variadic(json_ld):
1596 if not isinstance(e, dict):
1597 continue
d5c32548
ZM
1598 if at_top_level and '@context' not in e:
1599 continue
1600 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1d55ebab 1601 traverse_json_ld(e['@graph'], at_top_level=False)
c13a301a 1602 continue
f3c0c773 1603 if expected_type is not None and not is_type(e, expected_type):
4433bb02 1604 continue
8f122fa0 1605 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1606 if rating is not None:
1607 info['average_rating'] = rating
f3c0c773 1608 if is_type(e, 'TVEpisode', 'Episode'):
440863ad 1609 episode_name = unescapeHTML(e.get('name'))
46933a15 1610 info.update({
440863ad 1611 'episode': episode_name,
46933a15
S
1612 'episode_number': int_or_none(e.get('episodeNumber')),
1613 'description': unescapeHTML(e.get('description')),
1614 })
440863ad
S
1615 if not info.get('title') and episode_name:
1616 info['title'] = episode_name
46933a15 1617 part_of_season = e.get('partOfSeason')
f3c0c773 1618 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1619 info.update({
1620 'season': unescapeHTML(part_of_season.get('name')),
1621 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1622 })
d16b3c66 1623 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
f3c0c773 1624 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1625 info['series'] = unescapeHTML(part_of_series.get('name'))
f3c0c773 1626 elif is_type(e, 'Movie'):
391256dc
S
1627 info.update({
1628 'title': unescapeHTML(e.get('name')),
1629 'description': unescapeHTML(e.get('description')),
1630 'duration': parse_duration(e.get('duration')),
1631 'timestamp': unified_timestamp(e.get('dateCreated')),
1632 })
f3c0c773 1633 elif is_type(e, 'Article', 'NewsArticle'):
46933a15
S
1634 info.update({
1635 'timestamp': parse_iso8601(e.get('datePublished')),
1636 'title': unescapeHTML(e.get('headline')),
d5c32548 1637 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1638 })
f3c0c773 1639 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
2edb38e8 1640 extract_video_object(e['video'][0])
f3c0c773 1641 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
e50c3500 1642 extract_video_object(e['subjectOf'][0])
0f60ba6e 1643 elif is_type(e, 'VideoObject', 'AudioObject'):
bae14048 1644 extract_video_object(e)
4433bb02
S
1645 if expected_type is None:
1646 continue
1647 else:
1648 break
c69701c6 1649 video = e.get('video')
f3c0c773 1650 if is_type(video, 'VideoObject'):
c69701c6 1651 extract_video_object(video)
4433bb02
S
1652 if expected_type is None:
1653 continue
1654 else:
1655 break
d5c32548 1656
1d55ebab 1657 traverse_json_ld(json_ld)
90137ca4 1658 return filter_dict(info)
4ca2a3cf 1659
135dfa2c 1660 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1661 return self._parse_json(
1662 self._search_regex(
1663 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1664 webpage, 'next.js data', fatal=fatal, **kw),
1665 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1666
8072ef2b 1667 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1668 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
66f4c04e 1669 rectx = re.escape(context_name)
8072ef2b 1670 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
66f4c04e 1671 js, arg_keys, arg_vals = self._search_regex(
8072ef2b 1672 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
f7fc8d39 1673 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1674 default=NO_DEFAULT if fatal else (None, None, None))
1675 if js is None:
1676 return {}
66f4c04e 1677
b23167e7
L
1678 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1679 f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
66f4c04e 1680
8072ef2b 1681 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1682 return traverse_obj(ret, traverse) or {}
66f4c04e 1683
27713812 1684 @staticmethod
f8da79f8 1685 def _hidden_inputs(html):
586f1cc5 1686 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1687 hidden_inputs = {}
c8498368
S
1688 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1689 attrs = extract_attributes(input)
1690 if not input:
201ea3ee 1691 continue
c8498368 1692 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1693 continue
c8498368
S
1694 name = attrs.get('name') or attrs.get('id')
1695 value = attrs.get('value')
1696 if name and value is not None:
1697 hidden_inputs[name] = value
201ea3ee 1698 return hidden_inputs
27713812 1699
cf61d96d
S
1700 def _form_hidden_inputs(self, form_id, html):
1701 form = self._search_regex(
73eb13df 1702 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1703 html, '%s form' % form_id, group='form')
1704 return self._hidden_inputs(form)
1705
d0d74b71 1706 @classproperty(cache=True)
1707 def FormatSort(cls):
1708 class FormatSort(FormatSorter):
1709 def __init__(ie, *args, **kwargs):
1710 super().__init__(ie._downloader, *args, **kwargs)
eb8a4433 1711
d0d74b71 1712 deprecation_warning(
1713 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1714 'Use yt_dlp.utils.FormatSorter instead')
1715 return FormatSort
eb8a4433 1716
1717 def _sort_formats(self, formats, field_preference=[]):
9f14daf2 1718 if not field_preference:
1719 self._downloader.deprecation_warning(
1720 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1721 return
1722 self._downloader.deprecation_warning(
1723 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1724 'Return _format_sort_fields in the info_dict instead')
1725 if formats:
784320c9 1726 formats[0]['__sort_fields'] = field_preference
59040888 1727
96a53167
S
1728 def _check_formats(self, formats, video_id):
1729 if formats:
1730 formats[:] = filter(
1731 lambda f: self._is_valid_url(
1732 f['url'], video_id,
1733 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1734 formats)
1735
f5bdb444
S
1736 @staticmethod
1737 def _remove_duplicate_formats(formats):
1738 format_urls = set()
1739 unique_formats = []
1740 for f in formats:
1741 if f['url'] not in format_urls:
1742 format_urls.add(f['url'])
1743 unique_formats.append(f)
1744 formats[:] = unique_formats
1745
45024183 1746 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1747 url = self._proto_relative_url(url, scheme='http:')
1748 # For now assume non HTTP(S) URLs always valid
1749 if not (url.startswith('http://') or url.startswith('https://')):
1750 return True
96a53167 1751 try:
45024183 1752 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1753 return True
8bdd16b4 1754 except ExtractorError as e:
25e911a9 1755 self.to_screen(
8bdd16b4 1756 '%s: %s URL is invalid, skipping: %s'
1757 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1758 return False
96a53167 1759
20991253 1760 def http_scheme(self):
1ede5b24 1761 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1762 return (
1763 'http:'
a06916d9 1764 if self.get_param('prefer_insecure', False)
20991253
PH
1765 else 'https:')
1766
57c7411f 1767 def _proto_relative_url(self, url, scheme=None):
8f97a15d 1768 scheme = scheme or self.http_scheme()
1769 assert scheme.endswith(':')
1770 return sanitize_url(url, scheme=scheme[:-1])
57c7411f 1771
4094b6e3
PH
1772 def _sleep(self, timeout, video_id, msg_template=None):
1773 if msg_template is None:
f1a9d64e 1774 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1775 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1776 self.to_screen(msg)
1777 time.sleep(timeout)
1778
f983b875 1779 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1780 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1781 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
0b5546c7 1782 if self.get_param('ignore_no_formats_error'):
1783 fatal = False
1784
a076c1f9 1785 res = self._download_xml_handle(
f036a632 1786 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1787 'Unable to download f4m manifest',
1788 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1789 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1790 transform_source=transform_source,
7360c06f 1791 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 1792 if res is False:
8d29e47f 1793 return []
31bb8d3f 1794
a076c1f9
E
1795 manifest, urlh = res
1796 manifest_url = urlh.geturl()
1797
0fdbb332 1798 return self._parse_f4m_formats(
f983b875 1799 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1800 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1801
f983b875 1802 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1803 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1804 fatal=True, m3u8_id=None):
f9934b96 1805 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
1806 return []
1807
7a5c1cfe 1808 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1809 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1810 if akamai_pv is not None and ';' in akamai_pv.text:
1811 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1812 if playerVerificationChallenge.strip() != '':
1813 return []
1814
31bb8d3f 1815 formats = []
7a47d07c 1816 manifest_version = '1.0'
b2527359 1817 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1818 if not media_nodes:
7a47d07c 1819 manifest_version = '2.0'
34e48bed 1820 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1821 # Remove unsupported DRM protected media from final formats
067aa17e 1822 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1823 media_nodes = remove_encrypted_media(media_nodes)
1824 if not media_nodes:
1825 return formats
48107c19
S
1826
1827 manifest_base_url = get_base_url(manifest)
0a5685b2 1828
a6571f10 1829 bootstrap_info = xpath_element(
0a5685b2
YCH
1830 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1831 'bootstrap info', default=None)
1832
edd6074c
RA
1833 vcodec = None
1834 mime_type = xpath_text(
1835 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1836 'base URL', default=None)
1837 if mime_type and mime_type.startswith('audio/'):
1838 vcodec = 'none'
1839
b2527359 1840 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1841 tbr = int_or_none(media_el.attrib.get('bitrate'))
1842 width = int_or_none(media_el.attrib.get('width'))
1843 height = int_or_none(media_el.attrib.get('height'))
34921b43 1844 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
1845 # If <bootstrapInfo> is present, the specified f4m is a
1846 # stream-level manifest, and only set-level manifests may refer to
1847 # external resources. See section 11.4 and section 4 of F4M spec
1848 if bootstrap_info is None:
1849 media_url = None
1850 # @href is introduced in 2.0, see section 11.6 of F4M spec
1851 if manifest_version == '2.0':
1852 media_url = media_el.attrib.get('href')
1853 if media_url is None:
1854 media_url = media_el.attrib.get('url')
31c746e5
S
1855 if not media_url:
1856 continue
cc357c4d
S
1857 manifest_url = (
1858 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1859 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1860 # If media_url is itself a f4m manifest do the recursive extraction
1861 # since bitrates in parent manifest (this one) and media_url manifest
1862 # may differ leading to inability to resolve the format by requested
1863 # bitrate in f4m downloader
240b6045
YCH
1864 ext = determine_ext(manifest_url)
1865 if ext == 'f4m':
77b8b4e6 1866 f4m_formats = self._extract_f4m_formats(
f983b875 1867 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1868 transform_source=transform_source, fatal=fatal)
1869 # Sometimes stream-level manifest contains single media entry that
1870 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1871 # At the same time parent's media entry in set-level manifest may
1872 # contain it. We will copy it from parent in such cases.
1873 if len(f4m_formats) == 1:
1874 f = f4m_formats[0]
1875 f.update({
1876 'tbr': f.get('tbr') or tbr,
1877 'width': f.get('width') or width,
1878 'height': f.get('height') or height,
1879 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1880 'vcodec': vcodec,
77b8b4e6
S
1881 })
1882 formats.extend(f4m_formats)
70f0f5a8 1883 continue
240b6045
YCH
1884 elif ext == 'm3u8':
1885 formats.extend(self._extract_m3u8_formats(
1886 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1887 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1888 continue
31bb8d3f 1889 formats.append({
77b8b4e6 1890 'format_id': format_id,
31bb8d3f 1891 'url': manifest_url,
30d0b549 1892 'manifest_url': manifest_url,
a6571f10 1893 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1894 'protocol': 'f4m',
b2527359 1895 'tbr': tbr,
77b8b4e6
S
1896 'width': width,
1897 'height': height,
edd6074c 1898 'vcodec': vcodec,
60ca389c 1899 'preference': preference,
f983b875 1900 'quality': quality,
31bb8d3f 1901 })
31bb8d3f
JMF
1902 return formats
1903
f983b875 1904 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1905 return {
34921b43 1906 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
1907 'url': m3u8_url,
1908 'ext': ext,
1909 'protocol': 'm3u8',
37768f92 1910 'preference': preference - 100 if preference else -100,
f983b875 1911 'quality': quality,
704df56d
PH
1912 'resolution': 'multiple',
1913 'format_note': 'Quality selection URL',
16da9bbc
YCH
1914 }
1915
b5ae35ee 1916 def _report_ignoring_subs(self, name):
1917 self.report_warning(bug_reports_message(
1918 f'Ignoring subtitle tracks found in the {name} manifest; '
1919 'if any subtitle tracks are missing,'
1920 ), only_once=True)
1921
a0c3b2d5
F
1922 def _extract_m3u8_formats(self, *args, **kwargs):
1923 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1924 if subs:
b5ae35ee 1925 self._report_ignoring_subs('HLS')
a0c3b2d5
F
1926 return fmts
1927
1928 def _extract_m3u8_formats_and_subtitles(
177877c5 1929 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1930 preference=None, quality=None, m3u8_id=None, note=None,
1931 errnote=None, fatal=True, live=False, data=None, headers={},
1932 query={}):
1933
0b5546c7 1934 if self.get_param('ignore_no_formats_error'):
1935 fatal = False
1936
71df9b7f 1937 if not m3u8_url:
1938 if errnote is not False:
1939 errnote = errnote or 'Failed to obtain m3u8 URL'
1940 if fatal:
1941 raise ExtractorError(errnote, video_id=video_id)
1942 self.report_warning(f'{errnote}{bug_reports_message()}')
1943 return [], {}
1944
dbd82a1d 1945 res = self._download_webpage_handle(
81515ad9 1946 m3u8_url, video_id,
37a3bb66 1947 note='Downloading m3u8 information' if note is None else note,
1948 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1949 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1950
dbd82a1d 1951 if res is False:
a0c3b2d5 1952 return [], {}
cb252080 1953
dbd82a1d 1954 m3u8_doc, urlh = res
37113045 1955 m3u8_url = urlh.geturl()
9cdffeeb 1956
a0c3b2d5 1957 return self._parse_m3u8_formats_and_subtitles(
cb252080 1958 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1959 preference=preference, quality=quality, m3u8_id=m3u8_id,
1960 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1961 headers=headers, query=query, video_id=video_id)
cb252080 1962
a0c3b2d5 1963 def _parse_m3u8_formats_and_subtitles(
42676437 1964 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1965 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1966 errnote=None, fatal=True, data=None, headers={}, query={},
1967 video_id=None):
60755938 1968 formats, subtitles = [], {}
a0c3b2d5 1969
6b993ca7 1970 has_drm = re.search('|'.join([
1971 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
1972 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
1973 ]), m3u8_doc)
a0c3b2d5 1974
60755938 1975 def format_url(url):
14f25df2 1976 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
60755938 1977
1978 if self.get_param('hls_split_discontinuity', False):
1979 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1980 if not m3u8_doc:
1981 if not manifest_url:
1982 return []
1983 m3u8_doc = self._download_webpage(
1984 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1985 note=False, errnote='Failed to download m3u8 playlist information')
1986 if m3u8_doc is False:
1987 return []
1988 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 1989
60755938 1990 else:
1991 def _extract_m3u8_playlist_indices(*args, **kwargs):
1992 return [None]
310c2ed2 1993
cb252080
S
1994 # References:
1995 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
1996 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1997 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
1998
1999 # We should try extracting formats only from master playlists [1, 4.3.4],
2000 # i.e. playlists that describe available qualities. On the other hand
2001 # media playlists [1, 4.3.3] should be returned as is since they contain
2002 # just the media without qualities renditions.
9cdffeeb 2003 # Fortunately, master playlist can be easily distinguished from media
cb252080 2004 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2005 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2006 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2007 # media playlist and MUST NOT appear in master playlist thus we can
2008 # clearly detect media playlist with this criterion.
2009
9cdffeeb 2010 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2011 formats = [{
34921b43 2012 'format_id': join_nonempty(m3u8_id, idx),
60755938 2013 'format_index': idx,
42676437 2014 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2015 'ext': ext,
2016 'protocol': entry_protocol,
2017 'preference': preference,
2018 'quality': quality,
88acdbc2 2019 'has_drm': has_drm,
60755938 2020 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2021
a0c3b2d5 2022 return formats, subtitles
cb252080
S
2023
2024 groups = {}
2025 last_stream_inf = {}
2026
2027 def extract_media(x_media_line):
2028 media = parse_m3u8_attributes(x_media_line)
2029 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2030 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2031 if not (media_type and group_id and name):
2032 return
2033 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2034 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2035 if media_type == 'SUBTITLES':
3907333c 2036 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2037 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2038 # However, lack of URI has been spotted in the wild.
2039 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2040 if not media.get('URI'):
2041 return
a0c3b2d5
F
2042 url = format_url(media['URI'])
2043 sub_info = {
2044 'url': url,
2045 'ext': determine_ext(url),
2046 }
4a2f19ab
F
2047 if sub_info['ext'] == 'm3u8':
2048 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2049 # files may contain is WebVTT:
2050 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2051 sub_info['ext'] = 'vtt'
2052 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2053 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2054 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2055 if media_type not in ('VIDEO', 'AUDIO'):
2056 return
2057 media_url = media.get('URI')
2058 if media_url:
310c2ed2 2059 manifest_url = format_url(media_url)
60755938 2060 formats.extend({
34921b43 2061 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2062 'format_note': name,
2063 'format_index': idx,
2064 'url': manifest_url,
2065 'manifest_url': m3u8_url,
2066 'language': media.get('LANGUAGE'),
2067 'ext': ext,
2068 'protocol': entry_protocol,
2069 'preference': preference,
2070 'quality': quality,
43a3eaf9 2071 'has_drm': has_drm,
60755938 2072 'vcodec': 'none' if media_type == 'AUDIO' else None,
2073 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2074
2075 def build_stream_name():
2076 # Despite specification does not mention NAME attribute for
3019cb0c
S
2077 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2078 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2079 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2080 stream_name = last_stream_inf.get('NAME')
2081 if stream_name:
2082 return stream_name
2083 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2084 # from corresponding rendition group
2085 stream_group_id = last_stream_inf.get('VIDEO')
2086 if not stream_group_id:
2087 return
2088 stream_group = groups.get(stream_group_id)
2089 if not stream_group:
2090 return stream_group_id
2091 rendition = stream_group[0]
2092 return rendition.get('NAME') or stream_group_id
2093
379306ef 2094 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2095 # chance to detect video only formats when EXT-X-STREAM-INF tags
2096 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2097 for line in m3u8_doc.splitlines():
2098 if line.startswith('#EXT-X-MEDIA:'):
2099 extract_media(line)
2100
704df56d
PH
2101 for line in m3u8_doc.splitlines():
2102 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2103 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2104 elif line.startswith('#') or not line.strip():
2105 continue
2106 else:
9c99bef7 2107 tbr = float_or_none(
3089bc74
S
2108 last_stream_inf.get('AVERAGE-BANDWIDTH')
2109 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2110 manifest_url = format_url(line.strip())
5ef62fc4 2111
60755938 2112 for idx in _extract_m3u8_playlist_indices(manifest_url):
2113 format_id = [m3u8_id, None, idx]
310c2ed2 2114 # Bandwidth of live streams may differ over time thus making
2115 # format_id unpredictable. So it's better to keep provided
2116 # format_id intact.
2117 if not live:
60755938 2118 stream_name = build_stream_name()
34921b43 2119 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2120 f = {
34921b43 2121 'format_id': join_nonempty(*format_id),
60755938 2122 'format_index': idx,
310c2ed2 2123 'url': manifest_url,
2124 'manifest_url': m3u8_url,
2125 'tbr': tbr,
2126 'ext': ext,
2127 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2128 'protocol': entry_protocol,
2129 'preference': preference,
2130 'quality': quality,
43a3eaf9 2131 'has_drm': has_drm,
310c2ed2 2132 }
2133 resolution = last_stream_inf.get('RESOLUTION')
2134 if resolution:
2135 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2136 if mobj:
2137 f['width'] = int(mobj.group('width'))
2138 f['height'] = int(mobj.group('height'))
2139 # Unified Streaming Platform
2140 mobj = re.search(
2141 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2142 if mobj:
2143 abr, vbr = mobj.groups()
2144 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2145 f.update({
2146 'vbr': vbr,
2147 'abr': abr,
2148 })
2149 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2150 f.update(codecs)
2151 audio_group_id = last_stream_inf.get('AUDIO')
2152 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2153 # references a rendition group MUST have a CODECS attribute.
62b58c09 2154 # However, this is not always respected. E.g. [2]
310c2ed2 2155 # contains EXT-X-STREAM-INF tag which references AUDIO
2156 # rendition group but does not have CODECS and despite
2157 # referencing an audio group it represents a complete
2158 # (with audio and video) format. So, for such cases we will
2159 # ignore references to rendition groups and treat them
2160 # as complete formats.
2161 if audio_group_id and codecs and f.get('vcodec') != 'none':
2162 audio_group = groups.get(audio_group_id)
2163 if audio_group and audio_group[0].get('URI'):
2164 # TODO: update acodec for audio only formats with
2165 # the same GROUP-ID
2166 f['acodec'] = 'none'
fc21af50 2167 if not f.get('ext'):
2168 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2169 formats.append(f)
2170
2171 # for DailyMotion
2172 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2173 if progressive_uri:
2174 http_f = f.copy()
2175 del http_f['manifest_url']
2176 http_f.update({
2177 'format_id': f['format_id'].replace('hls-', 'http-'),
2178 'protocol': 'http',
2179 'url': progressive_uri,
2180 })
2181 formats.append(http_f)
5ef62fc4 2182
cb252080 2183 last_stream_inf = {}
a0c3b2d5 2184 return formats, subtitles
704df56d 2185
3cf4b91d
C
2186 def _extract_m3u8_vod_duration(
2187 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2188
2189 m3u8_vod = self._download_webpage(
2190 m3u8_vod_url, video_id,
2191 note='Downloading m3u8 VOD manifest' if note is None else note,
2192 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2193 fatal=False, data=data, headers=headers, query=query)
2194
2195 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2196
2197 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
5ab3534d 2198 if '#EXT-X-ENDLIST' not in m3u8_vod:
3cf4b91d
C
2199 return None
2200
2201 return int(sum(
2202 float(line[len('#EXTINF:'):].split(',')[0])
2203 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2204
5ab3534d 2205 def _extract_mpd_vod_duration(
2206 self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2207
2208 mpd_doc = self._download_xml(
2209 mpd_url, video_id,
2210 note='Downloading MPD VOD manifest' if note is None else note,
2211 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2212 fatal=False, data=data, headers=headers, query=query) or {}
2213 return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2214
a107193e
S
2215 @staticmethod
2216 def _xpath_ns(path, namespace=None):
2217 if not namespace:
2218 return path
2219 out = []
2220 for c in path.split('/'):
2221 if not c or c == '.':
2222 out.append(c)
2223 else:
2224 out.append('{%s}%s' % (namespace, c))
2225 return '/'.join(out)
2226
da1c94ee 2227 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
0b5546c7 2228 if self.get_param('ignore_no_formats_error'):
2229 fatal = False
2230
a076c1f9
E
2231 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2232 if res is False:
995029a1 2233 assert not fatal
774a46c5 2234 return [], {}
e89a2aab 2235
a076c1f9
E
2236 smil, urlh = res
2237 smil_url = urlh.geturl()
2238
17712eeb 2239 namespace = self._parse_smil_namespace(smil)
a107193e 2240
da1c94ee 2241 fmts = self._parse_smil_formats(
a107193e 2242 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2243 subs = self._parse_smil_subtitles(
2244 smil, namespace=namespace)
2245
2246 return fmts, subs
2247
2248 def _extract_smil_formats(self, *args, **kwargs):
2249 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2250 if subs:
b5ae35ee 2251 self._report_ignoring_subs('SMIL')
da1c94ee 2252 return fmts
a107193e
S
2253
2254 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2255 res = self._download_smil(smil_url, video_id, fatal=fatal)
2256 if res is False:
a107193e 2257 return {}
a076c1f9
E
2258
2259 smil, urlh = res
2260 smil_url = urlh.geturl()
2261
a107193e
S
2262 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2263
09f572fb 2264 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2265 return self._download_xml_handle(
a107193e 2266 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2267 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2268
2269 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2270 namespace = self._parse_smil_namespace(smil)
a107193e
S
2271
2272 formats = self._parse_smil_formats(
2273 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2274 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2275
2276 video_id = os.path.splitext(url_basename(smil_url))[0]
2277 title = None
2278 description = None
647eab45 2279 upload_date = None
a107193e
S
2280 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2281 name = meta.attrib.get('name')
2282 content = meta.attrib.get('content')
2283 if not name or not content:
2284 continue
2285 if not title and name == 'title':
2286 title = content
2287 elif not description and name in ('description', 'abstract'):
2288 description = content
647eab45
S
2289 elif not upload_date and name == 'date':
2290 upload_date = unified_strdate(content)
a107193e 2291
1e5bcdec
S
2292 thumbnails = [{
2293 'id': image.get('type'),
2294 'url': image.get('src'),
2295 'width': int_or_none(image.get('width')),
2296 'height': int_or_none(image.get('height')),
2297 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2298
a107193e
S
2299 return {
2300 'id': video_id,
2301 'title': title or video_id,
2302 'description': description,
647eab45 2303 'upload_date': upload_date,
1e5bcdec 2304 'thumbnails': thumbnails,
a107193e
S
2305 'formats': formats,
2306 'subtitles': subtitles,
2307 }
2308
17712eeb
S
2309 def _parse_smil_namespace(self, smil):
2310 return self._search_regex(
2311 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2312
f877c6ae 2313 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2314 base = smil_url
2315 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2316 b = meta.get('base') or meta.get('httpBase')
2317 if b:
2318 base = b
2319 break
e89a2aab
S
2320
2321 formats = []
2322 rtmp_count = 0
a107193e 2323 http_count = 0
7f32e5dc 2324 m3u8_count = 0
9359f3d4 2325 imgs_count = 0
a107193e 2326
9359f3d4 2327 srcs = set()
ad96b4c8
YCH
2328 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2329 for medium in media:
2330 src = medium.get('src')
81e1c4e2 2331 if not src or src in srcs:
a107193e 2332 continue
9359f3d4 2333 srcs.add(src)
a107193e 2334
ad96b4c8
YCH
2335 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2336 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2337 width = int_or_none(medium.get('width'))
2338 height = int_or_none(medium.get('height'))
2339 proto = medium.get('proto')
2340 ext = medium.get('ext')
cb73b846 2341 src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2342 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
ad96b4c8 2343 streamer = medium.get('streamer') or base
a107193e
S
2344
2345 if proto == 'rtmp' or streamer.startswith('rtmp'):
2346 rtmp_count += 1
2347 formats.append({
2348 'url': streamer,
2349 'play_path': src,
2350 'ext': 'flv',
2351 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2352 'tbr': bitrate,
2353 'filesize': filesize,
2354 'width': width,
2355 'height': height,
2356 })
f877c6ae
YCH
2357 if transform_rtmp_url:
2358 streamer, src = transform_rtmp_url(streamer, src)
2359 formats[-1].update({
2360 'url': streamer,
2361 'play_path': src,
2362 })
a107193e
S
2363 continue
2364
14f25df2 2365 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
c349456e 2366 src_url = src_url.strip()
a107193e
S
2367
2368 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2369 m3u8_formats = self._extract_m3u8_formats(
2370 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2371 if len(m3u8_formats) == 1:
2372 m3u8_count += 1
2373 m3u8_formats[0].update({
2374 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2375 'tbr': bitrate,
2376 'width': width,
2377 'height': height,
2378 })
2379 formats.extend(m3u8_formats)
bd21ead2 2380 elif src_ext == 'f4m':
a107193e
S
2381 f4m_url = src_url
2382 if not f4m_params:
2383 f4m_params = {
2384 'hdcore': '3.2.0',
2385 'plugin': 'flowplayer-3.2.0.1',
2386 }
2387 f4m_url += '&' if '?' in f4m_url else '?'
14f25df2 2388 f4m_url += urllib.parse.urlencode(f4m_params)
7e5edcfd 2389 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2390 elif src_ext == 'mpd':
2391 formats.extend(self._extract_mpd_formats(
2392 src_url, video_id, mpd_id='dash', fatal=False))
2393 elif re.search(r'\.ism/[Mm]anifest', src_url):
2394 formats.extend(self._extract_ism_formats(
2395 src_url, video_id, ism_id='mss', fatal=False))
2396 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2397 http_count += 1
2398 formats.append({
2399 'url': src_url,
2400 'ext': ext or src_ext or 'flv',
2401 'format_id': 'http-%d' % (bitrate or http_count),
2402 'tbr': bitrate,
2403 'filesize': filesize,
2404 'width': width,
2405 'height': height,
2406 })
63757032 2407
9359f3d4
F
2408 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2409 src = medium.get('src')
2410 if not src or src in srcs:
2411 continue
2412 srcs.add(src)
2413
2414 imgs_count += 1
2415 formats.append({
2416 'format_id': 'imagestream-%d' % (imgs_count),
2417 'url': src,
2418 'ext': mimetype2ext(medium.get('type')),
2419 'acodec': 'none',
2420 'vcodec': 'none',
2421 'width': int_or_none(medium.get('width')),
2422 'height': int_or_none(medium.get('height')),
2423 'format_note': 'SMIL storyboards',
2424 })
2425
e89a2aab
S
2426 return formats
2427
ce00af87 2428 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2429 urls = []
a107193e
S
2430 subtitles = {}
2431 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2432 src = textstream.get('src')
d413095f 2433 if not src or src in urls:
a107193e 2434 continue
d413095f 2435 urls.append(src)
df634be2 2436 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2437 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2438 subtitles.setdefault(lang, []).append({
2439 'url': src,
2440 'ext': ext,
2441 })
2442 return subtitles
63757032 2443
47a5cb77 2444 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2445 res = self._download_xml_handle(
47a5cb77 2446 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2447 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2448 if res is False:
942acef5 2449 return []
a076c1f9
E
2450
2451 xspf, urlh = res
2452 xspf_url = urlh.geturl()
2453
47a5cb77
S
2454 return self._parse_xspf(
2455 xspf, playlist_id, xspf_url=xspf_url,
2456 xspf_base_url=base_url(xspf_url))
8d6765cf 2457
47a5cb77 2458 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2459 NS_MAP = {
2460 'xspf': 'http://xspf.org/ns/0/',
2461 's1': 'http://static.streamone.nl/player/ns/0',
2462 }
2463
2464 entries = []
47a5cb77 2465 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2466 title = xpath_text(
98044462 2467 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2468 description = xpath_text(
2469 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2470 thumbnail = xpath_text(
2471 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2472 duration = float_or_none(
2473 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2474
47a5cb77
S
2475 formats = []
2476 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2477 format_url = urljoin(xspf_base_url, location.text)
2478 if not format_url:
2479 continue
2480 formats.append({
2481 'url': format_url,
2482 'manifest_url': xspf_url,
2483 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2484 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2485 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2486 })
8d6765cf
S
2487
2488 entries.append({
2489 'id': playlist_id,
2490 'title': title,
2491 'description': description,
2492 'thumbnail': thumbnail,
2493 'duration': duration,
2494 'formats': formats,
2495 })
2496 return entries
2497
171e59ed
F
2498 def _extract_mpd_formats(self, *args, **kwargs):
2499 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2500 if subs:
b5ae35ee 2501 self._report_ignoring_subs('DASH')
171e59ed
F
2502 return fmts
2503
2504 def _extract_mpd_formats_and_subtitles(
2505 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2506 fatal=True, data=None, headers={}, query={}):
0b5546c7 2507
2508 if self.get_param('ignore_no_formats_error'):
2509 fatal = False
2510
47a5cb77 2511 res = self._download_xml_handle(
1bac3455 2512 mpd_url, video_id,
37a3bb66 2513 note='Downloading MPD manifest' if note is None else note,
2514 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2515 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2516 if res is False:
171e59ed 2517 return [], {}
47a5cb77 2518 mpd_doc, urlh = res
c25720ef 2519 if mpd_doc is None:
171e59ed 2520 return [], {}
779da8e3
E
2521
2522 # We could have been redirected to a new url when we retrieved our mpd file.
2523 mpd_url = urlh.geturl()
2524 mpd_base_url = base_url(mpd_url)
1bac3455 2525
171e59ed 2526 return self._parse_mpd_formats_and_subtitles(
545cc85d 2527 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2528
171e59ed
F
2529 def _parse_mpd_formats(self, *args, **kwargs):
2530 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2531 if subs:
b5ae35ee 2532 self._report_ignoring_subs('DASH')
171e59ed
F
2533 return fmts
2534
2535 def _parse_mpd_formats_and_subtitles(
2536 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2537 """
2538 Parse formats from MPD manifest.
2539 References:
2540 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2541 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2542 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2543 """
a06916d9 2544 if not self.get_param('dynamic_mpd', True):
78895bd3 2545 if mpd_doc.get('type') == 'dynamic':
171e59ed 2546 return [], {}
2d2fa82d 2547
91cb6b50 2548 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2549
2550 def _add_ns(path):
2551 return self._xpath_ns(path, namespace)
2552
675d0016 2553 def is_drm_protected(element):
2554 return element.find(_add_ns('ContentProtection')) is not None
2555
1bac3455 2556 def extract_multisegment_info(element, ms_parent_info):
2557 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2558
2559 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2560 # common attributes and elements. We will only extract relevant
2561 # for us.
2562 def extract_common(source):
2563 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2564 if segment_timeline is not None:
2565 s_e = segment_timeline.findall(_add_ns('S'))
2566 if s_e:
2567 ms_info['total_number'] = 0
2568 ms_info['s'] = []
2569 for s in s_e:
2570 r = int(s.get('r', 0))
2571 ms_info['total_number'] += 1 + r
2572 ms_info['s'].append({
2573 't': int(s.get('t', 0)),
2574 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2575 'd': int(s.attrib['d']),
2576 'r': r,
2577 })
2578 start_number = source.get('startNumber')
2579 if start_number:
2580 ms_info['start_number'] = int(start_number)
2581 timescale = source.get('timescale')
2582 if timescale:
2583 ms_info['timescale'] = int(timescale)
2584 segment_duration = source.get('duration')
2585 if segment_duration:
48504785 2586 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2587
2588 def extract_Initialization(source):
2589 initialization = source.find(_add_ns('Initialization'))
2590 if initialization is not None:
2591 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2592
f14be228 2593 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2594 if segment_list is not None:
b4c1d6e8
S
2595 extract_common(segment_list)
2596 extract_Initialization(segment_list)
f14be228 2597 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2598 if segment_urls_e:
2599 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2600 else:
f14be228 2601 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2602 if segment_template is not None:
b4c1d6e8 2603 extract_common(segment_template)
e228616c
S
2604 media = segment_template.get('media')
2605 if media:
2606 ms_info['media'] = media
1bac3455 2607 initialization = segment_template.get('initialization')
2608 if initialization:
e228616c 2609 ms_info['initialization'] = initialization
1bac3455 2610 else:
b4c1d6e8 2611 extract_Initialization(segment_template)
1bac3455 2612 return ms_info
b323e170 2613
1bac3455 2614 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2615 formats, subtitles = [], {}
234416e4 2616 stream_numbers = collections.defaultdict(int)
f14be228 2617 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2618 period_duration = parse_duration(period.get('duration')) or mpd_duration
2619 period_ms_info = extract_multisegment_info(period, {
2620 'start_number': 1,
2621 'timescale': 1,
2622 })
f14be228 2623 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2624 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2625 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2626 representation_attrib = adaptation_set.attrib.copy()
2627 representation_attrib.update(representation.attrib)
f0948348 2628 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2629 mime_type = representation_attrib['mimeType']
171e59ed
F
2630 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2631
21633673 2632 codec_str = representation_attrib.get('codecs', '')
2633 # Some kind of binary subtitle found in some youtube livestreams
2634 if mime_type == 'application/x-rawcc':
2635 codecs = {'scodec': codec_str}
2636 else:
2637 codecs = parse_codecs(codec_str)
be2fc5b2 2638 if content_type not in ('video', 'audio', 'text'):
2639 if mime_type == 'image/jpeg':
a8731fcc 2640 content_type = mime_type
21633673 2641 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2642 content_type = 'video'
21633673 2643 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2644 content_type = 'audio'
3fe75fdc 2645 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2646 content_type = 'text'
6993f78d 2647 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2648 content_type = 'text'
cdb19aa4 2649 else:
be2fc5b2 2650 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2651 continue
2652
2653 base_url = ''
2654 for element in (representation, adaptation_set, period, mpd_doc):
2655 base_url_e = element.find(_add_ns('BaseURL'))
47046464 2656 if try_call(lambda: base_url_e.text) is not None:
be2fc5b2 2657 base_url = base_url_e.text + base_url
2658 if re.match(r'^https?://', base_url):
2659 break
f9cc0161 2660 if mpd_base_url and base_url.startswith('/'):
14f25df2 2661 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
f9cc0161
D
2662 elif mpd_base_url and not re.match(r'^https?://', base_url):
2663 if not mpd_base_url.endswith('/'):
be2fc5b2 2664 mpd_base_url += '/'
2665 base_url = mpd_base_url + base_url
2666 representation_id = representation_attrib.get('id')
2667 lang = representation_attrib.get('lang')
2668 url_el = representation.find(_add_ns('BaseURL'))
2669 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2670 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2671 if representation_id is not None:
2672 format_id = representation_id
2673 else:
2674 format_id = content_type
2675 if mpd_id:
2676 format_id = mpd_id + '-' + format_id
2677 if content_type in ('video', 'audio'):
2678 f = {
2679 'format_id': format_id,
2680 'manifest_url': mpd_url,
2681 'ext': mimetype2ext(mime_type),
2682 'width': int_or_none(representation_attrib.get('width')),
2683 'height': int_or_none(representation_attrib.get('height')),
2684 'tbr': float_or_none(bandwidth, 1000),
2685 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2686 'fps': int_or_none(representation_attrib.get('frameRate')),
2687 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2688 'format_note': 'DASH %s' % content_type,
2689 'filesize': filesize,
2690 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2691 **codecs
be2fc5b2 2692 }
be2fc5b2 2693 elif content_type == 'text':
2694 f = {
2695 'ext': mimetype2ext(mime_type),
2696 'manifest_url': mpd_url,
2697 'filesize': filesize,
2698 }
2699 elif content_type == 'image/jpeg':
2700 # See test case in VikiIE
2701 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2702 f = {
2703 'format_id': format_id,
2704 'ext': 'mhtml',
2705 'manifest_url': mpd_url,
2706 'format_note': 'DASH storyboards (jpeg)',
2707 'acodec': 'none',
2708 'vcodec': 'none',
2709 }
88acdbc2 2710 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2711 f['has_drm'] = True
be2fc5b2 2712 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2713
2714 def prepare_template(template_name, identifiers):
2715 tmpl = representation_ms_info[template_name]
0cb0fdbb 2716 if representation_id is not None:
2717 tmpl = tmpl.replace('$RepresentationID$', representation_id)
be2fc5b2 2718 # First of, % characters outside $...$ templates
2719 # must be escaped by doubling for proper processing
2720 # by % operator string formatting used further (see
2721 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2722 t = ''
2723 in_template = False
2724 for c in tmpl:
2725 t += c
2726 if c == '$':
2727 in_template = not in_template
2728 elif c == '%' and not in_template:
eca1f0d1 2729 t += c
be2fc5b2 2730 # Next, $...$ templates are translated to their
2731 # %(...) counterparts to be used with % operator
be2fc5b2 2732 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2733 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2734 t.replace('$$', '$')
2735 return t
2736
2737 # @initialization is a regular template like @media one
2738 # so it should be handled just the same way (see
2739 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2740 if 'initialization' in representation_ms_info:
2741 initialization_template = prepare_template(
2742 'initialization',
2743 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2744 # $Time$ shall not be included for @initialization thus
2745 # only $Bandwidth$ remains
2746 ('Bandwidth', ))
2747 representation_ms_info['initialization_url'] = initialization_template % {
2748 'Bandwidth': bandwidth,
2749 }
2750
2751 def location_key(location):
2752 return 'url' if re.match(r'^https?://', location) else 'path'
2753
2754 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2755
2756 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2757 media_location_key = location_key(media_template)
2758
2759 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2760 # can't be used at the same time
2761 if '%(Number' in media_template and 's' not in representation_ms_info:
2762 segment_duration = None
2763 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2764 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2765 representation_ms_info['total_number'] = int(math.ceil(
2766 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2767 representation_ms_info['fragments'] = [{
2768 media_location_key: media_template % {
2769 'Number': segment_number,
2770 'Bandwidth': bandwidth,
2771 },
2772 'duration': segment_duration,
2773 } for segment_number in range(
2774 representation_ms_info['start_number'],
2775 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2776 else:
2777 # $Number*$ or $Time$ in media template with S list available
2778 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2779 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2780 representation_ms_info['fragments'] = []
2781 segment_time = 0
2782 segment_d = None
2783 segment_number = representation_ms_info['start_number']
2784
2785 def add_segment_url():
2786 segment_url = media_template % {
2787 'Time': segment_time,
2788 'Bandwidth': bandwidth,
2789 'Number': segment_number,
2790 }
2791 representation_ms_info['fragments'].append({
2792 media_location_key: segment_url,
2793 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2794 })
2795
2796 for num, s in enumerate(representation_ms_info['s']):
2797 segment_time = s.get('t') or segment_time
2798 segment_d = s['d']
2799 add_segment_url()
2800 segment_number += 1
2801 for r in range(s.get('r', 0)):
2802 segment_time += segment_d
f0948348 2803 add_segment_url()
b4c1d6e8 2804 segment_number += 1
be2fc5b2 2805 segment_time += segment_d
2806 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
62b58c09
L
2807 # No media template,
2808 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
be2fc5b2 2809 # or any YouTube dashsegments video
2810 fragments = []
2811 segment_index = 0
2812 timescale = representation_ms_info['timescale']
2813 for s in representation_ms_info['s']:
2814 duration = float_or_none(s['d'], timescale)
2815 for r in range(s.get('r', 0) + 1):
2816 segment_uri = representation_ms_info['segment_urls'][segment_index]
2817 fragments.append({
2818 location_key(segment_uri): segment_uri,
2819 'duration': duration,
2820 })
2821 segment_index += 1
2822 representation_ms_info['fragments'] = fragments
2823 elif 'segment_urls' in representation_ms_info:
2824 # Segment URLs with no SegmentTimeline
62b58c09 2825 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
be2fc5b2 2826 # https://github.com/ytdl-org/youtube-dl/pull/14844
2827 fragments = []
2828 segment_duration = float_or_none(
2829 representation_ms_info['segment_duration'],
2830 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2831 for segment_url in representation_ms_info['segment_urls']:
2832 fragment = {
2833 location_key(segment_url): segment_url,
2834 }
2835 if segment_duration:
2836 fragment['duration'] = segment_duration
2837 fragments.append(fragment)
2838 representation_ms_info['fragments'] = fragments
2839 # If there is a fragments key available then we correctly recognized fragmented media.
2840 # Otherwise we will assume unfragmented media with direct access. Technically, such
2841 # assumption is not necessarily correct since we may simply have no support for
2842 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2843 if 'fragments' in representation_ms_info:
2844 f.update({
2845 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2846 'url': mpd_url or base_url,
2847 'fragment_base_url': base_url,
2848 'fragments': [],
2849 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2850 })
2851 if 'initialization_url' in representation_ms_info:
2852 initialization_url = representation_ms_info['initialization_url']
2853 if not f.get('url'):
2854 f['url'] = initialization_url
2855 f['fragments'].append({location_key(initialization_url): initialization_url})
2856 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 2857 if not period_duration:
2858 period_duration = try_get(
2859 representation_ms_info,
2860 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 2861 else:
be2fc5b2 2862 # Assuming direct URL to unfragmented media.
2863 f['url'] = base_url
234416e4 2864 if content_type in ('video', 'audio', 'image/jpeg'):
2865 f['manifest_stream_number'] = stream_numbers[f['url']]
2866 stream_numbers[f['url']] += 1
be2fc5b2 2867 formats.append(f)
2868 elif content_type == 'text':
2869 subtitles.setdefault(lang or 'und', []).append(f)
2870
171e59ed 2871 return formats, subtitles
17b598d3 2872
fd76a142
F
2873 def _extract_ism_formats(self, *args, **kwargs):
2874 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2875 if subs:
b5ae35ee 2876 self._report_ignoring_subs('ISM')
fd76a142
F
2877 return fmts
2878
2879 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
0b5546c7 2880 if self.get_param('ignore_no_formats_error'):
2881 fatal = False
2882
47a5cb77 2883 res = self._download_xml_handle(
b2758123 2884 ism_url, video_id,
37a3bb66 2885 note='Downloading ISM manifest' if note is None else note,
2886 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2887 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2888 if res is False:
fd76a142 2889 return [], {}
47a5cb77 2890 ism_doc, urlh = res
13b08034 2891 if ism_doc is None:
fd76a142 2892 return [], {}
b2758123 2893
fd76a142 2894 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 2895
fd76a142 2896 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2897 """
2898 Parse formats from ISM manifest.
2899 References:
2900 1. [MS-SSTR]: Smooth Streaming Protocol,
2901 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2902 """
06869367 2903 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2904 return [], {}
b2758123 2905
b2758123
RA
2906 duration = int(ism_doc.attrib['Duration'])
2907 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2908
2909 formats = []
fd76a142 2910 subtitles = {}
b2758123
RA
2911 for stream in ism_doc.findall('StreamIndex'):
2912 stream_type = stream.get('Type')
fd76a142 2913 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2914 continue
2915 url_pattern = stream.attrib['Url']
2916 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2917 stream_name = stream.get('Name')
fd76a142 2918 stream_language = stream.get('Language', 'und')
b2758123 2919 for track in stream.findall('QualityLevel'):
81b6102d 2920 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2921 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
b2758123 2922 # TODO: add support for WVC1 and WMAP
81b6102d 2923 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
b2758123
RA
2924 self.report_warning('%s is not a supported codec' % fourcc)
2925 continue
2926 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2927 # [1] does not mention Width and Height attributes. However,
2928 # they're often present while MaxWidth and MaxHeight are
2929 # missing, so should be used as fallbacks
2930 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2931 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2932 sampling_rate = int_or_none(track.get('SamplingRate'))
2933
2934 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
14f25df2 2935 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
b2758123
RA
2936
2937 fragments = []
2938 fragment_ctx = {
2939 'time': 0,
2940 }
2941 stream_fragments = stream.findall('c')
2942 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2943 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2944 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2945 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2946 if not fragment_ctx['duration']:
2947 try:
2948 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2949 except IndexError:
2950 next_fragment_time = duration
1616f9b4 2951 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2952 for _ in range(fragment_repeat):
2953 fragments.append({
14f25df2 2954 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2955 'duration': fragment_ctx['duration'] / stream_timescale,
2956 })
2957 fragment_ctx['time'] += fragment_ctx['duration']
2958
fd76a142
F
2959 if stream_type == 'text':
2960 subtitles.setdefault(stream_language, []).append({
2961 'ext': 'ismt',
2962 'protocol': 'ism',
2963 'url': ism_url,
2964 'manifest_url': ism_url,
2965 'fragments': fragments,
2966 '_download_params': {
2967 'stream_type': stream_type,
2968 'duration': duration,
2969 'timescale': stream_timescale,
2970 'fourcc': fourcc,
2971 'language': stream_language,
2972 'codec_private_data': track.get('CodecPrivateData'),
2973 }
2974 })
2975 elif stream_type in ('video', 'audio'):
2976 formats.append({
34921b43 2977 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
2978 'url': ism_url,
2979 'manifest_url': ism_url,
2980 'ext': 'ismv' if stream_type == 'video' else 'isma',
2981 'width': width,
2982 'height': height,
2983 'tbr': tbr,
2984 'asr': sampling_rate,
2985 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2986 'acodec': 'none' if stream_type == 'video' else fourcc,
2987 'protocol': 'ism',
2988 'fragments': fragments,
88acdbc2 2989 'has_drm': ism_doc.find('Protection') is not None,
f68434cc 2990 'language': stream_language,
2991 'audio_channels': int_or_none(track.get('Channels')),
fd76a142
F
2992 '_download_params': {
2993 'stream_type': stream_type,
2994 'duration': duration,
2995 'timescale': stream_timescale,
2996 'width': width or 0,
2997 'height': height or 0,
2998 'fourcc': fourcc,
2999 'language': stream_language,
3000 'codec_private_data': track.get('CodecPrivateData'),
3001 'sampling_rate': sampling_rate,
3002 'channels': int_or_none(track.get('Channels', 2)),
3003 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3004 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3005 },
3006 })
3007 return formats, subtitles
b2758123 3008
079a7cfc 3009 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
3010 def absolute_url(item_url):
3011 return urljoin(base_url, item_url)
59bbe491 3012
3013 def parse_content_type(content_type):
3014 if not content_type:
3015 return {}
3016 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3017 if ctr:
3018 mimetype, codecs = ctr.groups()
3019 f = parse_codecs(codecs)
3020 f['ext'] = mimetype2ext(mimetype)
3021 return f
3022 return {}
3023
222a2308
L
3024 def _media_formats(src, cur_media_type, type_info=None):
3025 type_info = type_info or {}
520251c0 3026 full_url = absolute_url(src)
82889d4a 3027 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3028 if ext == 'm3u8':
520251c0
YCH
3029 is_plain_url = False
3030 formats = self._extract_m3u8_formats(
ad120ae1 3031 full_url, video_id, ext='mp4',
eeb0a956 3032 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3033 preference=preference, quality=quality, fatal=False)
87a449c1
S
3034 elif ext == 'mpd':
3035 is_plain_url = False
3036 formats = self._extract_mpd_formats(
b359e977 3037 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3038 else:
3039 is_plain_url = True
3040 formats = [{
3041 'url': full_url,
3042 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3043 'ext': ext,
520251c0
YCH
3044 }]
3045 return is_plain_url, formats
3046
59bbe491 3047 entries = []
4328ddf8 3048 # amp-video and amp-audio are very similar to their HTML5 counterparts
962ffcf8 3049 # so we will include them right here (see
4328ddf8 3050 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3051 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3052 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3053 media_tags = [(media_tag, media_tag_name, media_type, '')
3054 for media_tag, media_tag_name, media_type
3055 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3056 media_tags.extend(re.findall(
3057 # We only allow video|audio followed by a whitespace or '>'.
3058 # Allowing more characters may end up in significant slow down (see
62b58c09
L
3059 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3060 # e.g. http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3061 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3062 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3063 media_info = {
3064 'formats': [],
3065 'subtitles': {},
3066 }
3067 media_attributes = extract_attributes(media_tag)
bfbecd11 3068 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3069 if src:
222a2308
L
3070 f = parse_content_type(media_attributes.get('type'))
3071 _, formats = _media_formats(src, media_type, f)
520251c0 3072 media_info['formats'].extend(formats)
6780154e 3073 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3074 if media_content:
3075 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3076 s_attr = extract_attributes(source_tag)
3077 # data-video-src and data-src are non standard but seen
3078 # several times in the wild
bfbecd11 3079 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3080 if not src:
3081 continue
d493f15c 3082 f = parse_content_type(s_attr.get('type'))
868f79db 3083 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3084 if is_plain_url:
d493f15c
S
3085 # width, height, res, label and title attributes are
3086 # all not standard but seen several times in the wild
3087 labels = [
3088 s_attr.get(lbl)
3089 for lbl in ('label', 'title')
3090 if str_or_none(s_attr.get(lbl))
3091 ]
3092 width = int_or_none(s_attr.get('width'))
3089bc74
S
3093 height = (int_or_none(s_attr.get('height'))
3094 or int_or_none(s_attr.get('res')))
d493f15c
S
3095 if not width or not height:
3096 for lbl in labels:
3097 resolution = parse_resolution(lbl)
3098 if not resolution:
3099 continue
3100 width = width or resolution.get('width')
3101 height = height or resolution.get('height')
3102 for lbl in labels:
3103 tbr = parse_bitrate(lbl)
3104 if tbr:
3105 break
3106 else:
3107 tbr = None
1ed45499 3108 f.update({
d493f15c
S
3109 'width': width,
3110 'height': height,
3111 'tbr': tbr,
3112 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3113 })
520251c0
YCH
3114 f.update(formats[0])
3115 media_info['formats'].append(f)
3116 else:
3117 media_info['formats'].extend(formats)
59bbe491 3118 for track_tag in re.findall(r'<track[^>]+>', media_content):
3119 track_attributes = extract_attributes(track_tag)
3120 kind = track_attributes.get('kind')
5968d7d2 3121 if not kind or kind in ('subtitles', 'captions'):
f856816b 3122 src = strip_or_none(track_attributes.get('src'))
59bbe491 3123 if not src:
3124 continue
3125 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3126 media_info['subtitles'].setdefault(lang, []).append({
3127 'url': absolute_url(src),
3128 })
5e8e2fa5
S
3129 for f in media_info['formats']:
3130 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3131 if media_info['formats'] or media_info['subtitles']:
59bbe491 3132 entries.append(media_info)
3133 return entries
3134
f6a1d69a
F
3135 def _extract_akamai_formats(self, *args, **kwargs):
3136 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3137 if subs:
b5ae35ee 3138 self._report_ignoring_subs('akamai')
f6a1d69a
F
3139 return fmts
3140
3141 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3142 signed = 'hdnea=' in manifest_url
3143 if not signed:
3144 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3145 manifest_url = re.sub(
3146 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3147 '', manifest_url).strip('?')
3148
c7c43a93 3149 formats = []
f6a1d69a 3150 subtitles = {}
70c5802b 3151
e71a4509 3152 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3153 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3154 hds_host = hosts.get('hds')
3155 if hds_host:
3156 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3157 if 'hdcore=' not in f4m_url:
3158 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3159 f4m_formats = self._extract_f4m_formats(
3160 f4m_url, video_id, f4m_id='hds', fatal=False)
3161 for entry in f4m_formats:
3162 entry.update({'extra_param_to_segment_url': hdcore_sign})
3163 formats.extend(f4m_formats)
70c5802b 3164
c4251b9a
RA
3165 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3166 hls_host = hosts.get('hls')
3167 if hls_host:
3168 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3169 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3170 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3171 m3u8_id='hls', fatal=False)
3172 formats.extend(m3u8_formats)
f6a1d69a 3173 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3174
3175 http_host = hosts.get('http')
29f7c58a 3176 if http_host and m3u8_formats and not signed:
3177 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3178 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3179 qualities_length = len(qualities)
29f7c58a 3180 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3181 i = 0
29f7c58a 3182 for f in m3u8_formats:
3183 if f['vcodec'] != 'none':
70c5802b 3184 for protocol in ('http', 'https'):
3185 http_f = f.copy()
3186 del http_f['manifest_url']
3187 http_url = re.sub(
86e5f3ed 3188 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3189 http_f.update({
3190 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3191 'url': http_url,
3192 'protocol': protocol,
3193 })
29f7c58a 3194 formats.append(http_f)
70c5802b 3195 i += 1
70c5802b 3196
f6a1d69a 3197 return formats, subtitles
c7c43a93 3198
6ad02195 3199 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
14f25df2 3200 query = urllib.parse.urlparse(url).query
6ad02195 3201 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3202 mobj = re.search(
3203 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3204 url_base = mobj.group('url')
3205 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3206 formats = []
044eeb14
S
3207
3208 def manifest_url(manifest):
86e5f3ed 3209 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3210 if query:
3211 m_url += '?%s' % query
3212 return m_url
3213
6ad02195
RA
3214 if 'm3u8' not in skip_protocols:
3215 formats.extend(self._extract_m3u8_formats(
044eeb14 3216 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3217 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3218 if 'f4m' not in skip_protocols:
3219 formats.extend(self._extract_f4m_formats(
044eeb14 3220 manifest_url('manifest.f4m'),
6ad02195 3221 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3222 if 'dash' not in skip_protocols:
3223 formats.extend(self._extract_mpd_formats(
044eeb14 3224 manifest_url('manifest.mpd'),
0384932e 3225 video_id, mpd_id='dash', fatal=False))
6ad02195 3226 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3227 if 'smil' not in skip_protocols:
3228 rtmp_formats = self._extract_smil_formats(
044eeb14 3229 manifest_url('jwplayer.smil'),
6ad02195
RA
3230 video_id, fatal=False)
3231 for rtmp_format in rtmp_formats:
3232 rtsp_format = rtmp_format.copy()
3233 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3234 del rtsp_format['play_path']
3235 del rtsp_format['ext']
3236 rtsp_format.update({
3237 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3238 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3239 'protocol': 'rtsp',
3240 })
3241 formats.extend([rtmp_format, rtsp_format])
3242 else:
3243 for protocol in ('rtmp', 'rtsp'):
3244 if protocol not in skip_protocols:
3245 formats.append({
86e5f3ed 3246 'url': f'{protocol}:{url_base}',
6ad02195
RA
3247 'format_id': protocol,
3248 'protocol': protocol,
3249 })
3250 return formats
3251
c73e330e 3252 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3253 mobj = re.search(
32a84bcf 3254 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
a4a554a7
YCH
3255 webpage)
3256 if mobj:
c73e330e
RU
3257 try:
3258 jwplayer_data = self._parse_json(mobj.group('options'),
3259 video_id=video_id,
3260 transform_source=transform_source)
3261 except ExtractorError:
3262 pass
3263 else:
3264 if isinstance(jwplayer_data, dict):
3265 return jwplayer_data
a4a554a7
YCH
3266
3267 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3268 jwplayer_data = self._find_jwplayer_data(
3269 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3270 return self._parse_jwplayer_data(
3271 jwplayer_data, video_id, *args, **kwargs)
3272
3273 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3274 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
a4a554a7 3275 entries = []
32a84bcf
SS
3276 if not isinstance(jwplayer_data, dict):
3277 return entries
a4a554a7 3278
32a84bcf
SS
3279 playlist_items = jwplayer_data.get('playlist')
3280 # JWPlayer backward compatibility: single playlist item/flattened playlists
a4a554a7 3281 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
32a84bcf
SS
3282 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3283 if not isinstance(playlist_items, list):
3284 playlist_items = (playlist_items or jwplayer_data, )
a4a554a7 3285
32a84bcf
SS
3286 for video_data in playlist_items:
3287 if not isinstance(video_data, dict):
3288 continue
a4a554a7
YCH
3289 # JWPlayer backward compatibility: flattened sources
3290 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3291 if 'sources' not in video_data:
3292 video_data['sources'] = [video_data]
3293
3294 this_video_id = video_id or video_data['mediaid']
3295
1a2192cb
S
3296 formats = self._parse_jwplayer_formats(
3297 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3298 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3299
3300 subtitles = {}
3301 tracks = video_data.get('tracks')
3302 if tracks and isinstance(tracks, list):
3303 for track in tracks:
96a2daa1
S
3304 if not isinstance(track, dict):
3305 continue
f4b74272 3306 track_kind = track.get('kind')
14f25df2 3307 if not track_kind or not isinstance(track_kind, str):
f4b74272
S
3308 continue
3309 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3310 continue
3311 track_url = urljoin(base_url, track.get('file'))
3312 if not track_url:
3313 continue
3314 subtitles.setdefault(track.get('label') or 'en', []).append({
3315 'url': self._proto_relative_url(track_url)
3316 })
3317
50d808f5 3318 entry = {
a4a554a7 3319 'id': this_video_id,
50d808f5 3320 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3321 'description': clean_html(video_data.get('description')),
6945b9e7 3322 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3323 'timestamp': int_or_none(video_data.get('pubdate')),
3324 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3325 'subtitles': subtitles,
32a84bcf
SS
3326 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3327 'genre': clean_html(video_data.get('genre')),
3328 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3329 'season_number': int_or_none(video_data.get('season')),
3330 'episode_number': int_or_none(video_data.get('episode')),
3331 'release_year': int_or_none(video_data.get('releasedate')),
3332 'age_limit': int_or_none(video_data.get('age_restriction')),
50d808f5
RA
3333 }
3334 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3335 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3336 entry.update({
3337 '_type': 'url_transparent',
3338 'url': formats[0]['url'],
3339 })
3340 else:
50d808f5
RA
3341 entry['formats'] = formats
3342 entries.append(entry)
a4a554a7
YCH
3343 if len(entries) == 1:
3344 return entries[0]
3345 else:
3346 return self.playlist_result(entries)
3347
ed0cf9b3
S
3348 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3349 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
32a84bcf 3350 urls = set()
ed0cf9b3 3351 formats = []
1a2192cb 3352 for source in jwplayer_sources_data:
0a268c6e
S
3353 if not isinstance(source, dict):
3354 continue
6945b9e7
RA
3355 source_url = urljoin(
3356 base_url, self._proto_relative_url(source.get('file')))
3357 if not source_url or source_url in urls:
bf1b87cd 3358 continue
32a84bcf 3359 urls.add(source_url)
ed0cf9b3
S
3360 source_type = source.get('type') or ''
3361 ext = mimetype2ext(source_type) or determine_ext(source_url)
32a84bcf 3362 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
ed0cf9b3 3363 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3364 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3365 m3u8_id=m3u8_id, fatal=False))
32a84bcf 3366 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
ed0cf9b3
S
3367 formats.extend(self._extract_mpd_formats(
3368 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3369 elif ext == 'smil':
3370 formats.extend(self._extract_smil_formats(
3371 source_url, video_id, fatal=False))
ed0cf9b3 3372 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3373 elif source_type.startswith('audio') or ext in (
3374 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3375 formats.append({
3376 'url': source_url,
3377 'vcodec': 'none',
3378 'ext': ext,
3379 })
3380 else:
32a84bcf 3381 format_id = str_or_none(source.get('label'))
ed0cf9b3 3382 height = int_or_none(source.get('height'))
32a84bcf 3383 if height is None and format_id:
ed0cf9b3 3384 # Often no height is provided but there is a label in
0236cd0d 3385 # format like "1080p", "720p SD", or 1080.
32a84bcf 3386 height = parse_resolution(format_id).get('height')
ed0cf9b3
S
3387 a_format = {
3388 'url': source_url,
3389 'width': int_or_none(source.get('width')),
3390 'height': height,
d3a3d7f0 3391 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3392 'filesize': int_or_none(source.get('filesize')),
ed0cf9b3 3393 'ext': ext,
32a84bcf 3394 'format_id': format_id
ed0cf9b3
S
3395 }
3396 if source_url.startswith('rtmp'):
3397 a_format['ext'] = 'flv'
ed0cf9b3
S
3398 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3399 # of jwplayer.flash.swf
3400 rtmp_url_parts = re.split(
3401 r'((?:mp4|mp3|flv):)', source_url, 1)
3402 if len(rtmp_url_parts) == 3:
3403 rtmp_url, prefix, play_path = rtmp_url_parts
3404 a_format.update({
3405 'url': rtmp_url,
3406 'play_path': prefix + play_path,
3407 })
3408 if rtmp_params:
3409 a_format.update(rtmp_params)
3410 formats.append(a_format)
3411 return formats
3412
f4b1c7ad 3413 def _live_title(self, name):
39ca3b5c 3414 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3415 return name
f4b1c7ad 3416
b14f3a4c
PH
3417 def _int(self, v, name, fatal=False, **kwargs):
3418 res = int_or_none(v, **kwargs)
b14f3a4c 3419 if res is None:
86e5f3ed 3420 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3421 if fatal:
3422 raise ExtractorError(msg)
3423 else:
6a39ee13 3424 self.report_warning(msg)
b14f3a4c
PH
3425 return res
3426
3427 def _float(self, v, name, fatal=False, **kwargs):
3428 res = float_or_none(v, **kwargs)
3429 if res is None:
86e5f3ed 3430 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3431 if fatal:
3432 raise ExtractorError(msg)
3433 else:
6a39ee13 3434 self.report_warning(msg)
b14f3a4c
PH
3435 return res
3436
40e41780
TF
3437 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3438 path='/', secure=False, discard=False, rest={}, **kwargs):
ac668111 3439 cookie = http.cookiejar.Cookie(
4ed2d7b7 3440 0, name, value, port, port is not None, domain, True,
40e41780
TF
3441 domain.startswith('.'), path, True, secure, expire_time,
3442 discard, None, None, rest)
9809740b 3443 self.cookiejar.set_cookie(cookie)
42939b61 3444
799207e8 3445 def _get_cookies(self, url):
ac668111 3446 """ Return a http.cookies.SimpleCookie with the cookies for the url """
b87e01c1 3447 return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
799207e8 3448
e3c1266f 3449 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3450 """
3451 Apply first Set-Cookie header instead of the last. Experimental.
3452
3453 Some sites (e.g. [1-3]) may serve two cookies under the same name
3454 in Set-Cookie header and expect the first (old) one to be set rather
3455 than second (new). However, as of RFC6265 the newer one cookie
3456 should be set into cookie store what actually happens.
3457 We will workaround this issue by resetting the cookie to
3458 the first one manually.
3459 1. https://new.vk.com/
3460 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3461 3. https://learning.oreilly.com/
3462 """
e3c1266f
S
3463 for header, cookies in url_handle.headers.items():
3464 if header.lower() != 'set-cookie':
3465 continue
cfb0511d 3466 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3467 cookie_value = re.search(
3468 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3469 if cookie_value:
3470 value, domain = cookie_value.groups()
3471 self._set_cookie(domain, cookie, value)
3472 break
3473
82d02080 3474 @classmethod
3475 def get_testcases(cls, include_onlymatching=False):
6368e2e6 3476 # Do not look in super classes
3477 t = vars(cls).get('_TEST')
05900629 3478 if t:
82d02080 3479 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3480 tests = [t]
3481 else:
6368e2e6 3482 tests = vars(cls).get('_TESTS', [])
05900629
PH
3483 for t in tests:
3484 if not include_onlymatching and t.get('only_matching', False):
3485 continue
82d02080 3486 t['name'] = cls.ie_key()
05900629 3487 yield t
e756f45b
M
3488 if getattr(cls, '__wrapped__', None):
3489 yield from cls.__wrapped__.get_testcases(include_onlymatching)
05900629 3490
f2e8dbcc 3491 @classmethod
3492 def get_webpage_testcases(cls):
6368e2e6 3493 tests = vars(cls).get('_WEBPAGE_TESTS', [])
f2e8dbcc 3494 for t in tests:
3495 t['name'] = cls.ie_key()
e756f45b
M
3496 yield t
3497 if getattr(cls, '__wrapped__', None):
3498 yield from cls.__wrapped__.get_webpage_testcases()
f2e8dbcc 3499
6368e2e6 3500 @classproperty(cache=True)
24146491 3501 def age_limit(cls):
3502 """Get age limit from the testcases"""
3503 return max(traverse_obj(
f2e8dbcc 3504 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
24146491 3505 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3506
171a31db 3507 @classproperty(cache=True)
3508 def _RETURN_TYPE(cls):
3509 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3510 tests = tuple(cls.get_testcases(include_onlymatching=False))
3511 if not tests:
3512 return None
3513 elif not any(k.startswith('playlist') for test in tests for k in test):
3514 return 'video'
3515 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3516 return 'playlist'
3517 return 'any'
3518
3519 @classmethod
3520 def is_single_video(cls, url):
3521 """Returns whether the URL is of a single video, None if unknown"""
baa922b5 3522 if cls.suitable(url):
3523 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
171a31db 3524
82d02080 3525 @classmethod
3526 def is_suitable(cls, age_limit):
24146491 3527 """Test whether the extractor is generally suitable for the given age limit"""
3528 return not age_restricted(cls.age_limit, age_limit)
05900629 3529
82d02080 3530 @classmethod
3531 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3532 """Description of the extractor"""
3533 desc = ''
82d02080 3534 if cls._NETRC_MACHINE:
8dcce6a8 3535 if markdown:
5b28cef7 3536 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
8dcce6a8 3537 else:
82d02080 3538 desc += f' [{cls._NETRC_MACHINE}]'
3539 if cls.IE_DESC is False:
8dcce6a8 3540 desc += ' [HIDDEN]'
82d02080 3541 elif cls.IE_DESC:
3542 desc += f' {cls.IE_DESC}'
3543 if cls.SEARCH_KEY:
08e29b9f 3544 desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3545 if search_examples:
3546 _COUNTS = ('', '5', '10', 'all')
62b58c09 3547 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
82d02080 3548 if not cls.working():
8dcce6a8 3549 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3550
46d09f87 3551 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3552 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
8dcce6a8 3553 return f'{name}:{desc}' if desc else name
3554
a504ced0 3555 def extract_subtitles(self, *args, **kwargs):
a06916d9 3556 if (self.get_param('writesubtitles', False)
3557 or self.get_param('listsubtitles')):
9868ea49
JMF
3558 return self._get_subtitles(*args, **kwargs)
3559 return {}
a504ced0
JMF
3560
3561 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3562 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3563
0cf643b2
M
3564 class CommentsDisabled(Exception):
3565 """Raise in _get_comments if comments are disabled for the video"""
3566
a2160aa4 3567 def extract_comments(self, *args, **kwargs):
3568 if not self.get_param('getcomments'):
3569 return None
3570 generator = self._get_comments(*args, **kwargs)
3571
3572 def extractor():
3573 comments = []
d2b2fca5 3574 interrupted = True
a2160aa4 3575 try:
3576 while True:
3577 comments.append(next(generator))
a2160aa4 3578 except StopIteration:
3579 interrupted = False
d2b2fca5 3580 except KeyboardInterrupt:
3581 self.to_screen('Interrupted by user')
0cf643b2
M
3582 except self.CommentsDisabled:
3583 return {'comments': None, 'comment_count': None}
d2b2fca5 3584 except Exception as e:
3585 if self.get_param('ignoreerrors') is not True:
3586 raise
3587 self._downloader.report_error(e)
a2160aa4 3588 comment_count = len(comments)
3589 self.to_screen(f'Extracted {comment_count} comments')
3590 return {
3591 'comments': comments,
3592 'comment_count': None if interrupted else comment_count
3593 }
3594 return extractor
3595
3596 def _get_comments(self, *args, **kwargs):
3597 raise NotImplementedError('This method must be implemented by subclasses')
3598
912e0b7e
YCH
3599 @staticmethod
3600 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3601 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3602 will be dropped. """
86e5f3ed 3603 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3604 ret = list(subtitle_list1)
a44ca5a4 3605 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3606 return ret
3607
3608 @classmethod
46890374 3609 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3610 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3611 if target is None:
3612 target = {}
3613 for d in dicts:
3614 for lang, subs in d.items():
3615 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3616 return target
912e0b7e 3617
360e1ca5 3618 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3619 if (self.get_param('writeautomaticsub', False)
3620 or self.get_param('listsubtitles')):
9868ea49
JMF
3621 return self._get_automatic_captions(*args, **kwargs)
3622 return {}
360e1ca5
JMF
3623
3624 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3625 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3626
2762dbb1 3627 @functools.cached_property
24146491 3628 def _cookies_passed(self):
3629 """Whether cookies have been passed to YoutubeDL"""
3630 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3631
d77ab8e2 3632 def mark_watched(self, *args, **kwargs):
1813a6cc 3633 if not self.get_param('mark_watched', False):
3634 return
24146491 3635 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3636 self._mark_watched(*args, **kwargs)
3637
3638 def _mark_watched(self, *args, **kwargs):
3639 raise NotImplementedError('This method must be implemented by subclasses')
3640
38cce791
YCH
3641 def geo_verification_headers(self):
3642 headers = {}
a06916d9 3643 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3644 if geo_verification_proxy:
3645 headers['Ytdl-request-proxy'] = geo_verification_proxy
3646 return headers
3647
8f97a15d 3648 @staticmethod
3649 def _generic_id(url):
14f25df2 3650 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
98763ee3 3651
62b8dac4 3652 def _generic_title(self, url='', webpage='', *, default=None):
3653 return (self._og_search_title(webpage, default=None)
3654 or self._html_extract_title(webpage, default=None)
3655 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3656 or default)
98763ee3 3657
22ccd542 3658 def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3659 if not duration:
3660 return
3661 chapter_list = [{
3662 'start_time': start_function(chapter),
3663 'title': title_function(chapter),
3664 } for chapter in chapter_list or []]
84ffeb7d 3665 if strict:
3666 warn = self.report_warning
3667 else:
3668 warn = self.write_debug
22ccd542 3669 chapter_list.sort(key=lambda c: c['start_time'] or 0)
3670
3671 chapters = [{'start_time': 0}]
3672 for idx, chapter in enumerate(chapter_list):
3673 if chapter['start_time'] is None:
84ffeb7d 3674 warn(f'Incomplete chapter {idx}')
22ccd542 3675 elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3676 chapters.append(chapter)
3677 elif chapter not in chapters:
84ffeb7d 3678 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3679 else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3680 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
22ccd542 3681 return chapters[1:]
3682
3683 def _extract_chapters_from_description(self, description, duration):
3684 duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3685 sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3686 return self._extract_chapters_helper(
3687 re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3688 start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3689 duration=duration, strict=False) or self._extract_chapters_helper(
3690 re.findall(sep_re % (r'.+?', duration_re), description or ''),
3691 start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3692 duration=duration, strict=False)
3693
c224251a 3694 @staticmethod
b0089e89 3695 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3696 all_known = all(map(
3697 lambda x: x is not None,
3698 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3699 return (
3700 'private' if is_private
3701 else 'premium_only' if needs_premium
3702 else 'subscriber_only' if needs_subscription
3703 else 'needs_auth' if needs_auth
3704 else 'unlisted' if is_unlisted
3705 else 'public' if all_known
3706 else None)
3707
d43de682 3708 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3709 '''
3710 @returns A list of values for the extractor argument given by "key"
3711 or "default" if no such key is present
3712 @param default The default value to return when the key is not present (default: [])
3713 @param casesense When false, the values are converted to lower case
3714 '''
5225df50 3715 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3716 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
4bb6b02f 3717 if val is None:
3718 return [] if default is NO_DEFAULT else default
3719 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3720
f40ee5e9 3721 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3722 if not playlist_id or not video_id:
3723 return not video_id
3724
3725 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3726 if no_playlist is not None:
3727 return not no_playlist
3728
3729 video_id = '' if video_id is True else f' {video_id}'
3730 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3731 if self.get_param('noplaylist'):
3732 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3733 return False
3734 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3735 return True
3736
be5c1ae8 3737 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
8ca48a1a 3738 RetryManager.report_retry(
3739 err, _count or int(fatal), _retries,
3740 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3741 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
be5c1ae8 3742
3743 def RetryManager(self, **kwargs):
3744 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3745
ade1fa70 3746 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3747 display_id = traverse_obj(info_dict, 'display_id', 'id')
3748 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3749 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3750 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3751
8f97a15d 3752 @classmethod
3753 def extract_from_webpage(cls, ydl, url, webpage):
3754 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3755 else ydl.get_info_extractor(cls.ie_key()))
f2e8dbcc 3756 for info in ie._extract_from_webpage(url, webpage) or []:
3757 # url = None since we do not want to set (webpage/original)_url
3758 ydl.add_default_extra_info(info, ie, None)
3759 yield info
8f97a15d 3760
3761 @classmethod
3762 def _extract_from_webpage(cls, url, webpage):
3763 for embed_url in orderedSet(
3764 cls._extract_embed_urls(url, webpage) or [], lazy=True):
d2c8aadf 3765 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
8f97a15d 3766
3767 @classmethod
3768 def _extract_embed_urls(cls, url, webpage):
3769 """@returns all the embed urls on the webpage"""
3770 if '_EMBED_URL_RE' not in cls.__dict__:
3771 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3772 for idx, regex in enumerate(cls._EMBED_REGEX):
3773 assert regex.count('(?P<url>') == 1, \
3774 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3775 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3776
3777 for regex in cls._EMBED_URL_RE:
3778 for mobj in regex.finditer(webpage):
3779 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3780 if cls._VALID_URL is False or cls.suitable(embed_url):
3781 yield embed_url
3782
3783 class StopExtraction(Exception):
3784 pass
3785
bfd973ec 3786 @classmethod
3787 def _extract_url(cls, webpage): # TODO: Remove
3788 """Only for compatibility with some older extractors"""
3789 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3790
2314b4d8 3791 @classmethod
3792 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3793 if plugin_name:
3794 mro = inspect.getmro(cls)
3795 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
e756f45b
M
3796 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3797 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
2314b4d8 3798 while getattr(super_class, '__wrapped__', None):
3799 super_class = super_class.__wrapped__
3800 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
e756f45b 3801 _PLUGIN_OVERRIDES[super_class].append(cls)
2314b4d8 3802
3803 return super().__init_subclass__(**kwargs)
3804
8dbe9899 3805
d6983cb4
PH
3806class SearchInfoExtractor(InfoExtractor):
3807 """
3808 Base class for paged search queries extractors.
10952eb2 3809 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3810 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3811 """
3812
96565c7e 3813 _MAX_RESULTS = float('inf')
171a31db 3814 _RETURN_TYPE = 'playlist'
96565c7e 3815
8f97a15d 3816 @classproperty
3817 def _VALID_URL(cls):
d6983cb4
PH
3818 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3819
d6983cb4 3820 def _real_extract(self, query):
2c4aaadd 3821 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3822 if prefix == '':
3823 return self._get_n_results(query, 1)
3824 elif prefix == 'all':
3825 return self._get_n_results(query, self._MAX_RESULTS)
3826 else:
3827 n = int(prefix)
3828 if n <= 0:
86e5f3ed 3829 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3830 elif n > self._MAX_RESULTS:
6a39ee13 3831 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3832 n = self._MAX_RESULTS
3833 return self._get_n_results(query, n)
3834
3835 def _get_n_results(self, query, n):
cc16383f 3836 """Get a specified number of results for a query.
3837 Either this function or _search_results must be overridden by subclasses """
3838 return self.playlist_result(
3839 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3840 query, query)
3841
3842 def _search_results(self, query):
3843 """Returns an iterator of search results"""
611c1dd9 3844 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 3845
82d02080 3846 @classproperty
3847 def SEARCH_KEY(cls):
3848 return cls._SEARCH_KEY
fe7866d0 3849
3850
3851class UnsupportedURLIE(InfoExtractor):
3852 _VALID_URL = '.*'
3853 _ENABLED = False
3854 IE_DESC = False
3855
3856 def _real_extract(self, url):
3857 raise UnsupportedError(url)
e756f45b
M
3858
3859
3860_PLUGIN_OVERRIDES = collections.defaultdict(list)