]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
[extractor/youtube] Extract more metadata for comments (#7179)
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
ac668111 3import getpass
3ec05685 4import hashlib
54007a45 5import http.client
6import http.cookiejar
7import http.cookies
2314b4d8 8import inspect
cc16383f 9import itertools
3d3538e4 10import json
f8271158 11import math
4094b6e3 12import netrc
d6983cb4 13import os
773f291d 14import random
6929b41a 15import re
d6983cb4 16import sys
4094b6e3 17import time
8f97a15d 18import types
14f25df2 19import urllib.parse
ac668111 20import urllib.request
f8271158 21import xml.etree.ElementTree
d6983cb4 22
6929b41a 23from ..compat import functools # isort: split
14f25df2 24from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
8817a80d 25from ..cookies import LenientSimpleCookie
f8271158 26from ..downloader.f4m import get_base_url, remove_encrypted_media
8c25f81b 27from ..utils import (
8f97a15d 28 IDENTITY,
f8271158 29 JSON_LD_RE,
30 NO_DEFAULT,
31 ExtractorError,
d0d74b71 32 FormatSorter,
f8271158 33 GeoRestrictedError,
34 GeoUtils,
cb73b846 35 HEADRequest,
b7c47b74 36 LenientJSONDecoder,
f8271158 37 RegexNotFoundError,
be5c1ae8 38 RetryManager,
f8271158 39 UnsupportedError,
05900629 40 age_restricted,
02dc0a36 41 base_url,
08f2a92c 42 bug_reports_message,
82d02080 43 classproperty,
d6983cb4 44 clean_html,
d0d74b71 45 deprecation_warning,
70f0f5a8 46 determine_ext,
d493f15c 47 dict_get,
42676437 48 encode_data_uri,
9b9c5355 49 error_to_compat_str,
46b18f23 50 extract_attributes,
90137ca4 51 filter_dict,
97f4aecf 52 fix_xml_ampersands,
b14f3a4c 53 float_or_none,
b868936c 54 format_field,
31bb8d3f 55 int_or_none,
34921b43 56 join_nonempty,
a4a554a7 57 js_to_json,
46b18f23 58 mimetype2ext,
3158150c 59 network_exceptions,
46b18f23 60 orderedSet,
d493f15c 61 parse_bitrate,
46b18f23
JH
62 parse_codecs,
63 parse_duration,
4ca2a3cf 64 parse_iso8601,
46b18f23 65 parse_m3u8_attributes,
d493f15c 66 parse_resolution,
46b18f23 67 sanitize_filename,
8f97a15d 68 sanitize_url,
b868936c 69 sanitized_Request,
ade1fa70 70 smuggle_url,
d493f15c 71 str_or_none,
ce5b9040 72 str_to_int,
f856816b 73 strip_or_none,
5d3a0e79 74 traverse_obj,
71df9b7f 75 truncate_string,
47046464 76 try_call,
ffa89477 77 try_get,
f38de77f 78 unescapeHTML,
647eab45 79 unified_strdate,
6b3a3098 80 unified_timestamp,
46b18f23 81 update_Request,
09d02ea4 82 update_url_query,
a107193e 83 url_basename,
bebef109 84 url_or_none,
7e68567e 85 urlhandle_detect_ext,
b868936c 86 urljoin,
6606817a 87 variadic,
a6571f10 88 xpath_element,
8d6765cf
S
89 xpath_text,
90 xpath_with_ns,
d6983cb4 91)
c342041f 92
d6983cb4 93
86e5f3ed 94class InfoExtractor:
d6983cb4
PH
95 """Information Extractor class.
96
97 Information extractors are the classes that, given a URL, extract
98 information about the video (or videos) the URL refers to. This
99 information includes the real video URL, the video title, author and
100 others. The information is stored in a dictionary which is then
5d380852 101 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
102 information possibly downloading the video to the file system, among
103 other possible outcomes.
104
cf0649f8 105 The type field determines the type of the result.
fed5d032
PH
106 By far the most common value (and the default if _type is missing) is
107 "video", which indicates a single video.
108
109 For a video, the dictionaries must include the following fields:
d6983cb4
PH
110
111 id: Video identifier.
d4736fdb 112 title: Video title, unescaped. Set to an empty string if video has
113 no title as opposed to "None" which signifies that the
114 extractor failed to obtain a title
d67b0b15 115
f49d89ee 116 Additionally, it must contain either a formats entry or a url one:
d67b0b15 117
f49d89ee
PH
118 formats: A list of dictionaries for each format available, ordered
119 from worst to best quality.
120
121 Potential fields:
c790e93a
S
122 * url The mandatory URL representing the media:
123 for plain file media - HTTP URL of this file,
124 for RTMP - RTMP URL,
125 for HLS - URL of the M3U8 media playlist,
126 for HDS - URL of the F4M manifest,
79d2077e
S
127 for DASH
128 - HTTP URL to plain file media (in case of
129 unfragmented media)
130 - URL of the MPD manifest or base URL
131 representing the media if MPD manifest
8ed7a233 132 is parsed from a string (in case of
79d2077e 133 fragmented media)
c790e93a 134 for MSS - URL of the ISM manifest.
f34804b2 135 * request_data Data to send in POST request to the URL
86f4d14f
S
136 * manifest_url
137 The URL of the manifest file in case of
c790e93a
S
138 fragmented media:
139 for HLS - URL of the M3U8 master playlist,
140 for HDS - URL of the F4M manifest,
141 for DASH - URL of the MPD manifest,
142 for MSS - URL of the ISM manifest.
a44ca5a4 143 * manifest_stream_number (For internal use only)
144 The index of the stream in the manifest file
10952eb2 145 * ext Will be calculated from URL if missing
d67b0b15
PH
146 * format A human-readable description of the format
147 ("mp4 container with h264/opus").
148 Calculated from the format_id, width, height.
149 and format_note fields if missing.
150 * format_id A short description of the format
5d4f3985
PH
151 ("mp4_h264_opus" or "19").
152 Technically optional, but strongly recommended.
d67b0b15
PH
153 * format_note Additional info about the format
154 ("3D" or "DASH video")
155 * width Width of the video, if known
156 * height Height of the video, if known
105bfd90 157 * aspect_ratio Aspect ratio of the video, if known
158 Automatically calculated from width and height
f49d89ee 159 * resolution Textual description of width and height
105bfd90 160 Automatically calculated from width and height
176f1866 161 * dynamic_range The dynamic range of the video. One of:
162 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 163 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
164 * abr Average audio bitrate in KBit/s
165 * acodec Name of the audio codec in use
dd27fd17 166 * asr Audio sampling rate in Hertz
b8ed0f15 167 * audio_channels Number of audio channels
d67b0b15 168 * vbr Average video bitrate in KBit/s
fbb21cf5 169 * fps Frame rate
d67b0b15 170 * vcodec Name of the video codec in use
1394ce65 171 * container Name of the container format
d67b0b15 172 * filesize The number of bytes, if known in advance
9732d77e 173 * filesize_approx An estimate for the number of bytes
d67b0b15 174 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 175 * protocol The protocol that will be used for the actual
adbc4ec4
THD
176 download, lower-case. One of "http", "https" or
177 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
178 * fragment_base_url
179 Base URL for fragments. Each fragment's path
180 value (if present) will be relative to
181 this URL.
182 * fragments A list of fragments of a fragmented media.
183 Each fragment entry must contain either an url
184 or a path. If an url is present it should be
185 considered by a client. Otherwise both path and
186 fragment_base_url must be present. Here is
187 the list of all potential fields:
188 * "url" - fragment's URL
189 * "path" - fragment's path relative to
190 fragment_base_url
a0d5077c
S
191 * "duration" (optional, int or float)
192 * "filesize" (optional, int)
adbc4ec4
THD
193 * is_from_start Is a live format that can be downloaded
194 from the start. Boolean
f49d89ee 195 * preference Order number of this format. If this field is
08d13955 196 present and not None, the formats get sorted
38d63d84 197 by this field, regardless of all other values.
f49d89ee
PH
198 -1 for default (order by other properties),
199 -2 or smaller for less than default.
e65566a9
PH
200 < -1000 to hide the format (if there is
201 another one which is strictly better)
32f90364
PH
202 * language Language code, e.g. "de" or "en-US".
203 * language_preference Is this in the language mentioned in
204 the URL?
aff2f4f4
PH
205 10 if it's what the URL is about,
206 -1 for default (don't know),
207 -10 otherwise, other values reserved for now.
5d73273f
PH
208 * quality Order number of the video quality of this
209 format, irrespective of the file format.
210 -1 for default (order by other properties),
211 -2 or smaller for less than default.
c64ed2a3
PH
212 * source_preference Order number for this video source
213 (quality takes higher priority)
214 -1 for default (order by other properties),
215 -2 or smaller for less than default.
d769be6c
PH
216 * http_headers A dictionary of additional HTTP headers
217 to add to the request.
6271f1ca 218 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
219 video's pixels are not square.
220 width : height ratio as float.
221 * no_resume The server does not support resuming the
222 (HTTP or RTMP) download. Boolean.
88acdbc2 223 * has_drm The format has DRM and cannot be downloaded. Boolean
7e68567e 224 * extra_param_to_segment_url A query string to append to each
225 fragment's URL, or to update each existing query string
226 with. Only applied by the native HLS/DASH downloaders.
227 * hls_aes A dictionary of HLS AES-128 decryption information
228 used by the native HLS downloader to override the
229 values in the media playlist when an '#EXT-X-KEY' tag
230 is present in the playlist:
231 * uri The URI from which the key will be downloaded
232 * key The key (as hex) used to decrypt fragments.
233 If `key` is given, any key URI will be ignored
234 * iv The IV (as hex) used to decrypt fragments
0a5a191a 235 * downloader_options A dictionary of downloader options
236 (For internal use only)
237 * http_chunk_size Chunk size for HTTP downloads
238 * ffmpeg_args Extra arguments for ffmpeg downloader
3b1fe47d 239 RTMP formats can also have the additional fields: page_url,
240 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
241 rtmp_protocol, rtmp_real_time
3dee7826 242
c0ba0f48 243 url: Final video URL.
d6983cb4 244 ext: Video filename extension.
d67b0b15
PH
245 format: The video format, defaults to ext (used for --get-format)
246 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 247
d6983cb4
PH
248 The following fields are optional:
249
08d30158 250 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 251 alt_title: A secondary title of the video.
0afef30b
PH
252 display_id An alternative identifier for the video, not necessarily
253 unique, but available before title. Typically, id is
254 something like "4234987", title "Dancing naked mole rats",
255 and display_id "dancing-naked-mole-rats"
d5519808 256 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 257 * "id" (optional, string) - Thumbnail format ID
d5519808 258 * "url"
cfb56d1a 259 * "preference" (optional, int) - quality of the image
d5519808
PH
260 * "width" (optional, int)
261 * "height" (optional, int)
5e1c39ac 262 * "resolution" (optional, string "{width}x{height}",
d5519808 263 deprecated)
2de624fd 264 * "filesize" (optional, int)
297e9952 265 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 266 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 267 description: Full video description.
d6983cb4 268 uploader: Full name of the video uploader.
2bc0c46f 269 license: License name the video is licensed under.
8a92e51c 270 creator: The creator of the video.
10db0d2f 271 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 272 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 273 If not explicitly set, calculated from timestamp
274 release_timestamp: UNIX timestamp of the moment the video was released.
275 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 276 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 277 If not explicitly set, calculated from release_timestamp
278 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 279 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 280 If not explicitly set, calculated from modified_timestamp
d6983cb4 281 uploader_id: Nickname or id of the video uploader.
7bcd2830 282 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 283 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 284 Note that channel fields may or may not repeat uploader
6f1f59f3
S
285 fields. This depends on a particular extractor.
286 channel_id: Id of the channel.
287 channel_url: Full URL to a channel webpage.
6c73052c 288 channel_follower_count: Number of followers of the channel.
da9ec3b9 289 location: Physical location where the video was filmed.
a504ced0 290 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
291 {tag: subformats}. "tag" is usually a language code, and
292 "subformats" is a list sorted from lower to higher
293 preference, each element is a dictionary with the "ext"
294 entry and one of:
a504ced0 295 * "data": The subtitles file contents
10952eb2 296 * "url": A URL pointing to the subtitles file
2412044c 297 It can optionally also have:
298 * "name": Name or description of the subtitles
08d30158 299 * "http_headers": A dictionary of additional HTTP headers
297e9952 300 to add to the request.
4bba3716 301 "ext" will be calculated from URL if missing
e167860c 302 automatic_captions: Like 'subtitles'; contains automatically generated
303 captions instead of normal subtitles
62d231c0 304 duration: Length of the video in seconds, as an integer or float.
f3d29461 305 view_count: How many users have watched the video on the platform.
867c66ff 306 concurrent_view_count: How many users are currently watching the video on the platform.
19e3dfc9
PH
307 like_count: Number of positive ratings of the video
308 dislike_count: Number of negative ratings of the video
02835c6b 309 repost_count: Number of reposts of the video
2d30521a 310 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 311 comment_count: Number of comments on the video
dd622d7c
PH
312 comments: A list of comments, each with one or more of the following
313 properties (all but one of text or html optional):
314 * "author" - human-readable name of the comment author
315 * "author_id" - user ID of the comment author
a1c5d2ca 316 * "author_thumbnail" - The thumbnail of the comment author
c35448b7 317 * "author_url" - The url to the comment author's page
318 * "author_is_verified" - Whether the author is verified
319 on the platform
320 * "author_is_uploader" - Whether the comment is made by
321 the video uploader
dd622d7c
PH
322 * "id" - Comment ID
323 * "html" - Comment as HTML
324 * "text" - Plain text of the comment
325 * "timestamp" - UNIX timestamp of comment
326 * "parent" - ID of the comment this one is replying to.
327 Set to "root" to indicate that this is a
328 comment to the original video.
a1c5d2ca
M
329 * "like_count" - Number of positive ratings of the comment
330 * "dislike_count" - Number of negative ratings of the comment
331 * "is_favorited" - Whether the comment is marked as
332 favorite by the video uploader
c35448b7 333 * "is_pinned" - Whether the comment is pinned to
334 the top of the comments
8dbe9899 335 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 336 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
337 should allow to get the same result again. (It will be set
338 by YoutubeDL if it's missing)
ad3bc6ac
PH
339 categories: A list of categories that the video falls in, for example
340 ["Sports", "Berlin"]
864f24bd 341 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 342 cast: A list of the video cast
7267bd53
PH
343 is_live: True, False, or None (=unknown). Whether this video is a
344 live stream that goes on instead of a fixed-length video.
f76ede8e 345 was_live: True, False, or None (=unknown). Whether this video was
346 originally a live stream.
0647d925 347 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
e325a21a 348 or 'post_live' (was live, but VOD is not yet processed)
ae30b840 349 If absent, automatically set from is_live, was_live
7c80519c 350 start_time: Time in seconds where the reproduction should start, as
10952eb2 351 specified in the URL.
297a564b 352 end_time: Time in seconds where the reproduction should end, as
10952eb2 353 specified in the URL.
55949fed 354 chapters: A list of dictionaries, with the following entries:
355 * "start_time" - The start time of the chapter in seconds
356 * "end_time" - The end time of the chapter in seconds
357 * "title" (optional, string)
5caf30db
A
358 heatmap: A list of dictionaries, with the following entries:
359 * "start_time" - The start time of the data point in seconds
360 * "end_time" - The end time of the data point in seconds
361 * "value" - The normalized value of the data point (float between 0 and 1)
6cfda058 362 playable_in_embed: Whether this video is allowed to play in embedded
363 players on other sites. Can be True (=always allowed),
364 False (=never allowed), None (=unknown), or a string
62b58c09 365 specifying the criteria for embedability; e.g. 'whitelist'
c224251a
M
366 availability: Under what condition the video is available. One of
367 'private', 'premium_only', 'subscriber_only', 'needs_auth',
368 'unlisted' or 'public'. Use 'InfoExtractor._availability'
369 to set it
1e8fe57e 370 _old_archive_ids: A list of old archive ids needed for backward compatibility
784320c9 371 _format_sort_fields: A list of fields to use for sorting formats
277d6ff5 372 __post_extractor: A function to be called just before the metadata is
373 written to either disk, logger or console. The function
374 must return a dict which will be added to the info_dict.
375 This is usefull for additional information that is
376 time-consuming to extract. Note that the fields thus
377 extracted will not be available to output template and
378 match_filter. So, only "comments" and "comment_count" are
379 currently allowed to be extracted via this method.
d6983cb4 380
7109903e
S
381 The following fields should only be used when the video belongs to some logical
382 chapter or section:
383
384 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
385 chapter_number: Number of the chapter the video belongs to, as an integer.
386 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
387
388 The following fields should only be used when the video is an episode of some
8d76bdf1 389 series, programme or podcast:
7109903e
S
390
391 series: Title of the series or programme the video episode belongs to.
9ac24e23 392 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 393 season: Title of the season the video episode belongs to.
27bfd4e5
S
394 season_number: Number of the season the video episode belongs to, as an integer.
395 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
396 episode: Title of the video episode. Unlike mandatory video title field,
397 this field should denote the exact title of the video episode
398 without any kind of decoration.
27bfd4e5
S
399 episode_number: Number of the video episode within a season, as an integer.
400 episode_id: Id of the video episode, as a unicode string.
7109903e 401
7a93ab5f
S
402 The following fields should only be used when the media is a track or a part of
403 a music album:
404
405 track: Title of the track.
406 track_number: Number of the track within an album or a disc, as an integer.
407 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
408 as a unicode string.
409 artist: Artist(s) of the track.
410 genre: Genre(s) of the track.
411 album: Title of the album the track belongs to.
412 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
413 album_artist: List of all artists appeared on the album (e.g.
414 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
415 and compilations).
416 disc_number: Number of the disc or other physical medium the track belongs to,
417 as an integer.
418 release_year: Year (YYYY) when the album was released.
8bcd4048 419 composer: Composer of the piece
7a93ab5f 420
3975b4d2 421 The following fields should only be set for clips that should be cut from the original video:
422
423 section_start: Start time of the section in seconds
424 section_end: End time of the section in seconds
425
45e8a04e 426 The following fields should only be set for storyboards:
427 rows: Number of rows in each storyboard fragment, as an integer
428 columns: Number of columns in each storyboard fragment, as an integer
429
deefc05b 430 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 431
d838b1bd
PH
432 Unless mentioned otherwise, None is equivalent to absence of information.
433
fed5d032
PH
434
435 _type "playlist" indicates multiple videos.
b82f815f
PH
436 There must be a key "entries", which is a list, an iterable, or a PagedList
437 object, each element of which is a valid dictionary by this specification.
fed5d032 438
962ffcf8 439 Additionally, playlists can have "id", "title", and any other relevant
b60419c5 440 attributes with the same semantics as videos (see above).
fed5d032 441
f0d785d3 442 It can also have the following optional fields:
443
444 playlist_count: The total number of videos in a playlist. If not given,
445 YoutubeDL tries to calculate it from "entries"
446
fed5d032
PH
447
448 _type "multi_video" indicates that there are multiple videos that
449 form a single show, for examples multiple acts of an opera or TV episode.
450 It must have an entries key like a playlist and contain all the keys
451 required for a video at the same time.
452
453
454 _type "url" indicates that the video must be extracted from another
455 location, possibly by a different extractor. Its only required key is:
456 "url" - the next URL to extract.
f58766ce
PH
457 The key "ie_key" can be set to the class name (minus the trailing "IE",
458 e.g. "Youtube") if the extractor class is known in advance.
459 Additionally, the dictionary may have any properties of the resolved entity
460 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
461 known ahead of time.
462
463
464 _type "url_transparent" entities have the same specification as "url", but
465 indicate that the given additional information is more precise than the one
466 associated with the resolved URL.
467 This is useful when a site employs a video service that hosts the video and
468 its technical metadata, but that video service does not embed a useful
469 title, description etc.
470
471
8f97a15d 472 Subclasses of this should also be added to the list of extractors and
473 should define a _VALID_URL regexp and, re-define the _real_extract() and
474 (optionally) _real_initialize() methods.
d6983cb4 475
e6f21b3d 476 Subclasses may also override suitable() if necessary, but ensure the function
477 signature is preserved and that this function imports everything it needs
52efa4b3 478 (except other extractors), so that lazy_extractors works correctly.
479
8f97a15d 480 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
481 the HTML of Generic webpages. It may also override _extract_embed_urls
482 or _extract_from_webpage as necessary. While these are normally classmethods,
483 _extract_from_webpage is allowed to be an instance method.
484
485 _extract_from_webpage may raise self.StopExtraction() to stop further
486 processing of the webpage and obtain exclusive rights to it. This is useful
62b58c09
L
487 when the extractor cannot reliably be matched using just the URL,
488 e.g. invidious/peertube instances
8f97a15d 489
490 Embed-only extractors can be defined by setting _VALID_URL = False.
491
52efa4b3 492 To support username + password (or netrc) login, the extractor must define a
493 _NETRC_MACHINE and re-define _perform_login(username, password) and
494 (optionally) _initialize_pre_login() methods. The _perform_login method will
495 be called between _initialize_pre_login and _real_initialize if credentials
496 are passed by the user. In cases where it is necessary to have the login
497 process as part of the extraction rather than initialization, _perform_login
498 can be left undefined.
e6f21b3d 499
4248dad9 500 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
501 geo restriction bypass mechanisms for a particular extractor.
502 Though it won't disable explicit geo restriction bypass based on
504f20dd 503 country code provided with geo_bypass_country.
4248dad9
S
504
505 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
506 countries for this extractor. One of these countries will be used by
507 geo restriction bypass mechanism right away in order to bypass
504f20dd 508 geo restriction, of course, if the mechanism is not disabled.
773f291d 509
5f95927a
S
510 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
511 IP blocks in CIDR notation for this extractor. One of these IP blocks
512 will be used by geo restriction bypass mechanism similarly
504f20dd 513 to _GEO_COUNTRIES.
3ccdde8c 514
fe7866d0 515 The _ENABLED attribute should be set to False for IEs that
516 are disabled by default and must be explicitly enabled.
517
e6f21b3d 518 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
519 in order to warn the users and skip the tests.
520 """
521
522 _ready = False
523 _downloader = None
773f291d 524 _x_forwarded_for_ip = None
4248dad9
S
525 _GEO_BYPASS = True
526 _GEO_COUNTRIES = None
5f95927a 527 _GEO_IP_BLOCKS = None
d6983cb4 528 _WORKING = True
fe7866d0 529 _ENABLED = True
52efa4b3 530 _NETRC_MACHINE = None
231025c4 531 IE_DESC = None
8dcce6a8 532 SEARCH_KEY = None
8f97a15d 533 _VALID_URL = None
534 _EMBED_REGEX = []
d6983cb4 535
8dcce6a8 536 def _login_hint(self, method=NO_DEFAULT, netrc=None):
537 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
538 return {
539 None: '',
540 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
541 'password': f'Use {password_hint}',
542 'cookies': (
543 'Use --cookies-from-browser or --cookies for the authentication. '
17ffed18 544 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
8dcce6a8 545 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 546
d6983cb4 547 def __init__(self, downloader=None):
49a57e70 548 """Constructor. Receives an optional downloader (a YoutubeDL instance).
549 If a downloader is not passed during initialization,
550 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 551 self._ready = False
773f291d 552 self._x_forwarded_for_ip = None
28f436ba 553 self._printed_messages = set()
d6983cb4
PH
554 self.set_downloader(downloader)
555
556 @classmethod
5ad28e7f 557 def _match_valid_url(cls, url):
8f97a15d 558 if cls._VALID_URL is False:
559 return None
79cb2577
PH
560 # This does not use has/getattr intentionally - we want to know whether
561 # we have cached the regexp for *this* class, whereas getattr would also
562 # match the superclass
563 if '_VALID_URL_RE' not in cls.__dict__:
564 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 565 return cls._VALID_URL_RE.match(url)
566
567 @classmethod
568 def suitable(cls, url):
569 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 570 # This function must import everything it needs (except other extractors),
571 # so that lazy_extractors works correctly
5ad28e7f 572 return cls._match_valid_url(url) is not None
d6983cb4 573
ed9266db
PH
574 @classmethod
575 def _match_id(cls, url):
5ad28e7f 576 return cls._match_valid_url(url).group('id')
ed9266db 577
1151c407 578 @classmethod
579 def get_temp_id(cls, url):
580 try:
581 return cls._match_id(url)
582 except (IndexError, AttributeError):
583 return None
584
d6983cb4
PH
585 @classmethod
586 def working(cls):
587 """Getter method for _WORKING."""
588 return cls._WORKING
589
52efa4b3 590 @classmethod
591 def supports_login(cls):
592 return bool(cls._NETRC_MACHINE)
593
d6983cb4
PH
594 def initialize(self):
595 """Initializes an instance (authentication, etc)."""
28f436ba 596 self._printed_messages = set()
5f95927a
S
597 self._initialize_geo_bypass({
598 'countries': self._GEO_COUNTRIES,
599 'ip_blocks': self._GEO_IP_BLOCKS,
600 })
4248dad9 601 if not self._ready:
52efa4b3 602 self._initialize_pre_login()
603 if self.supports_login():
604 username, password = self._get_login_info()
605 if username:
606 self._perform_login(username, password)
607 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 608 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
609 self._real_initialize()
610 self._ready = True
611
5f95927a 612 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
613 """
614 Initialize geo restriction bypass mechanism.
615
616 This method is used to initialize geo bypass mechanism based on faking
617 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 618 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
619 IP will be passed as X-Forwarded-For HTTP header in all subsequent
620 HTTP requests.
e39b5d4a
S
621
622 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
623 during the instance initialization with _GEO_COUNTRIES and
624 _GEO_IP_BLOCKS.
e39b5d4a 625
5f95927a 626 You may also manually call it from extractor's code if geo bypass
e39b5d4a 627 information is not available beforehand (e.g. obtained during
5f95927a
S
628 extraction) or due to some other reason. In this case you should pass
629 this information in geo bypass context passed as first argument. It may
630 contain following fields:
631
632 countries: List of geo unrestricted countries (similar
633 to _GEO_COUNTRIES)
634 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
635 (similar to _GEO_IP_BLOCKS)
636
e39b5d4a 637 """
773f291d 638 if not self._x_forwarded_for_ip:
5f95927a
S
639
640 # Geo bypass mechanism is explicitly disabled by user
a06916d9 641 if not self.get_param('geo_bypass', True):
5f95927a
S
642 return
643
644 if not geo_bypass_context:
645 geo_bypass_context = {}
646
647 # Backward compatibility: previously _initialize_geo_bypass
648 # expected a list of countries, some 3rd party code may still use
649 # it this way
650 if isinstance(geo_bypass_context, (list, tuple)):
651 geo_bypass_context = {
652 'countries': geo_bypass_context,
653 }
654
655 # The whole point of geo bypass mechanism is to fake IP
656 # as X-Forwarded-For HTTP header based on some IP block or
657 # country code.
658
659 # Path 1: bypassing based on IP block in CIDR notation
660
661 # Explicit IP block specified by user, use it right away
662 # regardless of whether extractor is geo bypassable or not
a06916d9 663 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
664
665 # Otherwise use random IP block from geo bypass context but only
666 # if extractor is known as geo bypassable
667 if not ip_block:
668 ip_blocks = geo_bypass_context.get('ip_blocks')
669 if self._GEO_BYPASS and ip_blocks:
670 ip_block = random.choice(ip_blocks)
671
672 if ip_block:
673 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 674 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
675 return
676
677 # Path 2: bypassing based on country code
678
679 # Explicit country code specified by user, use it right away
680 # regardless of whether extractor is geo bypassable or not
a06916d9 681 country = self.get_param('geo_bypass_country', None)
5f95927a
S
682
683 # Otherwise use random country code from geo bypass context but
684 # only if extractor is known as geo bypassable
685 if not country:
686 countries = geo_bypass_context.get('countries')
687 if self._GEO_BYPASS and countries:
688 country = random.choice(countries)
689
690 if country:
691 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 692 self._downloader.write_debug(
86e5f3ed 693 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
694
695 def extract(self, url):
696 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 697 try:
773f291d
S
698 for _ in range(2):
699 try:
700 self.initialize()
71df9b7f 701 self.to_screen('Extracting URL: %s' % (
702 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
0016b84e 703 ie_result = self._real_extract(url)
07cce701 704 if ie_result is None:
705 return None
0016b84e
S
706 if self._x_forwarded_for_ip:
707 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
b79f9e30 708 subtitles = ie_result.get('subtitles') or {}
709 if 'no-live-chat' in self.get_param('compat_opts'):
710 for lang in ('live_chat', 'comments', 'danmaku'):
711 subtitles.pop(lang, None)
0016b84e 712 return ie_result
773f291d 713 except GeoRestrictedError as e:
4248dad9
S
714 if self.__maybe_fake_ip_and_retry(e.countries):
715 continue
773f291d 716 raise
0db3bae8 717 except UnsupportedError:
718 raise
1151c407 719 except ExtractorError as e:
9bcfe33b 720 e.video_id = e.video_id or self.get_temp_id(url),
721 e.ie = e.ie or self.IE_NAME,
722 e.traceback = e.traceback or sys.exc_info()[2]
723 raise
ac668111 724 except http.client.IncompleteRead as e:
1151c407 725 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 726 except (KeyError, StopIteration) as e:
1151c407 727 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 728
4248dad9 729 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 730 if (not self.get_param('geo_bypass_country', None)
3089bc74 731 and self._GEO_BYPASS
a06916d9 732 and self.get_param('geo_bypass', True)
3089bc74
S
733 and not self._x_forwarded_for_ip
734 and countries):
eea0716c
S
735 country_code = random.choice(countries)
736 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
737 if self._x_forwarded_for_ip:
738 self.report_warning(
eea0716c
S
739 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
740 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
741 return True
742 return False
743
d6983cb4 744 def set_downloader(self, downloader):
08d30158 745 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
746 self._downloader = downloader
747
9809740b 748 @property
749 def cache(self):
750 return self._downloader.cache
751
752 @property
753 def cookiejar(self):
754 return self._downloader.cookiejar
755
52efa4b3 756 def _initialize_pre_login(self):
962ffcf8 757 """ Initialization before login. Redefine in subclasses."""
52efa4b3 758 pass
759
760 def _perform_login(self, username, password):
761 """ Login with username and password. Redefine in subclasses."""
762 pass
763
d6983cb4
PH
764 def _real_initialize(self):
765 """Real initialization process. Redefine in subclasses."""
766 pass
767
768 def _real_extract(self, url):
769 """Real extraction process. Redefine in subclasses."""
08d30158 770 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 771
56c73665
JMF
772 @classmethod
773 def ie_key(cls):
774 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 775 return cls.__name__[:-2]
56c73665 776
82d02080 777 @classproperty
778 def IE_NAME(cls):
779 return cls.__name__[:-2]
d6983cb4 780
d391b7e2
S
781 @staticmethod
782 def __can_accept_status_code(err, expected_status):
ac668111 783 assert isinstance(err, urllib.error.HTTPError)
d391b7e2
S
784 if expected_status is None:
785 return False
d391b7e2
S
786 elif callable(expected_status):
787 return expected_status(err.code) is True
788 else:
6606817a 789 return err.code in variadic(expected_status)
d391b7e2 790
c043c246 791 def _create_request(self, url_or_request, data=None, headers=None, query=None):
ac668111 792 if isinstance(url_or_request, urllib.request.Request):
09d02ea4 793 return update_Request(url_or_request, data=data, headers=headers, query=query)
794 if query:
795 url_or_request = update_url_query(url_or_request, query)
c043c246 796 return sanitized_Request(url_or_request, data, headers or {})
f95b9dee 797
c043c246 798 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
d391b7e2
S
799 """
800 Return the response handle.
801
802 See _download_webpage docstring for arguments specification.
803 """
1cf376f5 804 if not self._downloader._first_webpage_request:
49a57e70 805 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 806 if sleep_interval > 0:
5ef7d9bd 807 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 808 time.sleep(sleep_interval)
809 else:
810 self._downloader._first_webpage_request = False
811
d6983cb4
PH
812 if note is None:
813 self.report_download_webpage(video_id)
814 elif note is not False:
7cc3570e 815 if video_id is None:
86e5f3ed 816 self.to_screen(str(note))
7cc3570e 817 else:
86e5f3ed 818 self.to_screen(f'{video_id}: {note}')
2132edaa
S
819
820 # Some sites check X-Forwarded-For HTTP header in order to figure out
821 # the origin of the client behind proxy. This allows bypassing geo
822 # restriction by faking this header's value to IP that belongs to some
823 # geo unrestricted country. We will do so once we encounter any
824 # geo restriction error.
825 if self._x_forwarded_for_ip:
c043c246 826 headers = (headers or {}).copy()
827 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
2132edaa 828
d6983cb4 829 try:
f95b9dee 830 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 831 except network_exceptions as err:
ac668111 832 if isinstance(err, urllib.error.HTTPError):
d391b7e2 833 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
834 # Retain reference to error to prevent file object from
835 # being closed before it can be read. Works around the
836 # effects of <https://bugs.python.org/issue15002>
837 # introduced in Python 3.4.1.
838 err.fp._error = err
d391b7e2
S
839 return err.fp
840
aa94a6d3
PH
841 if errnote is False:
842 return False
d6983cb4 843 if errnote is None:
f1a9d64e 844 errnote = 'Unable to download webpage'
7f8b2714 845
86e5f3ed 846 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 847 if fatal:
497d2fab 848 raise ExtractorError(errmsg, cause=err)
7cc3570e 849 else:
6a39ee13 850 self.report_warning(errmsg)
7cc3570e 851 return False
d6983cb4 852
1890fc63 853 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
854 encoding=None, data=None, headers={}, query={}, expected_status=None):
d391b7e2
S
855 """
856 Return a tuple (page content as string, URL handle).
857
617f658b 858 Arguments:
859 url_or_request -- plain text URL as a string or
ac668111 860 a urllib.request.Request object
617f658b 861 video_id -- Video/playlist/item identifier (string)
862
863 Keyword arguments:
864 note -- note printed before downloading (string)
865 errnote -- note printed in case of an error (string)
866 fatal -- flag denoting whether error should be considered fatal,
867 i.e. whether it should cause ExtractionError to be raised,
868 otherwise a warning will be reported and extraction continued
869 encoding -- encoding for a page content decoding, guessed automatically
870 when not explicitly specified
871 data -- POST data (bytes)
872 headers -- HTTP headers (dict)
873 query -- URL query (dict)
874 expected_status -- allows to accept failed HTTP requests (non 2xx
875 status code) by explicitly specifying a set of accepted status
876 codes. Can be any of the following entities:
877 - an integer type specifying an exact failed status code to
878 accept
879 - a list or a tuple of integer types specifying a list of
880 failed status codes to accept
881 - a callable accepting an actual failed status code and
882 returning True if it should be accepted
883 Note that this argument does not affect success status codes (2xx)
884 which are always accepted.
d391b7e2 885 """
617f658b 886
b9d3e163 887 # Strip hashes from the URL (#1038)
14f25df2 888 if isinstance(url_or_request, str):
b9d3e163
PH
889 url_or_request = url_or_request.partition('#')[0]
890
d391b7e2 891 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
892 if urlh is False:
893 assert not fatal
894 return False
c9a77969 895 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
896 return (content, urlh)
897
c9a77969
YCH
898 @staticmethod
899 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
900 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
901 if m:
902 encoding = m.group(1)
903 else:
0d75ae2c 904 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
905 webpage_bytes[:1024])
906 if m:
907 encoding = m.group(1).decode('ascii')
b60016e8
PH
908 elif webpage_bytes.startswith(b'\xff\xfe'):
909 encoding = 'utf-16'
f143d86a
PH
910 else:
911 encoding = 'utf-8'
c9a77969
YCH
912
913 return encoding
914
4457823d
S
915 def __check_blocked(self, content):
916 first_block = content[:512]
3089bc74
S
917 if ('<title>Access to this site is blocked</title>' in content
918 and 'Websense' in first_block):
4457823d
S
919 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
920 blocked_iframe = self._html_search_regex(
921 r'<iframe src="([^"]+)"', content,
922 'Websense information URL', default=None)
923 if blocked_iframe:
924 msg += ' Visit %s for more details' % blocked_iframe
925 raise ExtractorError(msg, expected=True)
926 if '<title>The URL you requested has been blocked</title>' in first_block:
927 msg = (
928 'Access to this webpage has been blocked by Indian censorship. '
929 'Use a VPN or proxy server (with --proxy) to route around it.')
930 block_msg = self._html_search_regex(
931 r'</h1><p>(.*?)</p>',
932 content, 'block message', default=None)
933 if block_msg:
934 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
935 raise ExtractorError(msg, expected=True)
3089bc74
S
936 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
937 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
938 raise ExtractorError(
939 'Access to this webpage has been blocked by decision of the Russian government. '
940 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
941 expected=True)
942
f95b9dee 943 def _request_dump_filename(self, url, video_id):
944 basen = f'{video_id}_{url}'
945 trim_length = self.get_param('trim_file_name') or 240
946 if len(basen) > trim_length:
947 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
948 basen = basen[:trim_length - len(h)] + h
949 filename = sanitize_filename(f'{basen}.dump', restricted=True)
950 # Working around MAX_PATH limitation on Windows (see
951 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
952 if compat_os_name == 'nt':
953 absfilepath = os.path.abspath(filename)
954 if len(absfilepath) > 259:
955 filename = fR'\\?\{absfilepath}'
956 return filename
957
958 def __decode_webpage(self, webpage_bytes, encoding, headers):
959 if not encoding:
960 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
961 try:
962 return webpage_bytes.decode(encoding, 'replace')
963 except LookupError:
964 return webpage_bytes.decode('utf-8', 'replace')
965
c9a77969 966 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
967 webpage_bytes = urlh.read()
968 if prefix is not None:
969 webpage_bytes = prefix + webpage_bytes
a06916d9 970 if self.get_param('dump_intermediate_pages', False):
f610dbb0 971 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
972 dump = base64.b64encode(webpage_bytes).decode('ascii')
973 self._downloader.to_screen(dump)
f95b9dee 974 if self.get_param('write_pages'):
e121e3ce 975 filename = self._request_dump_filename(urlh.geturl(), video_id)
f95b9dee 976 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
977 with open(filename, 'wb') as outf:
978 outf.write(webpage_bytes)
979
f95b9dee 980 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 981 self.__check_blocked(content)
2410c43d 982
23be51d8 983 return content
d6983cb4 984
6edf2808 985 def __print_error(self, errnote, fatal, video_id, err):
986 if fatal:
c6e07cf1 987 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
6edf2808 988 elif errnote:
c6e07cf1 989 self.report_warning(f'{video_id}: {errnote}: {err}')
6edf2808 990
991 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
e2b38da9
PH
992 if transform_source:
993 xml_string = transform_source(xml_string)
e01c3d2e
S
994 try:
995 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 996 except xml.etree.ElementTree.ParseError as ve:
6edf2808 997 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
267ed0c5 998
6edf2808 999 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
3d3538e4 1000 try:
b7c47b74 1001 return json.loads(
1002 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
3d3538e4 1003 except ValueError as ve:
6edf2808 1004 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
3d3538e4 1005
6edf2808 1006 def _parse_socket_response_as_json(self, data, *args, **kwargs):
1007 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
adddc50c 1008
617f658b 1009 def __create_download_methods(name, parser, note, errnote, return_value):
1010
6edf2808 1011 def parse(ie, content, *args, errnote=errnote, **kwargs):
617f658b 1012 if parser is None:
1013 return content
6edf2808 1014 if errnote is False:
1015 kwargs['errnote'] = errnote
617f658b 1016 # parser is fetched by name so subclasses can override it
1017 return getattr(ie, parser)(content, *args, **kwargs)
1018
c4910024 1019 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1020 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1021 res = self._download_webpage_handle(
1022 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1023 data=data, headers=headers, query=query, expected_status=expected_status)
617f658b 1024 if res is False:
1025 return res
1026 content, urlh = res
6edf2808 1027 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
617f658b 1028
f95b9dee 1029 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
c4910024 1030 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
f95b9dee 1031 if self.get_param('load_pages'):
1032 url_or_request = self._create_request(url_or_request, data, headers, query)
1033 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1034 self.to_screen(f'Loading request from {filename}')
1035 try:
1036 with open(filename, 'rb') as dumpf:
1037 webpage_bytes = dumpf.read()
1038 except OSError as e:
1039 self.report_warning(f'Unable to load request from disk: {e}')
1040 else:
1041 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
6edf2808 1042 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
c4910024 1043 kwargs = {
1044 'note': note,
1045 'errnote': errnote,
1046 'transform_source': transform_source,
1047 'fatal': fatal,
1048 'encoding': encoding,
1049 'data': data,
1050 'headers': headers,
1051 'query': query,
1052 'expected_status': expected_status,
1053 }
617f658b 1054 if parser is None:
c4910024 1055 kwargs.pop('transform_source')
617f658b 1056 # The method is fetched by name so subclasses can override _download_..._handle
c4910024 1057 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
617f658b 1058 return res if res is False else res[0]
1059
1060 def impersonate(func, name, return_value):
1061 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1062 func.__doc__ = f'''
1063 @param transform_source Apply this transformation before parsing
1064 @returns {return_value}
1065
1066 See _download_webpage_handle docstring for other arguments specification
1067 '''
1068
1069 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1070 impersonate(download_content, f'_download_{name}', f'{return_value}')
1071 return download_handle, download_content
1072
1073 _download_xml_handle, _download_xml = __create_download_methods(
1074 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1075 _download_json_handle, _download_json = __create_download_methods(
1076 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1077 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1078 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1079 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1080
617f658b 1081 def _download_webpage(
1082 self, url_or_request, video_id, note=None, errnote=None,
1083 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1084 """
617f658b 1085 Return the data of the page as a string.
adddc50c 1086
617f658b 1087 Keyword arguments:
1088 tries -- number of tries
1089 timeout -- sleep interval between tries
1090
1091 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1092 """
617f658b 1093
1094 R''' # NB: These are unused; should they be deprecated?
1095 if tries != 1:
1096 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1097 if timeout is NO_DEFAULT:
1098 timeout = 5
1099 else:
1100 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1101 '''
1102
1103 try_count = 0
1104 while True:
1105 try:
1106 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
ac668111 1107 except http.client.IncompleteRead as e:
617f658b 1108 try_count += 1
1109 if try_count >= tries:
1110 raise e
1111 self._sleep(timeout, video_id)
adddc50c 1112
28f436ba 1113 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
a70635b8 1114 idstr = format_field(video_id, None, '%s: ')
28f436ba 1115 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1116 if only_once:
1117 if f'WARNING: {msg}' in self._printed_messages:
1118 return
1119 self._printed_messages.add(f'WARNING: {msg}')
1120 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1121
a06916d9 1122 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1123 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1124 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1125
1126 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1127 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1128
1129 def get_param(self, name, default=None, *args, **kwargs):
1130 if self._downloader:
1131 return self._downloader.params.get(name, default, *args, **kwargs)
1132 return default
d6983cb4 1133
d5d1df8a 1134 def report_drm(self, video_id, partial=NO_DEFAULT):
1135 if partial is not NO_DEFAULT:
1136 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
88acdbc2 1137 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1138
d6983cb4
PH
1139 def report_extraction(self, id_or_name):
1140 """Report information extraction."""
f1a9d64e 1141 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1142
1143 def report_download_webpage(self, video_id):
1144 """Report webpage download."""
f1a9d64e 1145 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1146
1147 def report_age_confirmation(self):
1148 """Report attempt to confirm age."""
f1a9d64e 1149 self.to_screen('Confirming age')
d6983cb4 1150
fc79158d
JMF
1151 def report_login(self):
1152 """Report attempt to log in."""
f1a9d64e 1153 self.to_screen('Logging in')
fc79158d 1154
b7da73eb 1155 def raise_login_required(
9d5d4d64 1156 self, msg='This video is only available for registered users',
52efa4b3 1157 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1158 if metadata_available and (
1159 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1160 self.report_warning(msg)
7265a219 1161 return
a70635b8 1162 msg += format_field(self._login_hint(method), None, '. %s')
46890374 1163 raise ExtractorError(msg, expected=True)
43e7d3c9 1164
b7da73eb 1165 def raise_geo_restricted(
1166 self, msg='This video is not available from your location due to geo restriction',
1167 countries=None, metadata_available=False):
f2ebc5c7 1168 if metadata_available and (
1169 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1170 self.report_warning(msg)
1171 else:
1172 raise GeoRestrictedError(msg, countries=countries)
1173
1174 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1175 if expected and (
1176 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1177 self.report_warning(msg, video_id)
68f5867c
L
1178 elif isinstance(msg, ExtractorError):
1179 raise msg
b7da73eb 1180 else:
1181 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1182
5f6a1245 1183 # Methods for following #608
c0d0b01f 1184 @staticmethod
311b6615 1185 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1186 """Returns a URL that points to a page that should be processed"""
311b6615 1187 if ie is not None:
1188 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1189 if video_id is not None:
311b6615 1190 kwargs['id'] = video_id
830d53bf 1191 if video_title is not None:
311b6615 1192 kwargs['title'] = video_title
1193 return {
1194 **kwargs,
1195 '_type': 'url_transparent' if url_transparent else 'url',
1196 'url': url,
1197 }
1198
8f97a15d 1199 @classmethod
1200 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1201 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1202 return cls.playlist_result(
1203 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1204 playlist_id, playlist_title, **kwargs)
46b18f23 1205
c0d0b01f 1206 @staticmethod
311b6615 1207 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1208 """Returns a playlist"""
d6983cb4 1209 if playlist_id:
311b6615 1210 kwargs['id'] = playlist_id
d6983cb4 1211 if playlist_title:
311b6615 1212 kwargs['title'] = playlist_title
ecc97af3 1213 if playlist_description is not None:
311b6615 1214 kwargs['description'] = playlist_description
1215 return {
1216 **kwargs,
1217 '_type': 'multi_video' if multi_video else 'playlist',
1218 'entries': entries,
1219 }
d6983cb4 1220
c342041f 1221 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1222 """
1223 Perform a regex search on the given string, using a single or a list of
1224 patterns returning the first matching group.
1225 In case of failure return a default value or raise a WARNING or a
55b3e45b 1226 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1227 """
61d3665d 1228 if string is None:
1229 mobj = None
77f90330 1230 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1231 mobj = re.search(pattern, string, flags)
1232 else:
1233 for p in pattern:
1234 mobj = re.search(p, string, flags)
c3415d1b
PH
1235 if mobj:
1236 break
d6983cb4 1237
ec11a9f4 1238 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1239
1240 if mobj:
711ede6e
PH
1241 if group is None:
1242 # return the first matching group
1243 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1244 elif isinstance(group, (list, tuple)):
1245 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1246 else:
1247 return mobj.group(group)
c342041f 1248 elif default is not NO_DEFAULT:
d6983cb4
PH
1249 return default
1250 elif fatal:
f1a9d64e 1251 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1252 else:
6a39ee13 1253 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1254 return None
1255
f0bc6e20 1256 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
8b7fb8b6 1257 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
b7c47b74 1258 """Searches string for the JSON object specified by start_pattern"""
1259 # NB: end_pattern is only used to reduce the size of the initial match
f0bc6e20 1260 if default is NO_DEFAULT:
1261 default, has_default = {}, False
1262 else:
1263 fatal, has_default = False, True
1264
1265 json_string = self._search_regex(
8b7fb8b6 1266 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
f0bc6e20 1267 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1268 if not json_string:
1269 return default
1270
1271 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1272 try:
1273 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1274 except ExtractorError as e:
1275 if fatal:
1276 raise ExtractorError(
1277 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1278 elif not has_default:
1279 self.report_warning(
1280 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1281 return default
b7c47b74 1282
c342041f 1283 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1284 """
1285 Like _search_regex, but strips HTML tags and unescapes entities.
1286 """
711ede6e 1287 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
08e29b9f 1288 if isinstance(res, tuple):
edfc7725 1289 return tuple(map(clean_html, res))
1290 return clean_html(res)
d6983cb4 1291
2118fdd1
RA
1292 def _get_netrc_login_info(self, netrc_machine=None):
1293 username = None
1294 password = None
1295 netrc_machine = netrc_machine or self._NETRC_MACHINE
1296
a06916d9 1297 if self.get_param('usenetrc', False):
2118fdd1 1298 try:
0001fcb5 1299 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1300 if os.path.isdir(netrc_file):
1301 netrc_file = os.path.join(netrc_file, '.netrc')
1302 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1303 if info is not None:
1304 username = info[0]
1305 password = info[2]
1306 else:
dcce092e
S
1307 raise netrc.NetrcParseError(
1308 'No authenticators for %s' % netrc_machine)
86e5f3ed 1309 except (OSError, netrc.NetrcParseError) as err:
6a39ee13 1310 self.report_warning(
dcce092e 1311 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1312
dcce092e 1313 return username, password
2118fdd1 1314
1b6712ab 1315 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1316 """
cf0649f8 1317 Get the login info as (username, password)
32443dd3
S
1318 First look for the manually specified credentials using username_option
1319 and password_option as keys in params dictionary. If no such credentials
1320 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1321 value.
fc79158d
JMF
1322 If there's no info available, return (None, None)
1323 """
fc79158d
JMF
1324
1325 # Attempt to use provided username and password or .netrc data
a06916d9 1326 username = self.get_param(username_option)
1327 if username is not None:
1328 password = self.get_param(password_option)
2118fdd1 1329 else:
1b6712ab 1330 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1331
2133565c 1332 return username, password
fc79158d 1333
e64b7569 1334 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1335 """
1336 Get the two-factor authentication info
1337 TODO - asking the user will be required for sms/phone verify
1338 currently just uses the command line option
1339 If there's no info available, return None
1340 """
83317f69 1341
a06916d9 1342 tfa = self.get_param('twofactor')
1343 if tfa is not None:
1344 return tfa
83317f69 1345
ac668111 1346 return getpass.getpass('Type %s and press [Return]: ' % note)
83317f69 1347
46720279
JMF
1348 # Helper functions for extracting OpenGraph info
1349 @staticmethod
ab2d5247 1350 def _og_regexes(prop):
45b2ee6f 1351 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
fbfde1c3
F
1352 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1353 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1354 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1355 return [
78fb87b2
JMF
1356 template % (property_re, content_re),
1357 template % (content_re, property_re),
ab2d5247 1358 ]
46720279 1359
864f24bd
S
1360 @staticmethod
1361 def _meta_regex(prop):
1362 return r'''(?isx)<meta
8b9848ac 1363 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1364 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1365
3c4e6d83 1366 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1367 prop = variadic(prop)
46720279 1368 if name is None:
b070564e
S
1369 name = 'OpenGraph %s' % prop[0]
1370 og_regexes = []
1371 for p in prop:
1372 og_regexes.extend(self._og_regexes(p))
1373 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1374 if escaped is None:
1375 return None
1376 return unescapeHTML(escaped)
46720279
JMF
1377
1378 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1379 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1380
1381 def _og_search_description(self, html, **kargs):
1382 return self._og_search_property('description', html, fatal=False, **kargs)
1383
04f3fd2c 1384 def _og_search_title(self, html, *, fatal=False, **kargs):
1385 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1386
8ffa13e0 1387 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1388 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1389 if secure:
1390 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1391 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1392
78338f71
JMF
1393 def _og_search_url(self, html, **kargs):
1394 return self._og_search_property('url', html, **kargs)
1395
04f3fd2c 1396 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1397 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1398
40c696e5 1399 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1400 name = variadic(name)
59040888 1401 if display_name is None:
88d9f6c0 1402 display_name = name[0]
59040888 1403 return self._html_search_regex(
88d9f6c0 1404 [self._meta_regex(n) for n in name],
711ede6e 1405 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1406
1407 def _dc_search_uploader(self, html):
1408 return self._html_search_meta('dc.creator', html, 'uploader')
1409
8f97a15d 1410 @staticmethod
1411 def _rta_search(html):
8dbe9899
PH
1412 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1413 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1414 r' content="RTA-5042-1996-1400-1577-RTA"',
1415 html):
1416 return 18
8f97a15d 1417
1418 # And then there are the jokers who advertise that they use RTA, but actually don't.
1419 AGE_LIMIT_MARKERS = [
1420 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
32a84bcf
SS
1421 r'>[^<]*you acknowledge you are at least (\d+) years old',
1422 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
8f97a15d 1423 ]
32a84bcf
SS
1424
1425 age_limit = 0
1426 for marker in AGE_LIMIT_MARKERS:
1427 mobj = re.search(marker, html)
1428 if mobj:
1429 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1430 return age_limit
8dbe9899 1431
59040888
PH
1432 def _media_rating_search(self, html):
1433 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1434 rating = self._html_search_meta('rating', html)
1435
1436 if not rating:
1437 return None
1438
1439 RATING_TABLE = {
1440 'safe for kids': 0,
1441 'general': 8,
1442 '14 years': 14,
1443 'mature': 17,
1444 'restricted': 19,
1445 }
d800609c 1446 return RATING_TABLE.get(rating.lower())
59040888 1447
69319969 1448 def _family_friendly_search(self, html):
6ca7732d 1449 # See http://schema.org/VideoObject
ac8491fc
S
1450 family_friendly = self._html_search_meta(
1451 'isFamilyFriendly', html, default=None)
69319969
NJ
1452
1453 if not family_friendly:
1454 return None
1455
1456 RATING_TABLE = {
1457 '1': 0,
1458 'true': 0,
1459 '0': 18,
1460 'false': 18,
1461 }
d800609c 1462 return RATING_TABLE.get(family_friendly.lower())
69319969 1463
0c708f11
JMF
1464 def _twitter_search_player(self, html):
1465 return self._html_search_meta('twitter:player', html,
9e1a5b84 1466 'twitter card player')
0c708f11 1467
0c36dc00 1468 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1469 """Yield all json ld objects in the html"""
1470 if default is not NO_DEFAULT:
1471 fatal = False
1472 for mobj in re.finditer(JSON_LD_RE, html):
1473 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1474 for json_ld in variadic(json_ld_item):
1475 if isinstance(json_ld, dict):
1476 yield json_ld
1477
1478 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1479 """Search for a video in any json ld in the html"""
1480 if default is not NO_DEFAULT:
1481 fatal = False
1482 info = self._json_ld(
1483 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1484 video_id, fatal=fatal, expected_type=expected_type)
1485 if info:
1486 return info
4433bb02
S
1487 if default is not NO_DEFAULT:
1488 return default
1489 elif fatal:
1490 raise RegexNotFoundError('Unable to extract JSON-LD')
1491 else:
6a39ee13 1492 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1493 return {}
4ca2a3cf 1494
95b31e26 1495 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
14f25df2 1496 if isinstance(json_ld, str):
4ca2a3cf
S
1497 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1498 if not json_ld:
1499 return {}
1500 info = {}
bae14048 1501
e7e4a6e0
S
1502 INTERACTION_TYPE_MAP = {
1503 'CommentAction': 'comment',
1504 'AgreeAction': 'like',
1505 'DisagreeAction': 'dislike',
1506 'LikeAction': 'like',
1507 'DislikeAction': 'dislike',
1508 'ListenAction': 'view',
1509 'WatchAction': 'view',
1510 'ViewAction': 'view',
1511 }
1512
f3c0c773 1513 def is_type(e, *expected_types):
1514 type = variadic(traverse_obj(e, '@type'))
1515 return any(x in type for x in expected_types)
1516
29f7c58a 1517 def extract_interaction_type(e):
1518 interaction_type = e.get('interactionType')
1519 if isinstance(interaction_type, dict):
1520 interaction_type = interaction_type.get('@type')
1521 return str_or_none(interaction_type)
1522
e7e4a6e0
S
1523 def extract_interaction_statistic(e):
1524 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1525 if isinstance(interaction_statistic, dict):
1526 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1527 if not isinstance(interaction_statistic, list):
1528 return
1529 for is_e in interaction_statistic:
f3c0c773 1530 if not is_type(is_e, 'InteractionCounter'):
e7e4a6e0 1531 continue
29f7c58a 1532 interaction_type = extract_interaction_type(is_e)
1533 if not interaction_type:
e7e4a6e0 1534 continue
ce5b9040
S
1535 # For interaction count some sites provide string instead of
1536 # an integer (as per spec) with non digit characters (e.g. ",")
1537 # so extracting count with more relaxed str_to_int
1538 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1539 if interaction_count is None:
1540 continue
1541 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1542 if not count_kind:
1543 continue
1544 count_key = '%s_count' % count_kind
1545 if info.get(count_key) is not None:
1546 continue
1547 info[count_key] = interaction_count
1548
f5225737 1549 def extract_chapter_information(e):
1550 chapters = [{
1551 'title': part.get('name'),
1552 'start_time': part.get('startOffset'),
1553 'end_time': part.get('endOffset'),
85553414 1554 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1555 for idx, (last_c, current_c, next_c) in enumerate(zip(
1556 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1557 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1558 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1559 if None in current_c.values():
1560 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1561 return
1562 if chapters:
1563 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1564 info['chapters'] = chapters
1565
bae14048 1566 def extract_video_object(e):
f7ad7160 1567 author = e.get('author')
bae14048 1568 info.update({
0c36dc00 1569 'url': url_or_none(e.get('contentUrl')),
0f60ba6e 1570 'ext': mimetype2ext(e.get('encodingFormat')),
bae14048
S
1571 'title': unescapeHTML(e.get('name')),
1572 'description': unescapeHTML(e.get('description')),
eb2333bc 1573 'thumbnails': [{'url': unescapeHTML(url)}
21633673 1574 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1575 if url_or_none(url)],
bae14048
S
1576 'duration': parse_duration(e.get('duration')),
1577 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1578 # author can be an instance of 'Organization' or 'Person' types.
1579 # both types can have 'name' property(inherited from 'Thing' type). [1]
1580 # however some websites are using 'Text' type instead.
1581 # 1. https://schema.org/VideoObject
14f25df2 1582 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
0f60ba6e 1583 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
56ba69e4 1584 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
bae14048
S
1585 'tbr': int_or_none(e.get('bitrate')),
1586 'width': int_or_none(e.get('width')),
1587 'height': int_or_none(e.get('height')),
33a81c2c 1588 'view_count': int_or_none(e.get('interactionCount')),
0f60ba6e 1589 'tags': try_call(lambda: e.get('keywords').split(',')),
bae14048 1590 })
0f60ba6e 1591 if is_type(e, 'AudioObject'):
1592 info.update({
1593 'vcodec': 'none',
1594 'abr': int_or_none(e.get('bitrate')),
1595 })
e7e4a6e0 1596 extract_interaction_statistic(e)
f5225737 1597 extract_chapter_information(e)
bae14048 1598
d5c32548 1599 def traverse_json_ld(json_ld, at_top_level=True):
1d55ebab
SS
1600 for e in variadic(json_ld):
1601 if not isinstance(e, dict):
1602 continue
d5c32548
ZM
1603 if at_top_level and '@context' not in e:
1604 continue
1605 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1d55ebab 1606 traverse_json_ld(e['@graph'], at_top_level=False)
c13a301a 1607 continue
f3c0c773 1608 if expected_type is not None and not is_type(e, expected_type):
4433bb02 1609 continue
8f122fa0 1610 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1611 if rating is not None:
1612 info['average_rating'] = rating
f3c0c773 1613 if is_type(e, 'TVEpisode', 'Episode'):
440863ad 1614 episode_name = unescapeHTML(e.get('name'))
46933a15 1615 info.update({
440863ad 1616 'episode': episode_name,
46933a15
S
1617 'episode_number': int_or_none(e.get('episodeNumber')),
1618 'description': unescapeHTML(e.get('description')),
1619 })
440863ad
S
1620 if not info.get('title') and episode_name:
1621 info['title'] = episode_name
46933a15 1622 part_of_season = e.get('partOfSeason')
f3c0c773 1623 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1624 info.update({
1625 'season': unescapeHTML(part_of_season.get('name')),
1626 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1627 })
d16b3c66 1628 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
f3c0c773 1629 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1630 info['series'] = unescapeHTML(part_of_series.get('name'))
f3c0c773 1631 elif is_type(e, 'Movie'):
391256dc
S
1632 info.update({
1633 'title': unescapeHTML(e.get('name')),
1634 'description': unescapeHTML(e.get('description')),
1635 'duration': parse_duration(e.get('duration')),
1636 'timestamp': unified_timestamp(e.get('dateCreated')),
1637 })
f3c0c773 1638 elif is_type(e, 'Article', 'NewsArticle'):
46933a15
S
1639 info.update({
1640 'timestamp': parse_iso8601(e.get('datePublished')),
1641 'title': unescapeHTML(e.get('headline')),
d5c32548 1642 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1643 })
f3c0c773 1644 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
2edb38e8 1645 extract_video_object(e['video'][0])
f3c0c773 1646 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
e50c3500 1647 extract_video_object(e['subjectOf'][0])
0f60ba6e 1648 elif is_type(e, 'VideoObject', 'AudioObject'):
bae14048 1649 extract_video_object(e)
4433bb02
S
1650 if expected_type is None:
1651 continue
1652 else:
1653 break
c69701c6 1654 video = e.get('video')
f3c0c773 1655 if is_type(video, 'VideoObject'):
c69701c6 1656 extract_video_object(video)
4433bb02
S
1657 if expected_type is None:
1658 continue
1659 else:
1660 break
d5c32548 1661
1d55ebab 1662 traverse_json_ld(json_ld)
90137ca4 1663 return filter_dict(info)
4ca2a3cf 1664
135dfa2c 1665 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1666 return self._parse_json(
1667 self._search_regex(
1668 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1669 webpage, 'next.js data', fatal=fatal, **kw),
1670 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1671
8072ef2b 1672 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1673 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
66f4c04e 1674 rectx = re.escape(context_name)
8072ef2b 1675 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
66f4c04e 1676 js, arg_keys, arg_vals = self._search_regex(
8072ef2b 1677 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
f7fc8d39 1678 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1679 default=NO_DEFAULT if fatal else (None, None, None))
1680 if js is None:
1681 return {}
66f4c04e 1682
b23167e7
L
1683 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1684 f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
66f4c04e 1685
8072ef2b 1686 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1687 return traverse_obj(ret, traverse) or {}
66f4c04e 1688
27713812 1689 @staticmethod
f8da79f8 1690 def _hidden_inputs(html):
586f1cc5 1691 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1692 hidden_inputs = {}
c8498368
S
1693 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1694 attrs = extract_attributes(input)
1695 if not input:
201ea3ee 1696 continue
c8498368 1697 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1698 continue
c8498368
S
1699 name = attrs.get('name') or attrs.get('id')
1700 value = attrs.get('value')
1701 if name and value is not None:
1702 hidden_inputs[name] = value
201ea3ee 1703 return hidden_inputs
27713812 1704
cf61d96d
S
1705 def _form_hidden_inputs(self, form_id, html):
1706 form = self._search_regex(
73eb13df 1707 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1708 html, '%s form' % form_id, group='form')
1709 return self._hidden_inputs(form)
1710
d0d74b71 1711 @classproperty(cache=True)
1712 def FormatSort(cls):
1713 class FormatSort(FormatSorter):
1714 def __init__(ie, *args, **kwargs):
1715 super().__init__(ie._downloader, *args, **kwargs)
eb8a4433 1716
d0d74b71 1717 deprecation_warning(
1718 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1719 'Use yt_dlp.utils.FormatSorter instead')
1720 return FormatSort
eb8a4433 1721
1722 def _sort_formats(self, formats, field_preference=[]):
9f14daf2 1723 if not field_preference:
1724 self._downloader.deprecation_warning(
1725 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1726 return
1727 self._downloader.deprecation_warning(
1728 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1729 'Return _format_sort_fields in the info_dict instead')
1730 if formats:
784320c9 1731 formats[0]['__sort_fields'] = field_preference
59040888 1732
96a53167
S
1733 def _check_formats(self, formats, video_id):
1734 if formats:
1735 formats[:] = filter(
1736 lambda f: self._is_valid_url(
1737 f['url'], video_id,
1738 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1739 formats)
1740
f5bdb444
S
1741 @staticmethod
1742 def _remove_duplicate_formats(formats):
1743 format_urls = set()
1744 unique_formats = []
1745 for f in formats:
1746 if f['url'] not in format_urls:
1747 format_urls.add(f['url'])
1748 unique_formats.append(f)
1749 formats[:] = unique_formats
1750
45024183 1751 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1752 url = self._proto_relative_url(url, scheme='http:')
1753 # For now assume non HTTP(S) URLs always valid
1754 if not (url.startswith('http://') or url.startswith('https://')):
1755 return True
96a53167 1756 try:
45024183 1757 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1758 return True
8bdd16b4 1759 except ExtractorError as e:
25e911a9 1760 self.to_screen(
8bdd16b4 1761 '%s: %s URL is invalid, skipping: %s'
1762 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1763 return False
96a53167 1764
20991253 1765 def http_scheme(self):
1ede5b24 1766 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1767 return (
1768 'http:'
a06916d9 1769 if self.get_param('prefer_insecure', False)
20991253
PH
1770 else 'https:')
1771
57c7411f 1772 def _proto_relative_url(self, url, scheme=None):
8f97a15d 1773 scheme = scheme or self.http_scheme()
1774 assert scheme.endswith(':')
1775 return sanitize_url(url, scheme=scheme[:-1])
57c7411f 1776
4094b6e3
PH
1777 def _sleep(self, timeout, video_id, msg_template=None):
1778 if msg_template is None:
f1a9d64e 1779 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1780 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1781 self.to_screen(msg)
1782 time.sleep(timeout)
1783
f983b875 1784 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1785 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1786 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
0b5546c7 1787 if self.get_param('ignore_no_formats_error'):
1788 fatal = False
1789
a076c1f9 1790 res = self._download_xml_handle(
f036a632 1791 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1792 'Unable to download f4m manifest',
1793 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1794 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1795 transform_source=transform_source,
7360c06f 1796 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 1797 if res is False:
8d29e47f 1798 return []
31bb8d3f 1799
a076c1f9
E
1800 manifest, urlh = res
1801 manifest_url = urlh.geturl()
1802
0fdbb332 1803 return self._parse_f4m_formats(
f983b875 1804 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1805 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1806
f983b875 1807 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1808 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1809 fatal=True, m3u8_id=None):
f9934b96 1810 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
1811 return []
1812
7a5c1cfe 1813 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1814 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1815 if akamai_pv is not None and ';' in akamai_pv.text:
1816 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1817 if playerVerificationChallenge.strip() != '':
1818 return []
1819
31bb8d3f 1820 formats = []
7a47d07c 1821 manifest_version = '1.0'
b2527359 1822 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1823 if not media_nodes:
7a47d07c 1824 manifest_version = '2.0'
34e48bed 1825 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1826 # Remove unsupported DRM protected media from final formats
067aa17e 1827 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1828 media_nodes = remove_encrypted_media(media_nodes)
1829 if not media_nodes:
1830 return formats
48107c19
S
1831
1832 manifest_base_url = get_base_url(manifest)
0a5685b2 1833
a6571f10 1834 bootstrap_info = xpath_element(
0a5685b2
YCH
1835 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1836 'bootstrap info', default=None)
1837
edd6074c
RA
1838 vcodec = None
1839 mime_type = xpath_text(
1840 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1841 'base URL', default=None)
1842 if mime_type and mime_type.startswith('audio/'):
1843 vcodec = 'none'
1844
b2527359 1845 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1846 tbr = int_or_none(media_el.attrib.get('bitrate'))
1847 width = int_or_none(media_el.attrib.get('width'))
1848 height = int_or_none(media_el.attrib.get('height'))
34921b43 1849 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
1850 # If <bootstrapInfo> is present, the specified f4m is a
1851 # stream-level manifest, and only set-level manifests may refer to
1852 # external resources. See section 11.4 and section 4 of F4M spec
1853 if bootstrap_info is None:
1854 media_url = None
1855 # @href is introduced in 2.0, see section 11.6 of F4M spec
1856 if manifest_version == '2.0':
1857 media_url = media_el.attrib.get('href')
1858 if media_url is None:
1859 media_url = media_el.attrib.get('url')
31c746e5
S
1860 if not media_url:
1861 continue
cc357c4d
S
1862 manifest_url = (
1863 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1864 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1865 # If media_url is itself a f4m manifest do the recursive extraction
1866 # since bitrates in parent manifest (this one) and media_url manifest
1867 # may differ leading to inability to resolve the format by requested
1868 # bitrate in f4m downloader
240b6045
YCH
1869 ext = determine_ext(manifest_url)
1870 if ext == 'f4m':
77b8b4e6 1871 f4m_formats = self._extract_f4m_formats(
f983b875 1872 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1873 transform_source=transform_source, fatal=fatal)
1874 # Sometimes stream-level manifest contains single media entry that
1875 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1876 # At the same time parent's media entry in set-level manifest may
1877 # contain it. We will copy it from parent in such cases.
1878 if len(f4m_formats) == 1:
1879 f = f4m_formats[0]
1880 f.update({
1881 'tbr': f.get('tbr') or tbr,
1882 'width': f.get('width') or width,
1883 'height': f.get('height') or height,
1884 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1885 'vcodec': vcodec,
77b8b4e6
S
1886 })
1887 formats.extend(f4m_formats)
70f0f5a8 1888 continue
240b6045
YCH
1889 elif ext == 'm3u8':
1890 formats.extend(self._extract_m3u8_formats(
1891 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1892 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1893 continue
31bb8d3f 1894 formats.append({
77b8b4e6 1895 'format_id': format_id,
31bb8d3f 1896 'url': manifest_url,
30d0b549 1897 'manifest_url': manifest_url,
a6571f10 1898 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1899 'protocol': 'f4m',
b2527359 1900 'tbr': tbr,
77b8b4e6
S
1901 'width': width,
1902 'height': height,
edd6074c 1903 'vcodec': vcodec,
60ca389c 1904 'preference': preference,
f983b875 1905 'quality': quality,
31bb8d3f 1906 })
31bb8d3f
JMF
1907 return formats
1908
f983b875 1909 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1910 return {
34921b43 1911 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
1912 'url': m3u8_url,
1913 'ext': ext,
1914 'protocol': 'm3u8',
37768f92 1915 'preference': preference - 100 if preference else -100,
f983b875 1916 'quality': quality,
704df56d
PH
1917 'resolution': 'multiple',
1918 'format_note': 'Quality selection URL',
16da9bbc
YCH
1919 }
1920
b5ae35ee 1921 def _report_ignoring_subs(self, name):
1922 self.report_warning(bug_reports_message(
1923 f'Ignoring subtitle tracks found in the {name} manifest; '
1924 'if any subtitle tracks are missing,'
1925 ), only_once=True)
1926
a0c3b2d5
F
1927 def _extract_m3u8_formats(self, *args, **kwargs):
1928 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1929 if subs:
b5ae35ee 1930 self._report_ignoring_subs('HLS')
a0c3b2d5
F
1931 return fmts
1932
1933 def _extract_m3u8_formats_and_subtitles(
177877c5 1934 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1935 preference=None, quality=None, m3u8_id=None, note=None,
1936 errnote=None, fatal=True, live=False, data=None, headers={},
1937 query={}):
1938
0b5546c7 1939 if self.get_param('ignore_no_formats_error'):
1940 fatal = False
1941
71df9b7f 1942 if not m3u8_url:
1943 if errnote is not False:
1944 errnote = errnote or 'Failed to obtain m3u8 URL'
1945 if fatal:
1946 raise ExtractorError(errnote, video_id=video_id)
1947 self.report_warning(f'{errnote}{bug_reports_message()}')
1948 return [], {}
1949
dbd82a1d 1950 res = self._download_webpage_handle(
81515ad9 1951 m3u8_url, video_id,
37a3bb66 1952 note='Downloading m3u8 information' if note is None else note,
1953 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1954 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1955
dbd82a1d 1956 if res is False:
a0c3b2d5 1957 return [], {}
cb252080 1958
dbd82a1d 1959 m3u8_doc, urlh = res
37113045 1960 m3u8_url = urlh.geturl()
9cdffeeb 1961
a0c3b2d5 1962 return self._parse_m3u8_formats_and_subtitles(
cb252080 1963 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1964 preference=preference, quality=quality, m3u8_id=m3u8_id,
1965 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1966 headers=headers, query=query, video_id=video_id)
cb252080 1967
a0c3b2d5 1968 def _parse_m3u8_formats_and_subtitles(
42676437 1969 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1970 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1971 errnote=None, fatal=True, data=None, headers={}, query={},
1972 video_id=None):
60755938 1973 formats, subtitles = [], {}
a0c3b2d5 1974
6b993ca7 1975 has_drm = re.search('|'.join([
1976 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
1977 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
1978 ]), m3u8_doc)
a0c3b2d5 1979
60755938 1980 def format_url(url):
14f25df2 1981 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
60755938 1982
1983 if self.get_param('hls_split_discontinuity', False):
1984 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1985 if not m3u8_doc:
1986 if not manifest_url:
1987 return []
1988 m3u8_doc = self._download_webpage(
1989 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1990 note=False, errnote='Failed to download m3u8 playlist information')
1991 if m3u8_doc is False:
1992 return []
1993 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 1994
60755938 1995 else:
1996 def _extract_m3u8_playlist_indices(*args, **kwargs):
1997 return [None]
310c2ed2 1998
cb252080
S
1999 # References:
2000 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2001 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2002 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2003
2004 # We should try extracting formats only from master playlists [1, 4.3.4],
2005 # i.e. playlists that describe available qualities. On the other hand
2006 # media playlists [1, 4.3.3] should be returned as is since they contain
2007 # just the media without qualities renditions.
9cdffeeb 2008 # Fortunately, master playlist can be easily distinguished from media
cb252080 2009 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2010 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2011 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2012 # media playlist and MUST NOT appear in master playlist thus we can
2013 # clearly detect media playlist with this criterion.
2014
9cdffeeb 2015 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2016 formats = [{
34921b43 2017 'format_id': join_nonempty(m3u8_id, idx),
60755938 2018 'format_index': idx,
42676437 2019 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2020 'ext': ext,
2021 'protocol': entry_protocol,
2022 'preference': preference,
2023 'quality': quality,
88acdbc2 2024 'has_drm': has_drm,
60755938 2025 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2026
a0c3b2d5 2027 return formats, subtitles
cb252080
S
2028
2029 groups = {}
2030 last_stream_inf = {}
2031
2032 def extract_media(x_media_line):
2033 media = parse_m3u8_attributes(x_media_line)
2034 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2035 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2036 if not (media_type and group_id and name):
2037 return
2038 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2039 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2040 if media_type == 'SUBTITLES':
3907333c 2041 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2042 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2043 # However, lack of URI has been spotted in the wild.
2044 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2045 if not media.get('URI'):
2046 return
a0c3b2d5
F
2047 url = format_url(media['URI'])
2048 sub_info = {
2049 'url': url,
2050 'ext': determine_ext(url),
2051 }
4a2f19ab
F
2052 if sub_info['ext'] == 'm3u8':
2053 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2054 # files may contain is WebVTT:
2055 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2056 sub_info['ext'] = 'vtt'
2057 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2058 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2059 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2060 if media_type not in ('VIDEO', 'AUDIO'):
2061 return
2062 media_url = media.get('URI')
2063 if media_url:
310c2ed2 2064 manifest_url = format_url(media_url)
60755938 2065 formats.extend({
34921b43 2066 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2067 'format_note': name,
2068 'format_index': idx,
2069 'url': manifest_url,
2070 'manifest_url': m3u8_url,
2071 'language': media.get('LANGUAGE'),
2072 'ext': ext,
2073 'protocol': entry_protocol,
2074 'preference': preference,
2075 'quality': quality,
43a3eaf9 2076 'has_drm': has_drm,
60755938 2077 'vcodec': 'none' if media_type == 'AUDIO' else None,
2078 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2079
2080 def build_stream_name():
2081 # Despite specification does not mention NAME attribute for
3019cb0c
S
2082 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2083 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2084 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2085 stream_name = last_stream_inf.get('NAME')
2086 if stream_name:
2087 return stream_name
2088 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2089 # from corresponding rendition group
2090 stream_group_id = last_stream_inf.get('VIDEO')
2091 if not stream_group_id:
2092 return
2093 stream_group = groups.get(stream_group_id)
2094 if not stream_group:
2095 return stream_group_id
2096 rendition = stream_group[0]
2097 return rendition.get('NAME') or stream_group_id
2098
379306ef 2099 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2100 # chance to detect video only formats when EXT-X-STREAM-INF tags
2101 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2102 for line in m3u8_doc.splitlines():
2103 if line.startswith('#EXT-X-MEDIA:'):
2104 extract_media(line)
2105
704df56d
PH
2106 for line in m3u8_doc.splitlines():
2107 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2108 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2109 elif line.startswith('#') or not line.strip():
2110 continue
2111 else:
9c99bef7 2112 tbr = float_or_none(
3089bc74
S
2113 last_stream_inf.get('AVERAGE-BANDWIDTH')
2114 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2115 manifest_url = format_url(line.strip())
5ef62fc4 2116
60755938 2117 for idx in _extract_m3u8_playlist_indices(manifest_url):
2118 format_id = [m3u8_id, None, idx]
310c2ed2 2119 # Bandwidth of live streams may differ over time thus making
2120 # format_id unpredictable. So it's better to keep provided
2121 # format_id intact.
2122 if not live:
60755938 2123 stream_name = build_stream_name()
34921b43 2124 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2125 f = {
34921b43 2126 'format_id': join_nonempty(*format_id),
60755938 2127 'format_index': idx,
310c2ed2 2128 'url': manifest_url,
2129 'manifest_url': m3u8_url,
2130 'tbr': tbr,
2131 'ext': ext,
2132 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2133 'protocol': entry_protocol,
2134 'preference': preference,
2135 'quality': quality,
43a3eaf9 2136 'has_drm': has_drm,
310c2ed2 2137 }
2138 resolution = last_stream_inf.get('RESOLUTION')
2139 if resolution:
2140 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2141 if mobj:
2142 f['width'] = int(mobj.group('width'))
2143 f['height'] = int(mobj.group('height'))
2144 # Unified Streaming Platform
2145 mobj = re.search(
2146 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2147 if mobj:
2148 abr, vbr = mobj.groups()
2149 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2150 f.update({
2151 'vbr': vbr,
2152 'abr': abr,
2153 })
2154 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2155 f.update(codecs)
2156 audio_group_id = last_stream_inf.get('AUDIO')
2157 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2158 # references a rendition group MUST have a CODECS attribute.
62b58c09 2159 # However, this is not always respected. E.g. [2]
310c2ed2 2160 # contains EXT-X-STREAM-INF tag which references AUDIO
2161 # rendition group but does not have CODECS and despite
2162 # referencing an audio group it represents a complete
2163 # (with audio and video) format. So, for such cases we will
2164 # ignore references to rendition groups and treat them
2165 # as complete formats.
2166 if audio_group_id and codecs and f.get('vcodec') != 'none':
2167 audio_group = groups.get(audio_group_id)
2168 if audio_group and audio_group[0].get('URI'):
2169 # TODO: update acodec for audio only formats with
2170 # the same GROUP-ID
2171 f['acodec'] = 'none'
fc21af50 2172 if not f.get('ext'):
2173 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2174 formats.append(f)
2175
2176 # for DailyMotion
2177 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2178 if progressive_uri:
2179 http_f = f.copy()
2180 del http_f['manifest_url']
2181 http_f.update({
2182 'format_id': f['format_id'].replace('hls-', 'http-'),
2183 'protocol': 'http',
2184 'url': progressive_uri,
2185 })
2186 formats.append(http_f)
5ef62fc4 2187
cb252080 2188 last_stream_inf = {}
a0c3b2d5 2189 return formats, subtitles
704df56d 2190
3cf4b91d
C
2191 def _extract_m3u8_vod_duration(
2192 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2193
2194 m3u8_vod = self._download_webpage(
2195 m3u8_vod_url, video_id,
2196 note='Downloading m3u8 VOD manifest' if note is None else note,
2197 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2198 fatal=False, data=data, headers=headers, query=query)
2199
2200 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2201
2202 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
5ab3534d 2203 if '#EXT-X-ENDLIST' not in m3u8_vod:
3cf4b91d
C
2204 return None
2205
2206 return int(sum(
2207 float(line[len('#EXTINF:'):].split(',')[0])
2208 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2209
5ab3534d 2210 def _extract_mpd_vod_duration(
2211 self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2212
2213 mpd_doc = self._download_xml(
2214 mpd_url, video_id,
2215 note='Downloading MPD VOD manifest' if note is None else note,
2216 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2217 fatal=False, data=data, headers=headers, query=query) or {}
2218 return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2219
a107193e
S
2220 @staticmethod
2221 def _xpath_ns(path, namespace=None):
2222 if not namespace:
2223 return path
2224 out = []
2225 for c in path.split('/'):
2226 if not c or c == '.':
2227 out.append(c)
2228 else:
2229 out.append('{%s}%s' % (namespace, c))
2230 return '/'.join(out)
2231
da1c94ee 2232 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
0b5546c7 2233 if self.get_param('ignore_no_formats_error'):
2234 fatal = False
2235
a076c1f9
E
2236 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2237 if res is False:
995029a1 2238 assert not fatal
774a46c5 2239 return [], {}
e89a2aab 2240
a076c1f9
E
2241 smil, urlh = res
2242 smil_url = urlh.geturl()
2243
17712eeb 2244 namespace = self._parse_smil_namespace(smil)
a107193e 2245
da1c94ee 2246 fmts = self._parse_smil_formats(
a107193e 2247 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2248 subs = self._parse_smil_subtitles(
2249 smil, namespace=namespace)
2250
2251 return fmts, subs
2252
2253 def _extract_smil_formats(self, *args, **kwargs):
2254 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2255 if subs:
b5ae35ee 2256 self._report_ignoring_subs('SMIL')
da1c94ee 2257 return fmts
a107193e
S
2258
2259 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2260 res = self._download_smil(smil_url, video_id, fatal=fatal)
2261 if res is False:
a107193e 2262 return {}
a076c1f9
E
2263
2264 smil, urlh = res
2265 smil_url = urlh.geturl()
2266
a107193e
S
2267 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2268
09f572fb 2269 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2270 return self._download_xml_handle(
a107193e 2271 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2272 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2273
2274 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2275 namespace = self._parse_smil_namespace(smil)
a107193e
S
2276
2277 formats = self._parse_smil_formats(
2278 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2279 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2280
2281 video_id = os.path.splitext(url_basename(smil_url))[0]
2282 title = None
2283 description = None
647eab45 2284 upload_date = None
a107193e
S
2285 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2286 name = meta.attrib.get('name')
2287 content = meta.attrib.get('content')
2288 if not name or not content:
2289 continue
2290 if not title and name == 'title':
2291 title = content
2292 elif not description and name in ('description', 'abstract'):
2293 description = content
647eab45
S
2294 elif not upload_date and name == 'date':
2295 upload_date = unified_strdate(content)
a107193e 2296
1e5bcdec
S
2297 thumbnails = [{
2298 'id': image.get('type'),
2299 'url': image.get('src'),
2300 'width': int_or_none(image.get('width')),
2301 'height': int_or_none(image.get('height')),
2302 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2303
a107193e
S
2304 return {
2305 'id': video_id,
2306 'title': title or video_id,
2307 'description': description,
647eab45 2308 'upload_date': upload_date,
1e5bcdec 2309 'thumbnails': thumbnails,
a107193e
S
2310 'formats': formats,
2311 'subtitles': subtitles,
2312 }
2313
17712eeb
S
2314 def _parse_smil_namespace(self, smil):
2315 return self._search_regex(
2316 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2317
f877c6ae 2318 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2319 base = smil_url
2320 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2321 b = meta.get('base') or meta.get('httpBase')
2322 if b:
2323 base = b
2324 break
e89a2aab
S
2325
2326 formats = []
2327 rtmp_count = 0
a107193e 2328 http_count = 0
7f32e5dc 2329 m3u8_count = 0
9359f3d4 2330 imgs_count = 0
a107193e 2331
9359f3d4 2332 srcs = set()
ad96b4c8
YCH
2333 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2334 for medium in media:
2335 src = medium.get('src')
81e1c4e2 2336 if not src or src in srcs:
a107193e 2337 continue
9359f3d4 2338 srcs.add(src)
a107193e 2339
ad96b4c8
YCH
2340 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2341 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2342 width = int_or_none(medium.get('width'))
2343 height = int_or_none(medium.get('height'))
2344 proto = medium.get('proto')
2345 ext = medium.get('ext')
cb73b846 2346 src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2347 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
ad96b4c8 2348 streamer = medium.get('streamer') or base
a107193e
S
2349
2350 if proto == 'rtmp' or streamer.startswith('rtmp'):
2351 rtmp_count += 1
2352 formats.append({
2353 'url': streamer,
2354 'play_path': src,
2355 'ext': 'flv',
2356 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2357 'tbr': bitrate,
2358 'filesize': filesize,
2359 'width': width,
2360 'height': height,
2361 })
f877c6ae
YCH
2362 if transform_rtmp_url:
2363 streamer, src = transform_rtmp_url(streamer, src)
2364 formats[-1].update({
2365 'url': streamer,
2366 'play_path': src,
2367 })
a107193e
S
2368 continue
2369
14f25df2 2370 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
c349456e 2371 src_url = src_url.strip()
a107193e
S
2372
2373 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2374 m3u8_formats = self._extract_m3u8_formats(
2375 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2376 if len(m3u8_formats) == 1:
2377 m3u8_count += 1
2378 m3u8_formats[0].update({
2379 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2380 'tbr': bitrate,
2381 'width': width,
2382 'height': height,
2383 })
2384 formats.extend(m3u8_formats)
bd21ead2 2385 elif src_ext == 'f4m':
a107193e
S
2386 f4m_url = src_url
2387 if not f4m_params:
2388 f4m_params = {
2389 'hdcore': '3.2.0',
2390 'plugin': 'flowplayer-3.2.0.1',
2391 }
2392 f4m_url += '&' if '?' in f4m_url else '?'
14f25df2 2393 f4m_url += urllib.parse.urlencode(f4m_params)
7e5edcfd 2394 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2395 elif src_ext == 'mpd':
2396 formats.extend(self._extract_mpd_formats(
2397 src_url, video_id, mpd_id='dash', fatal=False))
2398 elif re.search(r'\.ism/[Mm]anifest', src_url):
2399 formats.extend(self._extract_ism_formats(
2400 src_url, video_id, ism_id='mss', fatal=False))
2401 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2402 http_count += 1
2403 formats.append({
2404 'url': src_url,
2405 'ext': ext or src_ext or 'flv',
2406 'format_id': 'http-%d' % (bitrate or http_count),
2407 'tbr': bitrate,
2408 'filesize': filesize,
2409 'width': width,
2410 'height': height,
2411 })
63757032 2412
9359f3d4
F
2413 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2414 src = medium.get('src')
2415 if not src or src in srcs:
2416 continue
2417 srcs.add(src)
2418
2419 imgs_count += 1
2420 formats.append({
2421 'format_id': 'imagestream-%d' % (imgs_count),
2422 'url': src,
2423 'ext': mimetype2ext(medium.get('type')),
2424 'acodec': 'none',
2425 'vcodec': 'none',
2426 'width': int_or_none(medium.get('width')),
2427 'height': int_or_none(medium.get('height')),
2428 'format_note': 'SMIL storyboards',
2429 })
2430
e89a2aab
S
2431 return formats
2432
ce00af87 2433 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2434 urls = []
a107193e
S
2435 subtitles = {}
2436 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2437 src = textstream.get('src')
d413095f 2438 if not src or src in urls:
a107193e 2439 continue
d413095f 2440 urls.append(src)
df634be2 2441 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2442 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2443 subtitles.setdefault(lang, []).append({
2444 'url': src,
2445 'ext': ext,
2446 })
2447 return subtitles
63757032 2448
47a5cb77 2449 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2450 res = self._download_xml_handle(
47a5cb77 2451 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2452 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2453 if res is False:
942acef5 2454 return []
a076c1f9
E
2455
2456 xspf, urlh = res
2457 xspf_url = urlh.geturl()
2458
47a5cb77
S
2459 return self._parse_xspf(
2460 xspf, playlist_id, xspf_url=xspf_url,
2461 xspf_base_url=base_url(xspf_url))
8d6765cf 2462
47a5cb77 2463 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2464 NS_MAP = {
2465 'xspf': 'http://xspf.org/ns/0/',
2466 's1': 'http://static.streamone.nl/player/ns/0',
2467 }
2468
2469 entries = []
47a5cb77 2470 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2471 title = xpath_text(
98044462 2472 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2473 description = xpath_text(
2474 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2475 thumbnail = xpath_text(
2476 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2477 duration = float_or_none(
2478 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2479
47a5cb77
S
2480 formats = []
2481 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2482 format_url = urljoin(xspf_base_url, location.text)
2483 if not format_url:
2484 continue
2485 formats.append({
2486 'url': format_url,
2487 'manifest_url': xspf_url,
2488 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2489 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2490 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2491 })
8d6765cf
S
2492
2493 entries.append({
2494 'id': playlist_id,
2495 'title': title,
2496 'description': description,
2497 'thumbnail': thumbnail,
2498 'duration': duration,
2499 'formats': formats,
2500 })
2501 return entries
2502
171e59ed
F
2503 def _extract_mpd_formats(self, *args, **kwargs):
2504 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2505 if subs:
b5ae35ee 2506 self._report_ignoring_subs('DASH')
171e59ed
F
2507 return fmts
2508
2509 def _extract_mpd_formats_and_subtitles(
2510 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2511 fatal=True, data=None, headers={}, query={}):
0b5546c7 2512
2513 if self.get_param('ignore_no_formats_error'):
2514 fatal = False
2515
47a5cb77 2516 res = self._download_xml_handle(
1bac3455 2517 mpd_url, video_id,
37a3bb66 2518 note='Downloading MPD manifest' if note is None else note,
2519 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2520 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2521 if res is False:
171e59ed 2522 return [], {}
47a5cb77 2523 mpd_doc, urlh = res
c25720ef 2524 if mpd_doc is None:
171e59ed 2525 return [], {}
779da8e3
E
2526
2527 # We could have been redirected to a new url when we retrieved our mpd file.
2528 mpd_url = urlh.geturl()
2529 mpd_base_url = base_url(mpd_url)
1bac3455 2530
171e59ed 2531 return self._parse_mpd_formats_and_subtitles(
545cc85d 2532 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2533
171e59ed
F
2534 def _parse_mpd_formats(self, *args, **kwargs):
2535 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2536 if subs:
b5ae35ee 2537 self._report_ignoring_subs('DASH')
171e59ed
F
2538 return fmts
2539
2540 def _parse_mpd_formats_and_subtitles(
2541 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2542 """
2543 Parse formats from MPD manifest.
2544 References:
2545 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2546 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2547 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2548 """
a06916d9 2549 if not self.get_param('dynamic_mpd', True):
78895bd3 2550 if mpd_doc.get('type') == 'dynamic':
171e59ed 2551 return [], {}
2d2fa82d 2552
91cb6b50 2553 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2554
2555 def _add_ns(path):
2556 return self._xpath_ns(path, namespace)
2557
675d0016 2558 def is_drm_protected(element):
2559 return element.find(_add_ns('ContentProtection')) is not None
2560
1bac3455 2561 def extract_multisegment_info(element, ms_parent_info):
2562 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2563
2564 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2565 # common attributes and elements. We will only extract relevant
2566 # for us.
2567 def extract_common(source):
2568 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2569 if segment_timeline is not None:
2570 s_e = segment_timeline.findall(_add_ns('S'))
2571 if s_e:
2572 ms_info['total_number'] = 0
2573 ms_info['s'] = []
2574 for s in s_e:
2575 r = int(s.get('r', 0))
2576 ms_info['total_number'] += 1 + r
2577 ms_info['s'].append({
2578 't': int(s.get('t', 0)),
2579 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2580 'd': int(s.attrib['d']),
2581 'r': r,
2582 })
2583 start_number = source.get('startNumber')
2584 if start_number:
2585 ms_info['start_number'] = int(start_number)
2586 timescale = source.get('timescale')
2587 if timescale:
2588 ms_info['timescale'] = int(timescale)
2589 segment_duration = source.get('duration')
2590 if segment_duration:
48504785 2591 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2592
2593 def extract_Initialization(source):
2594 initialization = source.find(_add_ns('Initialization'))
2595 if initialization is not None:
2596 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2597
f14be228 2598 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2599 if segment_list is not None:
b4c1d6e8
S
2600 extract_common(segment_list)
2601 extract_Initialization(segment_list)
f14be228 2602 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2603 if segment_urls_e:
2604 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2605 else:
f14be228 2606 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2607 if segment_template is not None:
b4c1d6e8 2608 extract_common(segment_template)
e228616c
S
2609 media = segment_template.get('media')
2610 if media:
2611 ms_info['media'] = media
1bac3455 2612 initialization = segment_template.get('initialization')
2613 if initialization:
e228616c 2614 ms_info['initialization'] = initialization
1bac3455 2615 else:
b4c1d6e8 2616 extract_Initialization(segment_template)
1bac3455 2617 return ms_info
b323e170 2618
1bac3455 2619 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2620 formats, subtitles = [], {}
234416e4 2621 stream_numbers = collections.defaultdict(int)
f14be228 2622 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2623 period_duration = parse_duration(period.get('duration')) or mpd_duration
2624 period_ms_info = extract_multisegment_info(period, {
2625 'start_number': 1,
2626 'timescale': 1,
2627 })
f14be228 2628 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2629 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2630 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2631 representation_attrib = adaptation_set.attrib.copy()
2632 representation_attrib.update(representation.attrib)
f0948348 2633 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2634 mime_type = representation_attrib['mimeType']
171e59ed
F
2635 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2636
21633673 2637 codec_str = representation_attrib.get('codecs', '')
2638 # Some kind of binary subtitle found in some youtube livestreams
2639 if mime_type == 'application/x-rawcc':
2640 codecs = {'scodec': codec_str}
2641 else:
2642 codecs = parse_codecs(codec_str)
be2fc5b2 2643 if content_type not in ('video', 'audio', 'text'):
2644 if mime_type == 'image/jpeg':
a8731fcc 2645 content_type = mime_type
21633673 2646 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2647 content_type = 'video'
21633673 2648 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2649 content_type = 'audio'
3fe75fdc 2650 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2651 content_type = 'text'
6993f78d 2652 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2653 content_type = 'text'
cdb19aa4 2654 else:
be2fc5b2 2655 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2656 continue
2657
2658 base_url = ''
2659 for element in (representation, adaptation_set, period, mpd_doc):
2660 base_url_e = element.find(_add_ns('BaseURL'))
47046464 2661 if try_call(lambda: base_url_e.text) is not None:
be2fc5b2 2662 base_url = base_url_e.text + base_url
2663 if re.match(r'^https?://', base_url):
2664 break
f9cc0161 2665 if mpd_base_url and base_url.startswith('/'):
14f25df2 2666 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
f9cc0161
D
2667 elif mpd_base_url and not re.match(r'^https?://', base_url):
2668 if not mpd_base_url.endswith('/'):
be2fc5b2 2669 mpd_base_url += '/'
2670 base_url = mpd_base_url + base_url
2671 representation_id = representation_attrib.get('id')
2672 lang = representation_attrib.get('lang')
2673 url_el = representation.find(_add_ns('BaseURL'))
2674 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2675 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2676 if representation_id is not None:
2677 format_id = representation_id
2678 else:
2679 format_id = content_type
2680 if mpd_id:
2681 format_id = mpd_id + '-' + format_id
2682 if content_type in ('video', 'audio'):
2683 f = {
2684 'format_id': format_id,
2685 'manifest_url': mpd_url,
2686 'ext': mimetype2ext(mime_type),
2687 'width': int_or_none(representation_attrib.get('width')),
2688 'height': int_or_none(representation_attrib.get('height')),
2689 'tbr': float_or_none(bandwidth, 1000),
2690 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2691 'fps': int_or_none(representation_attrib.get('frameRate')),
2692 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2693 'format_note': 'DASH %s' % content_type,
2694 'filesize': filesize,
2695 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2696 **codecs
be2fc5b2 2697 }
be2fc5b2 2698 elif content_type == 'text':
2699 f = {
2700 'ext': mimetype2ext(mime_type),
2701 'manifest_url': mpd_url,
2702 'filesize': filesize,
2703 }
2704 elif content_type == 'image/jpeg':
2705 # See test case in VikiIE
2706 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2707 f = {
2708 'format_id': format_id,
2709 'ext': 'mhtml',
2710 'manifest_url': mpd_url,
2711 'format_note': 'DASH storyboards (jpeg)',
2712 'acodec': 'none',
2713 'vcodec': 'none',
2714 }
88acdbc2 2715 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2716 f['has_drm'] = True
be2fc5b2 2717 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2718
2719 def prepare_template(template_name, identifiers):
2720 tmpl = representation_ms_info[template_name]
0cb0fdbb 2721 if representation_id is not None:
2722 tmpl = tmpl.replace('$RepresentationID$', representation_id)
be2fc5b2 2723 # First of, % characters outside $...$ templates
2724 # must be escaped by doubling for proper processing
2725 # by % operator string formatting used further (see
2726 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2727 t = ''
2728 in_template = False
2729 for c in tmpl:
2730 t += c
2731 if c == '$':
2732 in_template = not in_template
2733 elif c == '%' and not in_template:
eca1f0d1 2734 t += c
be2fc5b2 2735 # Next, $...$ templates are translated to their
2736 # %(...) counterparts to be used with % operator
be2fc5b2 2737 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2738 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2739 t.replace('$$', '$')
2740 return t
2741
2742 # @initialization is a regular template like @media one
2743 # so it should be handled just the same way (see
2744 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2745 if 'initialization' in representation_ms_info:
2746 initialization_template = prepare_template(
2747 'initialization',
2748 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2749 # $Time$ shall not be included for @initialization thus
2750 # only $Bandwidth$ remains
2751 ('Bandwidth', ))
2752 representation_ms_info['initialization_url'] = initialization_template % {
2753 'Bandwidth': bandwidth,
2754 }
2755
2756 def location_key(location):
2757 return 'url' if re.match(r'^https?://', location) else 'path'
2758
2759 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2760
2761 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2762 media_location_key = location_key(media_template)
2763
2764 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2765 # can't be used at the same time
2766 if '%(Number' in media_template and 's' not in representation_ms_info:
2767 segment_duration = None
2768 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2769 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2770 representation_ms_info['total_number'] = int(math.ceil(
2771 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2772 representation_ms_info['fragments'] = [{
2773 media_location_key: media_template % {
2774 'Number': segment_number,
2775 'Bandwidth': bandwidth,
2776 },
2777 'duration': segment_duration,
2778 } for segment_number in range(
2779 representation_ms_info['start_number'],
2780 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2781 else:
2782 # $Number*$ or $Time$ in media template with S list available
2783 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2784 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2785 representation_ms_info['fragments'] = []
2786 segment_time = 0
2787 segment_d = None
2788 segment_number = representation_ms_info['start_number']
2789
2790 def add_segment_url():
2791 segment_url = media_template % {
2792 'Time': segment_time,
2793 'Bandwidth': bandwidth,
2794 'Number': segment_number,
2795 }
2796 representation_ms_info['fragments'].append({
2797 media_location_key: segment_url,
2798 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2799 })
2800
2801 for num, s in enumerate(representation_ms_info['s']):
2802 segment_time = s.get('t') or segment_time
2803 segment_d = s['d']
2804 add_segment_url()
2805 segment_number += 1
2806 for r in range(s.get('r', 0)):
2807 segment_time += segment_d
f0948348 2808 add_segment_url()
b4c1d6e8 2809 segment_number += 1
be2fc5b2 2810 segment_time += segment_d
2811 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
62b58c09
L
2812 # No media template,
2813 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
be2fc5b2 2814 # or any YouTube dashsegments video
2815 fragments = []
2816 segment_index = 0
2817 timescale = representation_ms_info['timescale']
2818 for s in representation_ms_info['s']:
2819 duration = float_or_none(s['d'], timescale)
2820 for r in range(s.get('r', 0) + 1):
2821 segment_uri = representation_ms_info['segment_urls'][segment_index]
2822 fragments.append({
2823 location_key(segment_uri): segment_uri,
2824 'duration': duration,
2825 })
2826 segment_index += 1
2827 representation_ms_info['fragments'] = fragments
2828 elif 'segment_urls' in representation_ms_info:
2829 # Segment URLs with no SegmentTimeline
62b58c09 2830 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
be2fc5b2 2831 # https://github.com/ytdl-org/youtube-dl/pull/14844
2832 fragments = []
2833 segment_duration = float_or_none(
2834 representation_ms_info['segment_duration'],
2835 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2836 for segment_url in representation_ms_info['segment_urls']:
2837 fragment = {
2838 location_key(segment_url): segment_url,
2839 }
2840 if segment_duration:
2841 fragment['duration'] = segment_duration
2842 fragments.append(fragment)
2843 representation_ms_info['fragments'] = fragments
2844 # If there is a fragments key available then we correctly recognized fragmented media.
2845 # Otherwise we will assume unfragmented media with direct access. Technically, such
2846 # assumption is not necessarily correct since we may simply have no support for
2847 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2848 if 'fragments' in representation_ms_info:
2849 f.update({
2850 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2851 'url': mpd_url or base_url,
2852 'fragment_base_url': base_url,
2853 'fragments': [],
2854 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2855 })
2856 if 'initialization_url' in representation_ms_info:
2857 initialization_url = representation_ms_info['initialization_url']
2858 if not f.get('url'):
2859 f['url'] = initialization_url
2860 f['fragments'].append({location_key(initialization_url): initialization_url})
2861 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 2862 if not period_duration:
2863 period_duration = try_get(
2864 representation_ms_info,
2865 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 2866 else:
be2fc5b2 2867 # Assuming direct URL to unfragmented media.
2868 f['url'] = base_url
234416e4 2869 if content_type in ('video', 'audio', 'image/jpeg'):
2870 f['manifest_stream_number'] = stream_numbers[f['url']]
2871 stream_numbers[f['url']] += 1
be2fc5b2 2872 formats.append(f)
2873 elif content_type == 'text':
2874 subtitles.setdefault(lang or 'und', []).append(f)
2875
171e59ed 2876 return formats, subtitles
17b598d3 2877
fd76a142
F
2878 def _extract_ism_formats(self, *args, **kwargs):
2879 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2880 if subs:
b5ae35ee 2881 self._report_ignoring_subs('ISM')
fd76a142
F
2882 return fmts
2883
2884 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
0b5546c7 2885 if self.get_param('ignore_no_formats_error'):
2886 fatal = False
2887
47a5cb77 2888 res = self._download_xml_handle(
b2758123 2889 ism_url, video_id,
37a3bb66 2890 note='Downloading ISM manifest' if note is None else note,
2891 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2892 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2893 if res is False:
fd76a142 2894 return [], {}
47a5cb77 2895 ism_doc, urlh = res
13b08034 2896 if ism_doc is None:
fd76a142 2897 return [], {}
b2758123 2898
fd76a142 2899 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 2900
fd76a142 2901 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2902 """
2903 Parse formats from ISM manifest.
2904 References:
2905 1. [MS-SSTR]: Smooth Streaming Protocol,
2906 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2907 """
06869367 2908 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2909 return [], {}
b2758123 2910
b2758123
RA
2911 duration = int(ism_doc.attrib['Duration'])
2912 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2913
2914 formats = []
fd76a142 2915 subtitles = {}
b2758123
RA
2916 for stream in ism_doc.findall('StreamIndex'):
2917 stream_type = stream.get('Type')
fd76a142 2918 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2919 continue
2920 url_pattern = stream.attrib['Url']
2921 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2922 stream_name = stream.get('Name')
fd76a142 2923 stream_language = stream.get('Language', 'und')
b2758123 2924 for track in stream.findall('QualityLevel'):
81b6102d 2925 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2926 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
b2758123 2927 # TODO: add support for WVC1 and WMAP
81b6102d 2928 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
b2758123
RA
2929 self.report_warning('%s is not a supported codec' % fourcc)
2930 continue
2931 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2932 # [1] does not mention Width and Height attributes. However,
2933 # they're often present while MaxWidth and MaxHeight are
2934 # missing, so should be used as fallbacks
2935 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2936 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2937 sampling_rate = int_or_none(track.get('SamplingRate'))
2938
2939 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
14f25df2 2940 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
b2758123
RA
2941
2942 fragments = []
2943 fragment_ctx = {
2944 'time': 0,
2945 }
2946 stream_fragments = stream.findall('c')
2947 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2948 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2949 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2950 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2951 if not fragment_ctx['duration']:
2952 try:
2953 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2954 except IndexError:
2955 next_fragment_time = duration
1616f9b4 2956 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2957 for _ in range(fragment_repeat):
2958 fragments.append({
14f25df2 2959 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2960 'duration': fragment_ctx['duration'] / stream_timescale,
2961 })
2962 fragment_ctx['time'] += fragment_ctx['duration']
2963
fd76a142
F
2964 if stream_type == 'text':
2965 subtitles.setdefault(stream_language, []).append({
2966 'ext': 'ismt',
2967 'protocol': 'ism',
2968 'url': ism_url,
2969 'manifest_url': ism_url,
2970 'fragments': fragments,
2971 '_download_params': {
2972 'stream_type': stream_type,
2973 'duration': duration,
2974 'timescale': stream_timescale,
2975 'fourcc': fourcc,
2976 'language': stream_language,
2977 'codec_private_data': track.get('CodecPrivateData'),
2978 }
2979 })
2980 elif stream_type in ('video', 'audio'):
2981 formats.append({
34921b43 2982 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
2983 'url': ism_url,
2984 'manifest_url': ism_url,
2985 'ext': 'ismv' if stream_type == 'video' else 'isma',
2986 'width': width,
2987 'height': height,
2988 'tbr': tbr,
2989 'asr': sampling_rate,
2990 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2991 'acodec': 'none' if stream_type == 'video' else fourcc,
2992 'protocol': 'ism',
2993 'fragments': fragments,
88acdbc2 2994 'has_drm': ism_doc.find('Protection') is not None,
f68434cc 2995 'language': stream_language,
2996 'audio_channels': int_or_none(track.get('Channels')),
fd76a142
F
2997 '_download_params': {
2998 'stream_type': stream_type,
2999 'duration': duration,
3000 'timescale': stream_timescale,
3001 'width': width or 0,
3002 'height': height or 0,
3003 'fourcc': fourcc,
3004 'language': stream_language,
3005 'codec_private_data': track.get('CodecPrivateData'),
3006 'sampling_rate': sampling_rate,
3007 'channels': int_or_none(track.get('Channels', 2)),
3008 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3009 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3010 },
3011 })
3012 return formats, subtitles
b2758123 3013
079a7cfc 3014 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
3015 def absolute_url(item_url):
3016 return urljoin(base_url, item_url)
59bbe491 3017
3018 def parse_content_type(content_type):
3019 if not content_type:
3020 return {}
3021 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3022 if ctr:
3023 mimetype, codecs = ctr.groups()
3024 f = parse_codecs(codecs)
3025 f['ext'] = mimetype2ext(mimetype)
3026 return f
3027 return {}
3028
222a2308
L
3029 def _media_formats(src, cur_media_type, type_info=None):
3030 type_info = type_info or {}
520251c0 3031 full_url = absolute_url(src)
82889d4a 3032 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3033 if ext == 'm3u8':
520251c0
YCH
3034 is_plain_url = False
3035 formats = self._extract_m3u8_formats(
ad120ae1 3036 full_url, video_id, ext='mp4',
eeb0a956 3037 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3038 preference=preference, quality=quality, fatal=False)
87a449c1
S
3039 elif ext == 'mpd':
3040 is_plain_url = False
3041 formats = self._extract_mpd_formats(
b359e977 3042 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3043 else:
3044 is_plain_url = True
3045 formats = [{
3046 'url': full_url,
3047 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3048 'ext': ext,
520251c0
YCH
3049 }]
3050 return is_plain_url, formats
3051
59bbe491 3052 entries = []
4328ddf8 3053 # amp-video and amp-audio are very similar to their HTML5 counterparts
962ffcf8 3054 # so we will include them right here (see
4328ddf8 3055 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3056 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3057 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3058 media_tags = [(media_tag, media_tag_name, media_type, '')
3059 for media_tag, media_tag_name, media_type
3060 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3061 media_tags.extend(re.findall(
3062 # We only allow video|audio followed by a whitespace or '>'.
3063 # Allowing more characters may end up in significant slow down (see
62b58c09
L
3064 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3065 # e.g. http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3066 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3067 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3068 media_info = {
3069 'formats': [],
3070 'subtitles': {},
3071 }
3072 media_attributes = extract_attributes(media_tag)
bfbecd11 3073 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3074 if src:
222a2308
L
3075 f = parse_content_type(media_attributes.get('type'))
3076 _, formats = _media_formats(src, media_type, f)
520251c0 3077 media_info['formats'].extend(formats)
6780154e 3078 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3079 if media_content:
3080 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3081 s_attr = extract_attributes(source_tag)
3082 # data-video-src and data-src are non standard but seen
3083 # several times in the wild
bfbecd11 3084 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3085 if not src:
3086 continue
d493f15c 3087 f = parse_content_type(s_attr.get('type'))
868f79db 3088 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3089 if is_plain_url:
d493f15c
S
3090 # width, height, res, label and title attributes are
3091 # all not standard but seen several times in the wild
3092 labels = [
3093 s_attr.get(lbl)
3094 for lbl in ('label', 'title')
3095 if str_or_none(s_attr.get(lbl))
3096 ]
3097 width = int_or_none(s_attr.get('width'))
3089bc74
S
3098 height = (int_or_none(s_attr.get('height'))
3099 or int_or_none(s_attr.get('res')))
d493f15c
S
3100 if not width or not height:
3101 for lbl in labels:
3102 resolution = parse_resolution(lbl)
3103 if not resolution:
3104 continue
3105 width = width or resolution.get('width')
3106 height = height or resolution.get('height')
3107 for lbl in labels:
3108 tbr = parse_bitrate(lbl)
3109 if tbr:
3110 break
3111 else:
3112 tbr = None
1ed45499 3113 f.update({
d493f15c
S
3114 'width': width,
3115 'height': height,
3116 'tbr': tbr,
3117 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3118 })
520251c0
YCH
3119 f.update(formats[0])
3120 media_info['formats'].append(f)
3121 else:
3122 media_info['formats'].extend(formats)
59bbe491 3123 for track_tag in re.findall(r'<track[^>]+>', media_content):
3124 track_attributes = extract_attributes(track_tag)
3125 kind = track_attributes.get('kind')
5968d7d2 3126 if not kind or kind in ('subtitles', 'captions'):
f856816b 3127 src = strip_or_none(track_attributes.get('src'))
59bbe491 3128 if not src:
3129 continue
3130 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3131 media_info['subtitles'].setdefault(lang, []).append({
3132 'url': absolute_url(src),
3133 })
5e8e2fa5
S
3134 for f in media_info['formats']:
3135 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3136 if media_info['formats'] or media_info['subtitles']:
59bbe491 3137 entries.append(media_info)
3138 return entries
3139
f6a1d69a
F
3140 def _extract_akamai_formats(self, *args, **kwargs):
3141 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3142 if subs:
b5ae35ee 3143 self._report_ignoring_subs('akamai')
f6a1d69a
F
3144 return fmts
3145
3146 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3147 signed = 'hdnea=' in manifest_url
3148 if not signed:
3149 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3150 manifest_url = re.sub(
3151 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3152 '', manifest_url).strip('?')
3153
c7c43a93 3154 formats = []
f6a1d69a 3155 subtitles = {}
70c5802b 3156
e71a4509 3157 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3158 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3159 hds_host = hosts.get('hds')
3160 if hds_host:
3161 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3162 if 'hdcore=' not in f4m_url:
3163 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3164 f4m_formats = self._extract_f4m_formats(
3165 f4m_url, video_id, f4m_id='hds', fatal=False)
3166 for entry in f4m_formats:
3167 entry.update({'extra_param_to_segment_url': hdcore_sign})
3168 formats.extend(f4m_formats)
70c5802b 3169
c4251b9a
RA
3170 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3171 hls_host = hosts.get('hls')
3172 if hls_host:
3173 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3174 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3175 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3176 m3u8_id='hls', fatal=False)
3177 formats.extend(m3u8_formats)
f6a1d69a 3178 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3179
3180 http_host = hosts.get('http')
29f7c58a 3181 if http_host and m3u8_formats and not signed:
3182 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3183 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3184 qualities_length = len(qualities)
29f7c58a 3185 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3186 i = 0
29f7c58a 3187 for f in m3u8_formats:
3188 if f['vcodec'] != 'none':
70c5802b 3189 for protocol in ('http', 'https'):
3190 http_f = f.copy()
3191 del http_f['manifest_url']
3192 http_url = re.sub(
86e5f3ed 3193 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3194 http_f.update({
3195 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3196 'url': http_url,
3197 'protocol': protocol,
3198 })
29f7c58a 3199 formats.append(http_f)
70c5802b 3200 i += 1
70c5802b 3201
f6a1d69a 3202 return formats, subtitles
c7c43a93 3203
6ad02195 3204 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
14f25df2 3205 query = urllib.parse.urlparse(url).query
6ad02195 3206 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3207 mobj = re.search(
3208 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3209 url_base = mobj.group('url')
3210 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3211 formats = []
044eeb14
S
3212
3213 def manifest_url(manifest):
86e5f3ed 3214 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3215 if query:
3216 m_url += '?%s' % query
3217 return m_url
3218
6ad02195
RA
3219 if 'm3u8' not in skip_protocols:
3220 formats.extend(self._extract_m3u8_formats(
044eeb14 3221 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3222 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3223 if 'f4m' not in skip_protocols:
3224 formats.extend(self._extract_f4m_formats(
044eeb14 3225 manifest_url('manifest.f4m'),
6ad02195 3226 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3227 if 'dash' not in skip_protocols:
3228 formats.extend(self._extract_mpd_formats(
044eeb14 3229 manifest_url('manifest.mpd'),
0384932e 3230 video_id, mpd_id='dash', fatal=False))
6ad02195 3231 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3232 if 'smil' not in skip_protocols:
3233 rtmp_formats = self._extract_smil_formats(
044eeb14 3234 manifest_url('jwplayer.smil'),
6ad02195
RA
3235 video_id, fatal=False)
3236 for rtmp_format in rtmp_formats:
3237 rtsp_format = rtmp_format.copy()
3238 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3239 del rtsp_format['play_path']
3240 del rtsp_format['ext']
3241 rtsp_format.update({
3242 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3243 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3244 'protocol': 'rtsp',
3245 })
3246 formats.extend([rtmp_format, rtsp_format])
3247 else:
3248 for protocol in ('rtmp', 'rtsp'):
3249 if protocol not in skip_protocols:
3250 formats.append({
86e5f3ed 3251 'url': f'{protocol}:{url_base}',
6ad02195
RA
3252 'format_id': protocol,
3253 'protocol': protocol,
3254 })
3255 return formats
3256
c73e330e 3257 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3258 mobj = re.search(
32a84bcf 3259 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
a4a554a7
YCH
3260 webpage)
3261 if mobj:
c73e330e
RU
3262 try:
3263 jwplayer_data = self._parse_json(mobj.group('options'),
3264 video_id=video_id,
3265 transform_source=transform_source)
3266 except ExtractorError:
3267 pass
3268 else:
3269 if isinstance(jwplayer_data, dict):
3270 return jwplayer_data
a4a554a7
YCH
3271
3272 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3273 jwplayer_data = self._find_jwplayer_data(
3274 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3275 return self._parse_jwplayer_data(
3276 jwplayer_data, video_id, *args, **kwargs)
3277
3278 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3279 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
a4a554a7 3280 entries = []
32a84bcf
SS
3281 if not isinstance(jwplayer_data, dict):
3282 return entries
a4a554a7 3283
32a84bcf
SS
3284 playlist_items = jwplayer_data.get('playlist')
3285 # JWPlayer backward compatibility: single playlist item/flattened playlists
a4a554a7 3286 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
32a84bcf
SS
3287 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3288 if not isinstance(playlist_items, list):
3289 playlist_items = (playlist_items or jwplayer_data, )
a4a554a7 3290
32a84bcf
SS
3291 for video_data in playlist_items:
3292 if not isinstance(video_data, dict):
3293 continue
a4a554a7
YCH
3294 # JWPlayer backward compatibility: flattened sources
3295 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3296 if 'sources' not in video_data:
3297 video_data['sources'] = [video_data]
3298
3299 this_video_id = video_id or video_data['mediaid']
3300
1a2192cb
S
3301 formats = self._parse_jwplayer_formats(
3302 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3303 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3304
3305 subtitles = {}
3306 tracks = video_data.get('tracks')
3307 if tracks and isinstance(tracks, list):
3308 for track in tracks:
96a2daa1
S
3309 if not isinstance(track, dict):
3310 continue
f4b74272 3311 track_kind = track.get('kind')
14f25df2 3312 if not track_kind or not isinstance(track_kind, str):
f4b74272
S
3313 continue
3314 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3315 continue
3316 track_url = urljoin(base_url, track.get('file'))
3317 if not track_url:
3318 continue
3319 subtitles.setdefault(track.get('label') or 'en', []).append({
3320 'url': self._proto_relative_url(track_url)
3321 })
3322
50d808f5 3323 entry = {
a4a554a7 3324 'id': this_video_id,
50d808f5 3325 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3326 'description': clean_html(video_data.get('description')),
6945b9e7 3327 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3328 'timestamp': int_or_none(video_data.get('pubdate')),
3329 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3330 'subtitles': subtitles,
32a84bcf
SS
3331 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3332 'genre': clean_html(video_data.get('genre')),
3333 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3334 'season_number': int_or_none(video_data.get('season')),
3335 'episode_number': int_or_none(video_data.get('episode')),
3336 'release_year': int_or_none(video_data.get('releasedate')),
3337 'age_limit': int_or_none(video_data.get('age_restriction')),
50d808f5
RA
3338 }
3339 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3340 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3341 entry.update({
3342 '_type': 'url_transparent',
3343 'url': formats[0]['url'],
3344 })
3345 else:
50d808f5
RA
3346 entry['formats'] = formats
3347 entries.append(entry)
a4a554a7
YCH
3348 if len(entries) == 1:
3349 return entries[0]
3350 else:
3351 return self.playlist_result(entries)
3352
ed0cf9b3
S
3353 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3354 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
32a84bcf 3355 urls = set()
ed0cf9b3 3356 formats = []
1a2192cb 3357 for source in jwplayer_sources_data:
0a268c6e
S
3358 if not isinstance(source, dict):
3359 continue
6945b9e7
RA
3360 source_url = urljoin(
3361 base_url, self._proto_relative_url(source.get('file')))
3362 if not source_url or source_url in urls:
bf1b87cd 3363 continue
32a84bcf 3364 urls.add(source_url)
ed0cf9b3
S
3365 source_type = source.get('type') or ''
3366 ext = mimetype2ext(source_type) or determine_ext(source_url)
32a84bcf 3367 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
ed0cf9b3 3368 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3369 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3370 m3u8_id=m3u8_id, fatal=False))
32a84bcf 3371 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
ed0cf9b3
S
3372 formats.extend(self._extract_mpd_formats(
3373 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3374 elif ext == 'smil':
3375 formats.extend(self._extract_smil_formats(
3376 source_url, video_id, fatal=False))
ed0cf9b3 3377 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3378 elif source_type.startswith('audio') or ext in (
3379 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3380 formats.append({
3381 'url': source_url,
3382 'vcodec': 'none',
3383 'ext': ext,
3384 })
3385 else:
32a84bcf 3386 format_id = str_or_none(source.get('label'))
ed0cf9b3 3387 height = int_or_none(source.get('height'))
32a84bcf 3388 if height is None and format_id:
ed0cf9b3 3389 # Often no height is provided but there is a label in
0236cd0d 3390 # format like "1080p", "720p SD", or 1080.
32a84bcf 3391 height = parse_resolution(format_id).get('height')
ed0cf9b3
S
3392 a_format = {
3393 'url': source_url,
3394 'width': int_or_none(source.get('width')),
3395 'height': height,
d3a3d7f0 3396 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3397 'filesize': int_or_none(source.get('filesize')),
ed0cf9b3 3398 'ext': ext,
32a84bcf 3399 'format_id': format_id
ed0cf9b3
S
3400 }
3401 if source_url.startswith('rtmp'):
3402 a_format['ext'] = 'flv'
ed0cf9b3
S
3403 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3404 # of jwplayer.flash.swf
3405 rtmp_url_parts = re.split(
3406 r'((?:mp4|mp3|flv):)', source_url, 1)
3407 if len(rtmp_url_parts) == 3:
3408 rtmp_url, prefix, play_path = rtmp_url_parts
3409 a_format.update({
3410 'url': rtmp_url,
3411 'play_path': prefix + play_path,
3412 })
3413 if rtmp_params:
3414 a_format.update(rtmp_params)
3415 formats.append(a_format)
3416 return formats
3417
f4b1c7ad 3418 def _live_title(self, name):
39ca3b5c 3419 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3420 return name
f4b1c7ad 3421
b14f3a4c
PH
3422 def _int(self, v, name, fatal=False, **kwargs):
3423 res = int_or_none(v, **kwargs)
b14f3a4c 3424 if res is None:
86e5f3ed 3425 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3426 if fatal:
3427 raise ExtractorError(msg)
3428 else:
6a39ee13 3429 self.report_warning(msg)
b14f3a4c
PH
3430 return res
3431
3432 def _float(self, v, name, fatal=False, **kwargs):
3433 res = float_or_none(v, **kwargs)
3434 if res is None:
86e5f3ed 3435 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3436 if fatal:
3437 raise ExtractorError(msg)
3438 else:
6a39ee13 3439 self.report_warning(msg)
b14f3a4c
PH
3440 return res
3441
40e41780
TF
3442 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3443 path='/', secure=False, discard=False, rest={}, **kwargs):
ac668111 3444 cookie = http.cookiejar.Cookie(
4ed2d7b7 3445 0, name, value, port, port is not None, domain, True,
40e41780
TF
3446 domain.startswith('.'), path, True, secure, expire_time,
3447 discard, None, None, rest)
9809740b 3448 self.cookiejar.set_cookie(cookie)
42939b61 3449
799207e8 3450 def _get_cookies(self, url):
ac668111 3451 """ Return a http.cookies.SimpleCookie with the cookies for the url """
b87e01c1 3452 return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
799207e8 3453
e3c1266f 3454 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3455 """
3456 Apply first Set-Cookie header instead of the last. Experimental.
3457
3458 Some sites (e.g. [1-3]) may serve two cookies under the same name
3459 in Set-Cookie header and expect the first (old) one to be set rather
3460 than second (new). However, as of RFC6265 the newer one cookie
3461 should be set into cookie store what actually happens.
3462 We will workaround this issue by resetting the cookie to
3463 the first one manually.
3464 1. https://new.vk.com/
3465 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3466 3. https://learning.oreilly.com/
3467 """
e3c1266f
S
3468 for header, cookies in url_handle.headers.items():
3469 if header.lower() != 'set-cookie':
3470 continue
cfb0511d 3471 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3472 cookie_value = re.search(
3473 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3474 if cookie_value:
3475 value, domain = cookie_value.groups()
3476 self._set_cookie(domain, cookie, value)
3477 break
3478
82d02080 3479 @classmethod
3480 def get_testcases(cls, include_onlymatching=False):
6368e2e6 3481 # Do not look in super classes
3482 t = vars(cls).get('_TEST')
05900629 3483 if t:
82d02080 3484 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3485 tests = [t]
3486 else:
6368e2e6 3487 tests = vars(cls).get('_TESTS', [])
05900629
PH
3488 for t in tests:
3489 if not include_onlymatching and t.get('only_matching', False):
3490 continue
82d02080 3491 t['name'] = cls.ie_key()
05900629 3492 yield t
e756f45b
M
3493 if getattr(cls, '__wrapped__', None):
3494 yield from cls.__wrapped__.get_testcases(include_onlymatching)
05900629 3495
f2e8dbcc 3496 @classmethod
3497 def get_webpage_testcases(cls):
6368e2e6 3498 tests = vars(cls).get('_WEBPAGE_TESTS', [])
f2e8dbcc 3499 for t in tests:
3500 t['name'] = cls.ie_key()
e756f45b
M
3501 yield t
3502 if getattr(cls, '__wrapped__', None):
3503 yield from cls.__wrapped__.get_webpage_testcases()
f2e8dbcc 3504
6368e2e6 3505 @classproperty(cache=True)
24146491 3506 def age_limit(cls):
3507 """Get age limit from the testcases"""
3508 return max(traverse_obj(
f2e8dbcc 3509 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
24146491 3510 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3511
171a31db 3512 @classproperty(cache=True)
3513 def _RETURN_TYPE(cls):
3514 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3515 tests = tuple(cls.get_testcases(include_onlymatching=False))
3516 if not tests:
3517 return None
3518 elif not any(k.startswith('playlist') for test in tests for k in test):
3519 return 'video'
3520 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3521 return 'playlist'
3522 return 'any'
3523
3524 @classmethod
3525 def is_single_video(cls, url):
3526 """Returns whether the URL is of a single video, None if unknown"""
baa922b5 3527 if cls.suitable(url):
3528 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
171a31db 3529
82d02080 3530 @classmethod
3531 def is_suitable(cls, age_limit):
24146491 3532 """Test whether the extractor is generally suitable for the given age limit"""
3533 return not age_restricted(cls.age_limit, age_limit)
05900629 3534
82d02080 3535 @classmethod
3536 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3537 """Description of the extractor"""
3538 desc = ''
82d02080 3539 if cls._NETRC_MACHINE:
8dcce6a8 3540 if markdown:
5b28cef7 3541 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
8dcce6a8 3542 else:
82d02080 3543 desc += f' [{cls._NETRC_MACHINE}]'
3544 if cls.IE_DESC is False:
8dcce6a8 3545 desc += ' [HIDDEN]'
82d02080 3546 elif cls.IE_DESC:
3547 desc += f' {cls.IE_DESC}'
3548 if cls.SEARCH_KEY:
08e29b9f 3549 desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3550 if search_examples:
3551 _COUNTS = ('', '5', '10', 'all')
62b58c09 3552 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
82d02080 3553 if not cls.working():
8dcce6a8 3554 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3555
46d09f87 3556 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3557 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
8dcce6a8 3558 return f'{name}:{desc}' if desc else name
3559
a504ced0 3560 def extract_subtitles(self, *args, **kwargs):
a06916d9 3561 if (self.get_param('writesubtitles', False)
3562 or self.get_param('listsubtitles')):
9868ea49
JMF
3563 return self._get_subtitles(*args, **kwargs)
3564 return {}
a504ced0
JMF
3565
3566 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3567 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3568
0cf643b2
M
3569 class CommentsDisabled(Exception):
3570 """Raise in _get_comments if comments are disabled for the video"""
3571
a2160aa4 3572 def extract_comments(self, *args, **kwargs):
3573 if not self.get_param('getcomments'):
3574 return None
3575 generator = self._get_comments(*args, **kwargs)
3576
3577 def extractor():
3578 comments = []
d2b2fca5 3579 interrupted = True
a2160aa4 3580 try:
3581 while True:
3582 comments.append(next(generator))
a2160aa4 3583 except StopIteration:
3584 interrupted = False
d2b2fca5 3585 except KeyboardInterrupt:
3586 self.to_screen('Interrupted by user')
0cf643b2
M
3587 except self.CommentsDisabled:
3588 return {'comments': None, 'comment_count': None}
d2b2fca5 3589 except Exception as e:
3590 if self.get_param('ignoreerrors') is not True:
3591 raise
3592 self._downloader.report_error(e)
a2160aa4 3593 comment_count = len(comments)
3594 self.to_screen(f'Extracted {comment_count} comments')
3595 return {
3596 'comments': comments,
3597 'comment_count': None if interrupted else comment_count
3598 }
3599 return extractor
3600
3601 def _get_comments(self, *args, **kwargs):
3602 raise NotImplementedError('This method must be implemented by subclasses')
3603
912e0b7e
YCH
3604 @staticmethod
3605 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3606 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3607 will be dropped. """
86e5f3ed 3608 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3609 ret = list(subtitle_list1)
a44ca5a4 3610 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3611 return ret
3612
3613 @classmethod
46890374 3614 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3615 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3616 if target is None:
3617 target = {}
3618 for d in dicts:
3619 for lang, subs in d.items():
3620 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3621 return target
912e0b7e 3622
360e1ca5 3623 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3624 if (self.get_param('writeautomaticsub', False)
3625 or self.get_param('listsubtitles')):
9868ea49
JMF
3626 return self._get_automatic_captions(*args, **kwargs)
3627 return {}
360e1ca5
JMF
3628
3629 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3630 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3631
2762dbb1 3632 @functools.cached_property
24146491 3633 def _cookies_passed(self):
3634 """Whether cookies have been passed to YoutubeDL"""
3635 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3636
d77ab8e2 3637 def mark_watched(self, *args, **kwargs):
1813a6cc 3638 if not self.get_param('mark_watched', False):
3639 return
24146491 3640 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3641 self._mark_watched(*args, **kwargs)
3642
3643 def _mark_watched(self, *args, **kwargs):
3644 raise NotImplementedError('This method must be implemented by subclasses')
3645
38cce791
YCH
3646 def geo_verification_headers(self):
3647 headers = {}
a06916d9 3648 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3649 if geo_verification_proxy:
3650 headers['Ytdl-request-proxy'] = geo_verification_proxy
3651 return headers
3652
8f97a15d 3653 @staticmethod
3654 def _generic_id(url):
14f25df2 3655 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
98763ee3 3656
62b8dac4 3657 def _generic_title(self, url='', webpage='', *, default=None):
3658 return (self._og_search_title(webpage, default=None)
3659 or self._html_extract_title(webpage, default=None)
3660 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3661 or default)
98763ee3 3662
22ccd542 3663 def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3664 if not duration:
3665 return
3666 chapter_list = [{
3667 'start_time': start_function(chapter),
3668 'title': title_function(chapter),
3669 } for chapter in chapter_list or []]
84ffeb7d 3670 if strict:
3671 warn = self.report_warning
3672 else:
3673 warn = self.write_debug
22ccd542 3674 chapter_list.sort(key=lambda c: c['start_time'] or 0)
3675
3676 chapters = [{'start_time': 0}]
3677 for idx, chapter in enumerate(chapter_list):
3678 if chapter['start_time'] is None:
84ffeb7d 3679 warn(f'Incomplete chapter {idx}')
22ccd542 3680 elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3681 chapters.append(chapter)
3682 elif chapter not in chapters:
84ffeb7d 3683 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3684 else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3685 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
22ccd542 3686 return chapters[1:]
3687
3688 def _extract_chapters_from_description(self, description, duration):
3689 duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3690 sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3691 return self._extract_chapters_helper(
3692 re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3693 start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3694 duration=duration, strict=False) or self._extract_chapters_helper(
3695 re.findall(sep_re % (r'.+?', duration_re), description or ''),
3696 start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3697 duration=duration, strict=False)
3698
c224251a 3699 @staticmethod
b0089e89 3700 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3701 all_known = all(map(
3702 lambda x: x is not None,
3703 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3704 return (
3705 'private' if is_private
3706 else 'premium_only' if needs_premium
3707 else 'subscriber_only' if needs_subscription
3708 else 'needs_auth' if needs_auth
3709 else 'unlisted' if is_unlisted
3710 else 'public' if all_known
3711 else None)
3712
d43de682 3713 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3714 '''
3715 @returns A list of values for the extractor argument given by "key"
3716 or "default" if no such key is present
3717 @param default The default value to return when the key is not present (default: [])
3718 @param casesense When false, the values are converted to lower case
3719 '''
5225df50 3720 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3721 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
4bb6b02f 3722 if val is None:
3723 return [] if default is NO_DEFAULT else default
3724 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3725
f40ee5e9 3726 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3727 if not playlist_id or not video_id:
3728 return not video_id
3729
3730 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3731 if no_playlist is not None:
3732 return not no_playlist
3733
3734 video_id = '' if video_id is True else f' {video_id}'
3735 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3736 if self.get_param('noplaylist'):
3737 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3738 return False
3739 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3740 return True
3741
be5c1ae8 3742 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
8ca48a1a 3743 RetryManager.report_retry(
3744 err, _count or int(fatal), _retries,
3745 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3746 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
be5c1ae8 3747
3748 def RetryManager(self, **kwargs):
3749 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3750
ade1fa70 3751 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3752 display_id = traverse_obj(info_dict, 'display_id', 'id')
3753 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3754 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3755 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3756
8f97a15d 3757 @classmethod
3758 def extract_from_webpage(cls, ydl, url, webpage):
3759 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3760 else ydl.get_info_extractor(cls.ie_key()))
f2e8dbcc 3761 for info in ie._extract_from_webpage(url, webpage) or []:
3762 # url = None since we do not want to set (webpage/original)_url
3763 ydl.add_default_extra_info(info, ie, None)
3764 yield info
8f97a15d 3765
3766 @classmethod
3767 def _extract_from_webpage(cls, url, webpage):
3768 for embed_url in orderedSet(
3769 cls._extract_embed_urls(url, webpage) or [], lazy=True):
d2c8aadf 3770 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
8f97a15d 3771
3772 @classmethod
3773 def _extract_embed_urls(cls, url, webpage):
3774 """@returns all the embed urls on the webpage"""
3775 if '_EMBED_URL_RE' not in cls.__dict__:
3776 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3777 for idx, regex in enumerate(cls._EMBED_REGEX):
3778 assert regex.count('(?P<url>') == 1, \
3779 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3780 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3781
3782 for regex in cls._EMBED_URL_RE:
3783 for mobj in regex.finditer(webpage):
3784 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3785 if cls._VALID_URL is False or cls.suitable(embed_url):
3786 yield embed_url
3787
3788 class StopExtraction(Exception):
3789 pass
3790
bfd973ec 3791 @classmethod
3792 def _extract_url(cls, webpage): # TODO: Remove
3793 """Only for compatibility with some older extractors"""
3794 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3795
2314b4d8 3796 @classmethod
3797 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3798 if plugin_name:
3799 mro = inspect.getmro(cls)
3800 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
e756f45b
M
3801 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3802 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
2314b4d8 3803 while getattr(super_class, '__wrapped__', None):
3804 super_class = super_class.__wrapped__
3805 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
e756f45b 3806 _PLUGIN_OVERRIDES[super_class].append(cls)
2314b4d8 3807
3808 return super().__init_subclass__(**kwargs)
3809
8dbe9899 3810
d6983cb4
PH
3811class SearchInfoExtractor(InfoExtractor):
3812 """
3813 Base class for paged search queries extractors.
10952eb2 3814 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3815 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3816 """
3817
96565c7e 3818 _MAX_RESULTS = float('inf')
171a31db 3819 _RETURN_TYPE = 'playlist'
96565c7e 3820
8f97a15d 3821 @classproperty
3822 def _VALID_URL(cls):
d6983cb4
PH
3823 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3824
d6983cb4 3825 def _real_extract(self, query):
2c4aaadd 3826 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3827 if prefix == '':
3828 return self._get_n_results(query, 1)
3829 elif prefix == 'all':
3830 return self._get_n_results(query, self._MAX_RESULTS)
3831 else:
3832 n = int(prefix)
3833 if n <= 0:
86e5f3ed 3834 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3835 elif n > self._MAX_RESULTS:
6a39ee13 3836 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3837 n = self._MAX_RESULTS
3838 return self._get_n_results(query, n)
3839
3840 def _get_n_results(self, query, n):
cc16383f 3841 """Get a specified number of results for a query.
3842 Either this function or _search_results must be overridden by subclasses """
3843 return self.playlist_result(
3844 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3845 query, query)
3846
3847 def _search_results(self, query):
3848 """Returns an iterator of search results"""
611c1dd9 3849 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 3850
82d02080 3851 @classproperty
3852 def SEARCH_KEY(cls):
3853 return cls._SEARCH_KEY
fe7866d0 3854
3855
3856class UnsupportedURLIE(InfoExtractor):
3857 _VALID_URL = '.*'
3858 _ENABLED = False
3859 IE_DESC = False
3860
3861 def _real_extract(self, url):
3862 raise UnsupportedError(url)
e756f45b
M
3863
3864
3865_PLUGIN_OVERRIDES = collections.defaultdict(list)