]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
[cleanup] Fix infodict returned fields (#8906)
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
ac668111 3import getpass
3ec05685 4import hashlib
54007a45 5import http.client
6import http.cookiejar
7import http.cookies
2314b4d8 8import inspect
cc16383f 9import itertools
3d3538e4 10import json
f8271158 11import math
4094b6e3 12import netrc
d6983cb4 13import os
773f291d 14import random
6929b41a 15import re
db3ad8a6 16import subprocess
d6983cb4 17import sys
4094b6e3 18import time
8f97a15d 19import types
14f25df2 20import urllib.parse
ac668111 21import urllib.request
f8271158 22import xml.etree.ElementTree
d6983cb4 23
6929b41a 24from ..compat import functools # isort: split
227bf1a3 25from ..compat import (
26 compat_etree_fromstring,
27 compat_expanduser,
28 compat_os_name,
29 urllib_req_to_req,
30)
8817a80d 31from ..cookies import LenientSimpleCookie
f8271158 32from ..downloader.f4m import get_base_url, remove_encrypted_media
bc344cd4 33from ..downloader.hls import HlsFD
3d2623a8 34from ..networking import HEADRequest, Request
35from ..networking.exceptions import (
36 HTTPError,
37 IncompleteRead,
38 network_exceptions,
39)
8c25f81b 40from ..utils import (
8f97a15d 41 IDENTITY,
f8271158 42 JSON_LD_RE,
43 NO_DEFAULT,
44 ExtractorError,
d0d74b71 45 FormatSorter,
f8271158 46 GeoRestrictedError,
47 GeoUtils,
b7c47b74 48 LenientJSONDecoder,
db3ad8a6 49 Popen,
f8271158 50 RegexNotFoundError,
be5c1ae8 51 RetryManager,
f8271158 52 UnsupportedError,
05900629 53 age_restricted,
02dc0a36 54 base_url,
08f2a92c 55 bug_reports_message,
82d02080 56 classproperty,
d6983cb4 57 clean_html,
d0d74b71 58 deprecation_warning,
70f0f5a8 59 determine_ext,
d493f15c 60 dict_get,
42676437 61 encode_data_uri,
9b9c5355 62 error_to_compat_str,
46b18f23 63 extract_attributes,
90137ca4 64 filter_dict,
97f4aecf 65 fix_xml_ampersands,
b14f3a4c 66 float_or_none,
b868936c 67 format_field,
31bb8d3f 68 int_or_none,
34921b43 69 join_nonempty,
a4a554a7 70 js_to_json,
46b18f23 71 mimetype2ext,
ad54c913 72 netrc_from_content,
46b18f23 73 orderedSet,
d493f15c 74 parse_bitrate,
46b18f23
JH
75 parse_codecs,
76 parse_duration,
4ca2a3cf 77 parse_iso8601,
46b18f23 78 parse_m3u8_attributes,
d493f15c 79 parse_resolution,
46b18f23 80 sanitize_filename,
8f97a15d 81 sanitize_url,
ade1fa70 82 smuggle_url,
d493f15c 83 str_or_none,
ce5b9040 84 str_to_int,
f856816b 85 strip_or_none,
5d3a0e79 86 traverse_obj,
71df9b7f 87 truncate_string,
47046464 88 try_call,
ffa89477 89 try_get,
f38de77f 90 unescapeHTML,
647eab45 91 unified_strdate,
6b3a3098 92 unified_timestamp,
a107193e 93 url_basename,
bebef109 94 url_or_none,
7e68567e 95 urlhandle_detect_ext,
b868936c 96 urljoin,
6606817a 97 variadic,
a6571f10 98 xpath_element,
8d6765cf
S
99 xpath_text,
100 xpath_with_ns,
d6983cb4 101)
c342041f 102
d6983cb4 103
86e5f3ed 104class InfoExtractor:
d6983cb4
PH
105 """Information Extractor class.
106
107 Information extractors are the classes that, given a URL, extract
108 information about the video (or videos) the URL refers to. This
109 information includes the real video URL, the video title, author and
110 others. The information is stored in a dictionary which is then
5d380852 111 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
112 information possibly downloading the video to the file system, among
113 other possible outcomes.
114
cf0649f8 115 The type field determines the type of the result.
fed5d032
PH
116 By far the most common value (and the default if _type is missing) is
117 "video", which indicates a single video.
118
119 For a video, the dictionaries must include the following fields:
d6983cb4
PH
120
121 id: Video identifier.
d4736fdb 122 title: Video title, unescaped. Set to an empty string if video has
123 no title as opposed to "None" which signifies that the
124 extractor failed to obtain a title
d67b0b15 125
f49d89ee 126 Additionally, it must contain either a formats entry or a url one:
d67b0b15 127
f49d89ee
PH
128 formats: A list of dictionaries for each format available, ordered
129 from worst to best quality.
130
131 Potential fields:
c790e93a
S
132 * url The mandatory URL representing the media:
133 for plain file media - HTTP URL of this file,
134 for RTMP - RTMP URL,
135 for HLS - URL of the M3U8 media playlist,
136 for HDS - URL of the F4M manifest,
79d2077e
S
137 for DASH
138 - HTTP URL to plain file media (in case of
139 unfragmented media)
140 - URL of the MPD manifest or base URL
141 representing the media if MPD manifest
8ed7a233 142 is parsed from a string (in case of
79d2077e 143 fragmented media)
c790e93a 144 for MSS - URL of the ISM manifest.
f34804b2 145 * request_data Data to send in POST request to the URL
86f4d14f
S
146 * manifest_url
147 The URL of the manifest file in case of
c790e93a
S
148 fragmented media:
149 for HLS - URL of the M3U8 master playlist,
150 for HDS - URL of the F4M manifest,
151 for DASH - URL of the MPD manifest,
152 for MSS - URL of the ISM manifest.
a44ca5a4 153 * manifest_stream_number (For internal use only)
154 The index of the stream in the manifest file
10952eb2 155 * ext Will be calculated from URL if missing
d67b0b15
PH
156 * format A human-readable description of the format
157 ("mp4 container with h264/opus").
158 Calculated from the format_id, width, height.
159 and format_note fields if missing.
160 * format_id A short description of the format
5d4f3985
PH
161 ("mp4_h264_opus" or "19").
162 Technically optional, but strongly recommended.
d67b0b15
PH
163 * format_note Additional info about the format
164 ("3D" or "DASH video")
165 * width Width of the video, if known
166 * height Height of the video, if known
105bfd90 167 * aspect_ratio Aspect ratio of the video, if known
168 Automatically calculated from width and height
f49d89ee 169 * resolution Textual description of width and height
105bfd90 170 Automatically calculated from width and height
176f1866 171 * dynamic_range The dynamic range of the video. One of:
172 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 173 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
174 * abr Average audio bitrate in KBit/s
175 * acodec Name of the audio codec in use
dd27fd17 176 * asr Audio sampling rate in Hertz
b8ed0f15 177 * audio_channels Number of audio channels
d67b0b15 178 * vbr Average video bitrate in KBit/s
fbb21cf5 179 * fps Frame rate
d67b0b15 180 * vcodec Name of the video codec in use
1394ce65 181 * container Name of the container format
d67b0b15 182 * filesize The number of bytes, if known in advance
9732d77e 183 * filesize_approx An estimate for the number of bytes
d67b0b15 184 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 185 * protocol The protocol that will be used for the actual
adbc4ec4
THD
186 download, lower-case. One of "http", "https" or
187 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
188 * fragment_base_url
189 Base URL for fragments. Each fragment's path
190 value (if present) will be relative to
191 this URL.
192 * fragments A list of fragments of a fragmented media.
193 Each fragment entry must contain either an url
194 or a path. If an url is present it should be
195 considered by a client. Otherwise both path and
196 fragment_base_url must be present. Here is
197 the list of all potential fields:
198 * "url" - fragment's URL
199 * "path" - fragment's path relative to
200 fragment_base_url
a0d5077c
S
201 * "duration" (optional, int or float)
202 * "filesize" (optional, int)
adbc4ec4
THD
203 * is_from_start Is a live format that can be downloaded
204 from the start. Boolean
f49d89ee 205 * preference Order number of this format. If this field is
08d13955 206 present and not None, the formats get sorted
38d63d84 207 by this field, regardless of all other values.
f49d89ee
PH
208 -1 for default (order by other properties),
209 -2 or smaller for less than default.
e65566a9
PH
210 < -1000 to hide the format (if there is
211 another one which is strictly better)
32f90364
PH
212 * language Language code, e.g. "de" or "en-US".
213 * language_preference Is this in the language mentioned in
214 the URL?
aff2f4f4
PH
215 10 if it's what the URL is about,
216 -1 for default (don't know),
217 -10 otherwise, other values reserved for now.
5d73273f
PH
218 * quality Order number of the video quality of this
219 format, irrespective of the file format.
220 -1 for default (order by other properties),
221 -2 or smaller for less than default.
c64ed2a3
PH
222 * source_preference Order number for this video source
223 (quality takes higher priority)
224 -1 for default (order by other properties),
225 -2 or smaller for less than default.
d769be6c
PH
226 * http_headers A dictionary of additional HTTP headers
227 to add to the request.
6271f1ca 228 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
229 video's pixels are not square.
230 width : height ratio as float.
231 * no_resume The server does not support resuming the
232 (HTTP or RTMP) download. Boolean.
bc344cd4 233 * has_drm True if the format has DRM and cannot be downloaded.
234 'maybe' if the format may have DRM and has to be tested before download.
7e68567e 235 * extra_param_to_segment_url A query string to append to each
236 fragment's URL, or to update each existing query string
237 with. Only applied by the native HLS/DASH downloaders.
238 * hls_aes A dictionary of HLS AES-128 decryption information
239 used by the native HLS downloader to override the
240 values in the media playlist when an '#EXT-X-KEY' tag
241 is present in the playlist:
242 * uri The URI from which the key will be downloaded
243 * key The key (as hex) used to decrypt fragments.
244 If `key` is given, any key URI will be ignored
245 * iv The IV (as hex) used to decrypt fragments
0a5a191a 246 * downloader_options A dictionary of downloader options
247 (For internal use only)
248 * http_chunk_size Chunk size for HTTP downloads
249 * ffmpeg_args Extra arguments for ffmpeg downloader
4ce57d3b
A
250 * is_dash_periods Whether the format is a result of merging
251 multiple DASH periods.
3b1fe47d 252 RTMP formats can also have the additional fields: page_url,
253 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
254 rtmp_protocol, rtmp_real_time
3dee7826 255
c0ba0f48 256 url: Final video URL.
d6983cb4 257 ext: Video filename extension.
d67b0b15
PH
258 format: The video format, defaults to ext (used for --get-format)
259 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 260
d6983cb4
PH
261 The following fields are optional:
262
08d30158 263 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 264 alt_title: A secondary title of the video.
f4f9f6d0 265 display_id: An alternative identifier for the video, not necessarily
0afef30b
PH
266 unique, but available before title. Typically, id is
267 something like "4234987", title "Dancing naked mole rats",
268 and display_id "dancing-naked-mole-rats"
d5519808 269 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 270 * "id" (optional, string) - Thumbnail format ID
d5519808 271 * "url"
cfb56d1a 272 * "preference" (optional, int) - quality of the image
d5519808
PH
273 * "width" (optional, int)
274 * "height" (optional, int)
5e1c39ac 275 * "resolution" (optional, string "{width}x{height}",
d5519808 276 deprecated)
2de624fd 277 * "filesize" (optional, int)
297e9952 278 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 279 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 280 description: Full video description.
d6983cb4 281 uploader: Full name of the video uploader.
2bc0c46f 282 license: License name the video is licensed under.
104a7b5a 283 creators: List of creators of the video.
10db0d2f 284 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 285 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 286 If not explicitly set, calculated from timestamp
287 release_timestamp: UNIX timestamp of the moment the video was released.
288 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 289 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 290 If not explicitly set, calculated from release_timestamp
1732eccc 291 release_year: Year (YYYY) as integer when the video or album was released.
292 To be used if no exact release date is known.
293 If not explicitly set, calculated from release_date.
f0d785d3 294 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 295 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 296 If not explicitly set, calculated from modified_timestamp
d6983cb4 297 uploader_id: Nickname or id of the video uploader.
7bcd2830 298 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 299 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 300 Note that channel fields may or may not repeat uploader
6f1f59f3
S
301 fields. This depends on a particular extractor.
302 channel_id: Id of the channel.
303 channel_url: Full URL to a channel webpage.
6c73052c 304 channel_follower_count: Number of followers of the channel.
8213ce28 305 channel_is_verified: Whether the channel is verified on the platform.
da9ec3b9 306 location: Physical location where the video was filmed.
a504ced0 307 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
308 {tag: subformats}. "tag" is usually a language code, and
309 "subformats" is a list sorted from lower to higher
310 preference, each element is a dictionary with the "ext"
311 entry and one of:
a504ced0 312 * "data": The subtitles file contents
10952eb2 313 * "url": A URL pointing to the subtitles file
2412044c 314 It can optionally also have:
315 * "name": Name or description of the subtitles
08d30158 316 * "http_headers": A dictionary of additional HTTP headers
297e9952 317 to add to the request.
4bba3716 318 "ext" will be calculated from URL if missing
e167860c 319 automatic_captions: Like 'subtitles'; contains automatically generated
320 captions instead of normal subtitles
62d231c0 321 duration: Length of the video in seconds, as an integer or float.
f3d29461 322 view_count: How many users have watched the video on the platform.
867c66ff 323 concurrent_view_count: How many users are currently watching the video on the platform.
19e3dfc9
PH
324 like_count: Number of positive ratings of the video
325 dislike_count: Number of negative ratings of the video
02835c6b 326 repost_count: Number of reposts of the video
2d30521a 327 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 328 comment_count: Number of comments on the video
dd622d7c
PH
329 comments: A list of comments, each with one or more of the following
330 properties (all but one of text or html optional):
331 * "author" - human-readable name of the comment author
332 * "author_id" - user ID of the comment author
a1c5d2ca 333 * "author_thumbnail" - The thumbnail of the comment author
c35448b7 334 * "author_url" - The url to the comment author's page
335 * "author_is_verified" - Whether the author is verified
336 on the platform
337 * "author_is_uploader" - Whether the comment is made by
338 the video uploader
dd622d7c
PH
339 * "id" - Comment ID
340 * "html" - Comment as HTML
341 * "text" - Plain text of the comment
342 * "timestamp" - UNIX timestamp of comment
343 * "parent" - ID of the comment this one is replying to.
344 Set to "root" to indicate that this is a
345 comment to the original video.
a1c5d2ca
M
346 * "like_count" - Number of positive ratings of the comment
347 * "dislike_count" - Number of negative ratings of the comment
348 * "is_favorited" - Whether the comment is marked as
349 favorite by the video uploader
c35448b7 350 * "is_pinned" - Whether the comment is pinned to
351 the top of the comments
8dbe9899 352 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 353 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
354 should allow to get the same result again. (It will be set
355 by YoutubeDL if it's missing)
ad3bc6ac
PH
356 categories: A list of categories that the video falls in, for example
357 ["Sports", "Berlin"]
864f24bd 358 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 359 cast: A list of the video cast
7267bd53
PH
360 is_live: True, False, or None (=unknown). Whether this video is a
361 live stream that goes on instead of a fixed-length video.
f76ede8e 362 was_live: True, False, or None (=unknown). Whether this video was
363 originally a live stream.
0647d925 364 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
e325a21a 365 or 'post_live' (was live, but VOD is not yet processed)
ae30b840 366 If absent, automatically set from is_live, was_live
7c80519c 367 start_time: Time in seconds where the reproduction should start, as
10952eb2 368 specified in the URL.
297a564b 369 end_time: Time in seconds where the reproduction should end, as
10952eb2 370 specified in the URL.
55949fed 371 chapters: A list of dictionaries, with the following entries:
372 * "start_time" - The start time of the chapter in seconds
373 * "end_time" - The end time of the chapter in seconds
374 * "title" (optional, string)
5caf30db
A
375 heatmap: A list of dictionaries, with the following entries:
376 * "start_time" - The start time of the data point in seconds
377 * "end_time" - The end time of the data point in seconds
378 * "value" - The normalized value of the data point (float between 0 and 1)
6cfda058 379 playable_in_embed: Whether this video is allowed to play in embedded
380 players on other sites. Can be True (=always allowed),
381 False (=never allowed), None (=unknown), or a string
62b58c09 382 specifying the criteria for embedability; e.g. 'whitelist'
c224251a
M
383 availability: Under what condition the video is available. One of
384 'private', 'premium_only', 'subscriber_only', 'needs_auth',
385 'unlisted' or 'public'. Use 'InfoExtractor._availability'
386 to set it
e370f9ec 387 media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
1e8fe57e 388 _old_archive_ids: A list of old archive ids needed for backward compatibility
784320c9 389 _format_sort_fields: A list of fields to use for sorting formats
277d6ff5 390 __post_extractor: A function to be called just before the metadata is
391 written to either disk, logger or console. The function
392 must return a dict which will be added to the info_dict.
393 This is usefull for additional information that is
394 time-consuming to extract. Note that the fields thus
395 extracted will not be available to output template and
396 match_filter. So, only "comments" and "comment_count" are
397 currently allowed to be extracted via this method.
d6983cb4 398
7109903e
S
399 The following fields should only be used when the video belongs to some logical
400 chapter or section:
401
402 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
403 chapter_number: Number of the chapter the video belongs to, as an integer.
404 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
405
406 The following fields should only be used when the video is an episode of some
8d76bdf1 407 series, programme or podcast:
7109903e
S
408
409 series: Title of the series or programme the video episode belongs to.
9ac24e23 410 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 411 season: Title of the season the video episode belongs to.
27bfd4e5
S
412 season_number: Number of the season the video episode belongs to, as an integer.
413 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
414 episode: Title of the video episode. Unlike mandatory video title field,
415 this field should denote the exact title of the video episode
416 without any kind of decoration.
27bfd4e5
S
417 episode_number: Number of the video episode within a season, as an integer.
418 episode_id: Id of the video episode, as a unicode string.
7109903e 419
7a93ab5f
S
420 The following fields should only be used when the media is a track or a part of
421 a music album:
422
423 track: Title of the track.
424 track_number: Number of the track within an album or a disc, as an integer.
425 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
426 as a unicode string.
104a7b5a
L
427 artists: List of artists of the track.
428 composers: List of composers of the piece.
429 genres: List of genres of the track.
7a93ab5f
S
430 album: Title of the album the track belongs to.
431 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
104a7b5a
L
432 album_artists: List of all artists appeared on the album.
433 E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
434 Useful for splits and compilations.
7a93ab5f
S
435 disc_number: Number of the disc or other physical medium the track belongs to,
436 as an integer.
7a93ab5f 437
3975b4d2 438 The following fields should only be set for clips that should be cut from the original video:
439
440 section_start: Start time of the section in seconds
441 section_end: End time of the section in seconds
442
45e8a04e 443 The following fields should only be set for storyboards:
444 rows: Number of rows in each storyboard fragment, as an integer
445 columns: Number of columns in each storyboard fragment, as an integer
446
104a7b5a
L
447 The following fields are deprecated and should not be set by new code:
448 composer: Use "composers" instead.
449 Composer(s) of the piece, comma-separated.
450 artist: Use "artists" instead.
451 Artist(s) of the track, comma-separated.
452 genre: Use "genres" instead.
453 Genre(s) of the track, comma-separated.
454 album_artist: Use "album_artists" instead.
455 All artists appeared on the album, comma-separated.
456 creator: Use "creators" instead.
457 The creator of the video.
458
deefc05b 459 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 460
d838b1bd
PH
461 Unless mentioned otherwise, None is equivalent to absence of information.
462
fed5d032
PH
463
464 _type "playlist" indicates multiple videos.
b82f815f
PH
465 There must be a key "entries", which is a list, an iterable, or a PagedList
466 object, each element of which is a valid dictionary by this specification.
fed5d032 467
962ffcf8 468 Additionally, playlists can have "id", "title", and any other relevant
b60419c5 469 attributes with the same semantics as videos (see above).
fed5d032 470
f0d785d3 471 It can also have the following optional fields:
472
473 playlist_count: The total number of videos in a playlist. If not given,
474 YoutubeDL tries to calculate it from "entries"
475
fed5d032
PH
476
477 _type "multi_video" indicates that there are multiple videos that
478 form a single show, for examples multiple acts of an opera or TV episode.
479 It must have an entries key like a playlist and contain all the keys
480 required for a video at the same time.
481
482
483 _type "url" indicates that the video must be extracted from another
484 location, possibly by a different extractor. Its only required key is:
485 "url" - the next URL to extract.
f58766ce
PH
486 The key "ie_key" can be set to the class name (minus the trailing "IE",
487 e.g. "Youtube") if the extractor class is known in advance.
488 Additionally, the dictionary may have any properties of the resolved entity
489 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
490 known ahead of time.
491
492
493 _type "url_transparent" entities have the same specification as "url", but
494 indicate that the given additional information is more precise than the one
495 associated with the resolved URL.
496 This is useful when a site employs a video service that hosts the video and
497 its technical metadata, but that video service does not embed a useful
498 title, description etc.
499
500
8f97a15d 501 Subclasses of this should also be added to the list of extractors and
5fd83674 502 should define _VALID_URL as a regexp or a Sequence of regexps, and
503 re-define the _real_extract() and (optionally) _real_initialize() methods.
d6983cb4 504
e6f21b3d 505 Subclasses may also override suitable() if necessary, but ensure the function
506 signature is preserved and that this function imports everything it needs
52efa4b3 507 (except other extractors), so that lazy_extractors works correctly.
508
8f97a15d 509 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
510 the HTML of Generic webpages. It may also override _extract_embed_urls
511 or _extract_from_webpage as necessary. While these are normally classmethods,
512 _extract_from_webpage is allowed to be an instance method.
513
514 _extract_from_webpage may raise self.StopExtraction() to stop further
515 processing of the webpage and obtain exclusive rights to it. This is useful
62b58c09
L
516 when the extractor cannot reliably be matched using just the URL,
517 e.g. invidious/peertube instances
8f97a15d 518
519 Embed-only extractors can be defined by setting _VALID_URL = False.
520
52efa4b3 521 To support username + password (or netrc) login, the extractor must define a
522 _NETRC_MACHINE and re-define _perform_login(username, password) and
523 (optionally) _initialize_pre_login() methods. The _perform_login method will
524 be called between _initialize_pre_login and _real_initialize if credentials
525 are passed by the user. In cases where it is necessary to have the login
526 process as part of the extraction rather than initialization, _perform_login
527 can be left undefined.
e6f21b3d 528
4248dad9 529 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
530 geo restriction bypass mechanisms for a particular extractor.
531 Though it won't disable explicit geo restriction bypass based on
504f20dd 532 country code provided with geo_bypass_country.
4248dad9
S
533
534 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
535 countries for this extractor. One of these countries will be used by
536 geo restriction bypass mechanism right away in order to bypass
504f20dd 537 geo restriction, of course, if the mechanism is not disabled.
773f291d 538
5f95927a
S
539 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
540 IP blocks in CIDR notation for this extractor. One of these IP blocks
541 will be used by geo restriction bypass mechanism similarly
504f20dd 542 to _GEO_COUNTRIES.
3ccdde8c 543
fe7866d0 544 The _ENABLED attribute should be set to False for IEs that
545 are disabled by default and must be explicitly enabled.
546
e6f21b3d 547 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
548 in order to warn the users and skip the tests.
549 """
550
551 _ready = False
552 _downloader = None
773f291d 553 _x_forwarded_for_ip = None
4248dad9
S
554 _GEO_BYPASS = True
555 _GEO_COUNTRIES = None
5f95927a 556 _GEO_IP_BLOCKS = None
d6983cb4 557 _WORKING = True
fe7866d0 558 _ENABLED = True
52efa4b3 559 _NETRC_MACHINE = None
231025c4 560 IE_DESC = None
8dcce6a8 561 SEARCH_KEY = None
8f97a15d 562 _VALID_URL = None
563 _EMBED_REGEX = []
d6983cb4 564
8dcce6a8 565 def _login_hint(self, method=NO_DEFAULT, netrc=None):
db3ad8a6 566 password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
8dcce6a8 567 return {
568 None: '',
569 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
570 'password': f'Use {password_hint}',
571 'cookies': (
572 'Use --cookies-from-browser or --cookies for the authentication. '
17ffed18 573 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
8dcce6a8 574 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 575
d6983cb4 576 def __init__(self, downloader=None):
49a57e70 577 """Constructor. Receives an optional downloader (a YoutubeDL instance).
578 If a downloader is not passed during initialization,
579 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 580 self._ready = False
773f291d 581 self._x_forwarded_for_ip = None
28f436ba 582 self._printed_messages = set()
d6983cb4
PH
583 self.set_downloader(downloader)
584
585 @classmethod
5ad28e7f 586 def _match_valid_url(cls, url):
8f97a15d 587 if cls._VALID_URL is False:
588 return None
79cb2577
PH
589 # This does not use has/getattr intentionally - we want to know whether
590 # we have cached the regexp for *this* class, whereas getattr would also
591 # match the superclass
592 if '_VALID_URL_RE' not in cls.__dict__:
5fd83674 593 cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
594 return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
5ad28e7f 595
596 @classmethod
597 def suitable(cls, url):
598 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 599 # This function must import everything it needs (except other extractors),
600 # so that lazy_extractors works correctly
5ad28e7f 601 return cls._match_valid_url(url) is not None
d6983cb4 602
ed9266db
PH
603 @classmethod
604 def _match_id(cls, url):
5ad28e7f 605 return cls._match_valid_url(url).group('id')
ed9266db 606
1151c407 607 @classmethod
608 def get_temp_id(cls, url):
609 try:
610 return cls._match_id(url)
611 except (IndexError, AttributeError):
612 return None
613
d6983cb4
PH
614 @classmethod
615 def working(cls):
616 """Getter method for _WORKING."""
617 return cls._WORKING
618
52efa4b3 619 @classmethod
620 def supports_login(cls):
621 return bool(cls._NETRC_MACHINE)
622
d6983cb4
PH
623 def initialize(self):
624 """Initializes an instance (authentication, etc)."""
28f436ba 625 self._printed_messages = set()
5f95927a
S
626 self._initialize_geo_bypass({
627 'countries': self._GEO_COUNTRIES,
628 'ip_blocks': self._GEO_IP_BLOCKS,
629 })
4248dad9 630 if not self._ready:
52efa4b3 631 self._initialize_pre_login()
632 if self.supports_login():
633 username, password = self._get_login_info()
634 if username:
635 self._perform_login(username, password)
636 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 637 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
638 self._real_initialize()
639 self._ready = True
640
5f95927a 641 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
642 """
643 Initialize geo restriction bypass mechanism.
644
645 This method is used to initialize geo bypass mechanism based on faking
646 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 647 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
648 IP will be passed as X-Forwarded-For HTTP header in all subsequent
649 HTTP requests.
e39b5d4a
S
650
651 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
652 during the instance initialization with _GEO_COUNTRIES and
653 _GEO_IP_BLOCKS.
e39b5d4a 654
5f95927a 655 You may also manually call it from extractor's code if geo bypass
e39b5d4a 656 information is not available beforehand (e.g. obtained during
5f95927a
S
657 extraction) or due to some other reason. In this case you should pass
658 this information in geo bypass context passed as first argument. It may
659 contain following fields:
660
661 countries: List of geo unrestricted countries (similar
662 to _GEO_COUNTRIES)
663 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
664 (similar to _GEO_IP_BLOCKS)
665
e39b5d4a 666 """
773f291d 667 if not self._x_forwarded_for_ip:
5f95927a
S
668
669 # Geo bypass mechanism is explicitly disabled by user
a06916d9 670 if not self.get_param('geo_bypass', True):
5f95927a
S
671 return
672
673 if not geo_bypass_context:
674 geo_bypass_context = {}
675
676 # Backward compatibility: previously _initialize_geo_bypass
677 # expected a list of countries, some 3rd party code may still use
678 # it this way
679 if isinstance(geo_bypass_context, (list, tuple)):
680 geo_bypass_context = {
681 'countries': geo_bypass_context,
682 }
683
684 # The whole point of geo bypass mechanism is to fake IP
685 # as X-Forwarded-For HTTP header based on some IP block or
686 # country code.
687
688 # Path 1: bypassing based on IP block in CIDR notation
689
690 # Explicit IP block specified by user, use it right away
691 # regardless of whether extractor is geo bypassable or not
a06916d9 692 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
693
694 # Otherwise use random IP block from geo bypass context but only
695 # if extractor is known as geo bypassable
696 if not ip_block:
697 ip_blocks = geo_bypass_context.get('ip_blocks')
698 if self._GEO_BYPASS and ip_blocks:
699 ip_block = random.choice(ip_blocks)
700
701 if ip_block:
702 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 703 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
704 return
705
706 # Path 2: bypassing based on country code
707
708 # Explicit country code specified by user, use it right away
709 # regardless of whether extractor is geo bypassable or not
a06916d9 710 country = self.get_param('geo_bypass_country', None)
5f95927a
S
711
712 # Otherwise use random country code from geo bypass context but
713 # only if extractor is known as geo bypassable
714 if not country:
715 countries = geo_bypass_context.get('countries')
716 if self._GEO_BYPASS and countries:
717 country = random.choice(countries)
718
719 if country:
720 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 721 self._downloader.write_debug(
86e5f3ed 722 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
723
724 def extract(self, url):
725 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 726 try:
773f291d
S
727 for _ in range(2):
728 try:
729 self.initialize()
71df9b7f 730 self.to_screen('Extracting URL: %s' % (
731 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
0016b84e 732 ie_result = self._real_extract(url)
07cce701 733 if ie_result is None:
734 return None
0016b84e
S
735 if self._x_forwarded_for_ip:
736 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
b79f9e30 737 subtitles = ie_result.get('subtitles') or {}
738 if 'no-live-chat' in self.get_param('compat_opts'):
739 for lang in ('live_chat', 'comments', 'danmaku'):
740 subtitles.pop(lang, None)
0016b84e 741 return ie_result
773f291d 742 except GeoRestrictedError as e:
4248dad9
S
743 if self.__maybe_fake_ip_and_retry(e.countries):
744 continue
773f291d 745 raise
0db3bae8 746 except UnsupportedError:
747 raise
1151c407 748 except ExtractorError as e:
6148833f 749 e.video_id = e.video_id or self.get_temp_id(url)
9bcfe33b 750 e.ie = e.ie or self.IE_NAME,
751 e.traceback = e.traceback or sys.exc_info()[2]
752 raise
3d2623a8 753 except IncompleteRead as e:
1151c407 754 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 755 except (KeyError, StopIteration) as e:
1151c407 756 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 757
4248dad9 758 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 759 if (not self.get_param('geo_bypass_country', None)
3089bc74 760 and self._GEO_BYPASS
a06916d9 761 and self.get_param('geo_bypass', True)
3089bc74
S
762 and not self._x_forwarded_for_ip
763 and countries):
eea0716c
S
764 country_code = random.choice(countries)
765 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
766 if self._x_forwarded_for_ip:
767 self.report_warning(
eea0716c
S
768 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
769 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
770 return True
771 return False
772
d6983cb4 773 def set_downloader(self, downloader):
08d30158 774 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
775 self._downloader = downloader
776
9809740b 777 @property
778 def cache(self):
779 return self._downloader.cache
780
781 @property
782 def cookiejar(self):
783 return self._downloader.cookiejar
784
52efa4b3 785 def _initialize_pre_login(self):
962ffcf8 786 """ Initialization before login. Redefine in subclasses."""
52efa4b3 787 pass
788
789 def _perform_login(self, username, password):
790 """ Login with username and password. Redefine in subclasses."""
791 pass
792
d6983cb4
PH
793 def _real_initialize(self):
794 """Real initialization process. Redefine in subclasses."""
795 pass
796
797 def _real_extract(self, url):
798 """Real extraction process. Redefine in subclasses."""
08d30158 799 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 800
56c73665
JMF
801 @classmethod
802 def ie_key(cls):
803 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 804 return cls.__name__[:-2]
56c73665 805
82d02080 806 @classproperty
807 def IE_NAME(cls):
808 return cls.__name__[:-2]
d6983cb4 809
d391b7e2
S
810 @staticmethod
811 def __can_accept_status_code(err, expected_status):
3d2623a8 812 assert isinstance(err, HTTPError)
d391b7e2
S
813 if expected_status is None:
814 return False
d391b7e2 815 elif callable(expected_status):
3d2623a8 816 return expected_status(err.status) is True
d391b7e2 817 else:
3d2623a8 818 return err.status in variadic(expected_status)
d391b7e2 819
c043c246 820 def _create_request(self, url_or_request, data=None, headers=None, query=None):
ac668111 821 if isinstance(url_or_request, urllib.request.Request):
3d2623a8 822 self._downloader.deprecation_warning(
823 'Passing a urllib.request.Request to _create_request() is deprecated. '
824 'Use yt_dlp.networking.common.Request instead.')
227bf1a3 825 url_or_request = urllib_req_to_req(url_or_request)
826 elif not isinstance(url_or_request, Request):
827 url_or_request = Request(url_or_request)
828
829 url_or_request.update(data=data, headers=headers, query=query)
830 return url_or_request
f95b9dee 831
c043c246 832 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
d391b7e2
S
833 """
834 Return the response handle.
835
836 See _download_webpage docstring for arguments specification.
837 """
1cf376f5 838 if not self._downloader._first_webpage_request:
49a57e70 839 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 840 if sleep_interval > 0:
5ef7d9bd 841 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 842 time.sleep(sleep_interval)
843 else:
844 self._downloader._first_webpage_request = False
845
d6983cb4
PH
846 if note is None:
847 self.report_download_webpage(video_id)
848 elif note is not False:
7cc3570e 849 if video_id is None:
86e5f3ed 850 self.to_screen(str(note))
7cc3570e 851 else:
86e5f3ed 852 self.to_screen(f'{video_id}: {note}')
2132edaa
S
853
854 # Some sites check X-Forwarded-For HTTP header in order to figure out
855 # the origin of the client behind proxy. This allows bypassing geo
856 # restriction by faking this header's value to IP that belongs to some
857 # geo unrestricted country. We will do so once we encounter any
858 # geo restriction error.
859 if self._x_forwarded_for_ip:
c043c246 860 headers = (headers or {}).copy()
861 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
2132edaa 862
d6983cb4 863 try:
f95b9dee 864 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 865 except network_exceptions as err:
3d2623a8 866 if isinstance(err, HTTPError):
d391b7e2 867 if self.__can_accept_status_code(err, expected_status):
227bf1a3 868 return err.response
d391b7e2 869
aa94a6d3
PH
870 if errnote is False:
871 return False
d6983cb4 872 if errnote is None:
f1a9d64e 873 errnote = 'Unable to download webpage'
7f8b2714 874
86e5f3ed 875 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 876 if fatal:
497d2fab 877 raise ExtractorError(errmsg, cause=err)
7cc3570e 878 else:
6a39ee13 879 self.report_warning(errmsg)
7cc3570e 880 return False
d6983cb4 881
1890fc63 882 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
883 encoding=None, data=None, headers={}, query={}, expected_status=None):
d391b7e2
S
884 """
885 Return a tuple (page content as string, URL handle).
886
617f658b 887 Arguments:
888 url_or_request -- plain text URL as a string or
ac668111 889 a urllib.request.Request object
617f658b 890 video_id -- Video/playlist/item identifier (string)
891
892 Keyword arguments:
893 note -- note printed before downloading (string)
894 errnote -- note printed in case of an error (string)
895 fatal -- flag denoting whether error should be considered fatal,
896 i.e. whether it should cause ExtractionError to be raised,
897 otherwise a warning will be reported and extraction continued
898 encoding -- encoding for a page content decoding, guessed automatically
899 when not explicitly specified
900 data -- POST data (bytes)
901 headers -- HTTP headers (dict)
902 query -- URL query (dict)
903 expected_status -- allows to accept failed HTTP requests (non 2xx
904 status code) by explicitly specifying a set of accepted status
905 codes. Can be any of the following entities:
906 - an integer type specifying an exact failed status code to
907 accept
908 - a list or a tuple of integer types specifying a list of
909 failed status codes to accept
910 - a callable accepting an actual failed status code and
911 returning True if it should be accepted
912 Note that this argument does not affect success status codes (2xx)
913 which are always accepted.
d391b7e2 914 """
617f658b 915
b9d3e163 916 # Strip hashes from the URL (#1038)
14f25df2 917 if isinstance(url_or_request, str):
b9d3e163
PH
918 url_or_request = url_or_request.partition('#')[0]
919
d391b7e2 920 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
921 if urlh is False:
922 assert not fatal
923 return False
c9a77969 924 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
925 return (content, urlh)
926
c9a77969
YCH
927 @staticmethod
928 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
929 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
930 if m:
931 encoding = m.group(1)
932 else:
0d75ae2c 933 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
934 webpage_bytes[:1024])
935 if m:
936 encoding = m.group(1).decode('ascii')
b60016e8
PH
937 elif webpage_bytes.startswith(b'\xff\xfe'):
938 encoding = 'utf-16'
f143d86a
PH
939 else:
940 encoding = 'utf-8'
c9a77969
YCH
941
942 return encoding
943
4457823d
S
944 def __check_blocked(self, content):
945 first_block = content[:512]
3089bc74
S
946 if ('<title>Access to this site is blocked</title>' in content
947 and 'Websense' in first_block):
4457823d
S
948 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
949 blocked_iframe = self._html_search_regex(
950 r'<iframe src="([^"]+)"', content,
951 'Websense information URL', default=None)
952 if blocked_iframe:
953 msg += ' Visit %s for more details' % blocked_iframe
954 raise ExtractorError(msg, expected=True)
955 if '<title>The URL you requested has been blocked</title>' in first_block:
956 msg = (
957 'Access to this webpage has been blocked by Indian censorship. '
958 'Use a VPN or proxy server (with --proxy) to route around it.')
959 block_msg = self._html_search_regex(
960 r'</h1><p>(.*?)</p>',
961 content, 'block message', default=None)
962 if block_msg:
963 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
964 raise ExtractorError(msg, expected=True)
3089bc74
S
965 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
966 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
967 raise ExtractorError(
968 'Access to this webpage has been blocked by decision of the Russian government. '
969 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
970 expected=True)
971
f95b9dee 972 def _request_dump_filename(self, url, video_id):
973 basen = f'{video_id}_{url}'
974 trim_length = self.get_param('trim_file_name') or 240
975 if len(basen) > trim_length:
976 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
977 basen = basen[:trim_length - len(h)] + h
978 filename = sanitize_filename(f'{basen}.dump', restricted=True)
979 # Working around MAX_PATH limitation on Windows (see
980 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
981 if compat_os_name == 'nt':
982 absfilepath = os.path.abspath(filename)
983 if len(absfilepath) > 259:
984 filename = fR'\\?\{absfilepath}'
985 return filename
986
987 def __decode_webpage(self, webpage_bytes, encoding, headers):
988 if not encoding:
989 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
990 try:
991 return webpage_bytes.decode(encoding, 'replace')
992 except LookupError:
993 return webpage_bytes.decode('utf-8', 'replace')
994
c9a77969 995 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
996 webpage_bytes = urlh.read()
997 if prefix is not None:
998 webpage_bytes = prefix + webpage_bytes
a06916d9 999 if self.get_param('dump_intermediate_pages', False):
3d2623a8 1000 self.to_screen('Dumping request to ' + urlh.url)
d6983cb4
PH
1001 dump = base64.b64encode(webpage_bytes).decode('ascii')
1002 self._downloader.to_screen(dump)
f95b9dee 1003 if self.get_param('write_pages'):
3d2623a8 1004 filename = self._request_dump_filename(urlh.url, video_id)
f95b9dee 1005 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
1006 with open(filename, 'wb') as outf:
1007 outf.write(webpage_bytes)
1008
f95b9dee 1009 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 1010 self.__check_blocked(content)
2410c43d 1011
23be51d8 1012 return content
d6983cb4 1013
6edf2808 1014 def __print_error(self, errnote, fatal, video_id, err):
1015 if fatal:
c6e07cf1 1016 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
6edf2808 1017 elif errnote:
c6e07cf1 1018 self.report_warning(f'{video_id}: {errnote}: {err}')
6edf2808 1019
1020 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
e2b38da9
PH
1021 if transform_source:
1022 xml_string = transform_source(xml_string)
e01c3d2e
S
1023 try:
1024 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 1025 except xml.etree.ElementTree.ParseError as ve:
6edf2808 1026 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
267ed0c5 1027
6edf2808 1028 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
3d3538e4 1029 try:
b7c47b74 1030 return json.loads(
1031 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
3d3538e4 1032 except ValueError as ve:
6edf2808 1033 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
3d3538e4 1034
6edf2808 1035 def _parse_socket_response_as_json(self, data, *args, **kwargs):
1036 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
adddc50c 1037
617f658b 1038 def __create_download_methods(name, parser, note, errnote, return_value):
1039
6edf2808 1040 def parse(ie, content, *args, errnote=errnote, **kwargs):
617f658b 1041 if parser is None:
1042 return content
6edf2808 1043 if errnote is False:
1044 kwargs['errnote'] = errnote
617f658b 1045 # parser is fetched by name so subclasses can override it
1046 return getattr(ie, parser)(content, *args, **kwargs)
1047
c4910024 1048 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1049 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1050 res = self._download_webpage_handle(
1051 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1052 data=data, headers=headers, query=query, expected_status=expected_status)
617f658b 1053 if res is False:
1054 return res
1055 content, urlh = res
6edf2808 1056 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
617f658b 1057
f95b9dee 1058 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
c4910024 1059 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
f95b9dee 1060 if self.get_param('load_pages'):
1061 url_or_request = self._create_request(url_or_request, data, headers, query)
81b4712b 1062 filename = self._request_dump_filename(url_or_request.url, video_id)
f95b9dee 1063 self.to_screen(f'Loading request from {filename}')
1064 try:
1065 with open(filename, 'rb') as dumpf:
1066 webpage_bytes = dumpf.read()
1067 except OSError as e:
1068 self.report_warning(f'Unable to load request from disk: {e}')
1069 else:
1070 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
6edf2808 1071 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
c4910024 1072 kwargs = {
1073 'note': note,
1074 'errnote': errnote,
1075 'transform_source': transform_source,
1076 'fatal': fatal,
1077 'encoding': encoding,
1078 'data': data,
1079 'headers': headers,
1080 'query': query,
1081 'expected_status': expected_status,
1082 }
617f658b 1083 if parser is None:
c4910024 1084 kwargs.pop('transform_source')
617f658b 1085 # The method is fetched by name so subclasses can override _download_..._handle
c4910024 1086 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
617f658b 1087 return res if res is False else res[0]
1088
1089 def impersonate(func, name, return_value):
1090 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1091 func.__doc__ = f'''
1092 @param transform_source Apply this transformation before parsing
1093 @returns {return_value}
1094
1095 See _download_webpage_handle docstring for other arguments specification
1096 '''
1097
1098 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1099 impersonate(download_content, f'_download_{name}', f'{return_value}')
1100 return download_handle, download_content
1101
1102 _download_xml_handle, _download_xml = __create_download_methods(
1103 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1104 _download_json_handle, _download_json = __create_download_methods(
1105 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1106 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1107 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1108 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1109
617f658b 1110 def _download_webpage(
1111 self, url_or_request, video_id, note=None, errnote=None,
1112 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1113 """
617f658b 1114 Return the data of the page as a string.
adddc50c 1115
617f658b 1116 Keyword arguments:
1117 tries -- number of tries
1118 timeout -- sleep interval between tries
1119
1120 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1121 """
617f658b 1122
1123 R''' # NB: These are unused; should they be deprecated?
1124 if tries != 1:
1125 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1126 if timeout is NO_DEFAULT:
1127 timeout = 5
1128 else:
1129 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1130 '''
1131
1132 try_count = 0
1133 while True:
1134 try:
1135 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
3d2623a8 1136 except IncompleteRead as e:
617f658b 1137 try_count += 1
1138 if try_count >= tries:
1139 raise e
1140 self._sleep(timeout, video_id)
adddc50c 1141
28f436ba 1142 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
a70635b8 1143 idstr = format_field(video_id, None, '%s: ')
28f436ba 1144 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1145 if only_once:
1146 if f'WARNING: {msg}' in self._printed_messages:
1147 return
1148 self._printed_messages.add(f'WARNING: {msg}')
1149 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1150
a06916d9 1151 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1152 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1153 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1154
1155 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1156 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1157
1158 def get_param(self, name, default=None, *args, **kwargs):
1159 if self._downloader:
1160 return self._downloader.params.get(name, default, *args, **kwargs)
1161 return default
d6983cb4 1162
d5d1df8a 1163 def report_drm(self, video_id, partial=NO_DEFAULT):
1164 if partial is not NO_DEFAULT:
1165 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
88acdbc2 1166 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1167
d6983cb4
PH
1168 def report_extraction(self, id_or_name):
1169 """Report information extraction."""
f1a9d64e 1170 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1171
1172 def report_download_webpage(self, video_id):
1173 """Report webpage download."""
f1a9d64e 1174 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1175
1176 def report_age_confirmation(self):
1177 """Report attempt to confirm age."""
f1a9d64e 1178 self.to_screen('Confirming age')
d6983cb4 1179
fc79158d
JMF
1180 def report_login(self):
1181 """Report attempt to log in."""
f1a9d64e 1182 self.to_screen('Logging in')
fc79158d 1183
b7da73eb 1184 def raise_login_required(
9d5d4d64 1185 self, msg='This video is only available for registered users',
52efa4b3 1186 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1187 if metadata_available and (
1188 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1189 self.report_warning(msg)
7265a219 1190 return
a70635b8 1191 msg += format_field(self._login_hint(method), None, '. %s')
46890374 1192 raise ExtractorError(msg, expected=True)
43e7d3c9 1193
b7da73eb 1194 def raise_geo_restricted(
1195 self, msg='This video is not available from your location due to geo restriction',
1196 countries=None, metadata_available=False):
f2ebc5c7 1197 if metadata_available and (
1198 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1199 self.report_warning(msg)
1200 else:
1201 raise GeoRestrictedError(msg, countries=countries)
1202
1203 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1204 if expected and (
1205 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1206 self.report_warning(msg, video_id)
68f5867c
L
1207 elif isinstance(msg, ExtractorError):
1208 raise msg
b7da73eb 1209 else:
1210 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1211
5f6a1245 1212 # Methods for following #608
c0d0b01f 1213 @staticmethod
311b6615 1214 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1215 """Returns a URL that points to a page that should be processed"""
311b6615 1216 if ie is not None:
1217 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1218 if video_id is not None:
311b6615 1219 kwargs['id'] = video_id
830d53bf 1220 if video_title is not None:
311b6615 1221 kwargs['title'] = video_title
1222 return {
1223 **kwargs,
1224 '_type': 'url_transparent' if url_transparent else 'url',
1225 'url': url,
1226 }
1227
8f97a15d 1228 @classmethod
1229 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1230 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1231 return cls.playlist_result(
1232 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1233 playlist_id, playlist_title, **kwargs)
46b18f23 1234
c0d0b01f 1235 @staticmethod
311b6615 1236 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1237 """Returns a playlist"""
d6983cb4 1238 if playlist_id:
311b6615 1239 kwargs['id'] = playlist_id
d6983cb4 1240 if playlist_title:
311b6615 1241 kwargs['title'] = playlist_title
ecc97af3 1242 if playlist_description is not None:
311b6615 1243 kwargs['description'] = playlist_description
1244 return {
1245 **kwargs,
1246 '_type': 'multi_video' if multi_video else 'playlist',
1247 'entries': entries,
1248 }
d6983cb4 1249
c342041f 1250 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1251 """
1252 Perform a regex search on the given string, using a single or a list of
1253 patterns returning the first matching group.
1254 In case of failure return a default value or raise a WARNING or a
55b3e45b 1255 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1256 """
61d3665d 1257 if string is None:
1258 mobj = None
77f90330 1259 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1260 mobj = re.search(pattern, string, flags)
1261 else:
1262 for p in pattern:
1263 mobj = re.search(p, string, flags)
c3415d1b
PH
1264 if mobj:
1265 break
d6983cb4 1266
ec11a9f4 1267 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1268
1269 if mobj:
711ede6e
PH
1270 if group is None:
1271 # return the first matching group
1272 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1273 elif isinstance(group, (list, tuple)):
1274 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1275 else:
1276 return mobj.group(group)
c342041f 1277 elif default is not NO_DEFAULT:
d6983cb4
PH
1278 return default
1279 elif fatal:
f1a9d64e 1280 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1281 else:
6a39ee13 1282 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1283 return None
1284
f0bc6e20 1285 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
8b7fb8b6 1286 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
b7c47b74 1287 """Searches string for the JSON object specified by start_pattern"""
1288 # NB: end_pattern is only used to reduce the size of the initial match
f0bc6e20 1289 if default is NO_DEFAULT:
1290 default, has_default = {}, False
1291 else:
1292 fatal, has_default = False, True
1293
1294 json_string = self._search_regex(
8b7fb8b6 1295 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
f0bc6e20 1296 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1297 if not json_string:
1298 return default
1299
1300 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1301 try:
1302 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1303 except ExtractorError as e:
1304 if fatal:
1305 raise ExtractorError(
1306 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1307 elif not has_default:
1308 self.report_warning(
1309 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1310 return default
b7c47b74 1311
c342041f 1312 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1313 """
1314 Like _search_regex, but strips HTML tags and unescapes entities.
1315 """
711ede6e 1316 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
08e29b9f 1317 if isinstance(res, tuple):
edfc7725 1318 return tuple(map(clean_html, res))
1319 return clean_html(res)
d6983cb4 1320
2118fdd1 1321 def _get_netrc_login_info(self, netrc_machine=None):
2118fdd1
RA
1322 netrc_machine = netrc_machine or self._NETRC_MACHINE
1323
d7cd97e8 1324 cmd = self.get_param('netrc_cmd')
db3ad8a6 1325 if cmd:
d7cd97e8 1326 cmd = cmd.replace('{}', netrc_machine)
db3ad8a6
ND
1327 self.to_screen(f'Executing command: {cmd}')
1328 stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1329 if ret != 0:
1330 raise OSError(f'Command returned error code {ret}')
1331 info = netrc_from_content(stdout).authenticators(netrc_machine)
2118fdd1 1332
db3ad8a6
ND
1333 elif self.get_param('usenetrc', False):
1334 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1335 if os.path.isdir(netrc_file):
1336 netrc_file = os.path.join(netrc_file, '.netrc')
1337 info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1338
1339 else:
1340 return None, None
1341 if not info:
1342 raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
1343 return info[0], info[2]
2118fdd1 1344
1b6712ab 1345 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1346 """
cf0649f8 1347 Get the login info as (username, password)
32443dd3
S
1348 First look for the manually specified credentials using username_option
1349 and password_option as keys in params dictionary. If no such credentials
db3ad8a6
ND
1350 are available try the netrc_cmd if it is defined or look in the
1351 netrc file using the netrc_machine or _NETRC_MACHINE value.
fc79158d
JMF
1352 If there's no info available, return (None, None)
1353 """
fc79158d 1354
a06916d9 1355 username = self.get_param(username_option)
1356 if username is not None:
1357 password = self.get_param(password_option)
2118fdd1 1358 else:
db3ad8a6
ND
1359 try:
1360 username, password = self._get_netrc_login_info(netrc_machine)
1361 except (OSError, netrc.NetrcParseError) as err:
1362 self.report_warning(f'Failed to parse .netrc: {err}')
1363 return None, None
2133565c 1364 return username, password
fc79158d 1365
e64b7569 1366 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1367 """
1368 Get the two-factor authentication info
1369 TODO - asking the user will be required for sms/phone verify
1370 currently just uses the command line option
1371 If there's no info available, return None
1372 """
83317f69 1373
a06916d9 1374 tfa = self.get_param('twofactor')
1375 if tfa is not None:
1376 return tfa
83317f69 1377
ac668111 1378 return getpass.getpass('Type %s and press [Return]: ' % note)
83317f69 1379
46720279
JMF
1380 # Helper functions for extracting OpenGraph info
1381 @staticmethod
ab2d5247 1382 def _og_regexes(prop):
45b2ee6f 1383 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
fbfde1c3
F
1384 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1385 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1386 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1387 return [
78fb87b2
JMF
1388 template % (property_re, content_re),
1389 template % (content_re, property_re),
ab2d5247 1390 ]
46720279 1391
864f24bd
S
1392 @staticmethod
1393 def _meta_regex(prop):
1394 return r'''(?isx)<meta
8b9848ac 1395 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1396 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1397
3c4e6d83 1398 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1399 prop = variadic(prop)
46720279 1400 if name is None:
b070564e
S
1401 name = 'OpenGraph %s' % prop[0]
1402 og_regexes = []
1403 for p in prop:
1404 og_regexes.extend(self._og_regexes(p))
1405 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1406 if escaped is None:
1407 return None
1408 return unescapeHTML(escaped)
46720279
JMF
1409
1410 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1411 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1412
1413 def _og_search_description(self, html, **kargs):
1414 return self._og_search_property('description', html, fatal=False, **kargs)
1415
04f3fd2c 1416 def _og_search_title(self, html, *, fatal=False, **kargs):
1417 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1418
8ffa13e0 1419 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1420 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1421 if secure:
1422 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1423 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1424
78338f71
JMF
1425 def _og_search_url(self, html, **kargs):
1426 return self._og_search_property('url', html, **kargs)
1427
04f3fd2c 1428 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1429 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1430
40c696e5 1431 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1432 name = variadic(name)
59040888 1433 if display_name is None:
88d9f6c0 1434 display_name = name[0]
59040888 1435 return self._html_search_regex(
88d9f6c0 1436 [self._meta_regex(n) for n in name],
711ede6e 1437 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1438
1439 def _dc_search_uploader(self, html):
1440 return self._html_search_meta('dc.creator', html, 'uploader')
1441
8f97a15d 1442 @staticmethod
1443 def _rta_search(html):
8dbe9899
PH
1444 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1445 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1446 r' content="RTA-5042-1996-1400-1577-RTA"',
1447 html):
1448 return 18
8f97a15d 1449
1450 # And then there are the jokers who advertise that they use RTA, but actually don't.
1451 AGE_LIMIT_MARKERS = [
1452 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
32a84bcf
SS
1453 r'>[^<]*you acknowledge you are at least (\d+) years old',
1454 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
8f97a15d 1455 ]
32a84bcf
SS
1456
1457 age_limit = 0
1458 for marker in AGE_LIMIT_MARKERS:
1459 mobj = re.search(marker, html)
1460 if mobj:
1461 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1462 return age_limit
8dbe9899 1463
59040888
PH
1464 def _media_rating_search(self, html):
1465 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1466 rating = self._html_search_meta('rating', html)
1467
1468 if not rating:
1469 return None
1470
1471 RATING_TABLE = {
1472 'safe for kids': 0,
1473 'general': 8,
1474 '14 years': 14,
1475 'mature': 17,
1476 'restricted': 19,
1477 }
d800609c 1478 return RATING_TABLE.get(rating.lower())
59040888 1479
69319969 1480 def _family_friendly_search(self, html):
6ca7732d 1481 # See http://schema.org/VideoObject
ac8491fc
S
1482 family_friendly = self._html_search_meta(
1483 'isFamilyFriendly', html, default=None)
69319969
NJ
1484
1485 if not family_friendly:
1486 return None
1487
1488 RATING_TABLE = {
1489 '1': 0,
1490 'true': 0,
1491 '0': 18,
1492 'false': 18,
1493 }
d800609c 1494 return RATING_TABLE.get(family_friendly.lower())
69319969 1495
0c708f11
JMF
1496 def _twitter_search_player(self, html):
1497 return self._html_search_meta('twitter:player', html,
9e1a5b84 1498 'twitter card player')
0c708f11 1499
0c36dc00 1500 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1501 """Yield all json ld objects in the html"""
1502 if default is not NO_DEFAULT:
1503 fatal = False
1504 for mobj in re.finditer(JSON_LD_RE, html):
1505 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1506 for json_ld in variadic(json_ld_item):
1507 if isinstance(json_ld, dict):
1508 yield json_ld
1509
1510 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1511 """Search for a video in any json ld in the html"""
1512 if default is not NO_DEFAULT:
1513 fatal = False
1514 info = self._json_ld(
1515 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1516 video_id, fatal=fatal, expected_type=expected_type)
1517 if info:
1518 return info
4433bb02
S
1519 if default is not NO_DEFAULT:
1520 return default
1521 elif fatal:
1522 raise RegexNotFoundError('Unable to extract JSON-LD')
1523 else:
6a39ee13 1524 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1525 return {}
4ca2a3cf 1526
95b31e26 1527 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
14f25df2 1528 if isinstance(json_ld, str):
4ca2a3cf
S
1529 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1530 if not json_ld:
1531 return {}
1532 info = {}
bae14048 1533
e7e4a6e0
S
1534 INTERACTION_TYPE_MAP = {
1535 'CommentAction': 'comment',
1536 'AgreeAction': 'like',
1537 'DisagreeAction': 'dislike',
1538 'LikeAction': 'like',
1539 'DislikeAction': 'dislike',
1540 'ListenAction': 'view',
1541 'WatchAction': 'view',
1542 'ViewAction': 'view',
1543 }
1544
f3c0c773 1545 def is_type(e, *expected_types):
1546 type = variadic(traverse_obj(e, '@type'))
1547 return any(x in type for x in expected_types)
1548
29f7c58a 1549 def extract_interaction_type(e):
1550 interaction_type = e.get('interactionType')
1551 if isinstance(interaction_type, dict):
1552 interaction_type = interaction_type.get('@type')
1553 return str_or_none(interaction_type)
1554
e7e4a6e0
S
1555 def extract_interaction_statistic(e):
1556 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1557 if isinstance(interaction_statistic, dict):
1558 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1559 if not isinstance(interaction_statistic, list):
1560 return
1561 for is_e in interaction_statistic:
f3c0c773 1562 if not is_type(is_e, 'InteractionCounter'):
e7e4a6e0 1563 continue
29f7c58a 1564 interaction_type = extract_interaction_type(is_e)
1565 if not interaction_type:
e7e4a6e0 1566 continue
ce5b9040
S
1567 # For interaction count some sites provide string instead of
1568 # an integer (as per spec) with non digit characters (e.g. ",")
1569 # so extracting count with more relaxed str_to_int
1570 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1571 if interaction_count is None:
1572 continue
1573 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1574 if not count_kind:
1575 continue
1576 count_key = '%s_count' % count_kind
1577 if info.get(count_key) is not None:
1578 continue
1579 info[count_key] = interaction_count
1580
f5225737 1581 def extract_chapter_information(e):
1582 chapters = [{
1583 'title': part.get('name'),
1584 'start_time': part.get('startOffset'),
1585 'end_time': part.get('endOffset'),
85553414 1586 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1587 for idx, (last_c, current_c, next_c) in enumerate(zip(
1588 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1589 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1590 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1591 if None in current_c.values():
1592 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1593 return
1594 if chapters:
1595 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1596 info['chapters'] = chapters
1597
bae14048 1598 def extract_video_object(e):
f7ad7160 1599 author = e.get('author')
bae14048 1600 info.update({
0c36dc00 1601 'url': url_or_none(e.get('contentUrl')),
0f60ba6e 1602 'ext': mimetype2ext(e.get('encodingFormat')),
bae14048
S
1603 'title': unescapeHTML(e.get('name')),
1604 'description': unescapeHTML(e.get('description')),
eb2333bc 1605 'thumbnails': [{'url': unescapeHTML(url)}
21633673 1606 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1607 if url_or_none(url)],
bae14048
S
1608 'duration': parse_duration(e.get('duration')),
1609 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1610 # author can be an instance of 'Organization' or 'Person' types.
1611 # both types can have 'name' property(inherited from 'Thing' type). [1]
1612 # however some websites are using 'Text' type instead.
1613 # 1. https://schema.org/VideoObject
14f25df2 1614 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
0f60ba6e 1615 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
56ba69e4 1616 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
bae14048
S
1617 'tbr': int_or_none(e.get('bitrate')),
1618 'width': int_or_none(e.get('width')),
1619 'height': int_or_none(e.get('height')),
33a81c2c 1620 'view_count': int_or_none(e.get('interactionCount')),
0f60ba6e 1621 'tags': try_call(lambda: e.get('keywords').split(',')),
bae14048 1622 })
0f60ba6e 1623 if is_type(e, 'AudioObject'):
1624 info.update({
1625 'vcodec': 'none',
1626 'abr': int_or_none(e.get('bitrate')),
1627 })
e7e4a6e0 1628 extract_interaction_statistic(e)
f5225737 1629 extract_chapter_information(e)
bae14048 1630
d5c32548 1631 def traverse_json_ld(json_ld, at_top_level=True):
1d55ebab
SS
1632 for e in variadic(json_ld):
1633 if not isinstance(e, dict):
1634 continue
d5c32548
ZM
1635 if at_top_level and '@context' not in e:
1636 continue
1637 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1d55ebab 1638 traverse_json_ld(e['@graph'], at_top_level=False)
c13a301a 1639 continue
f3c0c773 1640 if expected_type is not None and not is_type(e, expected_type):
4433bb02 1641 continue
8f122fa0 1642 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1643 if rating is not None:
1644 info['average_rating'] = rating
f3c0c773 1645 if is_type(e, 'TVEpisode', 'Episode'):
440863ad 1646 episode_name = unescapeHTML(e.get('name'))
46933a15 1647 info.update({
440863ad 1648 'episode': episode_name,
46933a15
S
1649 'episode_number': int_or_none(e.get('episodeNumber')),
1650 'description': unescapeHTML(e.get('description')),
1651 })
440863ad
S
1652 if not info.get('title') and episode_name:
1653 info['title'] = episode_name
46933a15 1654 part_of_season = e.get('partOfSeason')
f3c0c773 1655 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1656 info.update({
1657 'season': unescapeHTML(part_of_season.get('name')),
1658 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1659 })
d16b3c66 1660 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
f3c0c773 1661 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1662 info['series'] = unescapeHTML(part_of_series.get('name'))
f3c0c773 1663 elif is_type(e, 'Movie'):
391256dc
S
1664 info.update({
1665 'title': unescapeHTML(e.get('name')),
1666 'description': unescapeHTML(e.get('description')),
1667 'duration': parse_duration(e.get('duration')),
1668 'timestamp': unified_timestamp(e.get('dateCreated')),
1669 })
f3c0c773 1670 elif is_type(e, 'Article', 'NewsArticle'):
46933a15
S
1671 info.update({
1672 'timestamp': parse_iso8601(e.get('datePublished')),
1673 'title': unescapeHTML(e.get('headline')),
d5c32548 1674 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1675 })
f3c0c773 1676 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
2edb38e8 1677 extract_video_object(e['video'][0])
f3c0c773 1678 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
e50c3500 1679 extract_video_object(e['subjectOf'][0])
0f60ba6e 1680 elif is_type(e, 'VideoObject', 'AudioObject'):
bae14048 1681 extract_video_object(e)
4433bb02
S
1682 if expected_type is None:
1683 continue
1684 else:
1685 break
c69701c6 1686 video = e.get('video')
f3c0c773 1687 if is_type(video, 'VideoObject'):
c69701c6 1688 extract_video_object(video)
4433bb02
S
1689 if expected_type is None:
1690 continue
1691 else:
1692 break
d5c32548 1693
1d55ebab 1694 traverse_json_ld(json_ld)
90137ca4 1695 return filter_dict(info)
4ca2a3cf 1696
135dfa2c 1697 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1698 return self._parse_json(
1699 self._search_regex(
1700 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1701 webpage, 'next.js data', fatal=fatal, **kw),
1702 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1703
8072ef2b 1704 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1705 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
66f4c04e 1706 rectx = re.escape(context_name)
377e85a1 1707 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
66f4c04e 1708 js, arg_keys, arg_vals = self._search_regex(
8072ef2b 1709 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
f7fc8d39 1710 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1711 default=NO_DEFAULT if fatal else (None, None, None))
1712 if js is None:
1713 return {}
66f4c04e 1714
b23167e7
L
1715 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1716 f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
66f4c04e 1717
8072ef2b 1718 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1719 return traverse_obj(ret, traverse) or {}
66f4c04e 1720
27713812 1721 @staticmethod
f8da79f8 1722 def _hidden_inputs(html):
586f1cc5 1723 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1724 hidden_inputs = {}
c8498368
S
1725 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1726 attrs = extract_attributes(input)
1727 if not input:
201ea3ee 1728 continue
c8498368 1729 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1730 continue
c8498368
S
1731 name = attrs.get('name') or attrs.get('id')
1732 value = attrs.get('value')
1733 if name and value is not None:
1734 hidden_inputs[name] = value
201ea3ee 1735 return hidden_inputs
27713812 1736
cf61d96d
S
1737 def _form_hidden_inputs(self, form_id, html):
1738 form = self._search_regex(
73eb13df 1739 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1740 html, '%s form' % form_id, group='form')
1741 return self._hidden_inputs(form)
1742
d0d74b71 1743 @classproperty(cache=True)
1744 def FormatSort(cls):
1745 class FormatSort(FormatSorter):
1746 def __init__(ie, *args, **kwargs):
1747 super().__init__(ie._downloader, *args, **kwargs)
eb8a4433 1748
d0d74b71 1749 deprecation_warning(
1750 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1751 'Use yt_dlp.utils.FormatSorter instead')
1752 return FormatSort
eb8a4433 1753
1754 def _sort_formats(self, formats, field_preference=[]):
9f14daf2 1755 if not field_preference:
1756 self._downloader.deprecation_warning(
1757 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1758 return
1759 self._downloader.deprecation_warning(
1760 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1761 'Return _format_sort_fields in the info_dict instead')
1762 if formats:
784320c9 1763 formats[0]['__sort_fields'] = field_preference
59040888 1764
96a53167
S
1765 def _check_formats(self, formats, video_id):
1766 if formats:
1767 formats[:] = filter(
1768 lambda f: self._is_valid_url(
1769 f['url'], video_id,
1770 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1771 formats)
1772
f5bdb444
S
1773 @staticmethod
1774 def _remove_duplicate_formats(formats):
1775 format_urls = set()
1776 unique_formats = []
1777 for f in formats:
1778 if f['url'] not in format_urls:
1779 format_urls.add(f['url'])
1780 unique_formats.append(f)
1781 formats[:] = unique_formats
1782
45024183 1783 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1784 url = self._proto_relative_url(url, scheme='http:')
1785 # For now assume non HTTP(S) URLs always valid
1786 if not (url.startswith('http://') or url.startswith('https://')):
1787 return True
96a53167 1788 try:
45024183 1789 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1790 return True
8bdd16b4 1791 except ExtractorError as e:
25e911a9 1792 self.to_screen(
8bdd16b4 1793 '%s: %s URL is invalid, skipping: %s'
1794 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1795 return False
96a53167 1796
20991253 1797 def http_scheme(self):
1ede5b24 1798 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1799 return (
1800 'http:'
a06916d9 1801 if self.get_param('prefer_insecure', False)
20991253
PH
1802 else 'https:')
1803
57c7411f 1804 def _proto_relative_url(self, url, scheme=None):
8f97a15d 1805 scheme = scheme or self.http_scheme()
1806 assert scheme.endswith(':')
1807 return sanitize_url(url, scheme=scheme[:-1])
57c7411f 1808
4094b6e3
PH
1809 def _sleep(self, timeout, video_id, msg_template=None):
1810 if msg_template is None:
f1a9d64e 1811 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1812 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1813 self.to_screen(msg)
1814 time.sleep(timeout)
1815
f983b875 1816 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1817 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1818 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
0b5546c7 1819 if self.get_param('ignore_no_formats_error'):
1820 fatal = False
1821
a076c1f9 1822 res = self._download_xml_handle(
f036a632 1823 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1824 'Unable to download f4m manifest',
1825 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1826 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1827 transform_source=transform_source,
7360c06f 1828 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 1829 if res is False:
8d29e47f 1830 return []
31bb8d3f 1831
a076c1f9 1832 manifest, urlh = res
3d2623a8 1833 manifest_url = urlh.url
a076c1f9 1834
0fdbb332 1835 return self._parse_f4m_formats(
f983b875 1836 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1837 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1838
f983b875 1839 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1840 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1841 fatal=True, m3u8_id=None):
f9934b96 1842 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
1843 return []
1844
7a5c1cfe 1845 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1846 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1847 if akamai_pv is not None and ';' in akamai_pv.text:
1848 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1849 if playerVerificationChallenge.strip() != '':
1850 return []
1851
31bb8d3f 1852 formats = []
7a47d07c 1853 manifest_version = '1.0'
b2527359 1854 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1855 if not media_nodes:
7a47d07c 1856 manifest_version = '2.0'
34e48bed 1857 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1858 # Remove unsupported DRM protected media from final formats
067aa17e 1859 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1860 media_nodes = remove_encrypted_media(media_nodes)
1861 if not media_nodes:
1862 return formats
48107c19
S
1863
1864 manifest_base_url = get_base_url(manifest)
0a5685b2 1865
a6571f10 1866 bootstrap_info = xpath_element(
0a5685b2
YCH
1867 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1868 'bootstrap info', default=None)
1869
edd6074c
RA
1870 vcodec = None
1871 mime_type = xpath_text(
1872 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1873 'base URL', default=None)
1874 if mime_type and mime_type.startswith('audio/'):
1875 vcodec = 'none'
1876
b2527359 1877 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1878 tbr = int_or_none(media_el.attrib.get('bitrate'))
1879 width = int_or_none(media_el.attrib.get('width'))
1880 height = int_or_none(media_el.attrib.get('height'))
34921b43 1881 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
1882 # If <bootstrapInfo> is present, the specified f4m is a
1883 # stream-level manifest, and only set-level manifests may refer to
1884 # external resources. See section 11.4 and section 4 of F4M spec
1885 if bootstrap_info is None:
1886 media_url = None
1887 # @href is introduced in 2.0, see section 11.6 of F4M spec
1888 if manifest_version == '2.0':
1889 media_url = media_el.attrib.get('href')
1890 if media_url is None:
1891 media_url = media_el.attrib.get('url')
31c746e5
S
1892 if not media_url:
1893 continue
cc357c4d
S
1894 manifest_url = (
1895 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1896 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1897 # If media_url is itself a f4m manifest do the recursive extraction
1898 # since bitrates in parent manifest (this one) and media_url manifest
1899 # may differ leading to inability to resolve the format by requested
1900 # bitrate in f4m downloader
240b6045
YCH
1901 ext = determine_ext(manifest_url)
1902 if ext == 'f4m':
77b8b4e6 1903 f4m_formats = self._extract_f4m_formats(
f983b875 1904 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1905 transform_source=transform_source, fatal=fatal)
1906 # Sometimes stream-level manifest contains single media entry that
1907 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1908 # At the same time parent's media entry in set-level manifest may
1909 # contain it. We will copy it from parent in such cases.
1910 if len(f4m_formats) == 1:
1911 f = f4m_formats[0]
1912 f.update({
1913 'tbr': f.get('tbr') or tbr,
1914 'width': f.get('width') or width,
1915 'height': f.get('height') or height,
1916 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1917 'vcodec': vcodec,
77b8b4e6
S
1918 })
1919 formats.extend(f4m_formats)
70f0f5a8 1920 continue
240b6045
YCH
1921 elif ext == 'm3u8':
1922 formats.extend(self._extract_m3u8_formats(
1923 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1924 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1925 continue
31bb8d3f 1926 formats.append({
77b8b4e6 1927 'format_id': format_id,
31bb8d3f 1928 'url': manifest_url,
30d0b549 1929 'manifest_url': manifest_url,
a6571f10 1930 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1931 'protocol': 'f4m',
b2527359 1932 'tbr': tbr,
77b8b4e6
S
1933 'width': width,
1934 'height': height,
edd6074c 1935 'vcodec': vcodec,
60ca389c 1936 'preference': preference,
f983b875 1937 'quality': quality,
31bb8d3f 1938 })
31bb8d3f
JMF
1939 return formats
1940
f983b875 1941 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1942 return {
34921b43 1943 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
1944 'url': m3u8_url,
1945 'ext': ext,
1946 'protocol': 'm3u8',
37768f92 1947 'preference': preference - 100 if preference else -100,
f983b875 1948 'quality': quality,
704df56d
PH
1949 'resolution': 'multiple',
1950 'format_note': 'Quality selection URL',
16da9bbc
YCH
1951 }
1952
b5ae35ee 1953 def _report_ignoring_subs(self, name):
1954 self.report_warning(bug_reports_message(
1955 f'Ignoring subtitle tracks found in the {name} manifest; '
1956 'if any subtitle tracks are missing,'
1957 ), only_once=True)
1958
a0c3b2d5
F
1959 def _extract_m3u8_formats(self, *args, **kwargs):
1960 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1961 if subs:
b5ae35ee 1962 self._report_ignoring_subs('HLS')
a0c3b2d5
F
1963 return fmts
1964
1965 def _extract_m3u8_formats_and_subtitles(
177877c5 1966 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1967 preference=None, quality=None, m3u8_id=None, note=None,
1968 errnote=None, fatal=True, live=False, data=None, headers={},
1969 query={}):
1970
0b5546c7 1971 if self.get_param('ignore_no_formats_error'):
1972 fatal = False
1973
71df9b7f 1974 if not m3u8_url:
1975 if errnote is not False:
1976 errnote = errnote or 'Failed to obtain m3u8 URL'
1977 if fatal:
1978 raise ExtractorError(errnote, video_id=video_id)
1979 self.report_warning(f'{errnote}{bug_reports_message()}')
1980 return [], {}
1981
dbd82a1d 1982 res = self._download_webpage_handle(
81515ad9 1983 m3u8_url, video_id,
37a3bb66 1984 note='Downloading m3u8 information' if note is None else note,
1985 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1986 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1987
dbd82a1d 1988 if res is False:
a0c3b2d5 1989 return [], {}
cb252080 1990
dbd82a1d 1991 m3u8_doc, urlh = res
3d2623a8 1992 m3u8_url = urlh.url
9cdffeeb 1993
a0c3b2d5 1994 return self._parse_m3u8_formats_and_subtitles(
cb252080 1995 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1996 preference=preference, quality=quality, m3u8_id=m3u8_id,
1997 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1998 headers=headers, query=query, video_id=video_id)
cb252080 1999
a0c3b2d5 2000 def _parse_m3u8_formats_and_subtitles(
42676437 2001 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2002 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2003 errnote=None, fatal=True, data=None, headers={}, query={},
2004 video_id=None):
60755938 2005 formats, subtitles = [], {}
bc344cd4 2006 has_drm = HlsFD._has_drm(m3u8_doc)
a0c3b2d5 2007
60755938 2008 def format_url(url):
14f25df2 2009 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
60755938 2010
2011 if self.get_param('hls_split_discontinuity', False):
2012 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2013 if not m3u8_doc:
2014 if not manifest_url:
2015 return []
2016 m3u8_doc = self._download_webpage(
2017 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2018 note=False, errnote='Failed to download m3u8 playlist information')
2019 if m3u8_doc is False:
2020 return []
2021 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 2022
60755938 2023 else:
2024 def _extract_m3u8_playlist_indices(*args, **kwargs):
2025 return [None]
310c2ed2 2026
cb252080
S
2027 # References:
2028 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2029 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2030 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2031
2032 # We should try extracting formats only from master playlists [1, 4.3.4],
2033 # i.e. playlists that describe available qualities. On the other hand
2034 # media playlists [1, 4.3.3] should be returned as is since they contain
2035 # just the media without qualities renditions.
9cdffeeb 2036 # Fortunately, master playlist can be easily distinguished from media
cb252080 2037 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2038 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2039 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2040 # media playlist and MUST NOT appear in master playlist thus we can
2041 # clearly detect media playlist with this criterion.
2042
9cdffeeb 2043 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2044 formats = [{
34921b43 2045 'format_id': join_nonempty(m3u8_id, idx),
60755938 2046 'format_index': idx,
42676437 2047 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2048 'ext': ext,
2049 'protocol': entry_protocol,
2050 'preference': preference,
2051 'quality': quality,
88acdbc2 2052 'has_drm': has_drm,
60755938 2053 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2054
a0c3b2d5 2055 return formats, subtitles
cb252080
S
2056
2057 groups = {}
2058 last_stream_inf = {}
2059
2060 def extract_media(x_media_line):
2061 media = parse_m3u8_attributes(x_media_line)
2062 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2063 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2064 if not (media_type and group_id and name):
2065 return
2066 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2067 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2068 if media_type == 'SUBTITLES':
3907333c 2069 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2070 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2071 # However, lack of URI has been spotted in the wild.
2072 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2073 if not media.get('URI'):
2074 return
a0c3b2d5
F
2075 url = format_url(media['URI'])
2076 sub_info = {
2077 'url': url,
2078 'ext': determine_ext(url),
2079 }
4a2f19ab
F
2080 if sub_info['ext'] == 'm3u8':
2081 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2082 # files may contain is WebVTT:
2083 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2084 sub_info['ext'] = 'vtt'
2085 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2086 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2087 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2088 if media_type not in ('VIDEO', 'AUDIO'):
2089 return
2090 media_url = media.get('URI')
2091 if media_url:
310c2ed2 2092 manifest_url = format_url(media_url)
60755938 2093 formats.extend({
34921b43 2094 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2095 'format_note': name,
2096 'format_index': idx,
2097 'url': manifest_url,
2098 'manifest_url': m3u8_url,
2099 'language': media.get('LANGUAGE'),
2100 'ext': ext,
2101 'protocol': entry_protocol,
2102 'preference': preference,
2103 'quality': quality,
43a3eaf9 2104 'has_drm': has_drm,
60755938 2105 'vcodec': 'none' if media_type == 'AUDIO' else None,
2106 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2107
2108 def build_stream_name():
2109 # Despite specification does not mention NAME attribute for
3019cb0c
S
2110 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2111 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2112 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2113 stream_name = last_stream_inf.get('NAME')
2114 if stream_name:
2115 return stream_name
2116 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2117 # from corresponding rendition group
2118 stream_group_id = last_stream_inf.get('VIDEO')
2119 if not stream_group_id:
2120 return
2121 stream_group = groups.get(stream_group_id)
2122 if not stream_group:
2123 return stream_group_id
2124 rendition = stream_group[0]
2125 return rendition.get('NAME') or stream_group_id
2126
379306ef 2127 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2128 # chance to detect video only formats when EXT-X-STREAM-INF tags
2129 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2130 for line in m3u8_doc.splitlines():
2131 if line.startswith('#EXT-X-MEDIA:'):
2132 extract_media(line)
2133
704df56d
PH
2134 for line in m3u8_doc.splitlines():
2135 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2136 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2137 elif line.startswith('#') or not line.strip():
2138 continue
2139 else:
9c99bef7 2140 tbr = float_or_none(
3089bc74
S
2141 last_stream_inf.get('AVERAGE-BANDWIDTH')
2142 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2143 manifest_url = format_url(line.strip())
5ef62fc4 2144
60755938 2145 for idx in _extract_m3u8_playlist_indices(manifest_url):
2146 format_id = [m3u8_id, None, idx]
310c2ed2 2147 # Bandwidth of live streams may differ over time thus making
2148 # format_id unpredictable. So it's better to keep provided
2149 # format_id intact.
2150 if not live:
60755938 2151 stream_name = build_stream_name()
34921b43 2152 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2153 f = {
34921b43 2154 'format_id': join_nonempty(*format_id),
60755938 2155 'format_index': idx,
310c2ed2 2156 'url': manifest_url,
2157 'manifest_url': m3u8_url,
2158 'tbr': tbr,
2159 'ext': ext,
2160 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2161 'protocol': entry_protocol,
2162 'preference': preference,
2163 'quality': quality,
43a3eaf9 2164 'has_drm': has_drm,
310c2ed2 2165 }
2166 resolution = last_stream_inf.get('RESOLUTION')
2167 if resolution:
2168 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2169 if mobj:
2170 f['width'] = int(mobj.group('width'))
2171 f['height'] = int(mobj.group('height'))
2172 # Unified Streaming Platform
2173 mobj = re.search(
2174 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2175 if mobj:
2176 abr, vbr = mobj.groups()
2177 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2178 f.update({
2179 'vbr': vbr,
2180 'abr': abr,
2181 })
2182 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2183 f.update(codecs)
2184 audio_group_id = last_stream_inf.get('AUDIO')
2185 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2186 # references a rendition group MUST have a CODECS attribute.
62b58c09 2187 # However, this is not always respected. E.g. [2]
310c2ed2 2188 # contains EXT-X-STREAM-INF tag which references AUDIO
2189 # rendition group but does not have CODECS and despite
2190 # referencing an audio group it represents a complete
2191 # (with audio and video) format. So, for such cases we will
2192 # ignore references to rendition groups and treat them
2193 # as complete formats.
2194 if audio_group_id and codecs and f.get('vcodec') != 'none':
2195 audio_group = groups.get(audio_group_id)
2196 if audio_group and audio_group[0].get('URI'):
2197 # TODO: update acodec for audio only formats with
2198 # the same GROUP-ID
2199 f['acodec'] = 'none'
fc21af50 2200 if not f.get('ext'):
2201 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2202 formats.append(f)
2203
2204 # for DailyMotion
2205 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2206 if progressive_uri:
2207 http_f = f.copy()
2208 del http_f['manifest_url']
2209 http_f.update({
2210 'format_id': f['format_id'].replace('hls-', 'http-'),
2211 'protocol': 'http',
2212 'url': progressive_uri,
2213 })
2214 formats.append(http_f)
5ef62fc4 2215
cb252080 2216 last_stream_inf = {}
a0c3b2d5 2217 return formats, subtitles
704df56d 2218
3cf4b91d
C
2219 def _extract_m3u8_vod_duration(
2220 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2221
2222 m3u8_vod = self._download_webpage(
2223 m3u8_vod_url, video_id,
2224 note='Downloading m3u8 VOD manifest' if note is None else note,
2225 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2226 fatal=False, data=data, headers=headers, query=query)
2227
2228 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2229
2230 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
5ab3534d 2231 if '#EXT-X-ENDLIST' not in m3u8_vod:
3cf4b91d
C
2232 return None
2233
2234 return int(sum(
2235 float(line[len('#EXTINF:'):].split(',')[0])
2236 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2237
5ab3534d 2238 def _extract_mpd_vod_duration(
2239 self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2240
2241 mpd_doc = self._download_xml(
2242 mpd_url, video_id,
2243 note='Downloading MPD VOD manifest' if note is None else note,
2244 errnote='Failed to download VOD manifest' if errnote is None else errnote,
d4f14a72 2245 fatal=False, data=data, headers=headers, query=query)
2246 if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2247 return None
5ab3534d 2248 return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2249
a107193e
S
2250 @staticmethod
2251 def _xpath_ns(path, namespace=None):
2252 if not namespace:
2253 return path
2254 out = []
2255 for c in path.split('/'):
2256 if not c or c == '.':
2257 out.append(c)
2258 else:
2259 out.append('{%s}%s' % (namespace, c))
2260 return '/'.join(out)
2261
da1c94ee 2262 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
0b5546c7 2263 if self.get_param('ignore_no_formats_error'):
2264 fatal = False
2265
a076c1f9
E
2266 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2267 if res is False:
995029a1 2268 assert not fatal
774a46c5 2269 return [], {}
a076c1f9 2270 smil, urlh = res
a107193e 2271
550e6541 2272 return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2273 namespace=self._parse_smil_namespace(smil))
da1c94ee
F
2274
2275 def _extract_smil_formats(self, *args, **kwargs):
2276 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2277 if subs:
b5ae35ee 2278 self._report_ignoring_subs('SMIL')
da1c94ee 2279 return fmts
a107193e
S
2280
2281 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2282 res = self._download_smil(smil_url, video_id, fatal=fatal)
2283 if res is False:
a107193e 2284 return {}
a076c1f9
E
2285
2286 smil, urlh = res
3d2623a8 2287 smil_url = urlh.url
a076c1f9 2288
a107193e
S
2289 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2290
09f572fb 2291 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2292 return self._download_xml_handle(
a107193e 2293 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2294 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2295
2296 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2297 namespace = self._parse_smil_namespace(smil)
a107193e 2298
550e6541 2299 formats, subtitles = self._parse_smil_formats_and_subtitles(
a107193e 2300 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
a107193e
S
2301
2302 video_id = os.path.splitext(url_basename(smil_url))[0]
2303 title = None
2304 description = None
647eab45 2305 upload_date = None
a107193e
S
2306 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2307 name = meta.attrib.get('name')
2308 content = meta.attrib.get('content')
2309 if not name or not content:
2310 continue
2311 if not title and name == 'title':
2312 title = content
2313 elif not description and name in ('description', 'abstract'):
2314 description = content
647eab45
S
2315 elif not upload_date and name == 'date':
2316 upload_date = unified_strdate(content)
a107193e 2317
1e5bcdec
S
2318 thumbnails = [{
2319 'id': image.get('type'),
2320 'url': image.get('src'),
2321 'width': int_or_none(image.get('width')),
2322 'height': int_or_none(image.get('height')),
2323 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2324
a107193e
S
2325 return {
2326 'id': video_id,
2327 'title': title or video_id,
2328 'description': description,
647eab45 2329 'upload_date': upload_date,
1e5bcdec 2330 'thumbnails': thumbnails,
a107193e
S
2331 'formats': formats,
2332 'subtitles': subtitles,
2333 }
2334
17712eeb
S
2335 def _parse_smil_namespace(self, smil):
2336 return self._search_regex(
2337 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2338
550e6541 2339 def _parse_smil_formats(self, *args, **kwargs):
2340 fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2341 if subs:
2342 self._report_ignoring_subs('SMIL')
2343 return fmts
2344
2345 def _parse_smil_formats_and_subtitles(
2346 self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2347 base = smil_url
2348 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2349 b = meta.get('base') or meta.get('httpBase')
2350 if b:
2351 base = b
2352 break
e89a2aab 2353
550e6541 2354 formats, subtitles = [], {}
e89a2aab 2355 rtmp_count = 0
a107193e 2356 http_count = 0
7f32e5dc 2357 m3u8_count = 0
9359f3d4 2358 imgs_count = 0
a107193e 2359
9359f3d4 2360 srcs = set()
ddb2d758 2361 media = itertools.chain.from_iterable(
2362 smil.findall(self._xpath_ns(arg, namespace))
2363 for arg in ['.//video', './/audio', './/media'])
ad96b4c8
YCH
2364 for medium in media:
2365 src = medium.get('src')
81e1c4e2 2366 if not src or src in srcs:
a107193e 2367 continue
9359f3d4 2368 srcs.add(src)
a107193e 2369
ad96b4c8
YCH
2370 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2371 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2372 width = int_or_none(medium.get('width'))
2373 height = int_or_none(medium.get('height'))
2374 proto = medium.get('proto')
2375 ext = medium.get('ext')
cb73b846 2376 src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2377 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
ad96b4c8 2378 streamer = medium.get('streamer') or base
a107193e
S
2379
2380 if proto == 'rtmp' or streamer.startswith('rtmp'):
2381 rtmp_count += 1
2382 formats.append({
2383 'url': streamer,
2384 'play_path': src,
2385 'ext': 'flv',
2386 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2387 'tbr': bitrate,
2388 'filesize': filesize,
2389 'width': width,
2390 'height': height,
2391 })
f877c6ae
YCH
2392 if transform_rtmp_url:
2393 streamer, src = transform_rtmp_url(streamer, src)
2394 formats[-1].update({
2395 'url': streamer,
2396 'play_path': src,
2397 })
a107193e
S
2398 continue
2399
14f25df2 2400 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
c349456e 2401 src_url = src_url.strip()
a107193e
S
2402
2403 if proto == 'm3u8' or src_ext == 'm3u8':
550e6541 2404 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
7f32e5dc 2405 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
550e6541 2406 self._merge_subtitles(m3u8_subs, target=subtitles)
7f32e5dc 2407 if len(m3u8_formats) == 1:
2408 m3u8_count += 1
2409 m3u8_formats[0].update({
2410 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2411 'tbr': bitrate,
2412 'width': width,
2413 'height': height,
2414 })
2415 formats.extend(m3u8_formats)
bd21ead2 2416 elif src_ext == 'f4m':
a107193e
S
2417 f4m_url = src_url
2418 if not f4m_params:
2419 f4m_params = {
2420 'hdcore': '3.2.0',
2421 'plugin': 'flowplayer-3.2.0.1',
2422 }
2423 f4m_url += '&' if '?' in f4m_url else '?'
14f25df2 2424 f4m_url += urllib.parse.urlencode(f4m_params)
7e5edcfd 2425 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2 2426 elif src_ext == 'mpd':
550e6541 2427 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2428 src_url, video_id, mpd_id='dash', fatal=False)
2429 formats.extend(mpd_formats)
2430 self._merge_subtitles(mpd_subs, target=subtitles)
bd21ead2 2431 elif re.search(r'\.ism/[Mm]anifest', src_url):
550e6541 2432 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2433 src_url, video_id, ism_id='mss', fatal=False)
2434 formats.extend(ism_formats)
2435 self._merge_subtitles(ism_subs, target=subtitles)
bd21ead2 2436 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2437 http_count += 1
2438 formats.append({
2439 'url': src_url,
2440 'ext': ext or src_ext or 'flv',
2441 'format_id': 'http-%d' % (bitrate or http_count),
2442 'tbr': bitrate,
2443 'filesize': filesize,
2444 'width': width,
2445 'height': height,
2446 })
63757032 2447
9359f3d4
F
2448 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2449 src = medium.get('src')
2450 if not src or src in srcs:
2451 continue
2452 srcs.add(src)
2453
2454 imgs_count += 1
2455 formats.append({
2456 'format_id': 'imagestream-%d' % (imgs_count),
2457 'url': src,
2458 'ext': mimetype2ext(medium.get('type')),
2459 'acodec': 'none',
2460 'vcodec': 'none',
2461 'width': int_or_none(medium.get('width')),
2462 'height': int_or_none(medium.get('height')),
2463 'format_note': 'SMIL storyboards',
2464 })
2465
550e6541 2466 smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2467 self._merge_subtitles(smil_subs, target=subtitles)
2468
2469 return formats, subtitles
e89a2aab 2470
ce00af87 2471 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2472 urls = []
a107193e
S
2473 subtitles = {}
2474 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2475 src = textstream.get('src')
d413095f 2476 if not src or src in urls:
a107193e 2477 continue
d413095f 2478 urls.append(src)
df634be2 2479 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2480 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2481 subtitles.setdefault(lang, []).append({
2482 'url': src,
2483 'ext': ext,
2484 })
2485 return subtitles
63757032 2486
47a5cb77 2487 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2488 res = self._download_xml_handle(
47a5cb77 2489 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2490 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2491 if res is False:
942acef5 2492 return []
a076c1f9
E
2493
2494 xspf, urlh = res
3d2623a8 2495 xspf_url = urlh.url
a076c1f9 2496
47a5cb77
S
2497 return self._parse_xspf(
2498 xspf, playlist_id, xspf_url=xspf_url,
2499 xspf_base_url=base_url(xspf_url))
8d6765cf 2500
47a5cb77 2501 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2502 NS_MAP = {
2503 'xspf': 'http://xspf.org/ns/0/',
2504 's1': 'http://static.streamone.nl/player/ns/0',
2505 }
2506
2507 entries = []
47a5cb77 2508 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2509 title = xpath_text(
98044462 2510 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2511 description = xpath_text(
2512 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2513 thumbnail = xpath_text(
2514 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2515 duration = float_or_none(
2516 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2517
47a5cb77
S
2518 formats = []
2519 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2520 format_url = urljoin(xspf_base_url, location.text)
2521 if not format_url:
2522 continue
2523 formats.append({
2524 'url': format_url,
2525 'manifest_url': xspf_url,
2526 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2527 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2528 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2529 })
8d6765cf
S
2530
2531 entries.append({
2532 'id': playlist_id,
2533 'title': title,
2534 'description': description,
2535 'thumbnail': thumbnail,
2536 'duration': duration,
2537 'formats': formats,
2538 })
2539 return entries
2540
171e59ed
F
2541 def _extract_mpd_formats(self, *args, **kwargs):
2542 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2543 if subs:
b5ae35ee 2544 self._report_ignoring_subs('DASH')
171e59ed
F
2545 return fmts
2546
4ce57d3b
A
2547 def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2548 periods = self._extract_mpd_periods(*args, **kwargs)
2549 return self._merge_mpd_periods(periods)
2550
2551 def _extract_mpd_periods(
171e59ed
F
2552 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2553 fatal=True, data=None, headers={}, query={}):
0b5546c7 2554
2555 if self.get_param('ignore_no_formats_error'):
2556 fatal = False
2557
47a5cb77 2558 res = self._download_xml_handle(
1bac3455 2559 mpd_url, video_id,
37a3bb66 2560 note='Downloading MPD manifest' if note is None else note,
2561 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2562 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2563 if res is False:
4ce57d3b 2564 return []
47a5cb77 2565 mpd_doc, urlh = res
c25720ef 2566 if mpd_doc is None:
4ce57d3b 2567 return []
779da8e3
E
2568
2569 # We could have been redirected to a new url when we retrieved our mpd file.
3d2623a8 2570 mpd_url = urlh.url
779da8e3 2571 mpd_base_url = base_url(mpd_url)
1bac3455 2572
4ce57d3b 2573 return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2574
171e59ed
F
2575 def _parse_mpd_formats(self, *args, **kwargs):
2576 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2577 if subs:
b5ae35ee 2578 self._report_ignoring_subs('DASH')
171e59ed
F
2579 return fmts
2580
4ce57d3b
A
2581 def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2582 periods = self._parse_mpd_periods(*args, **kwargs)
2583 return self._merge_mpd_periods(periods)
2584
2585 def _merge_mpd_periods(self, periods):
2586 """
2587 Combine all formats and subtitles from an MPD manifest into a single list,
2588 by concatenate streams with similar formats.
2589 """
2590 formats, subtitles = {}, {}
2591 for period in periods:
2592 for f in period['formats']:
2593 assert 'is_dash_periods' not in f, 'format already processed'
2594 f['is_dash_periods'] = True
2595 format_key = tuple(v for k, v in f.items() if k not in (
2596 ('format_id', 'fragments', 'manifest_stream_number')))
2597 if format_key not in formats:
2598 formats[format_key] = f
2599 elif 'fragments' in f:
2600 formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2601
2602 if subtitles and period['subtitles']:
2603 self.report_warning(bug_reports_message(
2604 'Found subtitles in multiple periods in the DASH manifest; '
2605 'if part of the subtitles are missing,'
2606 ), only_once=True)
2607
2608 for sub_lang, sub_info in period['subtitles'].items():
2609 subtitles.setdefault(sub_lang, []).extend(sub_info)
2610
2611 return list(formats.values()), subtitles
2612
2613 def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2614 """
2615 Parse formats from MPD manifest.
2616 References:
2617 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2618 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2619 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2620 """
a06916d9 2621 if not self.get_param('dynamic_mpd', True):
78895bd3 2622 if mpd_doc.get('type') == 'dynamic':
171e59ed 2623 return [], {}
2d2fa82d 2624
91cb6b50 2625 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2626
2627 def _add_ns(path):
2628 return self._xpath_ns(path, namespace)
2629
675d0016 2630 def is_drm_protected(element):
2631 return element.find(_add_ns('ContentProtection')) is not None
2632
1bac3455 2633 def extract_multisegment_info(element, ms_parent_info):
2634 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2635
2636 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2637 # common attributes and elements. We will only extract relevant
2638 # for us.
2639 def extract_common(source):
2640 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2641 if segment_timeline is not None:
2642 s_e = segment_timeline.findall(_add_ns('S'))
2643 if s_e:
2644 ms_info['total_number'] = 0
2645 ms_info['s'] = []
2646 for s in s_e:
2647 r = int(s.get('r', 0))
2648 ms_info['total_number'] += 1 + r
2649 ms_info['s'].append({
2650 't': int(s.get('t', 0)),
2651 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2652 'd': int(s.attrib['d']),
2653 'r': r,
2654 })
2655 start_number = source.get('startNumber')
2656 if start_number:
2657 ms_info['start_number'] = int(start_number)
2658 timescale = source.get('timescale')
2659 if timescale:
2660 ms_info['timescale'] = int(timescale)
2661 segment_duration = source.get('duration')
2662 if segment_duration:
48504785 2663 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2664
2665 def extract_Initialization(source):
2666 initialization = source.find(_add_ns('Initialization'))
2667 if initialization is not None:
2668 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2669
f14be228 2670 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2671 if segment_list is not None:
b4c1d6e8
S
2672 extract_common(segment_list)
2673 extract_Initialization(segment_list)
f14be228 2674 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2675 if segment_urls_e:
2676 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2677 else:
f14be228 2678 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2679 if segment_template is not None:
b4c1d6e8 2680 extract_common(segment_template)
e228616c
S
2681 media = segment_template.get('media')
2682 if media:
2683 ms_info['media'] = media
1bac3455 2684 initialization = segment_template.get('initialization')
2685 if initialization:
e228616c 2686 ms_info['initialization'] = initialization
1bac3455 2687 else:
b4c1d6e8 2688 extract_Initialization(segment_template)
1bac3455 2689 return ms_info
b323e170 2690
1bac3455 2691 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
234416e4 2692 stream_numbers = collections.defaultdict(int)
4ce57d3b
A
2693 for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2694 period_entry = {
2695 'id': period.get('id', f'period-{period_idx}'),
2696 'formats': [],
2697 'subtitles': collections.defaultdict(list),
2698 }
1bac3455 2699 period_duration = parse_duration(period.get('duration')) or mpd_duration
2700 period_ms_info = extract_multisegment_info(period, {
2701 'start_number': 1,
2702 'timescale': 1,
2703 })
f14be228 2704 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2705 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2706 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2707 representation_attrib = adaptation_set.attrib.copy()
2708 representation_attrib.update(representation.attrib)
f0948348 2709 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2710 mime_type = representation_attrib['mimeType']
171e59ed
F
2711 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2712
21633673 2713 codec_str = representation_attrib.get('codecs', '')
2714 # Some kind of binary subtitle found in some youtube livestreams
2715 if mime_type == 'application/x-rawcc':
2716 codecs = {'scodec': codec_str}
2717 else:
2718 codecs = parse_codecs(codec_str)
be2fc5b2 2719 if content_type not in ('video', 'audio', 'text'):
2720 if mime_type == 'image/jpeg':
a8731fcc 2721 content_type = mime_type
21633673 2722 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2723 content_type = 'video'
21633673 2724 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2725 content_type = 'audio'
3fe75fdc 2726 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2727 content_type = 'text'
6993f78d 2728 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2729 content_type = 'text'
cdb19aa4 2730 else:
be2fc5b2 2731 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2732 continue
2733
2734 base_url = ''
2735 for element in (representation, adaptation_set, period, mpd_doc):
2736 base_url_e = element.find(_add_ns('BaseURL'))
47046464 2737 if try_call(lambda: base_url_e.text) is not None:
be2fc5b2 2738 base_url = base_url_e.text + base_url
2739 if re.match(r'^https?://', base_url):
2740 break
f9cc0161 2741 if mpd_base_url and base_url.startswith('/'):
14f25df2 2742 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
f9cc0161
D
2743 elif mpd_base_url and not re.match(r'^https?://', base_url):
2744 if not mpd_base_url.endswith('/'):
be2fc5b2 2745 mpd_base_url += '/'
2746 base_url = mpd_base_url + base_url
2747 representation_id = representation_attrib.get('id')
2748 lang = representation_attrib.get('lang')
2749 url_el = representation.find(_add_ns('BaseURL'))
2750 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2751 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2752 if representation_id is not None:
2753 format_id = representation_id
2754 else:
2755 format_id = content_type
2756 if mpd_id:
2757 format_id = mpd_id + '-' + format_id
2758 if content_type in ('video', 'audio'):
2759 f = {
2760 'format_id': format_id,
2761 'manifest_url': mpd_url,
2762 'ext': mimetype2ext(mime_type),
2763 'width': int_or_none(representation_attrib.get('width')),
2764 'height': int_or_none(representation_attrib.get('height')),
2765 'tbr': float_or_none(bandwidth, 1000),
2766 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2767 'fps': int_or_none(representation_attrib.get('frameRate')),
2768 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2769 'format_note': 'DASH %s' % content_type,
2770 'filesize': filesize,
2771 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2772 **codecs
be2fc5b2 2773 }
be2fc5b2 2774 elif content_type == 'text':
2775 f = {
2776 'ext': mimetype2ext(mime_type),
2777 'manifest_url': mpd_url,
2778 'filesize': filesize,
2779 }
2780 elif content_type == 'image/jpeg':
2781 # See test case in VikiIE
2782 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2783 f = {
2784 'format_id': format_id,
2785 'ext': 'mhtml',
2786 'manifest_url': mpd_url,
2787 'format_note': 'DASH storyboards (jpeg)',
2788 'acodec': 'none',
2789 'vcodec': 'none',
2790 }
88acdbc2 2791 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2792 f['has_drm'] = True
be2fc5b2 2793 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2794
2795 def prepare_template(template_name, identifiers):
2796 tmpl = representation_ms_info[template_name]
0cb0fdbb 2797 if representation_id is not None:
2798 tmpl = tmpl.replace('$RepresentationID$', representation_id)
be2fc5b2 2799 # First of, % characters outside $...$ templates
2800 # must be escaped by doubling for proper processing
2801 # by % operator string formatting used further (see
2802 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2803 t = ''
2804 in_template = False
2805 for c in tmpl:
2806 t += c
2807 if c == '$':
2808 in_template = not in_template
2809 elif c == '%' and not in_template:
eca1f0d1 2810 t += c
be2fc5b2 2811 # Next, $...$ templates are translated to their
2812 # %(...) counterparts to be used with % operator
be2fc5b2 2813 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2814 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2815 t.replace('$$', '$')
2816 return t
2817
2818 # @initialization is a regular template like @media one
2819 # so it should be handled just the same way (see
2820 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2821 if 'initialization' in representation_ms_info:
2822 initialization_template = prepare_template(
2823 'initialization',
2824 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2825 # $Time$ shall not be included for @initialization thus
2826 # only $Bandwidth$ remains
2827 ('Bandwidth', ))
2828 representation_ms_info['initialization_url'] = initialization_template % {
2829 'Bandwidth': bandwidth,
2830 }
2831
2832 def location_key(location):
2833 return 'url' if re.match(r'^https?://', location) else 'path'
2834
2835 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2836
2837 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2838 media_location_key = location_key(media_template)
2839
2840 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2841 # can't be used at the same time
2842 if '%(Number' in media_template and 's' not in representation_ms_info:
2843 segment_duration = None
2844 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2845 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2846 representation_ms_info['total_number'] = int(math.ceil(
2847 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2848 representation_ms_info['fragments'] = [{
2849 media_location_key: media_template % {
2850 'Number': segment_number,
2851 'Bandwidth': bandwidth,
2852 },
2853 'duration': segment_duration,
2854 } for segment_number in range(
2855 representation_ms_info['start_number'],
2856 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2857 else:
2858 # $Number*$ or $Time$ in media template with S list available
2859 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2860 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2861 representation_ms_info['fragments'] = []
2862 segment_time = 0
2863 segment_d = None
2864 segment_number = representation_ms_info['start_number']
2865
2866 def add_segment_url():
2867 segment_url = media_template % {
2868 'Time': segment_time,
2869 'Bandwidth': bandwidth,
2870 'Number': segment_number,
2871 }
2872 representation_ms_info['fragments'].append({
2873 media_location_key: segment_url,
2874 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2875 })
2876
2877 for num, s in enumerate(representation_ms_info['s']):
2878 segment_time = s.get('t') or segment_time
2879 segment_d = s['d']
2880 add_segment_url()
2881 segment_number += 1
2882 for r in range(s.get('r', 0)):
2883 segment_time += segment_d
f0948348 2884 add_segment_url()
b4c1d6e8 2885 segment_number += 1
be2fc5b2 2886 segment_time += segment_d
2887 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
62b58c09
L
2888 # No media template,
2889 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
be2fc5b2 2890 # or any YouTube dashsegments video
2891 fragments = []
2892 segment_index = 0
2893 timescale = representation_ms_info['timescale']
2894 for s in representation_ms_info['s']:
2895 duration = float_or_none(s['d'], timescale)
2896 for r in range(s.get('r', 0) + 1):
2897 segment_uri = representation_ms_info['segment_urls'][segment_index]
2898 fragments.append({
2899 location_key(segment_uri): segment_uri,
2900 'duration': duration,
2901 })
2902 segment_index += 1
2903 representation_ms_info['fragments'] = fragments
2904 elif 'segment_urls' in representation_ms_info:
2905 # Segment URLs with no SegmentTimeline
62b58c09 2906 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
be2fc5b2 2907 # https://github.com/ytdl-org/youtube-dl/pull/14844
2908 fragments = []
2909 segment_duration = float_or_none(
2910 representation_ms_info['segment_duration'],
2911 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2912 for segment_url in representation_ms_info['segment_urls']:
2913 fragment = {
2914 location_key(segment_url): segment_url,
2915 }
2916 if segment_duration:
2917 fragment['duration'] = segment_duration
2918 fragments.append(fragment)
2919 representation_ms_info['fragments'] = fragments
2920 # If there is a fragments key available then we correctly recognized fragmented media.
2921 # Otherwise we will assume unfragmented media with direct access. Technically, such
2922 # assumption is not necessarily correct since we may simply have no support for
2923 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2924 if 'fragments' in representation_ms_info:
2925 f.update({
2926 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2927 'url': mpd_url or base_url,
2928 'fragment_base_url': base_url,
2929 'fragments': [],
2930 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2931 })
2932 if 'initialization_url' in representation_ms_info:
2933 initialization_url = representation_ms_info['initialization_url']
2934 if not f.get('url'):
2935 f['url'] = initialization_url
2936 f['fragments'].append({location_key(initialization_url): initialization_url})
2937 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 2938 if not period_duration:
2939 period_duration = try_get(
2940 representation_ms_info,
2941 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 2942 else:
be2fc5b2 2943 # Assuming direct URL to unfragmented media.
2944 f['url'] = base_url
234416e4 2945 if content_type in ('video', 'audio', 'image/jpeg'):
2946 f['manifest_stream_number'] = stream_numbers[f['url']]
2947 stream_numbers[f['url']] += 1
4ce57d3b 2948 period_entry['formats'].append(f)
be2fc5b2 2949 elif content_type == 'text':
4ce57d3b
A
2950 period_entry['subtitles'][lang or 'und'].append(f)
2951 yield period_entry
17b598d3 2952
fd76a142
F
2953 def _extract_ism_formats(self, *args, **kwargs):
2954 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2955 if subs:
b5ae35ee 2956 self._report_ignoring_subs('ISM')
fd76a142
F
2957 return fmts
2958
2959 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
0b5546c7 2960 if self.get_param('ignore_no_formats_error'):
2961 fatal = False
2962
47a5cb77 2963 res = self._download_xml_handle(
b2758123 2964 ism_url, video_id,
37a3bb66 2965 note='Downloading ISM manifest' if note is None else note,
2966 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2967 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2968 if res is False:
fd76a142 2969 return [], {}
47a5cb77 2970 ism_doc, urlh = res
13b08034 2971 if ism_doc is None:
fd76a142 2972 return [], {}
b2758123 2973
3d2623a8 2974 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
b2758123 2975
fd76a142 2976 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2977 """
2978 Parse formats from ISM manifest.
2979 References:
2980 1. [MS-SSTR]: Smooth Streaming Protocol,
2981 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2982 """
06869367 2983 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2984 return [], {}
b2758123 2985
b2758123
RA
2986 duration = int(ism_doc.attrib['Duration'])
2987 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2988
2989 formats = []
fd76a142 2990 subtitles = {}
b2758123
RA
2991 for stream in ism_doc.findall('StreamIndex'):
2992 stream_type = stream.get('Type')
fd76a142 2993 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2994 continue
2995 url_pattern = stream.attrib['Url']
2996 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2997 stream_name = stream.get('Name')
fd76a142 2998 stream_language = stream.get('Language', 'und')
b2758123 2999 for track in stream.findall('QualityLevel'):
81b6102d 3000 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3001 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
b2758123 3002 # TODO: add support for WVC1 and WMAP
81b6102d 3003 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
b2758123
RA
3004 self.report_warning('%s is not a supported codec' % fourcc)
3005 continue
3006 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
3007 # [1] does not mention Width and Height attributes. However,
3008 # they're often present while MaxWidth and MaxHeight are
3009 # missing, so should be used as fallbacks
3010 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3011 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
3012 sampling_rate = int_or_none(track.get('SamplingRate'))
3013
3014 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
14f25df2 3015 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
b2758123
RA
3016
3017 fragments = []
3018 fragment_ctx = {
3019 'time': 0,
3020 }
3021 stream_fragments = stream.findall('c')
3022 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3023 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3024 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3025 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3026 if not fragment_ctx['duration']:
3027 try:
3028 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3029 except IndexError:
3030 next_fragment_time = duration
1616f9b4 3031 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
3032 for _ in range(fragment_repeat):
3033 fragments.append({
14f25df2 3034 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
3035 'duration': fragment_ctx['duration'] / stream_timescale,
3036 })
3037 fragment_ctx['time'] += fragment_ctx['duration']
3038
fd76a142
F
3039 if stream_type == 'text':
3040 subtitles.setdefault(stream_language, []).append({
3041 'ext': 'ismt',
3042 'protocol': 'ism',
3043 'url': ism_url,
3044 'manifest_url': ism_url,
3045 'fragments': fragments,
3046 '_download_params': {
3047 'stream_type': stream_type,
3048 'duration': duration,
3049 'timescale': stream_timescale,
3050 'fourcc': fourcc,
3051 'language': stream_language,
3052 'codec_private_data': track.get('CodecPrivateData'),
3053 }
3054 })
3055 elif stream_type in ('video', 'audio'):
3056 formats.append({
34921b43 3057 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
3058 'url': ism_url,
3059 'manifest_url': ism_url,
3060 'ext': 'ismv' if stream_type == 'video' else 'isma',
3061 'width': width,
3062 'height': height,
3063 'tbr': tbr,
3064 'asr': sampling_rate,
3065 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3066 'acodec': 'none' if stream_type == 'video' else fourcc,
3067 'protocol': 'ism',
3068 'fragments': fragments,
88acdbc2 3069 'has_drm': ism_doc.find('Protection') is not None,
f68434cc 3070 'language': stream_language,
3071 'audio_channels': int_or_none(track.get('Channels')),
fd76a142
F
3072 '_download_params': {
3073 'stream_type': stream_type,
3074 'duration': duration,
3075 'timescale': stream_timescale,
3076 'width': width or 0,
3077 'height': height or 0,
3078 'fourcc': fourcc,
3079 'language': stream_language,
3080 'codec_private_data': track.get('CodecPrivateData'),
3081 'sampling_rate': sampling_rate,
3082 'channels': int_or_none(track.get('Channels', 2)),
3083 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3084 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3085 },
3086 })
3087 return formats, subtitles
b2758123 3088
079a7cfc 3089 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
3090 def absolute_url(item_url):
3091 return urljoin(base_url, item_url)
59bbe491 3092
3093 def parse_content_type(content_type):
3094 if not content_type:
3095 return {}
3096 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3097 if ctr:
3098 mimetype, codecs = ctr.groups()
3099 f = parse_codecs(codecs)
3100 f['ext'] = mimetype2ext(mimetype)
3101 return f
3102 return {}
3103
222a2308
L
3104 def _media_formats(src, cur_media_type, type_info=None):
3105 type_info = type_info or {}
520251c0 3106 full_url = absolute_url(src)
82889d4a 3107 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3108 if ext == 'm3u8':
520251c0
YCH
3109 is_plain_url = False
3110 formats = self._extract_m3u8_formats(
ad120ae1 3111 full_url, video_id, ext='mp4',
eeb0a956 3112 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3113 preference=preference, quality=quality, fatal=False)
87a449c1
S
3114 elif ext == 'mpd':
3115 is_plain_url = False
3116 formats = self._extract_mpd_formats(
b359e977 3117 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3118 else:
3119 is_plain_url = True
3120 formats = [{
3121 'url': full_url,
3122 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3123 'ext': ext,
520251c0
YCH
3124 }]
3125 return is_plain_url, formats
3126
59bbe491 3127 entries = []
4328ddf8 3128 # amp-video and amp-audio are very similar to their HTML5 counterparts
962ffcf8 3129 # so we will include them right here (see
4328ddf8 3130 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3131 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3132 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3133 media_tags = [(media_tag, media_tag_name, media_type, '')
3134 for media_tag, media_tag_name, media_type
3135 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3136 media_tags.extend(re.findall(
3137 # We only allow video|audio followed by a whitespace or '>'.
3138 # Allowing more characters may end up in significant slow down (see
62b58c09
L
3139 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3140 # e.g. http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3141 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3142 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3143 media_info = {
3144 'formats': [],
3145 'subtitles': {},
3146 }
3147 media_attributes = extract_attributes(media_tag)
bfbecd11 3148 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3149 if src:
222a2308
L
3150 f = parse_content_type(media_attributes.get('type'))
3151 _, formats = _media_formats(src, media_type, f)
520251c0 3152 media_info['formats'].extend(formats)
6780154e 3153 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3154 if media_content:
3155 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3156 s_attr = extract_attributes(source_tag)
3157 # data-video-src and data-src are non standard but seen
3158 # several times in the wild
bfbecd11 3159 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3160 if not src:
3161 continue
d493f15c 3162 f = parse_content_type(s_attr.get('type'))
868f79db 3163 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3164 if is_plain_url:
d493f15c
S
3165 # width, height, res, label and title attributes are
3166 # all not standard but seen several times in the wild
3167 labels = [
3168 s_attr.get(lbl)
3169 for lbl in ('label', 'title')
3170 if str_or_none(s_attr.get(lbl))
3171 ]
3172 width = int_or_none(s_attr.get('width'))
3089bc74
S
3173 height = (int_or_none(s_attr.get('height'))
3174 or int_or_none(s_attr.get('res')))
d493f15c
S
3175 if not width or not height:
3176 for lbl in labels:
3177 resolution = parse_resolution(lbl)
3178 if not resolution:
3179 continue
3180 width = width or resolution.get('width')
3181 height = height or resolution.get('height')
3182 for lbl in labels:
3183 tbr = parse_bitrate(lbl)
3184 if tbr:
3185 break
3186 else:
3187 tbr = None
1ed45499 3188 f.update({
d493f15c
S
3189 'width': width,
3190 'height': height,
3191 'tbr': tbr,
3192 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3193 })
520251c0
YCH
3194 f.update(formats[0])
3195 media_info['formats'].append(f)
3196 else:
3197 media_info['formats'].extend(formats)
59bbe491 3198 for track_tag in re.findall(r'<track[^>]+>', media_content):
3199 track_attributes = extract_attributes(track_tag)
3200 kind = track_attributes.get('kind')
5968d7d2 3201 if not kind or kind in ('subtitles', 'captions'):
f856816b 3202 src = strip_or_none(track_attributes.get('src'))
59bbe491 3203 if not src:
3204 continue
3205 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3206 media_info['subtitles'].setdefault(lang, []).append({
3207 'url': absolute_url(src),
3208 })
5e8e2fa5
S
3209 for f in media_info['formats']:
3210 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3211 if media_info['formats'] or media_info['subtitles']:
59bbe491 3212 entries.append(media_info)
3213 return entries
3214
f6a1d69a
F
3215 def _extract_akamai_formats(self, *args, **kwargs):
3216 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3217 if subs:
b5ae35ee 3218 self._report_ignoring_subs('akamai')
f6a1d69a
F
3219 return fmts
3220
3221 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3222 signed = 'hdnea=' in manifest_url
3223 if not signed:
3224 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3225 manifest_url = re.sub(
3226 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3227 '', manifest_url).strip('?')
3228
c7c43a93 3229 formats = []
f6a1d69a 3230 subtitles = {}
70c5802b 3231
e71a4509 3232 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3233 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3234 hds_host = hosts.get('hds')
3235 if hds_host:
3236 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3237 if 'hdcore=' not in f4m_url:
3238 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3239 f4m_formats = self._extract_f4m_formats(
3240 f4m_url, video_id, f4m_id='hds', fatal=False)
3241 for entry in f4m_formats:
3242 entry.update({'extra_param_to_segment_url': hdcore_sign})
3243 formats.extend(f4m_formats)
70c5802b 3244
c4251b9a
RA
3245 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3246 hls_host = hosts.get('hls')
3247 if hls_host:
3248 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3249 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3250 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3251 m3u8_id='hls', fatal=False)
3252 formats.extend(m3u8_formats)
f6a1d69a 3253 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3254
3255 http_host = hosts.get('http')
29f7c58a 3256 if http_host and m3u8_formats and not signed:
3257 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3258 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3259 qualities_length = len(qualities)
29f7c58a 3260 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3261 i = 0
29f7c58a 3262 for f in m3u8_formats:
3263 if f['vcodec'] != 'none':
70c5802b 3264 for protocol in ('http', 'https'):
3265 http_f = f.copy()
3266 del http_f['manifest_url']
3267 http_url = re.sub(
86e5f3ed 3268 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3269 http_f.update({
3270 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3271 'url': http_url,
3272 'protocol': protocol,
3273 })
29f7c58a 3274 formats.append(http_f)
70c5802b 3275 i += 1
70c5802b 3276
f6a1d69a 3277 return formats, subtitles
c7c43a93 3278
6ad02195 3279 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
14f25df2 3280 query = urllib.parse.urlparse(url).query
6ad02195 3281 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3282 mobj = re.search(
3283 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3284 url_base = mobj.group('url')
3285 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3286 formats = []
044eeb14
S
3287
3288 def manifest_url(manifest):
86e5f3ed 3289 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3290 if query:
3291 m_url += '?%s' % query
3292 return m_url
3293
6ad02195
RA
3294 if 'm3u8' not in skip_protocols:
3295 formats.extend(self._extract_m3u8_formats(
044eeb14 3296 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3297 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3298 if 'f4m' not in skip_protocols:
3299 formats.extend(self._extract_f4m_formats(
044eeb14 3300 manifest_url('manifest.f4m'),
6ad02195 3301 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3302 if 'dash' not in skip_protocols:
3303 formats.extend(self._extract_mpd_formats(
044eeb14 3304 manifest_url('manifest.mpd'),
0384932e 3305 video_id, mpd_id='dash', fatal=False))
6ad02195 3306 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3307 if 'smil' not in skip_protocols:
3308 rtmp_formats = self._extract_smil_formats(
044eeb14 3309 manifest_url('jwplayer.smil'),
6ad02195
RA
3310 video_id, fatal=False)
3311 for rtmp_format in rtmp_formats:
3312 rtsp_format = rtmp_format.copy()
3313 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3314 del rtsp_format['play_path']
3315 del rtsp_format['ext']
3316 rtsp_format.update({
3317 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3318 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3319 'protocol': 'rtsp',
3320 })
3321 formats.extend([rtmp_format, rtsp_format])
3322 else:
3323 for protocol in ('rtmp', 'rtsp'):
3324 if protocol not in skip_protocols:
3325 formats.append({
86e5f3ed 3326 'url': f'{protocol}:{url_base}',
6ad02195
RA
3327 'format_id': protocol,
3328 'protocol': protocol,
3329 })
3330 return formats
3331
c73e330e 3332 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3333 mobj = re.search(
32a84bcf 3334 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
a4a554a7
YCH
3335 webpage)
3336 if mobj:
c73e330e
RU
3337 try:
3338 jwplayer_data = self._parse_json(mobj.group('options'),
3339 video_id=video_id,
3340 transform_source=transform_source)
3341 except ExtractorError:
3342 pass
3343 else:
3344 if isinstance(jwplayer_data, dict):
3345 return jwplayer_data
a4a554a7
YCH
3346
3347 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3348 jwplayer_data = self._find_jwplayer_data(
3349 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3350 return self._parse_jwplayer_data(
3351 jwplayer_data, video_id, *args, **kwargs)
3352
3353 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3354 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
a4a554a7 3355 entries = []
32a84bcf
SS
3356 if not isinstance(jwplayer_data, dict):
3357 return entries
a4a554a7 3358
32a84bcf
SS
3359 playlist_items = jwplayer_data.get('playlist')
3360 # JWPlayer backward compatibility: single playlist item/flattened playlists
a4a554a7 3361 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
32a84bcf
SS
3362 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3363 if not isinstance(playlist_items, list):
3364 playlist_items = (playlist_items or jwplayer_data, )
a4a554a7 3365
32a84bcf
SS
3366 for video_data in playlist_items:
3367 if not isinstance(video_data, dict):
3368 continue
a4a554a7
YCH
3369 # JWPlayer backward compatibility: flattened sources
3370 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3371 if 'sources' not in video_data:
3372 video_data['sources'] = [video_data]
3373
3374 this_video_id = video_id or video_data['mediaid']
3375
1a2192cb
S
3376 formats = self._parse_jwplayer_formats(
3377 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3378 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3379
3380 subtitles = {}
3381 tracks = video_data.get('tracks')
3382 if tracks and isinstance(tracks, list):
3383 for track in tracks:
96a2daa1
S
3384 if not isinstance(track, dict):
3385 continue
f4b74272 3386 track_kind = track.get('kind')
14f25df2 3387 if not track_kind or not isinstance(track_kind, str):
f4b74272
S
3388 continue
3389 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3390 continue
3391 track_url = urljoin(base_url, track.get('file'))
3392 if not track_url:
3393 continue
3394 subtitles.setdefault(track.get('label') or 'en', []).append({
3395 'url': self._proto_relative_url(track_url)
3396 })
3397
50d808f5 3398 entry = {
a4a554a7 3399 'id': this_video_id,
50d808f5 3400 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3401 'description': clean_html(video_data.get('description')),
6945b9e7 3402 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3403 'timestamp': int_or_none(video_data.get('pubdate')),
3404 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3405 'subtitles': subtitles,
32a84bcf
SS
3406 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3407 'genre': clean_html(video_data.get('genre')),
3408 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3409 'season_number': int_or_none(video_data.get('season')),
3410 'episode_number': int_or_none(video_data.get('episode')),
3411 'release_year': int_or_none(video_data.get('releasedate')),
3412 'age_limit': int_or_none(video_data.get('age_restriction')),
50d808f5
RA
3413 }
3414 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3415 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3416 entry.update({
3417 '_type': 'url_transparent',
3418 'url': formats[0]['url'],
3419 })
3420 else:
50d808f5
RA
3421 entry['formats'] = formats
3422 entries.append(entry)
a4a554a7
YCH
3423 if len(entries) == 1:
3424 return entries[0]
3425 else:
3426 return self.playlist_result(entries)
3427
ed0cf9b3
S
3428 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3429 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
32a84bcf 3430 urls = set()
ed0cf9b3 3431 formats = []
1a2192cb 3432 for source in jwplayer_sources_data:
0a268c6e
S
3433 if not isinstance(source, dict):
3434 continue
6945b9e7
RA
3435 source_url = urljoin(
3436 base_url, self._proto_relative_url(source.get('file')))
3437 if not source_url or source_url in urls:
bf1b87cd 3438 continue
32a84bcf 3439 urls.add(source_url)
ed0cf9b3
S
3440 source_type = source.get('type') or ''
3441 ext = mimetype2ext(source_type) or determine_ext(source_url)
32a84bcf 3442 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
ed0cf9b3 3443 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3444 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3445 m3u8_id=m3u8_id, fatal=False))
32a84bcf 3446 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
ed0cf9b3
S
3447 formats.extend(self._extract_mpd_formats(
3448 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3449 elif ext == 'smil':
3450 formats.extend(self._extract_smil_formats(
3451 source_url, video_id, fatal=False))
ed0cf9b3 3452 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3453 elif source_type.startswith('audio') or ext in (
3454 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3455 formats.append({
3456 'url': source_url,
3457 'vcodec': 'none',
3458 'ext': ext,
3459 })
3460 else:
32a84bcf 3461 format_id = str_or_none(source.get('label'))
ed0cf9b3 3462 height = int_or_none(source.get('height'))
32a84bcf 3463 if height is None and format_id:
ed0cf9b3 3464 # Often no height is provided but there is a label in
0236cd0d 3465 # format like "1080p", "720p SD", or 1080.
32a84bcf 3466 height = parse_resolution(format_id).get('height')
ed0cf9b3
S
3467 a_format = {
3468 'url': source_url,
3469 'width': int_or_none(source.get('width')),
3470 'height': height,
d3a3d7f0 3471 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3472 'filesize': int_or_none(source.get('filesize')),
ed0cf9b3 3473 'ext': ext,
32a84bcf 3474 'format_id': format_id
ed0cf9b3
S
3475 }
3476 if source_url.startswith('rtmp'):
3477 a_format['ext'] = 'flv'
ed0cf9b3
S
3478 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3479 # of jwplayer.flash.swf
3480 rtmp_url_parts = re.split(
3481 r'((?:mp4|mp3|flv):)', source_url, 1)
3482 if len(rtmp_url_parts) == 3:
3483 rtmp_url, prefix, play_path = rtmp_url_parts
3484 a_format.update({
3485 'url': rtmp_url,
3486 'play_path': prefix + play_path,
3487 })
3488 if rtmp_params:
3489 a_format.update(rtmp_params)
3490 formats.append(a_format)
3491 return formats
3492
f4b1c7ad 3493 def _live_title(self, name):
39ca3b5c 3494 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3495 return name
f4b1c7ad 3496
b14f3a4c
PH
3497 def _int(self, v, name, fatal=False, **kwargs):
3498 res = int_or_none(v, **kwargs)
b14f3a4c 3499 if res is None:
86e5f3ed 3500 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3501 if fatal:
3502 raise ExtractorError(msg)
3503 else:
6a39ee13 3504 self.report_warning(msg)
b14f3a4c
PH
3505 return res
3506
3507 def _float(self, v, name, fatal=False, **kwargs):
3508 res = float_or_none(v, **kwargs)
3509 if res is None:
86e5f3ed 3510 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3511 if fatal:
3512 raise ExtractorError(msg)
3513 else:
6a39ee13 3514 self.report_warning(msg)
b14f3a4c
PH
3515 return res
3516
40e41780
TF
3517 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3518 path='/', secure=False, discard=False, rest={}, **kwargs):
ac668111 3519 cookie = http.cookiejar.Cookie(
4ed2d7b7 3520 0, name, value, port, port is not None, domain, True,
40e41780
TF
3521 domain.startswith('.'), path, True, secure, expire_time,
3522 discard, None, None, rest)
9809740b 3523 self.cookiejar.set_cookie(cookie)
42939b61 3524
799207e8 3525 def _get_cookies(self, url):
ac668111 3526 """ Return a http.cookies.SimpleCookie with the cookies for the url """
b87e01c1 3527 return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
799207e8 3528
e3c1266f 3529 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3530 """
3531 Apply first Set-Cookie header instead of the last. Experimental.
3532
3533 Some sites (e.g. [1-3]) may serve two cookies under the same name
3534 in Set-Cookie header and expect the first (old) one to be set rather
3535 than second (new). However, as of RFC6265 the newer one cookie
3536 should be set into cookie store what actually happens.
3537 We will workaround this issue by resetting the cookie to
3538 the first one manually.
3539 1. https://new.vk.com/
3540 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3541 3. https://learning.oreilly.com/
3542 """
e3c1266f
S
3543 for header, cookies in url_handle.headers.items():
3544 if header.lower() != 'set-cookie':
3545 continue
cfb0511d 3546 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3547 cookie_value = re.search(
3548 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3549 if cookie_value:
3550 value, domain = cookie_value.groups()
3551 self._set_cookie(domain, cookie, value)
3552 break
3553
82d02080 3554 @classmethod
3555 def get_testcases(cls, include_onlymatching=False):
6368e2e6 3556 # Do not look in super classes
3557 t = vars(cls).get('_TEST')
05900629 3558 if t:
82d02080 3559 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3560 tests = [t]
3561 else:
6368e2e6 3562 tests = vars(cls).get('_TESTS', [])
05900629
PH
3563 for t in tests:
3564 if not include_onlymatching and t.get('only_matching', False):
3565 continue
82d02080 3566 t['name'] = cls.ie_key()
05900629 3567 yield t
e756f45b
M
3568 if getattr(cls, '__wrapped__', None):
3569 yield from cls.__wrapped__.get_testcases(include_onlymatching)
05900629 3570
f2e8dbcc 3571 @classmethod
3572 def get_webpage_testcases(cls):
6368e2e6 3573 tests = vars(cls).get('_WEBPAGE_TESTS', [])
f2e8dbcc 3574 for t in tests:
3575 t['name'] = cls.ie_key()
e756f45b
M
3576 yield t
3577 if getattr(cls, '__wrapped__', None):
3578 yield from cls.__wrapped__.get_webpage_testcases()
f2e8dbcc 3579
6368e2e6 3580 @classproperty(cache=True)
24146491 3581 def age_limit(cls):
3582 """Get age limit from the testcases"""
3583 return max(traverse_obj(
f2e8dbcc 3584 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
24146491 3585 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3586
171a31db 3587 @classproperty(cache=True)
3588 def _RETURN_TYPE(cls):
3589 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3590 tests = tuple(cls.get_testcases(include_onlymatching=False))
3591 if not tests:
3592 return None
3593 elif not any(k.startswith('playlist') for test in tests for k in test):
3594 return 'video'
3595 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3596 return 'playlist'
3597 return 'any'
3598
3599 @classmethod
3600 def is_single_video(cls, url):
3601 """Returns whether the URL is of a single video, None if unknown"""
baa922b5 3602 if cls.suitable(url):
3603 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
171a31db 3604
82d02080 3605 @classmethod
3606 def is_suitable(cls, age_limit):
24146491 3607 """Test whether the extractor is generally suitable for the given age limit"""
3608 return not age_restricted(cls.age_limit, age_limit)
05900629 3609
82d02080 3610 @classmethod
3611 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3612 """Description of the extractor"""
3613 desc = ''
82d02080 3614 if cls._NETRC_MACHINE:
8dcce6a8 3615 if markdown:
5b28cef7 3616 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
8dcce6a8 3617 else:
82d02080 3618 desc += f' [{cls._NETRC_MACHINE}]'
3619 if cls.IE_DESC is False:
8dcce6a8 3620 desc += ' [HIDDEN]'
82d02080 3621 elif cls.IE_DESC:
3622 desc += f' {cls.IE_DESC}'
3623 if cls.SEARCH_KEY:
08e29b9f 3624 desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3625 if search_examples:
3626 _COUNTS = ('', '5', '10', 'all')
62b58c09 3627 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
82d02080 3628 if not cls.working():
8dcce6a8 3629 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3630
46d09f87 3631 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3632 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
8dcce6a8 3633 return f'{name}:{desc}' if desc else name
3634
a504ced0 3635 def extract_subtitles(self, *args, **kwargs):
a06916d9 3636 if (self.get_param('writesubtitles', False)
3637 or self.get_param('listsubtitles')):
9868ea49
JMF
3638 return self._get_subtitles(*args, **kwargs)
3639 return {}
a504ced0
JMF
3640
3641 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3642 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3643
0cf643b2
M
3644 class CommentsDisabled(Exception):
3645 """Raise in _get_comments if comments are disabled for the video"""
3646
a2160aa4 3647 def extract_comments(self, *args, **kwargs):
3648 if not self.get_param('getcomments'):
3649 return None
3650 generator = self._get_comments(*args, **kwargs)
3651
3652 def extractor():
3653 comments = []
d2b2fca5 3654 interrupted = True
a2160aa4 3655 try:
3656 while True:
3657 comments.append(next(generator))
a2160aa4 3658 except StopIteration:
3659 interrupted = False
d2b2fca5 3660 except KeyboardInterrupt:
3661 self.to_screen('Interrupted by user')
0cf643b2
M
3662 except self.CommentsDisabled:
3663 return {'comments': None, 'comment_count': None}
d2b2fca5 3664 except Exception as e:
3665 if self.get_param('ignoreerrors') is not True:
3666 raise
3667 self._downloader.report_error(e)
a2160aa4 3668 comment_count = len(comments)
3669 self.to_screen(f'Extracted {comment_count} comments')
3670 return {
3671 'comments': comments,
3672 'comment_count': None if interrupted else comment_count
3673 }
3674 return extractor
3675
3676 def _get_comments(self, *args, **kwargs):
3677 raise NotImplementedError('This method must be implemented by subclasses')
3678
912e0b7e
YCH
3679 @staticmethod
3680 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3681 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3682 will be dropped. """
86e5f3ed 3683 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3684 ret = list(subtitle_list1)
a44ca5a4 3685 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3686 return ret
3687
3688 @classmethod
46890374 3689 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3690 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3691 if target is None:
3692 target = {}
3693 for d in dicts:
3694 for lang, subs in d.items():
3695 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3696 return target
912e0b7e 3697
360e1ca5 3698 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3699 if (self.get_param('writeautomaticsub', False)
3700 or self.get_param('listsubtitles')):
9868ea49
JMF
3701 return self._get_automatic_captions(*args, **kwargs)
3702 return {}
360e1ca5
JMF
3703
3704 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3705 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3706
2762dbb1 3707 @functools.cached_property
24146491 3708 def _cookies_passed(self):
3709 """Whether cookies have been passed to YoutubeDL"""
3710 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3711
d77ab8e2 3712 def mark_watched(self, *args, **kwargs):
1813a6cc 3713 if not self.get_param('mark_watched', False):
3714 return
24146491 3715 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3716 self._mark_watched(*args, **kwargs)
3717
3718 def _mark_watched(self, *args, **kwargs):
3719 raise NotImplementedError('This method must be implemented by subclasses')
3720
38cce791
YCH
3721 def geo_verification_headers(self):
3722 headers = {}
a06916d9 3723 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3724 if geo_verification_proxy:
3725 headers['Ytdl-request-proxy'] = geo_verification_proxy
3726 return headers
3727
8f97a15d 3728 @staticmethod
3729 def _generic_id(url):
14f25df2 3730 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
98763ee3 3731
62b8dac4 3732 def _generic_title(self, url='', webpage='', *, default=None):
3733 return (self._og_search_title(webpage, default=None)
3734 or self._html_extract_title(webpage, default=None)
3735 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3736 or default)
98763ee3 3737
22ccd542 3738 def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3739 if not duration:
3740 return
3741 chapter_list = [{
3742 'start_time': start_function(chapter),
3743 'title': title_function(chapter),
3744 } for chapter in chapter_list or []]
84ffeb7d 3745 if strict:
3746 warn = self.report_warning
3747 else:
3748 warn = self.write_debug
22ccd542 3749 chapter_list.sort(key=lambda c: c['start_time'] or 0)
3750
3751 chapters = [{'start_time': 0}]
3752 for idx, chapter in enumerate(chapter_list):
3753 if chapter['start_time'] is None:
84ffeb7d 3754 warn(f'Incomplete chapter {idx}')
22ccd542 3755 elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3756 chapters.append(chapter)
3757 elif chapter not in chapters:
84ffeb7d 3758 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3759 else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3760 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
22ccd542 3761 return chapters[1:]
3762
3763 def _extract_chapters_from_description(self, description, duration):
3764 duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3765 sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3766 return self._extract_chapters_helper(
3767 re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3768 start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3769 duration=duration, strict=False) or self._extract_chapters_helper(
3770 re.findall(sep_re % (r'.+?', duration_re), description or ''),
3771 start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3772 duration=duration, strict=False)
3773
c224251a 3774 @staticmethod
b0089e89 3775 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3776 all_known = all(map(
3777 lambda x: x is not None,
3778 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3779 return (
3780 'private' if is_private
3781 else 'premium_only' if needs_premium
3782 else 'subscriber_only' if needs_subscription
3783 else 'needs_auth' if needs_auth
3784 else 'unlisted' if is_unlisted
3785 else 'public' if all_known
3786 else None)
3787
d43de682 3788 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3789 '''
3790 @returns A list of values for the extractor argument given by "key"
3791 or "default" if no such key is present
3792 @param default The default value to return when the key is not present (default: [])
3793 @param casesense When false, the values are converted to lower case
3794 '''
5225df50 3795 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3796 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
4bb6b02f 3797 if val is None:
3798 return [] if default is NO_DEFAULT else default
3799 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3800
f40ee5e9 3801 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3802 if not playlist_id or not video_id:
3803 return not video_id
3804
3805 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3806 if no_playlist is not None:
3807 return not no_playlist
3808
3809 video_id = '' if video_id is True else f' {video_id}'
3810 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3811 if self.get_param('noplaylist'):
3812 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3813 return False
3814 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3815 return True
3816
be5c1ae8 3817 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
8ca48a1a 3818 RetryManager.report_retry(
3819 err, _count or int(fatal), _retries,
3820 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3821 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
be5c1ae8 3822
3823 def RetryManager(self, **kwargs):
3824 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3825
ade1fa70 3826 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3827 display_id = traverse_obj(info_dict, 'display_id', 'id')
3828 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3829 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3830 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3831
8f97a15d 3832 @classmethod
3833 def extract_from_webpage(cls, ydl, url, webpage):
3834 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3835 else ydl.get_info_extractor(cls.ie_key()))
f2e8dbcc 3836 for info in ie._extract_from_webpage(url, webpage) or []:
3837 # url = None since we do not want to set (webpage/original)_url
3838 ydl.add_default_extra_info(info, ie, None)
3839 yield info
8f97a15d 3840
3841 @classmethod
3842 def _extract_from_webpage(cls, url, webpage):
3843 for embed_url in orderedSet(
3844 cls._extract_embed_urls(url, webpage) or [], lazy=True):
d2c8aadf 3845 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
8f97a15d 3846
3847 @classmethod
3848 def _extract_embed_urls(cls, url, webpage):
3849 """@returns all the embed urls on the webpage"""
3850 if '_EMBED_URL_RE' not in cls.__dict__:
3851 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3852 for idx, regex in enumerate(cls._EMBED_REGEX):
3853 assert regex.count('(?P<url>') == 1, \
3854 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3855 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3856
3857 for regex in cls._EMBED_URL_RE:
3858 for mobj in regex.finditer(webpage):
3859 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3860 if cls._VALID_URL is False or cls.suitable(embed_url):
3861 yield embed_url
3862
3863 class StopExtraction(Exception):
3864 pass
3865
bfd973ec 3866 @classmethod
3867 def _extract_url(cls, webpage): # TODO: Remove
3868 """Only for compatibility with some older extractors"""
3869 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3870
2314b4d8 3871 @classmethod
3872 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3873 if plugin_name:
3874 mro = inspect.getmro(cls)
3875 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
e756f45b
M
3876 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3877 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
2314b4d8 3878 while getattr(super_class, '__wrapped__', None):
3879 super_class = super_class.__wrapped__
3880 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
e756f45b 3881 _PLUGIN_OVERRIDES[super_class].append(cls)
2314b4d8 3882
3883 return super().__init_subclass__(**kwargs)
3884
8dbe9899 3885
d6983cb4
PH
3886class SearchInfoExtractor(InfoExtractor):
3887 """
3888 Base class for paged search queries extractors.
10952eb2 3889 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3890 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3891 """
3892
96565c7e 3893 _MAX_RESULTS = float('inf')
171a31db 3894 _RETURN_TYPE = 'playlist'
96565c7e 3895
8f97a15d 3896 @classproperty
3897 def _VALID_URL(cls):
d6983cb4
PH
3898 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3899
d6983cb4 3900 def _real_extract(self, query):
2c4aaadd 3901 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3902 if prefix == '':
3903 return self._get_n_results(query, 1)
3904 elif prefix == 'all':
3905 return self._get_n_results(query, self._MAX_RESULTS)
3906 else:
3907 n = int(prefix)
3908 if n <= 0:
86e5f3ed 3909 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3910 elif n > self._MAX_RESULTS:
6a39ee13 3911 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3912 n = self._MAX_RESULTS
3913 return self._get_n_results(query, n)
3914
3915 def _get_n_results(self, query, n):
cc16383f 3916 """Get a specified number of results for a query.
3917 Either this function or _search_results must be overridden by subclasses """
3918 return self.playlist_result(
3919 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3920 query, query)
3921
3922 def _search_results(self, query):
3923 """Returns an iterator of search results"""
611c1dd9 3924 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 3925
82d02080 3926 @classproperty
3927 def SEARCH_KEY(cls):
3928 return cls._SEARCH_KEY
fe7866d0 3929
3930
3931class UnsupportedURLIE(InfoExtractor):
3932 _VALID_URL = '.*'
3933 _ENABLED = False
3934 IE_DESC = False
3935
3936 def _real_extract(self, url):
3937 raise UnsupportedError(url)
e756f45b
M
3938
3939
3940_PLUGIN_OVERRIDES = collections.defaultdict(list)