]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
[cleanup] Fix misc bugs (#8968)
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
ac668111 3import getpass
3ec05685 4import hashlib
54007a45 5import http.client
6import http.cookiejar
7import http.cookies
2314b4d8 8import inspect
cc16383f 9import itertools
3d3538e4 10import json
f8271158 11import math
4094b6e3 12import netrc
d6983cb4 13import os
773f291d 14import random
6929b41a 15import re
db3ad8a6 16import subprocess
d6983cb4 17import sys
4094b6e3 18import time
8f97a15d 19import types
14f25df2 20import urllib.parse
ac668111 21import urllib.request
f8271158 22import xml.etree.ElementTree
d6983cb4 23
6929b41a 24from ..compat import functools # isort: split
227bf1a3 25from ..compat import (
26 compat_etree_fromstring,
27 compat_expanduser,
28 compat_os_name,
29 urllib_req_to_req,
30)
8817a80d 31from ..cookies import LenientSimpleCookie
f8271158 32from ..downloader.f4m import get_base_url, remove_encrypted_media
bc344cd4 33from ..downloader.hls import HlsFD
3d2623a8 34from ..networking import HEADRequest, Request
35from ..networking.exceptions import (
36 HTTPError,
37 IncompleteRead,
38 network_exceptions,
39)
8c25f81b 40from ..utils import (
8f97a15d 41 IDENTITY,
f8271158 42 JSON_LD_RE,
43 NO_DEFAULT,
44 ExtractorError,
d0d74b71 45 FormatSorter,
f8271158 46 GeoRestrictedError,
47 GeoUtils,
b7c47b74 48 LenientJSONDecoder,
db3ad8a6 49 Popen,
f8271158 50 RegexNotFoundError,
be5c1ae8 51 RetryManager,
f8271158 52 UnsupportedError,
05900629 53 age_restricted,
02dc0a36 54 base_url,
08f2a92c 55 bug_reports_message,
82d02080 56 classproperty,
d6983cb4 57 clean_html,
d0d74b71 58 deprecation_warning,
70f0f5a8 59 determine_ext,
d493f15c 60 dict_get,
42676437 61 encode_data_uri,
9b9c5355 62 error_to_compat_str,
46b18f23 63 extract_attributes,
90137ca4 64 filter_dict,
97f4aecf 65 fix_xml_ampersands,
b14f3a4c 66 float_or_none,
b868936c 67 format_field,
31bb8d3f 68 int_or_none,
34921b43 69 join_nonempty,
a4a554a7 70 js_to_json,
46b18f23 71 mimetype2ext,
ad54c913 72 netrc_from_content,
46b18f23 73 orderedSet,
d493f15c 74 parse_bitrate,
46b18f23
JH
75 parse_codecs,
76 parse_duration,
4ca2a3cf 77 parse_iso8601,
46b18f23 78 parse_m3u8_attributes,
d493f15c 79 parse_resolution,
46b18f23 80 sanitize_filename,
8f97a15d 81 sanitize_url,
ade1fa70 82 smuggle_url,
d493f15c 83 str_or_none,
ce5b9040 84 str_to_int,
f856816b 85 strip_or_none,
5d3a0e79 86 traverse_obj,
71df9b7f 87 truncate_string,
47046464 88 try_call,
ffa89477 89 try_get,
f38de77f 90 unescapeHTML,
647eab45 91 unified_strdate,
6b3a3098 92 unified_timestamp,
a107193e 93 url_basename,
bebef109 94 url_or_none,
7e68567e 95 urlhandle_detect_ext,
b868936c 96 urljoin,
6606817a 97 variadic,
a6571f10 98 xpath_element,
8d6765cf
S
99 xpath_text,
100 xpath_with_ns,
d6983cb4 101)
c342041f 102
d6983cb4 103
86e5f3ed 104class InfoExtractor:
d6983cb4
PH
105 """Information Extractor class.
106
107 Information extractors are the classes that, given a URL, extract
108 information about the video (or videos) the URL refers to. This
109 information includes the real video URL, the video title, author and
110 others. The information is stored in a dictionary which is then
5d380852 111 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
112 information possibly downloading the video to the file system, among
113 other possible outcomes.
114
cf0649f8 115 The type field determines the type of the result.
fed5d032
PH
116 By far the most common value (and the default if _type is missing) is
117 "video", which indicates a single video.
118
119 For a video, the dictionaries must include the following fields:
d6983cb4
PH
120
121 id: Video identifier.
d4736fdb 122 title: Video title, unescaped. Set to an empty string if video has
123 no title as opposed to "None" which signifies that the
124 extractor failed to obtain a title
d67b0b15 125
f49d89ee 126 Additionally, it must contain either a formats entry or a url one:
d67b0b15 127
f49d89ee
PH
128 formats: A list of dictionaries for each format available, ordered
129 from worst to best quality.
130
131 Potential fields:
c790e93a
S
132 * url The mandatory URL representing the media:
133 for plain file media - HTTP URL of this file,
134 for RTMP - RTMP URL,
135 for HLS - URL of the M3U8 media playlist,
136 for HDS - URL of the F4M manifest,
79d2077e
S
137 for DASH
138 - HTTP URL to plain file media (in case of
139 unfragmented media)
140 - URL of the MPD manifest or base URL
141 representing the media if MPD manifest
8ed7a233 142 is parsed from a string (in case of
79d2077e 143 fragmented media)
c790e93a 144 for MSS - URL of the ISM manifest.
f34804b2 145 * request_data Data to send in POST request to the URL
86f4d14f
S
146 * manifest_url
147 The URL of the manifest file in case of
c790e93a
S
148 fragmented media:
149 for HLS - URL of the M3U8 master playlist,
150 for HDS - URL of the F4M manifest,
151 for DASH - URL of the MPD manifest,
152 for MSS - URL of the ISM manifest.
a44ca5a4 153 * manifest_stream_number (For internal use only)
154 The index of the stream in the manifest file
10952eb2 155 * ext Will be calculated from URL if missing
d67b0b15
PH
156 * format A human-readable description of the format
157 ("mp4 container with h264/opus").
158 Calculated from the format_id, width, height.
159 and format_note fields if missing.
160 * format_id A short description of the format
5d4f3985
PH
161 ("mp4_h264_opus" or "19").
162 Technically optional, but strongly recommended.
d67b0b15
PH
163 * format_note Additional info about the format
164 ("3D" or "DASH video")
165 * width Width of the video, if known
166 * height Height of the video, if known
105bfd90 167 * aspect_ratio Aspect ratio of the video, if known
168 Automatically calculated from width and height
f49d89ee 169 * resolution Textual description of width and height
105bfd90 170 Automatically calculated from width and height
176f1866 171 * dynamic_range The dynamic range of the video. One of:
172 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 173 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
174 * abr Average audio bitrate in KBit/s
175 * acodec Name of the audio codec in use
dd27fd17 176 * asr Audio sampling rate in Hertz
b8ed0f15 177 * audio_channels Number of audio channels
d67b0b15 178 * vbr Average video bitrate in KBit/s
fbb21cf5 179 * fps Frame rate
d67b0b15 180 * vcodec Name of the video codec in use
1394ce65 181 * container Name of the container format
d67b0b15 182 * filesize The number of bytes, if known in advance
9732d77e 183 * filesize_approx An estimate for the number of bytes
d67b0b15 184 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 185 * protocol The protocol that will be used for the actual
adbc4ec4
THD
186 download, lower-case. One of "http", "https" or
187 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
188 * fragment_base_url
189 Base URL for fragments. Each fragment's path
190 value (if present) will be relative to
191 this URL.
192 * fragments A list of fragments of a fragmented media.
193 Each fragment entry must contain either an url
194 or a path. If an url is present it should be
195 considered by a client. Otherwise both path and
196 fragment_base_url must be present. Here is
197 the list of all potential fields:
198 * "url" - fragment's URL
199 * "path" - fragment's path relative to
200 fragment_base_url
a0d5077c
S
201 * "duration" (optional, int or float)
202 * "filesize" (optional, int)
adbc4ec4
THD
203 * is_from_start Is a live format that can be downloaded
204 from the start. Boolean
f49d89ee 205 * preference Order number of this format. If this field is
08d13955 206 present and not None, the formats get sorted
38d63d84 207 by this field, regardless of all other values.
f49d89ee
PH
208 -1 for default (order by other properties),
209 -2 or smaller for less than default.
e65566a9
PH
210 < -1000 to hide the format (if there is
211 another one which is strictly better)
32f90364
PH
212 * language Language code, e.g. "de" or "en-US".
213 * language_preference Is this in the language mentioned in
214 the URL?
aff2f4f4
PH
215 10 if it's what the URL is about,
216 -1 for default (don't know),
217 -10 otherwise, other values reserved for now.
5d73273f
PH
218 * quality Order number of the video quality of this
219 format, irrespective of the file format.
220 -1 for default (order by other properties),
221 -2 or smaller for less than default.
c64ed2a3
PH
222 * source_preference Order number for this video source
223 (quality takes higher priority)
224 -1 for default (order by other properties),
225 -2 or smaller for less than default.
d769be6c
PH
226 * http_headers A dictionary of additional HTTP headers
227 to add to the request.
6271f1ca 228 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
229 video's pixels are not square.
230 width : height ratio as float.
231 * no_resume The server does not support resuming the
232 (HTTP or RTMP) download. Boolean.
bc344cd4 233 * has_drm True if the format has DRM and cannot be downloaded.
234 'maybe' if the format may have DRM and has to be tested before download.
7e68567e 235 * extra_param_to_segment_url A query string to append to each
236 fragment's URL, or to update each existing query string
237 with. Only applied by the native HLS/DASH downloaders.
238 * hls_aes A dictionary of HLS AES-128 decryption information
239 used by the native HLS downloader to override the
240 values in the media playlist when an '#EXT-X-KEY' tag
241 is present in the playlist:
242 * uri The URI from which the key will be downloaded
243 * key The key (as hex) used to decrypt fragments.
244 If `key` is given, any key URI will be ignored
245 * iv The IV (as hex) used to decrypt fragments
0a5a191a 246 * downloader_options A dictionary of downloader options
247 (For internal use only)
248 * http_chunk_size Chunk size for HTTP downloads
249 * ffmpeg_args Extra arguments for ffmpeg downloader
4ce57d3b
A
250 * is_dash_periods Whether the format is a result of merging
251 multiple DASH periods.
3b1fe47d 252 RTMP formats can also have the additional fields: page_url,
253 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
254 rtmp_protocol, rtmp_real_time
3dee7826 255
c0ba0f48 256 url: Final video URL.
d6983cb4 257 ext: Video filename extension.
d67b0b15
PH
258 format: The video format, defaults to ext (used for --get-format)
259 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 260
d6983cb4
PH
261 The following fields are optional:
262
08d30158 263 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 264 alt_title: A secondary title of the video.
f4f9f6d0 265 display_id: An alternative identifier for the video, not necessarily
0afef30b
PH
266 unique, but available before title. Typically, id is
267 something like "4234987", title "Dancing naked mole rats",
268 and display_id "dancing-naked-mole-rats"
d5519808 269 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 270 * "id" (optional, string) - Thumbnail format ID
d5519808 271 * "url"
cfb56d1a 272 * "preference" (optional, int) - quality of the image
d5519808
PH
273 * "width" (optional, int)
274 * "height" (optional, int)
5e1c39ac 275 * "resolution" (optional, string "{width}x{height}",
d5519808 276 deprecated)
2de624fd 277 * "filesize" (optional, int)
297e9952 278 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 279 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 280 description: Full video description.
d6983cb4 281 uploader: Full name of the video uploader.
2bc0c46f 282 license: License name the video is licensed under.
104a7b5a 283 creators: List of creators of the video.
10db0d2f 284 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 285 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 286 If not explicitly set, calculated from timestamp
287 release_timestamp: UNIX timestamp of the moment the video was released.
288 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 289 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 290 If not explicitly set, calculated from release_timestamp
1732eccc 291 release_year: Year (YYYY) as integer when the video or album was released.
292 To be used if no exact release date is known.
293 If not explicitly set, calculated from release_date.
f0d785d3 294 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 295 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 296 If not explicitly set, calculated from modified_timestamp
d6983cb4 297 uploader_id: Nickname or id of the video uploader.
7bcd2830 298 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 299 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 300 Note that channel fields may or may not repeat uploader
6f1f59f3
S
301 fields. This depends on a particular extractor.
302 channel_id: Id of the channel.
303 channel_url: Full URL to a channel webpage.
6c73052c 304 channel_follower_count: Number of followers of the channel.
8213ce28 305 channel_is_verified: Whether the channel is verified on the platform.
da9ec3b9 306 location: Physical location where the video was filmed.
a504ced0 307 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
308 {tag: subformats}. "tag" is usually a language code, and
309 "subformats" is a list sorted from lower to higher
310 preference, each element is a dictionary with the "ext"
311 entry and one of:
a504ced0 312 * "data": The subtitles file contents
10952eb2 313 * "url": A URL pointing to the subtitles file
2412044c 314 It can optionally also have:
315 * "name": Name or description of the subtitles
08d30158 316 * "http_headers": A dictionary of additional HTTP headers
297e9952 317 to add to the request.
4bba3716 318 "ext" will be calculated from URL if missing
e167860c 319 automatic_captions: Like 'subtitles'; contains automatically generated
320 captions instead of normal subtitles
62d231c0 321 duration: Length of the video in seconds, as an integer or float.
f3d29461 322 view_count: How many users have watched the video on the platform.
867c66ff 323 concurrent_view_count: How many users are currently watching the video on the platform.
19e3dfc9
PH
324 like_count: Number of positive ratings of the video
325 dislike_count: Number of negative ratings of the video
02835c6b 326 repost_count: Number of reposts of the video
2d30521a 327 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 328 comment_count: Number of comments on the video
dd622d7c
PH
329 comments: A list of comments, each with one or more of the following
330 properties (all but one of text or html optional):
331 * "author" - human-readable name of the comment author
332 * "author_id" - user ID of the comment author
a1c5d2ca 333 * "author_thumbnail" - The thumbnail of the comment author
c35448b7 334 * "author_url" - The url to the comment author's page
335 * "author_is_verified" - Whether the author is verified
336 on the platform
337 * "author_is_uploader" - Whether the comment is made by
338 the video uploader
dd622d7c
PH
339 * "id" - Comment ID
340 * "html" - Comment as HTML
341 * "text" - Plain text of the comment
342 * "timestamp" - UNIX timestamp of comment
343 * "parent" - ID of the comment this one is replying to.
344 Set to "root" to indicate that this is a
345 comment to the original video.
a1c5d2ca
M
346 * "like_count" - Number of positive ratings of the comment
347 * "dislike_count" - Number of negative ratings of the comment
348 * "is_favorited" - Whether the comment is marked as
349 favorite by the video uploader
c35448b7 350 * "is_pinned" - Whether the comment is pinned to
351 the top of the comments
8dbe9899 352 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 353 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
354 should allow to get the same result again. (It will be set
355 by YoutubeDL if it's missing)
ad3bc6ac
PH
356 categories: A list of categories that the video falls in, for example
357 ["Sports", "Berlin"]
864f24bd 358 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 359 cast: A list of the video cast
7267bd53
PH
360 is_live: True, False, or None (=unknown). Whether this video is a
361 live stream that goes on instead of a fixed-length video.
f76ede8e 362 was_live: True, False, or None (=unknown). Whether this video was
363 originally a live stream.
0647d925 364 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
e325a21a 365 or 'post_live' (was live, but VOD is not yet processed)
ae30b840 366 If absent, automatically set from is_live, was_live
7c80519c 367 start_time: Time in seconds where the reproduction should start, as
10952eb2 368 specified in the URL.
297a564b 369 end_time: Time in seconds where the reproduction should end, as
10952eb2 370 specified in the URL.
55949fed 371 chapters: A list of dictionaries, with the following entries:
372 * "start_time" - The start time of the chapter in seconds
373 * "end_time" - The end time of the chapter in seconds
374 * "title" (optional, string)
5caf30db
A
375 heatmap: A list of dictionaries, with the following entries:
376 * "start_time" - The start time of the data point in seconds
377 * "end_time" - The end time of the data point in seconds
378 * "value" - The normalized value of the data point (float between 0 and 1)
6cfda058 379 playable_in_embed: Whether this video is allowed to play in embedded
380 players on other sites. Can be True (=always allowed),
381 False (=never allowed), None (=unknown), or a string
62b58c09 382 specifying the criteria for embedability; e.g. 'whitelist'
c224251a
M
383 availability: Under what condition the video is available. One of
384 'private', 'premium_only', 'subscriber_only', 'needs_auth',
385 'unlisted' or 'public'. Use 'InfoExtractor._availability'
386 to set it
e370f9ec 387 media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
1e8fe57e 388 _old_archive_ids: A list of old archive ids needed for backward compatibility
784320c9 389 _format_sort_fields: A list of fields to use for sorting formats
277d6ff5 390 __post_extractor: A function to be called just before the metadata is
391 written to either disk, logger or console. The function
392 must return a dict which will be added to the info_dict.
393 This is usefull for additional information that is
394 time-consuming to extract. Note that the fields thus
395 extracted will not be available to output template and
396 match_filter. So, only "comments" and "comment_count" are
397 currently allowed to be extracted via this method.
d6983cb4 398
7109903e
S
399 The following fields should only be used when the video belongs to some logical
400 chapter or section:
401
402 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
403 chapter_number: Number of the chapter the video belongs to, as an integer.
404 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
405
406 The following fields should only be used when the video is an episode of some
8d76bdf1 407 series, programme or podcast:
7109903e
S
408
409 series: Title of the series or programme the video episode belongs to.
9ac24e23 410 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 411 season: Title of the season the video episode belongs to.
27bfd4e5
S
412 season_number: Number of the season the video episode belongs to, as an integer.
413 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
414 episode: Title of the video episode. Unlike mandatory video title field,
415 this field should denote the exact title of the video episode
416 without any kind of decoration.
27bfd4e5
S
417 episode_number: Number of the video episode within a season, as an integer.
418 episode_id: Id of the video episode, as a unicode string.
7109903e 419
7a93ab5f
S
420 The following fields should only be used when the media is a track or a part of
421 a music album:
422
423 track: Title of the track.
424 track_number: Number of the track within an album or a disc, as an integer.
425 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
426 as a unicode string.
104a7b5a
L
427 artists: List of artists of the track.
428 composers: List of composers of the piece.
429 genres: List of genres of the track.
7a93ab5f
S
430 album: Title of the album the track belongs to.
431 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
104a7b5a
L
432 album_artists: List of all artists appeared on the album.
433 E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
434 Useful for splits and compilations.
7a93ab5f
S
435 disc_number: Number of the disc or other physical medium the track belongs to,
436 as an integer.
7a93ab5f 437
3975b4d2 438 The following fields should only be set for clips that should be cut from the original video:
439
440 section_start: Start time of the section in seconds
441 section_end: End time of the section in seconds
442
45e8a04e 443 The following fields should only be set for storyboards:
444 rows: Number of rows in each storyboard fragment, as an integer
445 columns: Number of columns in each storyboard fragment, as an integer
446
104a7b5a
L
447 The following fields are deprecated and should not be set by new code:
448 composer: Use "composers" instead.
449 Composer(s) of the piece, comma-separated.
450 artist: Use "artists" instead.
451 Artist(s) of the track, comma-separated.
452 genre: Use "genres" instead.
453 Genre(s) of the track, comma-separated.
454 album_artist: Use "album_artists" instead.
455 All artists appeared on the album, comma-separated.
456 creator: Use "creators" instead.
457 The creator of the video.
458
deefc05b 459 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 460
d838b1bd
PH
461 Unless mentioned otherwise, None is equivalent to absence of information.
462
fed5d032
PH
463
464 _type "playlist" indicates multiple videos.
b82f815f
PH
465 There must be a key "entries", which is a list, an iterable, or a PagedList
466 object, each element of which is a valid dictionary by this specification.
fed5d032 467
962ffcf8 468 Additionally, playlists can have "id", "title", and any other relevant
b60419c5 469 attributes with the same semantics as videos (see above).
fed5d032 470
f0d785d3 471 It can also have the following optional fields:
472
473 playlist_count: The total number of videos in a playlist. If not given,
474 YoutubeDL tries to calculate it from "entries"
475
fed5d032
PH
476
477 _type "multi_video" indicates that there are multiple videos that
478 form a single show, for examples multiple acts of an opera or TV episode.
479 It must have an entries key like a playlist and contain all the keys
480 required for a video at the same time.
481
482
483 _type "url" indicates that the video must be extracted from another
484 location, possibly by a different extractor. Its only required key is:
485 "url" - the next URL to extract.
f58766ce
PH
486 The key "ie_key" can be set to the class name (minus the trailing "IE",
487 e.g. "Youtube") if the extractor class is known in advance.
488 Additionally, the dictionary may have any properties of the resolved entity
489 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
490 known ahead of time.
491
492
493 _type "url_transparent" entities have the same specification as "url", but
494 indicate that the given additional information is more precise than the one
495 associated with the resolved URL.
496 This is useful when a site employs a video service that hosts the video and
497 its technical metadata, but that video service does not embed a useful
498 title, description etc.
499
500
8f97a15d 501 Subclasses of this should also be added to the list of extractors and
5fd83674 502 should define _VALID_URL as a regexp or a Sequence of regexps, and
503 re-define the _real_extract() and (optionally) _real_initialize() methods.
d6983cb4 504
e6f21b3d 505 Subclasses may also override suitable() if necessary, but ensure the function
506 signature is preserved and that this function imports everything it needs
52efa4b3 507 (except other extractors), so that lazy_extractors works correctly.
508
8f97a15d 509 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
510 the HTML of Generic webpages. It may also override _extract_embed_urls
511 or _extract_from_webpage as necessary. While these are normally classmethods,
512 _extract_from_webpage is allowed to be an instance method.
513
514 _extract_from_webpage may raise self.StopExtraction() to stop further
515 processing of the webpage and obtain exclusive rights to it. This is useful
62b58c09
L
516 when the extractor cannot reliably be matched using just the URL,
517 e.g. invidious/peertube instances
8f97a15d 518
519 Embed-only extractors can be defined by setting _VALID_URL = False.
520
52efa4b3 521 To support username + password (or netrc) login, the extractor must define a
522 _NETRC_MACHINE and re-define _perform_login(username, password) and
523 (optionally) _initialize_pre_login() methods. The _perform_login method will
524 be called between _initialize_pre_login and _real_initialize if credentials
525 are passed by the user. In cases where it is necessary to have the login
526 process as part of the extraction rather than initialization, _perform_login
527 can be left undefined.
e6f21b3d 528
4248dad9 529 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
530 geo restriction bypass mechanisms for a particular extractor.
531 Though it won't disable explicit geo restriction bypass based on
504f20dd 532 country code provided with geo_bypass_country.
4248dad9
S
533
534 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
535 countries for this extractor. One of these countries will be used by
536 geo restriction bypass mechanism right away in order to bypass
504f20dd 537 geo restriction, of course, if the mechanism is not disabled.
773f291d 538
5f95927a
S
539 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
540 IP blocks in CIDR notation for this extractor. One of these IP blocks
541 will be used by geo restriction bypass mechanism similarly
504f20dd 542 to _GEO_COUNTRIES.
3ccdde8c 543
fe7866d0 544 The _ENABLED attribute should be set to False for IEs that
545 are disabled by default and must be explicitly enabled.
546
e6f21b3d 547 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
548 in order to warn the users and skip the tests.
549 """
550
551 _ready = False
552 _downloader = None
773f291d 553 _x_forwarded_for_ip = None
4248dad9
S
554 _GEO_BYPASS = True
555 _GEO_COUNTRIES = None
5f95927a 556 _GEO_IP_BLOCKS = None
d6983cb4 557 _WORKING = True
fe7866d0 558 _ENABLED = True
52efa4b3 559 _NETRC_MACHINE = None
231025c4 560 IE_DESC = None
8dcce6a8 561 SEARCH_KEY = None
8f97a15d 562 _VALID_URL = None
563 _EMBED_REGEX = []
d6983cb4 564
8dcce6a8 565 def _login_hint(self, method=NO_DEFAULT, netrc=None):
db3ad8a6 566 password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
8dcce6a8 567 return {
568 None: '',
569 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
570 'password': f'Use {password_hint}',
571 'cookies': (
572 'Use --cookies-from-browser or --cookies for the authentication. '
17ffed18 573 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
8dcce6a8 574 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 575
d6983cb4 576 def __init__(self, downloader=None):
49a57e70 577 """Constructor. Receives an optional downloader (a YoutubeDL instance).
578 If a downloader is not passed during initialization,
579 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 580 self._ready = False
773f291d 581 self._x_forwarded_for_ip = None
28f436ba 582 self._printed_messages = set()
d6983cb4
PH
583 self.set_downloader(downloader)
584
585 @classmethod
5ad28e7f 586 def _match_valid_url(cls, url):
8f97a15d 587 if cls._VALID_URL is False:
588 return None
79cb2577
PH
589 # This does not use has/getattr intentionally - we want to know whether
590 # we have cached the regexp for *this* class, whereas getattr would also
591 # match the superclass
592 if '_VALID_URL_RE' not in cls.__dict__:
5fd83674 593 cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
594 return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
5ad28e7f 595
596 @classmethod
597 def suitable(cls, url):
598 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 599 # This function must import everything it needs (except other extractors),
600 # so that lazy_extractors works correctly
5ad28e7f 601 return cls._match_valid_url(url) is not None
d6983cb4 602
ed9266db
PH
603 @classmethod
604 def _match_id(cls, url):
5ad28e7f 605 return cls._match_valid_url(url).group('id')
ed9266db 606
1151c407 607 @classmethod
608 def get_temp_id(cls, url):
609 try:
610 return cls._match_id(url)
611 except (IndexError, AttributeError):
612 return None
613
d6983cb4
PH
614 @classmethod
615 def working(cls):
616 """Getter method for _WORKING."""
617 return cls._WORKING
618
52efa4b3 619 @classmethod
620 def supports_login(cls):
621 return bool(cls._NETRC_MACHINE)
622
d6983cb4
PH
623 def initialize(self):
624 """Initializes an instance (authentication, etc)."""
28f436ba 625 self._printed_messages = set()
5f95927a
S
626 self._initialize_geo_bypass({
627 'countries': self._GEO_COUNTRIES,
628 'ip_blocks': self._GEO_IP_BLOCKS,
629 })
4248dad9 630 if not self._ready:
52efa4b3 631 self._initialize_pre_login()
632 if self.supports_login():
633 username, password = self._get_login_info()
634 if username:
635 self._perform_login(username, password)
636 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 637 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
638 self._real_initialize()
639 self._ready = True
640
5f95927a 641 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
642 """
643 Initialize geo restriction bypass mechanism.
644
645 This method is used to initialize geo bypass mechanism based on faking
646 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 647 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
648 IP will be passed as X-Forwarded-For HTTP header in all subsequent
649 HTTP requests.
e39b5d4a
S
650
651 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
652 during the instance initialization with _GEO_COUNTRIES and
653 _GEO_IP_BLOCKS.
e39b5d4a 654
5f95927a 655 You may also manually call it from extractor's code if geo bypass
e39b5d4a 656 information is not available beforehand (e.g. obtained during
5f95927a
S
657 extraction) or due to some other reason. In this case you should pass
658 this information in geo bypass context passed as first argument. It may
659 contain following fields:
660
661 countries: List of geo unrestricted countries (similar
662 to _GEO_COUNTRIES)
663 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
664 (similar to _GEO_IP_BLOCKS)
665
e39b5d4a 666 """
773f291d 667 if not self._x_forwarded_for_ip:
5f95927a
S
668
669 # Geo bypass mechanism is explicitly disabled by user
a06916d9 670 if not self.get_param('geo_bypass', True):
5f95927a
S
671 return
672
673 if not geo_bypass_context:
674 geo_bypass_context = {}
675
676 # Backward compatibility: previously _initialize_geo_bypass
677 # expected a list of countries, some 3rd party code may still use
678 # it this way
679 if isinstance(geo_bypass_context, (list, tuple)):
680 geo_bypass_context = {
681 'countries': geo_bypass_context,
682 }
683
684 # The whole point of geo bypass mechanism is to fake IP
685 # as X-Forwarded-For HTTP header based on some IP block or
686 # country code.
687
688 # Path 1: bypassing based on IP block in CIDR notation
689
690 # Explicit IP block specified by user, use it right away
691 # regardless of whether extractor is geo bypassable or not
a06916d9 692 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
693
694 # Otherwise use random IP block from geo bypass context but only
695 # if extractor is known as geo bypassable
696 if not ip_block:
697 ip_blocks = geo_bypass_context.get('ip_blocks')
698 if self._GEO_BYPASS and ip_blocks:
699 ip_block = random.choice(ip_blocks)
700
701 if ip_block:
702 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 703 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
704 return
705
706 # Path 2: bypassing based on country code
707
708 # Explicit country code specified by user, use it right away
709 # regardless of whether extractor is geo bypassable or not
a06916d9 710 country = self.get_param('geo_bypass_country', None)
5f95927a
S
711
712 # Otherwise use random country code from geo bypass context but
713 # only if extractor is known as geo bypassable
714 if not country:
715 countries = geo_bypass_context.get('countries')
716 if self._GEO_BYPASS and countries:
717 country = random.choice(countries)
718
719 if country:
720 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 721 self._downloader.write_debug(
86e5f3ed 722 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
723
724 def extract(self, url):
725 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 726 try:
773f291d
S
727 for _ in range(2):
728 try:
729 self.initialize()
71df9b7f 730 self.to_screen('Extracting URL: %s' % (
731 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
0016b84e 732 ie_result = self._real_extract(url)
07cce701 733 if ie_result is None:
734 return None
0016b84e
S
735 if self._x_forwarded_for_ip:
736 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
b79f9e30 737 subtitles = ie_result.get('subtitles') or {}
738 if 'no-live-chat' in self.get_param('compat_opts'):
739 for lang in ('live_chat', 'comments', 'danmaku'):
740 subtitles.pop(lang, None)
0016b84e 741 return ie_result
773f291d 742 except GeoRestrictedError as e:
4248dad9
S
743 if self.__maybe_fake_ip_and_retry(e.countries):
744 continue
773f291d 745 raise
0db3bae8 746 except UnsupportedError:
747 raise
1151c407 748 except ExtractorError as e:
6148833f 749 e.video_id = e.video_id or self.get_temp_id(url)
93240fc1 750 e.ie = e.ie or self.IE_NAME
9bcfe33b 751 e.traceback = e.traceback or sys.exc_info()[2]
752 raise
3d2623a8 753 except IncompleteRead as e:
1151c407 754 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 755 except (KeyError, StopIteration) as e:
1151c407 756 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 757
4248dad9 758 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 759 if (not self.get_param('geo_bypass_country', None)
3089bc74 760 and self._GEO_BYPASS
a06916d9 761 and self.get_param('geo_bypass', True)
3089bc74
S
762 and not self._x_forwarded_for_ip
763 and countries):
eea0716c
S
764 country_code = random.choice(countries)
765 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
766 if self._x_forwarded_for_ip:
767 self.report_warning(
eea0716c
S
768 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
769 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
770 return True
771 return False
772
d6983cb4 773 def set_downloader(self, downloader):
08d30158 774 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
775 self._downloader = downloader
776
9809740b 777 @property
778 def cache(self):
779 return self._downloader.cache
780
781 @property
782 def cookiejar(self):
783 return self._downloader.cookiejar
784
52efa4b3 785 def _initialize_pre_login(self):
962ffcf8 786 """ Initialization before login. Redefine in subclasses."""
52efa4b3 787 pass
788
789 def _perform_login(self, username, password):
790 """ Login with username and password. Redefine in subclasses."""
791 pass
792
d6983cb4
PH
793 def _real_initialize(self):
794 """Real initialization process. Redefine in subclasses."""
795 pass
796
797 def _real_extract(self, url):
798 """Real extraction process. Redefine in subclasses."""
08d30158 799 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 800
56c73665
JMF
801 @classmethod
802 def ie_key(cls):
803 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 804 return cls.__name__[:-2]
56c73665 805
82d02080 806 @classproperty
807 def IE_NAME(cls):
808 return cls.__name__[:-2]
d6983cb4 809
d391b7e2
S
810 @staticmethod
811 def __can_accept_status_code(err, expected_status):
3d2623a8 812 assert isinstance(err, HTTPError)
d391b7e2
S
813 if expected_status is None:
814 return False
d391b7e2 815 elif callable(expected_status):
3d2623a8 816 return expected_status(err.status) is True
d391b7e2 817 else:
3d2623a8 818 return err.status in variadic(expected_status)
d391b7e2 819
c043c246 820 def _create_request(self, url_or_request, data=None, headers=None, query=None):
ac668111 821 if isinstance(url_or_request, urllib.request.Request):
3d2623a8 822 self._downloader.deprecation_warning(
823 'Passing a urllib.request.Request to _create_request() is deprecated. '
824 'Use yt_dlp.networking.common.Request instead.')
227bf1a3 825 url_or_request = urllib_req_to_req(url_or_request)
826 elif not isinstance(url_or_request, Request):
827 url_or_request = Request(url_or_request)
828
829 url_or_request.update(data=data, headers=headers, query=query)
830 return url_or_request
f95b9dee 831
c043c246 832 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
d391b7e2
S
833 """
834 Return the response handle.
835
836 See _download_webpage docstring for arguments specification.
837 """
1cf376f5 838 if not self._downloader._first_webpage_request:
49a57e70 839 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 840 if sleep_interval > 0:
5ef7d9bd 841 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 842 time.sleep(sleep_interval)
843 else:
844 self._downloader._first_webpage_request = False
845
d6983cb4
PH
846 if note is None:
847 self.report_download_webpage(video_id)
848 elif note is not False:
7cc3570e 849 if video_id is None:
86e5f3ed 850 self.to_screen(str(note))
7cc3570e 851 else:
86e5f3ed 852 self.to_screen(f'{video_id}: {note}')
2132edaa
S
853
854 # Some sites check X-Forwarded-For HTTP header in order to figure out
855 # the origin of the client behind proxy. This allows bypassing geo
856 # restriction by faking this header's value to IP that belongs to some
857 # geo unrestricted country. We will do so once we encounter any
858 # geo restriction error.
859 if self._x_forwarded_for_ip:
c043c246 860 headers = (headers or {}).copy()
861 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
2132edaa 862
d6983cb4 863 try:
f95b9dee 864 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 865 except network_exceptions as err:
3d2623a8 866 if isinstance(err, HTTPError):
d391b7e2 867 if self.__can_accept_status_code(err, expected_status):
227bf1a3 868 return err.response
d391b7e2 869
aa94a6d3
PH
870 if errnote is False:
871 return False
d6983cb4 872 if errnote is None:
f1a9d64e 873 errnote = 'Unable to download webpage'
7f8b2714 874
86e5f3ed 875 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 876 if fatal:
497d2fab 877 raise ExtractorError(errmsg, cause=err)
7cc3570e 878 else:
6a39ee13 879 self.report_warning(errmsg)
7cc3570e 880 return False
d6983cb4 881
1890fc63 882 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
883 encoding=None, data=None, headers={}, query={}, expected_status=None):
d391b7e2
S
884 """
885 Return a tuple (page content as string, URL handle).
886
617f658b 887 Arguments:
888 url_or_request -- plain text URL as a string or
ac668111 889 a urllib.request.Request object
617f658b 890 video_id -- Video/playlist/item identifier (string)
891
892 Keyword arguments:
893 note -- note printed before downloading (string)
894 errnote -- note printed in case of an error (string)
895 fatal -- flag denoting whether error should be considered fatal,
896 i.e. whether it should cause ExtractionError to be raised,
897 otherwise a warning will be reported and extraction continued
898 encoding -- encoding for a page content decoding, guessed automatically
899 when not explicitly specified
900 data -- POST data (bytes)
901 headers -- HTTP headers (dict)
902 query -- URL query (dict)
903 expected_status -- allows to accept failed HTTP requests (non 2xx
904 status code) by explicitly specifying a set of accepted status
905 codes. Can be any of the following entities:
906 - an integer type specifying an exact failed status code to
907 accept
908 - a list or a tuple of integer types specifying a list of
909 failed status codes to accept
910 - a callable accepting an actual failed status code and
911 returning True if it should be accepted
912 Note that this argument does not affect success status codes (2xx)
913 which are always accepted.
d391b7e2 914 """
617f658b 915
b9d3e163 916 # Strip hashes from the URL (#1038)
14f25df2 917 if isinstance(url_or_request, str):
b9d3e163
PH
918 url_or_request = url_or_request.partition('#')[0]
919
d391b7e2 920 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
921 if urlh is False:
922 assert not fatal
923 return False
c9a77969 924 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
925 return (content, urlh)
926
c9a77969
YCH
927 @staticmethod
928 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
929 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
930 if m:
931 encoding = m.group(1)
932 else:
0d75ae2c 933 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
934 webpage_bytes[:1024])
935 if m:
936 encoding = m.group(1).decode('ascii')
b60016e8
PH
937 elif webpage_bytes.startswith(b'\xff\xfe'):
938 encoding = 'utf-16'
f143d86a
PH
939 else:
940 encoding = 'utf-8'
c9a77969
YCH
941
942 return encoding
943
4457823d
S
944 def __check_blocked(self, content):
945 first_block = content[:512]
3089bc74
S
946 if ('<title>Access to this site is blocked</title>' in content
947 and 'Websense' in first_block):
4457823d
S
948 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
949 blocked_iframe = self._html_search_regex(
950 r'<iframe src="([^"]+)"', content,
951 'Websense information URL', default=None)
952 if blocked_iframe:
953 msg += ' Visit %s for more details' % blocked_iframe
954 raise ExtractorError(msg, expected=True)
955 if '<title>The URL you requested has been blocked</title>' in first_block:
956 msg = (
957 'Access to this webpage has been blocked by Indian censorship. '
958 'Use a VPN or proxy server (with --proxy) to route around it.')
959 block_msg = self._html_search_regex(
960 r'</h1><p>(.*?)</p>',
961 content, 'block message', default=None)
962 if block_msg:
963 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
964 raise ExtractorError(msg, expected=True)
3089bc74
S
965 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
966 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
967 raise ExtractorError(
968 'Access to this webpage has been blocked by decision of the Russian government. '
969 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
970 expected=True)
971
f95b9dee 972 def _request_dump_filename(self, url, video_id):
973 basen = f'{video_id}_{url}'
974 trim_length = self.get_param('trim_file_name') or 240
975 if len(basen) > trim_length:
976 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
977 basen = basen[:trim_length - len(h)] + h
978 filename = sanitize_filename(f'{basen}.dump', restricted=True)
979 # Working around MAX_PATH limitation on Windows (see
980 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
981 if compat_os_name == 'nt':
982 absfilepath = os.path.abspath(filename)
983 if len(absfilepath) > 259:
984 filename = fR'\\?\{absfilepath}'
985 return filename
986
987 def __decode_webpage(self, webpage_bytes, encoding, headers):
988 if not encoding:
989 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
990 try:
991 return webpage_bytes.decode(encoding, 'replace')
992 except LookupError:
993 return webpage_bytes.decode('utf-8', 'replace')
994
c9a77969 995 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
996 webpage_bytes = urlh.read()
997 if prefix is not None:
998 webpage_bytes = prefix + webpage_bytes
a06916d9 999 if self.get_param('dump_intermediate_pages', False):
3d2623a8 1000 self.to_screen('Dumping request to ' + urlh.url)
d6983cb4
PH
1001 dump = base64.b64encode(webpage_bytes).decode('ascii')
1002 self._downloader.to_screen(dump)
f95b9dee 1003 if self.get_param('write_pages'):
3d2623a8 1004 filename = self._request_dump_filename(urlh.url, video_id)
f95b9dee 1005 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
1006 with open(filename, 'wb') as outf:
1007 outf.write(webpage_bytes)
1008
f95b9dee 1009 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 1010 self.__check_blocked(content)
2410c43d 1011
23be51d8 1012 return content
d6983cb4 1013
6edf2808 1014 def __print_error(self, errnote, fatal, video_id, err):
1015 if fatal:
c6e07cf1 1016 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
6edf2808 1017 elif errnote:
c6e07cf1 1018 self.report_warning(f'{video_id}: {errnote}: {err}')
6edf2808 1019
1020 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
e2b38da9
PH
1021 if transform_source:
1022 xml_string = transform_source(xml_string)
e01c3d2e
S
1023 try:
1024 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 1025 except xml.etree.ElementTree.ParseError as ve:
6edf2808 1026 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
267ed0c5 1027
6edf2808 1028 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
3d3538e4 1029 try:
b7c47b74 1030 return json.loads(
1031 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
3d3538e4 1032 except ValueError as ve:
6edf2808 1033 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
3d3538e4 1034
6edf2808 1035 def _parse_socket_response_as_json(self, data, *args, **kwargs):
1036 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
adddc50c 1037
617f658b 1038 def __create_download_methods(name, parser, note, errnote, return_value):
1039
6edf2808 1040 def parse(ie, content, *args, errnote=errnote, **kwargs):
617f658b 1041 if parser is None:
1042 return content
6edf2808 1043 if errnote is False:
1044 kwargs['errnote'] = errnote
617f658b 1045 # parser is fetched by name so subclasses can override it
1046 return getattr(ie, parser)(content, *args, **kwargs)
1047
c4910024 1048 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1049 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1050 res = self._download_webpage_handle(
1051 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1052 data=data, headers=headers, query=query, expected_status=expected_status)
617f658b 1053 if res is False:
1054 return res
1055 content, urlh = res
6edf2808 1056 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
617f658b 1057
f95b9dee 1058 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
c4910024 1059 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
f95b9dee 1060 if self.get_param('load_pages'):
1061 url_or_request = self._create_request(url_or_request, data, headers, query)
81b4712b 1062 filename = self._request_dump_filename(url_or_request.url, video_id)
f95b9dee 1063 self.to_screen(f'Loading request from {filename}')
1064 try:
1065 with open(filename, 'rb') as dumpf:
1066 webpage_bytes = dumpf.read()
1067 except OSError as e:
1068 self.report_warning(f'Unable to load request from disk: {e}')
1069 else:
1070 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
6edf2808 1071 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
c4910024 1072 kwargs = {
1073 'note': note,
1074 'errnote': errnote,
1075 'transform_source': transform_source,
1076 'fatal': fatal,
1077 'encoding': encoding,
1078 'data': data,
1079 'headers': headers,
1080 'query': query,
1081 'expected_status': expected_status,
1082 }
617f658b 1083 if parser is None:
c4910024 1084 kwargs.pop('transform_source')
617f658b 1085 # The method is fetched by name so subclasses can override _download_..._handle
c4910024 1086 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
617f658b 1087 return res if res is False else res[0]
1088
1089 def impersonate(func, name, return_value):
1090 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1091 func.__doc__ = f'''
1092 @param transform_source Apply this transformation before parsing
1093 @returns {return_value}
1094
1095 See _download_webpage_handle docstring for other arguments specification
1096 '''
1097
1098 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1099 impersonate(download_content, f'_download_{name}', f'{return_value}')
1100 return download_handle, download_content
1101
1102 _download_xml_handle, _download_xml = __create_download_methods(
1103 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1104 _download_json_handle, _download_json = __create_download_methods(
1105 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1106 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1107 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1108 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1109
617f658b 1110 def _download_webpage(
1111 self, url_or_request, video_id, note=None, errnote=None,
1112 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1113 """
617f658b 1114 Return the data of the page as a string.
adddc50c 1115
617f658b 1116 Keyword arguments:
1117 tries -- number of tries
1118 timeout -- sleep interval between tries
1119
1120 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1121 """
617f658b 1122
1123 R''' # NB: These are unused; should they be deprecated?
1124 if tries != 1:
1125 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1126 if timeout is NO_DEFAULT:
1127 timeout = 5
1128 else:
1129 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1130 '''
1131
1132 try_count = 0
1133 while True:
1134 try:
1135 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
3d2623a8 1136 except IncompleteRead as e:
617f658b 1137 try_count += 1
1138 if try_count >= tries:
1139 raise e
1140 self._sleep(timeout, video_id)
adddc50c 1141
28f436ba 1142 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
a70635b8 1143 idstr = format_field(video_id, None, '%s: ')
28f436ba 1144 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1145 if only_once:
1146 if f'WARNING: {msg}' in self._printed_messages:
1147 return
1148 self._printed_messages.add(f'WARNING: {msg}')
1149 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1150
a06916d9 1151 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1152 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1153 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1154
1155 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1156 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1157
1158 def get_param(self, name, default=None, *args, **kwargs):
1159 if self._downloader:
1160 return self._downloader.params.get(name, default, *args, **kwargs)
1161 return default
d6983cb4 1162
d5d1df8a 1163 def report_drm(self, video_id, partial=NO_DEFAULT):
1164 if partial is not NO_DEFAULT:
1165 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
88acdbc2 1166 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1167
d6983cb4
PH
1168 def report_extraction(self, id_or_name):
1169 """Report information extraction."""
f1a9d64e 1170 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1171
1172 def report_download_webpage(self, video_id):
1173 """Report webpage download."""
f1a9d64e 1174 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1175
1176 def report_age_confirmation(self):
1177 """Report attempt to confirm age."""
f1a9d64e 1178 self.to_screen('Confirming age')
d6983cb4 1179
fc79158d
JMF
1180 def report_login(self):
1181 """Report attempt to log in."""
f1a9d64e 1182 self.to_screen('Logging in')
fc79158d 1183
b7da73eb 1184 def raise_login_required(
9d5d4d64 1185 self, msg='This video is only available for registered users',
52efa4b3 1186 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1187 if metadata_available and (
1188 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1189 self.report_warning(msg)
7265a219 1190 return
a70635b8 1191 msg += format_field(self._login_hint(method), None, '. %s')
46890374 1192 raise ExtractorError(msg, expected=True)
43e7d3c9 1193
b7da73eb 1194 def raise_geo_restricted(
1195 self, msg='This video is not available from your location due to geo restriction',
1196 countries=None, metadata_available=False):
f2ebc5c7 1197 if metadata_available and (
1198 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1199 self.report_warning(msg)
1200 else:
1201 raise GeoRestrictedError(msg, countries=countries)
1202
1203 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1204 if expected and (
1205 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1206 self.report_warning(msg, video_id)
68f5867c
L
1207 elif isinstance(msg, ExtractorError):
1208 raise msg
b7da73eb 1209 else:
1210 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1211
5f6a1245 1212 # Methods for following #608
c0d0b01f 1213 @staticmethod
311b6615 1214 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1215 """Returns a URL that points to a page that should be processed"""
311b6615 1216 if ie is not None:
1217 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1218 if video_id is not None:
311b6615 1219 kwargs['id'] = video_id
830d53bf 1220 if video_title is not None:
311b6615 1221 kwargs['title'] = video_title
1222 return {
1223 **kwargs,
1224 '_type': 'url_transparent' if url_transparent else 'url',
1225 'url': url,
1226 }
1227
8f97a15d 1228 @classmethod
1229 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1230 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1231 return cls.playlist_result(
1232 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1233 playlist_id, playlist_title, **kwargs)
46b18f23 1234
c0d0b01f 1235 @staticmethod
311b6615 1236 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1237 """Returns a playlist"""
d6983cb4 1238 if playlist_id:
311b6615 1239 kwargs['id'] = playlist_id
d6983cb4 1240 if playlist_title:
311b6615 1241 kwargs['title'] = playlist_title
ecc97af3 1242 if playlist_description is not None:
311b6615 1243 kwargs['description'] = playlist_description
1244 return {
1245 **kwargs,
1246 '_type': 'multi_video' if multi_video else 'playlist',
1247 'entries': entries,
1248 }
d6983cb4 1249
c342041f 1250 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1251 """
1252 Perform a regex search on the given string, using a single or a list of
1253 patterns returning the first matching group.
1254 In case of failure return a default value or raise a WARNING or a
55b3e45b 1255 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1256 """
61d3665d 1257 if string is None:
1258 mobj = None
77f90330 1259 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1260 mobj = re.search(pattern, string, flags)
1261 else:
1262 for p in pattern:
1263 mobj = re.search(p, string, flags)
c3415d1b
PH
1264 if mobj:
1265 break
d6983cb4 1266
ec11a9f4 1267 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1268
1269 if mobj:
711ede6e
PH
1270 if group is None:
1271 # return the first matching group
1272 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1273 elif isinstance(group, (list, tuple)):
1274 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1275 else:
1276 return mobj.group(group)
c342041f 1277 elif default is not NO_DEFAULT:
d6983cb4
PH
1278 return default
1279 elif fatal:
f1a9d64e 1280 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1281 else:
6a39ee13 1282 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1283 return None
1284
f0bc6e20 1285 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
8b7fb8b6 1286 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
b7c47b74 1287 """Searches string for the JSON object specified by start_pattern"""
1288 # NB: end_pattern is only used to reduce the size of the initial match
f0bc6e20 1289 if default is NO_DEFAULT:
1290 default, has_default = {}, False
1291 else:
1292 fatal, has_default = False, True
1293
1294 json_string = self._search_regex(
8b7fb8b6 1295 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
f0bc6e20 1296 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1297 if not json_string:
1298 return default
1299
1300 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1301 try:
1302 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1303 except ExtractorError as e:
1304 if fatal:
1305 raise ExtractorError(
1306 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1307 elif not has_default:
1308 self.report_warning(
1309 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1310 return default
b7c47b74 1311
c342041f 1312 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1313 """
1314 Like _search_regex, but strips HTML tags and unescapes entities.
1315 """
711ede6e 1316 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
08e29b9f 1317 if isinstance(res, tuple):
edfc7725 1318 return tuple(map(clean_html, res))
1319 return clean_html(res)
d6983cb4 1320
2118fdd1 1321 def _get_netrc_login_info(self, netrc_machine=None):
2118fdd1
RA
1322 netrc_machine = netrc_machine or self._NETRC_MACHINE
1323
d7cd97e8 1324 cmd = self.get_param('netrc_cmd')
db3ad8a6 1325 if cmd:
d7cd97e8 1326 cmd = cmd.replace('{}', netrc_machine)
db3ad8a6
ND
1327 self.to_screen(f'Executing command: {cmd}')
1328 stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1329 if ret != 0:
1330 raise OSError(f'Command returned error code {ret}')
1331 info = netrc_from_content(stdout).authenticators(netrc_machine)
2118fdd1 1332
db3ad8a6
ND
1333 elif self.get_param('usenetrc', False):
1334 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1335 if os.path.isdir(netrc_file):
1336 netrc_file = os.path.join(netrc_file, '.netrc')
1337 info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1338
1339 else:
1340 return None, None
1341 if not info:
93240fc1 1342 self.to_screen(f'No authenticators for {netrc_machine}')
1343 return None, None
1344
1345 self.write_debug(f'Using netrc for {netrc_machine} authentication')
db3ad8a6 1346 return info[0], info[2]
2118fdd1 1347
1b6712ab 1348 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1349 """
cf0649f8 1350 Get the login info as (username, password)
32443dd3
S
1351 First look for the manually specified credentials using username_option
1352 and password_option as keys in params dictionary. If no such credentials
db3ad8a6
ND
1353 are available try the netrc_cmd if it is defined or look in the
1354 netrc file using the netrc_machine or _NETRC_MACHINE value.
fc79158d
JMF
1355 If there's no info available, return (None, None)
1356 """
fc79158d 1357
a06916d9 1358 username = self.get_param(username_option)
1359 if username is not None:
1360 password = self.get_param(password_option)
2118fdd1 1361 else:
db3ad8a6
ND
1362 try:
1363 username, password = self._get_netrc_login_info(netrc_machine)
1364 except (OSError, netrc.NetrcParseError) as err:
1365 self.report_warning(f'Failed to parse .netrc: {err}')
1366 return None, None
2133565c 1367 return username, password
fc79158d 1368
e64b7569 1369 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1370 """
1371 Get the two-factor authentication info
1372 TODO - asking the user will be required for sms/phone verify
1373 currently just uses the command line option
1374 If there's no info available, return None
1375 """
83317f69 1376
a06916d9 1377 tfa = self.get_param('twofactor')
1378 if tfa is not None:
1379 return tfa
83317f69 1380
ac668111 1381 return getpass.getpass('Type %s and press [Return]: ' % note)
83317f69 1382
46720279
JMF
1383 # Helper functions for extracting OpenGraph info
1384 @staticmethod
ab2d5247 1385 def _og_regexes(prop):
45b2ee6f 1386 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
fbfde1c3
F
1387 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1388 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1389 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1390 return [
78fb87b2
JMF
1391 template % (property_re, content_re),
1392 template % (content_re, property_re),
ab2d5247 1393 ]
46720279 1394
864f24bd
S
1395 @staticmethod
1396 def _meta_regex(prop):
1397 return r'''(?isx)<meta
8b9848ac 1398 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1399 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1400
3c4e6d83 1401 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1402 prop = variadic(prop)
46720279 1403 if name is None:
b070564e
S
1404 name = 'OpenGraph %s' % prop[0]
1405 og_regexes = []
1406 for p in prop:
1407 og_regexes.extend(self._og_regexes(p))
1408 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1409 if escaped is None:
1410 return None
1411 return unescapeHTML(escaped)
46720279
JMF
1412
1413 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1414 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1415
1416 def _og_search_description(self, html, **kargs):
1417 return self._og_search_property('description', html, fatal=False, **kargs)
1418
04f3fd2c 1419 def _og_search_title(self, html, *, fatal=False, **kargs):
1420 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1421
8ffa13e0 1422 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1423 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1424 if secure:
1425 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1426 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1427
78338f71
JMF
1428 def _og_search_url(self, html, **kargs):
1429 return self._og_search_property('url', html, **kargs)
1430
04f3fd2c 1431 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1432 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1433
40c696e5 1434 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1435 name = variadic(name)
59040888 1436 if display_name is None:
88d9f6c0 1437 display_name = name[0]
59040888 1438 return self._html_search_regex(
88d9f6c0 1439 [self._meta_regex(n) for n in name],
711ede6e 1440 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1441
1442 def _dc_search_uploader(self, html):
1443 return self._html_search_meta('dc.creator', html, 'uploader')
1444
8f97a15d 1445 @staticmethod
1446 def _rta_search(html):
8dbe9899
PH
1447 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1448 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1449 r' content="RTA-5042-1996-1400-1577-RTA"',
1450 html):
1451 return 18
8f97a15d 1452
1453 # And then there are the jokers who advertise that they use RTA, but actually don't.
1454 AGE_LIMIT_MARKERS = [
1455 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
32a84bcf
SS
1456 r'>[^<]*you acknowledge you are at least (\d+) years old',
1457 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
8f97a15d 1458 ]
32a84bcf
SS
1459
1460 age_limit = 0
1461 for marker in AGE_LIMIT_MARKERS:
1462 mobj = re.search(marker, html)
1463 if mobj:
1464 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1465 return age_limit
8dbe9899 1466
59040888
PH
1467 def _media_rating_search(self, html):
1468 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1469 rating = self._html_search_meta('rating', html)
1470
1471 if not rating:
1472 return None
1473
1474 RATING_TABLE = {
1475 'safe for kids': 0,
1476 'general': 8,
1477 '14 years': 14,
1478 'mature': 17,
1479 'restricted': 19,
1480 }
d800609c 1481 return RATING_TABLE.get(rating.lower())
59040888 1482
69319969 1483 def _family_friendly_search(self, html):
6ca7732d 1484 # See http://schema.org/VideoObject
ac8491fc
S
1485 family_friendly = self._html_search_meta(
1486 'isFamilyFriendly', html, default=None)
69319969
NJ
1487
1488 if not family_friendly:
1489 return None
1490
1491 RATING_TABLE = {
1492 '1': 0,
1493 'true': 0,
1494 '0': 18,
1495 'false': 18,
1496 }
d800609c 1497 return RATING_TABLE.get(family_friendly.lower())
69319969 1498
0c708f11
JMF
1499 def _twitter_search_player(self, html):
1500 return self._html_search_meta('twitter:player', html,
9e1a5b84 1501 'twitter card player')
0c708f11 1502
0c36dc00 1503 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1504 """Yield all json ld objects in the html"""
1505 if default is not NO_DEFAULT:
1506 fatal = False
1507 for mobj in re.finditer(JSON_LD_RE, html):
1508 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1509 for json_ld in variadic(json_ld_item):
1510 if isinstance(json_ld, dict):
1511 yield json_ld
1512
1513 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1514 """Search for a video in any json ld in the html"""
1515 if default is not NO_DEFAULT:
1516 fatal = False
1517 info = self._json_ld(
1518 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1519 video_id, fatal=fatal, expected_type=expected_type)
1520 if info:
1521 return info
4433bb02
S
1522 if default is not NO_DEFAULT:
1523 return default
1524 elif fatal:
1525 raise RegexNotFoundError('Unable to extract JSON-LD')
1526 else:
6a39ee13 1527 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1528 return {}
4ca2a3cf 1529
95b31e26 1530 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
14f25df2 1531 if isinstance(json_ld, str):
4ca2a3cf
S
1532 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1533 if not json_ld:
1534 return {}
1535 info = {}
bae14048 1536
e7e4a6e0
S
1537 INTERACTION_TYPE_MAP = {
1538 'CommentAction': 'comment',
1539 'AgreeAction': 'like',
1540 'DisagreeAction': 'dislike',
1541 'LikeAction': 'like',
1542 'DislikeAction': 'dislike',
1543 'ListenAction': 'view',
1544 'WatchAction': 'view',
1545 'ViewAction': 'view',
1546 }
1547
f3c0c773 1548 def is_type(e, *expected_types):
1549 type = variadic(traverse_obj(e, '@type'))
1550 return any(x in type for x in expected_types)
1551
29f7c58a 1552 def extract_interaction_type(e):
1553 interaction_type = e.get('interactionType')
1554 if isinstance(interaction_type, dict):
1555 interaction_type = interaction_type.get('@type')
1556 return str_or_none(interaction_type)
1557
e7e4a6e0
S
1558 def extract_interaction_statistic(e):
1559 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1560 if isinstance(interaction_statistic, dict):
1561 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1562 if not isinstance(interaction_statistic, list):
1563 return
1564 for is_e in interaction_statistic:
f3c0c773 1565 if not is_type(is_e, 'InteractionCounter'):
e7e4a6e0 1566 continue
29f7c58a 1567 interaction_type = extract_interaction_type(is_e)
1568 if not interaction_type:
e7e4a6e0 1569 continue
ce5b9040
S
1570 # For interaction count some sites provide string instead of
1571 # an integer (as per spec) with non digit characters (e.g. ",")
1572 # so extracting count with more relaxed str_to_int
1573 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1574 if interaction_count is None:
1575 continue
1576 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1577 if not count_kind:
1578 continue
1579 count_key = '%s_count' % count_kind
1580 if info.get(count_key) is not None:
1581 continue
1582 info[count_key] = interaction_count
1583
f5225737 1584 def extract_chapter_information(e):
1585 chapters = [{
1586 'title': part.get('name'),
1587 'start_time': part.get('startOffset'),
1588 'end_time': part.get('endOffset'),
85553414 1589 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1590 for idx, (last_c, current_c, next_c) in enumerate(zip(
1591 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1592 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1593 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1594 if None in current_c.values():
1595 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1596 return
1597 if chapters:
1598 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1599 info['chapters'] = chapters
1600
bae14048 1601 def extract_video_object(e):
f7ad7160 1602 author = e.get('author')
bae14048 1603 info.update({
0c36dc00 1604 'url': url_or_none(e.get('contentUrl')),
0f60ba6e 1605 'ext': mimetype2ext(e.get('encodingFormat')),
bae14048
S
1606 'title': unescapeHTML(e.get('name')),
1607 'description': unescapeHTML(e.get('description')),
eb2333bc 1608 'thumbnails': [{'url': unescapeHTML(url)}
21633673 1609 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1610 if url_or_none(url)],
bae14048
S
1611 'duration': parse_duration(e.get('duration')),
1612 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1613 # author can be an instance of 'Organization' or 'Person' types.
1614 # both types can have 'name' property(inherited from 'Thing' type). [1]
1615 # however some websites are using 'Text' type instead.
1616 # 1. https://schema.org/VideoObject
14f25df2 1617 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
0f60ba6e 1618 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
56ba69e4 1619 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
bae14048
S
1620 'tbr': int_or_none(e.get('bitrate')),
1621 'width': int_or_none(e.get('width')),
1622 'height': int_or_none(e.get('height')),
33a81c2c 1623 'view_count': int_or_none(e.get('interactionCount')),
0f60ba6e 1624 'tags': try_call(lambda: e.get('keywords').split(',')),
bae14048 1625 })
0f60ba6e 1626 if is_type(e, 'AudioObject'):
1627 info.update({
1628 'vcodec': 'none',
1629 'abr': int_or_none(e.get('bitrate')),
1630 })
e7e4a6e0 1631 extract_interaction_statistic(e)
f5225737 1632 extract_chapter_information(e)
bae14048 1633
d5c32548 1634 def traverse_json_ld(json_ld, at_top_level=True):
1d55ebab
SS
1635 for e in variadic(json_ld):
1636 if not isinstance(e, dict):
1637 continue
d5c32548
ZM
1638 if at_top_level and '@context' not in e:
1639 continue
1640 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1d55ebab 1641 traverse_json_ld(e['@graph'], at_top_level=False)
c13a301a 1642 continue
f3c0c773 1643 if expected_type is not None and not is_type(e, expected_type):
4433bb02 1644 continue
8f122fa0 1645 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1646 if rating is not None:
1647 info['average_rating'] = rating
f3c0c773 1648 if is_type(e, 'TVEpisode', 'Episode'):
440863ad 1649 episode_name = unescapeHTML(e.get('name'))
46933a15 1650 info.update({
440863ad 1651 'episode': episode_name,
46933a15
S
1652 'episode_number': int_or_none(e.get('episodeNumber')),
1653 'description': unescapeHTML(e.get('description')),
1654 })
440863ad
S
1655 if not info.get('title') and episode_name:
1656 info['title'] = episode_name
46933a15 1657 part_of_season = e.get('partOfSeason')
f3c0c773 1658 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1659 info.update({
1660 'season': unescapeHTML(part_of_season.get('name')),
1661 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1662 })
d16b3c66 1663 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
f3c0c773 1664 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1665 info['series'] = unescapeHTML(part_of_series.get('name'))
f3c0c773 1666 elif is_type(e, 'Movie'):
391256dc
S
1667 info.update({
1668 'title': unescapeHTML(e.get('name')),
1669 'description': unescapeHTML(e.get('description')),
1670 'duration': parse_duration(e.get('duration')),
1671 'timestamp': unified_timestamp(e.get('dateCreated')),
1672 })
f3c0c773 1673 elif is_type(e, 'Article', 'NewsArticle'):
46933a15
S
1674 info.update({
1675 'timestamp': parse_iso8601(e.get('datePublished')),
1676 'title': unescapeHTML(e.get('headline')),
d5c32548 1677 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1678 })
f3c0c773 1679 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
2edb38e8 1680 extract_video_object(e['video'][0])
f3c0c773 1681 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
e50c3500 1682 extract_video_object(e['subjectOf'][0])
0f60ba6e 1683 elif is_type(e, 'VideoObject', 'AudioObject'):
bae14048 1684 extract_video_object(e)
4433bb02
S
1685 if expected_type is None:
1686 continue
1687 else:
1688 break
c69701c6 1689 video = e.get('video')
f3c0c773 1690 if is_type(video, 'VideoObject'):
c69701c6 1691 extract_video_object(video)
4433bb02
S
1692 if expected_type is None:
1693 continue
1694 else:
1695 break
d5c32548 1696
1d55ebab 1697 traverse_json_ld(json_ld)
90137ca4 1698 return filter_dict(info)
4ca2a3cf 1699
135dfa2c 1700 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1701 return self._parse_json(
1702 self._search_regex(
1703 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1704 webpage, 'next.js data', fatal=fatal, **kw),
1705 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1706
8072ef2b 1707 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1708 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
66f4c04e 1709 rectx = re.escape(context_name)
377e85a1 1710 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
66f4c04e 1711 js, arg_keys, arg_vals = self._search_regex(
8072ef2b 1712 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
f7fc8d39 1713 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1714 default=NO_DEFAULT if fatal else (None, None, None))
1715 if js is None:
1716 return {}
66f4c04e 1717
b23167e7
L
1718 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1719 f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
66f4c04e 1720
8072ef2b 1721 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1722 return traverse_obj(ret, traverse) or {}
66f4c04e 1723
27713812 1724 @staticmethod
f8da79f8 1725 def _hidden_inputs(html):
586f1cc5 1726 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1727 hidden_inputs = {}
c8498368
S
1728 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1729 attrs = extract_attributes(input)
1730 if not input:
201ea3ee 1731 continue
c8498368 1732 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1733 continue
c8498368
S
1734 name = attrs.get('name') or attrs.get('id')
1735 value = attrs.get('value')
1736 if name and value is not None:
1737 hidden_inputs[name] = value
201ea3ee 1738 return hidden_inputs
27713812 1739
cf61d96d
S
1740 def _form_hidden_inputs(self, form_id, html):
1741 form = self._search_regex(
73eb13df 1742 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1743 html, '%s form' % form_id, group='form')
1744 return self._hidden_inputs(form)
1745
d0d74b71 1746 @classproperty(cache=True)
1747 def FormatSort(cls):
1748 class FormatSort(FormatSorter):
1749 def __init__(ie, *args, **kwargs):
1750 super().__init__(ie._downloader, *args, **kwargs)
eb8a4433 1751
d0d74b71 1752 deprecation_warning(
1753 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1754 'Use yt_dlp.utils.FormatSorter instead')
1755 return FormatSort
eb8a4433 1756
1757 def _sort_formats(self, formats, field_preference=[]):
9f14daf2 1758 if not field_preference:
1759 self._downloader.deprecation_warning(
1760 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1761 return
1762 self._downloader.deprecation_warning(
1763 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1764 'Return _format_sort_fields in the info_dict instead')
1765 if formats:
784320c9 1766 formats[0]['__sort_fields'] = field_preference
59040888 1767
96a53167
S
1768 def _check_formats(self, formats, video_id):
1769 if formats:
1770 formats[:] = filter(
1771 lambda f: self._is_valid_url(
1772 f['url'], video_id,
1773 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1774 formats)
1775
f5bdb444
S
1776 @staticmethod
1777 def _remove_duplicate_formats(formats):
1778 format_urls = set()
1779 unique_formats = []
1780 for f in formats:
1781 if f['url'] not in format_urls:
1782 format_urls.add(f['url'])
1783 unique_formats.append(f)
1784 formats[:] = unique_formats
1785
45024183 1786 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1787 url = self._proto_relative_url(url, scheme='http:')
1788 # For now assume non HTTP(S) URLs always valid
1789 if not (url.startswith('http://') or url.startswith('https://')):
1790 return True
96a53167 1791 try:
45024183 1792 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1793 return True
8bdd16b4 1794 except ExtractorError as e:
25e911a9 1795 self.to_screen(
8bdd16b4 1796 '%s: %s URL is invalid, skipping: %s'
1797 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1798 return False
96a53167 1799
20991253 1800 def http_scheme(self):
1ede5b24 1801 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1802 return (
1803 'http:'
a06916d9 1804 if self.get_param('prefer_insecure', False)
20991253
PH
1805 else 'https:')
1806
57c7411f 1807 def _proto_relative_url(self, url, scheme=None):
8f97a15d 1808 scheme = scheme or self.http_scheme()
1809 assert scheme.endswith(':')
1810 return sanitize_url(url, scheme=scheme[:-1])
57c7411f 1811
4094b6e3
PH
1812 def _sleep(self, timeout, video_id, msg_template=None):
1813 if msg_template is None:
f1a9d64e 1814 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1815 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1816 self.to_screen(msg)
1817 time.sleep(timeout)
1818
f983b875 1819 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1820 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1821 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
0b5546c7 1822 if self.get_param('ignore_no_formats_error'):
1823 fatal = False
1824
a076c1f9 1825 res = self._download_xml_handle(
f036a632 1826 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1827 'Unable to download f4m manifest',
1828 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1829 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1830 transform_source=transform_source,
7360c06f 1831 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 1832 if res is False:
8d29e47f 1833 return []
31bb8d3f 1834
a076c1f9 1835 manifest, urlh = res
3d2623a8 1836 manifest_url = urlh.url
a076c1f9 1837
0fdbb332 1838 return self._parse_f4m_formats(
f983b875 1839 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1840 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1841
f983b875 1842 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1843 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1844 fatal=True, m3u8_id=None):
f9934b96 1845 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
1846 return []
1847
7a5c1cfe 1848 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1849 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1850 if akamai_pv is not None and ';' in akamai_pv.text:
1851 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1852 if playerVerificationChallenge.strip() != '':
1853 return []
1854
31bb8d3f 1855 formats = []
7a47d07c 1856 manifest_version = '1.0'
b2527359 1857 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1858 if not media_nodes:
7a47d07c 1859 manifest_version = '2.0'
34e48bed 1860 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1861 # Remove unsupported DRM protected media from final formats
067aa17e 1862 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1863 media_nodes = remove_encrypted_media(media_nodes)
1864 if not media_nodes:
1865 return formats
48107c19
S
1866
1867 manifest_base_url = get_base_url(manifest)
0a5685b2 1868
a6571f10 1869 bootstrap_info = xpath_element(
0a5685b2
YCH
1870 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1871 'bootstrap info', default=None)
1872
edd6074c
RA
1873 vcodec = None
1874 mime_type = xpath_text(
1875 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1876 'base URL', default=None)
1877 if mime_type and mime_type.startswith('audio/'):
1878 vcodec = 'none'
1879
b2527359 1880 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1881 tbr = int_or_none(media_el.attrib.get('bitrate'))
1882 width = int_or_none(media_el.attrib.get('width'))
1883 height = int_or_none(media_el.attrib.get('height'))
34921b43 1884 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
1885 # If <bootstrapInfo> is present, the specified f4m is a
1886 # stream-level manifest, and only set-level manifests may refer to
1887 # external resources. See section 11.4 and section 4 of F4M spec
1888 if bootstrap_info is None:
1889 media_url = None
1890 # @href is introduced in 2.0, see section 11.6 of F4M spec
1891 if manifest_version == '2.0':
1892 media_url = media_el.attrib.get('href')
1893 if media_url is None:
1894 media_url = media_el.attrib.get('url')
31c746e5
S
1895 if not media_url:
1896 continue
cc357c4d
S
1897 manifest_url = (
1898 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1899 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1900 # If media_url is itself a f4m manifest do the recursive extraction
1901 # since bitrates in parent manifest (this one) and media_url manifest
1902 # may differ leading to inability to resolve the format by requested
1903 # bitrate in f4m downloader
240b6045
YCH
1904 ext = determine_ext(manifest_url)
1905 if ext == 'f4m':
77b8b4e6 1906 f4m_formats = self._extract_f4m_formats(
f983b875 1907 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1908 transform_source=transform_source, fatal=fatal)
1909 # Sometimes stream-level manifest contains single media entry that
1910 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1911 # At the same time parent's media entry in set-level manifest may
1912 # contain it. We will copy it from parent in such cases.
1913 if len(f4m_formats) == 1:
1914 f = f4m_formats[0]
1915 f.update({
1916 'tbr': f.get('tbr') or tbr,
1917 'width': f.get('width') or width,
1918 'height': f.get('height') or height,
1919 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1920 'vcodec': vcodec,
77b8b4e6
S
1921 })
1922 formats.extend(f4m_formats)
70f0f5a8 1923 continue
240b6045
YCH
1924 elif ext == 'm3u8':
1925 formats.extend(self._extract_m3u8_formats(
1926 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1927 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1928 continue
31bb8d3f 1929 formats.append({
77b8b4e6 1930 'format_id': format_id,
31bb8d3f 1931 'url': manifest_url,
30d0b549 1932 'manifest_url': manifest_url,
a6571f10 1933 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1934 'protocol': 'f4m',
b2527359 1935 'tbr': tbr,
77b8b4e6
S
1936 'width': width,
1937 'height': height,
edd6074c 1938 'vcodec': vcodec,
60ca389c 1939 'preference': preference,
f983b875 1940 'quality': quality,
31bb8d3f 1941 })
31bb8d3f
JMF
1942 return formats
1943
f983b875 1944 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1945 return {
34921b43 1946 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
1947 'url': m3u8_url,
1948 'ext': ext,
1949 'protocol': 'm3u8',
37768f92 1950 'preference': preference - 100 if preference else -100,
f983b875 1951 'quality': quality,
704df56d
PH
1952 'resolution': 'multiple',
1953 'format_note': 'Quality selection URL',
16da9bbc
YCH
1954 }
1955
b5ae35ee 1956 def _report_ignoring_subs(self, name):
1957 self.report_warning(bug_reports_message(
1958 f'Ignoring subtitle tracks found in the {name} manifest; '
1959 'if any subtitle tracks are missing,'
1960 ), only_once=True)
1961
a0c3b2d5
F
1962 def _extract_m3u8_formats(self, *args, **kwargs):
1963 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1964 if subs:
b5ae35ee 1965 self._report_ignoring_subs('HLS')
a0c3b2d5
F
1966 return fmts
1967
1968 def _extract_m3u8_formats_and_subtitles(
177877c5 1969 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1970 preference=None, quality=None, m3u8_id=None, note=None,
1971 errnote=None, fatal=True, live=False, data=None, headers={},
1972 query={}):
1973
0b5546c7 1974 if self.get_param('ignore_no_formats_error'):
1975 fatal = False
1976
71df9b7f 1977 if not m3u8_url:
1978 if errnote is not False:
1979 errnote = errnote or 'Failed to obtain m3u8 URL'
1980 if fatal:
1981 raise ExtractorError(errnote, video_id=video_id)
1982 self.report_warning(f'{errnote}{bug_reports_message()}')
1983 return [], {}
1984
dbd82a1d 1985 res = self._download_webpage_handle(
81515ad9 1986 m3u8_url, video_id,
37a3bb66 1987 note='Downloading m3u8 information' if note is None else note,
1988 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1989 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1990
dbd82a1d 1991 if res is False:
a0c3b2d5 1992 return [], {}
cb252080 1993
dbd82a1d 1994 m3u8_doc, urlh = res
3d2623a8 1995 m3u8_url = urlh.url
9cdffeeb 1996
a0c3b2d5 1997 return self._parse_m3u8_formats_and_subtitles(
cb252080 1998 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1999 preference=preference, quality=quality, m3u8_id=m3u8_id,
2000 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2001 headers=headers, query=query, video_id=video_id)
cb252080 2002
a0c3b2d5 2003 def _parse_m3u8_formats_and_subtitles(
42676437 2004 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2005 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2006 errnote=None, fatal=True, data=None, headers={}, query={},
2007 video_id=None):
60755938 2008 formats, subtitles = [], {}
bc344cd4 2009 has_drm = HlsFD._has_drm(m3u8_doc)
a0c3b2d5 2010
60755938 2011 def format_url(url):
14f25df2 2012 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
60755938 2013
2014 if self.get_param('hls_split_discontinuity', False):
2015 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2016 if not m3u8_doc:
2017 if not manifest_url:
2018 return []
2019 m3u8_doc = self._download_webpage(
2020 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2021 note=False, errnote='Failed to download m3u8 playlist information')
2022 if m3u8_doc is False:
2023 return []
2024 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 2025
60755938 2026 else:
2027 def _extract_m3u8_playlist_indices(*args, **kwargs):
2028 return [None]
310c2ed2 2029
cb252080
S
2030 # References:
2031 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2032 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2033 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2034
2035 # We should try extracting formats only from master playlists [1, 4.3.4],
2036 # i.e. playlists that describe available qualities. On the other hand
2037 # media playlists [1, 4.3.3] should be returned as is since they contain
2038 # just the media without qualities renditions.
9cdffeeb 2039 # Fortunately, master playlist can be easily distinguished from media
cb252080 2040 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2041 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2042 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2043 # media playlist and MUST NOT appear in master playlist thus we can
2044 # clearly detect media playlist with this criterion.
2045
9cdffeeb 2046 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2047 formats = [{
34921b43 2048 'format_id': join_nonempty(m3u8_id, idx),
60755938 2049 'format_index': idx,
42676437 2050 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2051 'ext': ext,
2052 'protocol': entry_protocol,
2053 'preference': preference,
2054 'quality': quality,
88acdbc2 2055 'has_drm': has_drm,
60755938 2056 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2057
a0c3b2d5 2058 return formats, subtitles
cb252080
S
2059
2060 groups = {}
2061 last_stream_inf = {}
2062
2063 def extract_media(x_media_line):
2064 media = parse_m3u8_attributes(x_media_line)
2065 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2066 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2067 if not (media_type and group_id and name):
2068 return
2069 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2070 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2071 if media_type == 'SUBTITLES':
3907333c 2072 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2073 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2074 # However, lack of URI has been spotted in the wild.
2075 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2076 if not media.get('URI'):
2077 return
a0c3b2d5
F
2078 url = format_url(media['URI'])
2079 sub_info = {
2080 'url': url,
2081 'ext': determine_ext(url),
2082 }
4a2f19ab
F
2083 if sub_info['ext'] == 'm3u8':
2084 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2085 # files may contain is WebVTT:
2086 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2087 sub_info['ext'] = 'vtt'
2088 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2089 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2090 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2091 if media_type not in ('VIDEO', 'AUDIO'):
2092 return
2093 media_url = media.get('URI')
2094 if media_url:
310c2ed2 2095 manifest_url = format_url(media_url)
60755938 2096 formats.extend({
34921b43 2097 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2098 'format_note': name,
2099 'format_index': idx,
2100 'url': manifest_url,
2101 'manifest_url': m3u8_url,
2102 'language': media.get('LANGUAGE'),
2103 'ext': ext,
2104 'protocol': entry_protocol,
2105 'preference': preference,
2106 'quality': quality,
43a3eaf9 2107 'has_drm': has_drm,
60755938 2108 'vcodec': 'none' if media_type == 'AUDIO' else None,
2109 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2110
2111 def build_stream_name():
2112 # Despite specification does not mention NAME attribute for
3019cb0c
S
2113 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2114 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2115 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2116 stream_name = last_stream_inf.get('NAME')
2117 if stream_name:
2118 return stream_name
2119 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2120 # from corresponding rendition group
2121 stream_group_id = last_stream_inf.get('VIDEO')
2122 if not stream_group_id:
2123 return
2124 stream_group = groups.get(stream_group_id)
2125 if not stream_group:
2126 return stream_group_id
2127 rendition = stream_group[0]
2128 return rendition.get('NAME') or stream_group_id
2129
379306ef 2130 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2131 # chance to detect video only formats when EXT-X-STREAM-INF tags
2132 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2133 for line in m3u8_doc.splitlines():
2134 if line.startswith('#EXT-X-MEDIA:'):
2135 extract_media(line)
2136
704df56d
PH
2137 for line in m3u8_doc.splitlines():
2138 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2139 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2140 elif line.startswith('#') or not line.strip():
2141 continue
2142 else:
9c99bef7 2143 tbr = float_or_none(
3089bc74
S
2144 last_stream_inf.get('AVERAGE-BANDWIDTH')
2145 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2146 manifest_url = format_url(line.strip())
5ef62fc4 2147
60755938 2148 for idx in _extract_m3u8_playlist_indices(manifest_url):
2149 format_id = [m3u8_id, None, idx]
310c2ed2 2150 # Bandwidth of live streams may differ over time thus making
2151 # format_id unpredictable. So it's better to keep provided
2152 # format_id intact.
2153 if not live:
60755938 2154 stream_name = build_stream_name()
34921b43 2155 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2156 f = {
34921b43 2157 'format_id': join_nonempty(*format_id),
60755938 2158 'format_index': idx,
310c2ed2 2159 'url': manifest_url,
2160 'manifest_url': m3u8_url,
2161 'tbr': tbr,
2162 'ext': ext,
2163 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2164 'protocol': entry_protocol,
2165 'preference': preference,
2166 'quality': quality,
43a3eaf9 2167 'has_drm': has_drm,
310c2ed2 2168 }
2169 resolution = last_stream_inf.get('RESOLUTION')
2170 if resolution:
2171 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2172 if mobj:
2173 f['width'] = int(mobj.group('width'))
2174 f['height'] = int(mobj.group('height'))
2175 # Unified Streaming Platform
2176 mobj = re.search(
2177 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2178 if mobj:
2179 abr, vbr = mobj.groups()
2180 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2181 f.update({
2182 'vbr': vbr,
2183 'abr': abr,
2184 })
2185 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2186 f.update(codecs)
2187 audio_group_id = last_stream_inf.get('AUDIO')
2188 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2189 # references a rendition group MUST have a CODECS attribute.
62b58c09 2190 # However, this is not always respected. E.g. [2]
310c2ed2 2191 # contains EXT-X-STREAM-INF tag which references AUDIO
2192 # rendition group but does not have CODECS and despite
2193 # referencing an audio group it represents a complete
2194 # (with audio and video) format. So, for such cases we will
2195 # ignore references to rendition groups and treat them
2196 # as complete formats.
2197 if audio_group_id and codecs and f.get('vcodec') != 'none':
2198 audio_group = groups.get(audio_group_id)
2199 if audio_group and audio_group[0].get('URI'):
2200 # TODO: update acodec for audio only formats with
2201 # the same GROUP-ID
2202 f['acodec'] = 'none'
fc21af50 2203 if not f.get('ext'):
2204 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2205 formats.append(f)
2206
2207 # for DailyMotion
2208 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2209 if progressive_uri:
2210 http_f = f.copy()
2211 del http_f['manifest_url']
2212 http_f.update({
2213 'format_id': f['format_id'].replace('hls-', 'http-'),
2214 'protocol': 'http',
2215 'url': progressive_uri,
2216 })
2217 formats.append(http_f)
5ef62fc4 2218
cb252080 2219 last_stream_inf = {}
a0c3b2d5 2220 return formats, subtitles
704df56d 2221
3cf4b91d
C
2222 def _extract_m3u8_vod_duration(
2223 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2224
2225 m3u8_vod = self._download_webpage(
2226 m3u8_vod_url, video_id,
2227 note='Downloading m3u8 VOD manifest' if note is None else note,
2228 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2229 fatal=False, data=data, headers=headers, query=query)
2230
2231 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2232
2233 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
5ab3534d 2234 if '#EXT-X-ENDLIST' not in m3u8_vod:
3cf4b91d
C
2235 return None
2236
2237 return int(sum(
2238 float(line[len('#EXTINF:'):].split(',')[0])
2239 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2240
5ab3534d 2241 def _extract_mpd_vod_duration(
2242 self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2243
2244 mpd_doc = self._download_xml(
2245 mpd_url, video_id,
2246 note='Downloading MPD VOD manifest' if note is None else note,
2247 errnote='Failed to download VOD manifest' if errnote is None else errnote,
d4f14a72 2248 fatal=False, data=data, headers=headers, query=query)
2249 if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2250 return None
5ab3534d 2251 return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2252
a107193e
S
2253 @staticmethod
2254 def _xpath_ns(path, namespace=None):
2255 if not namespace:
2256 return path
2257 out = []
2258 for c in path.split('/'):
2259 if not c or c == '.':
2260 out.append(c)
2261 else:
2262 out.append('{%s}%s' % (namespace, c))
2263 return '/'.join(out)
2264
da1c94ee 2265 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
0b5546c7 2266 if self.get_param('ignore_no_formats_error'):
2267 fatal = False
2268
a076c1f9
E
2269 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2270 if res is False:
995029a1 2271 assert not fatal
774a46c5 2272 return [], {}
a076c1f9 2273 smil, urlh = res
a107193e 2274
550e6541 2275 return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2276 namespace=self._parse_smil_namespace(smil))
da1c94ee
F
2277
2278 def _extract_smil_formats(self, *args, **kwargs):
2279 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2280 if subs:
b5ae35ee 2281 self._report_ignoring_subs('SMIL')
da1c94ee 2282 return fmts
a107193e
S
2283
2284 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2285 res = self._download_smil(smil_url, video_id, fatal=fatal)
2286 if res is False:
a107193e 2287 return {}
a076c1f9
E
2288
2289 smil, urlh = res
3d2623a8 2290 smil_url = urlh.url
a076c1f9 2291
a107193e
S
2292 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2293
09f572fb 2294 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2295 return self._download_xml_handle(
a107193e 2296 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2297 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2298
2299 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2300 namespace = self._parse_smil_namespace(smil)
a107193e 2301
550e6541 2302 formats, subtitles = self._parse_smil_formats_and_subtitles(
a107193e 2303 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
a107193e
S
2304
2305 video_id = os.path.splitext(url_basename(smil_url))[0]
2306 title = None
2307 description = None
647eab45 2308 upload_date = None
a107193e
S
2309 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2310 name = meta.attrib.get('name')
2311 content = meta.attrib.get('content')
2312 if not name or not content:
2313 continue
2314 if not title and name == 'title':
2315 title = content
2316 elif not description and name in ('description', 'abstract'):
2317 description = content
647eab45
S
2318 elif not upload_date and name == 'date':
2319 upload_date = unified_strdate(content)
a107193e 2320
1e5bcdec
S
2321 thumbnails = [{
2322 'id': image.get('type'),
2323 'url': image.get('src'),
2324 'width': int_or_none(image.get('width')),
2325 'height': int_or_none(image.get('height')),
2326 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2327
a107193e
S
2328 return {
2329 'id': video_id,
2330 'title': title or video_id,
2331 'description': description,
647eab45 2332 'upload_date': upload_date,
1e5bcdec 2333 'thumbnails': thumbnails,
a107193e
S
2334 'formats': formats,
2335 'subtitles': subtitles,
2336 }
2337
17712eeb
S
2338 def _parse_smil_namespace(self, smil):
2339 return self._search_regex(
2340 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2341
550e6541 2342 def _parse_smil_formats(self, *args, **kwargs):
2343 fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2344 if subs:
2345 self._report_ignoring_subs('SMIL')
2346 return fmts
2347
2348 def _parse_smil_formats_and_subtitles(
2349 self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2350 base = smil_url
2351 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2352 b = meta.get('base') or meta.get('httpBase')
2353 if b:
2354 base = b
2355 break
e89a2aab 2356
550e6541 2357 formats, subtitles = [], {}
e89a2aab 2358 rtmp_count = 0
a107193e 2359 http_count = 0
7f32e5dc 2360 m3u8_count = 0
9359f3d4 2361 imgs_count = 0
a107193e 2362
9359f3d4 2363 srcs = set()
ddb2d758 2364 media = itertools.chain.from_iterable(
2365 smil.findall(self._xpath_ns(arg, namespace))
2366 for arg in ['.//video', './/audio', './/media'])
ad96b4c8
YCH
2367 for medium in media:
2368 src = medium.get('src')
81e1c4e2 2369 if not src or src in srcs:
a107193e 2370 continue
9359f3d4 2371 srcs.add(src)
a107193e 2372
ad96b4c8
YCH
2373 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2374 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2375 width = int_or_none(medium.get('width'))
2376 height = int_or_none(medium.get('height'))
2377 proto = medium.get('proto')
2378 ext = medium.get('ext')
cb73b846 2379 src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2380 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
ad96b4c8 2381 streamer = medium.get('streamer') or base
a107193e
S
2382
2383 if proto == 'rtmp' or streamer.startswith('rtmp'):
2384 rtmp_count += 1
2385 formats.append({
2386 'url': streamer,
2387 'play_path': src,
2388 'ext': 'flv',
2389 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2390 'tbr': bitrate,
2391 'filesize': filesize,
2392 'width': width,
2393 'height': height,
2394 })
f877c6ae
YCH
2395 if transform_rtmp_url:
2396 streamer, src = transform_rtmp_url(streamer, src)
2397 formats[-1].update({
2398 'url': streamer,
2399 'play_path': src,
2400 })
a107193e
S
2401 continue
2402
14f25df2 2403 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
c349456e 2404 src_url = src_url.strip()
a107193e
S
2405
2406 if proto == 'm3u8' or src_ext == 'm3u8':
550e6541 2407 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
7f32e5dc 2408 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
550e6541 2409 self._merge_subtitles(m3u8_subs, target=subtitles)
7f32e5dc 2410 if len(m3u8_formats) == 1:
2411 m3u8_count += 1
2412 m3u8_formats[0].update({
2413 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2414 'tbr': bitrate,
2415 'width': width,
2416 'height': height,
2417 })
2418 formats.extend(m3u8_formats)
bd21ead2 2419 elif src_ext == 'f4m':
a107193e
S
2420 f4m_url = src_url
2421 if not f4m_params:
2422 f4m_params = {
2423 'hdcore': '3.2.0',
2424 'plugin': 'flowplayer-3.2.0.1',
2425 }
2426 f4m_url += '&' if '?' in f4m_url else '?'
14f25df2 2427 f4m_url += urllib.parse.urlencode(f4m_params)
7e5edcfd 2428 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2 2429 elif src_ext == 'mpd':
550e6541 2430 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2431 src_url, video_id, mpd_id='dash', fatal=False)
2432 formats.extend(mpd_formats)
2433 self._merge_subtitles(mpd_subs, target=subtitles)
bd21ead2 2434 elif re.search(r'\.ism/[Mm]anifest', src_url):
550e6541 2435 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2436 src_url, video_id, ism_id='mss', fatal=False)
2437 formats.extend(ism_formats)
2438 self._merge_subtitles(ism_subs, target=subtitles)
bd21ead2 2439 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2440 http_count += 1
2441 formats.append({
2442 'url': src_url,
2443 'ext': ext or src_ext or 'flv',
2444 'format_id': 'http-%d' % (bitrate or http_count),
2445 'tbr': bitrate,
2446 'filesize': filesize,
2447 'width': width,
2448 'height': height,
2449 })
63757032 2450
9359f3d4
F
2451 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2452 src = medium.get('src')
2453 if not src or src in srcs:
2454 continue
2455 srcs.add(src)
2456
2457 imgs_count += 1
2458 formats.append({
2459 'format_id': 'imagestream-%d' % (imgs_count),
2460 'url': src,
2461 'ext': mimetype2ext(medium.get('type')),
2462 'acodec': 'none',
2463 'vcodec': 'none',
2464 'width': int_or_none(medium.get('width')),
2465 'height': int_or_none(medium.get('height')),
2466 'format_note': 'SMIL storyboards',
2467 })
2468
550e6541 2469 smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2470 self._merge_subtitles(smil_subs, target=subtitles)
2471
2472 return formats, subtitles
e89a2aab 2473
ce00af87 2474 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2475 urls = []
a107193e
S
2476 subtitles = {}
2477 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2478 src = textstream.get('src')
d413095f 2479 if not src or src in urls:
a107193e 2480 continue
d413095f 2481 urls.append(src)
df634be2 2482 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2483 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2484 subtitles.setdefault(lang, []).append({
2485 'url': src,
2486 'ext': ext,
2487 })
2488 return subtitles
63757032 2489
47a5cb77 2490 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2491 res = self._download_xml_handle(
47a5cb77 2492 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2493 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2494 if res is False:
942acef5 2495 return []
a076c1f9
E
2496
2497 xspf, urlh = res
3d2623a8 2498 xspf_url = urlh.url
a076c1f9 2499
47a5cb77
S
2500 return self._parse_xspf(
2501 xspf, playlist_id, xspf_url=xspf_url,
2502 xspf_base_url=base_url(xspf_url))
8d6765cf 2503
47a5cb77 2504 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2505 NS_MAP = {
2506 'xspf': 'http://xspf.org/ns/0/',
2507 's1': 'http://static.streamone.nl/player/ns/0',
2508 }
2509
2510 entries = []
47a5cb77 2511 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2512 title = xpath_text(
98044462 2513 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2514 description = xpath_text(
2515 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2516 thumbnail = xpath_text(
2517 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2518 duration = float_or_none(
2519 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2520
47a5cb77
S
2521 formats = []
2522 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2523 format_url = urljoin(xspf_base_url, location.text)
2524 if not format_url:
2525 continue
2526 formats.append({
2527 'url': format_url,
2528 'manifest_url': xspf_url,
2529 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2530 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2531 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2532 })
8d6765cf
S
2533
2534 entries.append({
2535 'id': playlist_id,
2536 'title': title,
2537 'description': description,
2538 'thumbnail': thumbnail,
2539 'duration': duration,
2540 'formats': formats,
2541 })
2542 return entries
2543
171e59ed
F
2544 def _extract_mpd_formats(self, *args, **kwargs):
2545 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2546 if subs:
b5ae35ee 2547 self._report_ignoring_subs('DASH')
171e59ed
F
2548 return fmts
2549
4ce57d3b
A
2550 def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2551 periods = self._extract_mpd_periods(*args, **kwargs)
2552 return self._merge_mpd_periods(periods)
2553
2554 def _extract_mpd_periods(
171e59ed
F
2555 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2556 fatal=True, data=None, headers={}, query={}):
0b5546c7 2557
2558 if self.get_param('ignore_no_formats_error'):
2559 fatal = False
2560
47a5cb77 2561 res = self._download_xml_handle(
1bac3455 2562 mpd_url, video_id,
37a3bb66 2563 note='Downloading MPD manifest' if note is None else note,
2564 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2565 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2566 if res is False:
4ce57d3b 2567 return []
47a5cb77 2568 mpd_doc, urlh = res
c25720ef 2569 if mpd_doc is None:
4ce57d3b 2570 return []
779da8e3
E
2571
2572 # We could have been redirected to a new url when we retrieved our mpd file.
3d2623a8 2573 mpd_url = urlh.url
779da8e3 2574 mpd_base_url = base_url(mpd_url)
1bac3455 2575
4ce57d3b 2576 return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2577
171e59ed
F
2578 def _parse_mpd_formats(self, *args, **kwargs):
2579 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2580 if subs:
b5ae35ee 2581 self._report_ignoring_subs('DASH')
171e59ed
F
2582 return fmts
2583
4ce57d3b
A
2584 def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2585 periods = self._parse_mpd_periods(*args, **kwargs)
2586 return self._merge_mpd_periods(periods)
2587
2588 def _merge_mpd_periods(self, periods):
2589 """
2590 Combine all formats and subtitles from an MPD manifest into a single list,
2591 by concatenate streams with similar formats.
2592 """
2593 formats, subtitles = {}, {}
2594 for period in periods:
2595 for f in period['formats']:
2596 assert 'is_dash_periods' not in f, 'format already processed'
2597 f['is_dash_periods'] = True
2598 format_key = tuple(v for k, v in f.items() if k not in (
2599 ('format_id', 'fragments', 'manifest_stream_number')))
2600 if format_key not in formats:
2601 formats[format_key] = f
2602 elif 'fragments' in f:
2603 formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2604
2605 if subtitles and period['subtitles']:
2606 self.report_warning(bug_reports_message(
2607 'Found subtitles in multiple periods in the DASH manifest; '
2608 'if part of the subtitles are missing,'
2609 ), only_once=True)
2610
2611 for sub_lang, sub_info in period['subtitles'].items():
2612 subtitles.setdefault(sub_lang, []).extend(sub_info)
2613
2614 return list(formats.values()), subtitles
2615
2616 def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2617 """
2618 Parse formats from MPD manifest.
2619 References:
2620 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2621 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2622 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2623 """
a06916d9 2624 if not self.get_param('dynamic_mpd', True):
78895bd3 2625 if mpd_doc.get('type') == 'dynamic':
171e59ed 2626 return [], {}
2d2fa82d 2627
91cb6b50 2628 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2629
2630 def _add_ns(path):
2631 return self._xpath_ns(path, namespace)
2632
675d0016 2633 def is_drm_protected(element):
2634 return element.find(_add_ns('ContentProtection')) is not None
2635
1bac3455 2636 def extract_multisegment_info(element, ms_parent_info):
2637 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2638
2639 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2640 # common attributes and elements. We will only extract relevant
2641 # for us.
2642 def extract_common(source):
2643 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2644 if segment_timeline is not None:
2645 s_e = segment_timeline.findall(_add_ns('S'))
2646 if s_e:
2647 ms_info['total_number'] = 0
2648 ms_info['s'] = []
2649 for s in s_e:
2650 r = int(s.get('r', 0))
2651 ms_info['total_number'] += 1 + r
2652 ms_info['s'].append({
2653 't': int(s.get('t', 0)),
2654 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2655 'd': int(s.attrib['d']),
2656 'r': r,
2657 })
2658 start_number = source.get('startNumber')
2659 if start_number:
2660 ms_info['start_number'] = int(start_number)
2661 timescale = source.get('timescale')
2662 if timescale:
2663 ms_info['timescale'] = int(timescale)
2664 segment_duration = source.get('duration')
2665 if segment_duration:
48504785 2666 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2667
2668 def extract_Initialization(source):
2669 initialization = source.find(_add_ns('Initialization'))
2670 if initialization is not None:
2671 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2672
f14be228 2673 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2674 if segment_list is not None:
b4c1d6e8
S
2675 extract_common(segment_list)
2676 extract_Initialization(segment_list)
f14be228 2677 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2678 if segment_urls_e:
2679 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2680 else:
f14be228 2681 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2682 if segment_template is not None:
b4c1d6e8 2683 extract_common(segment_template)
e228616c
S
2684 media = segment_template.get('media')
2685 if media:
2686 ms_info['media'] = media
1bac3455 2687 initialization = segment_template.get('initialization')
2688 if initialization:
e228616c 2689 ms_info['initialization'] = initialization
1bac3455 2690 else:
b4c1d6e8 2691 extract_Initialization(segment_template)
1bac3455 2692 return ms_info
b323e170 2693
1bac3455 2694 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
234416e4 2695 stream_numbers = collections.defaultdict(int)
4ce57d3b
A
2696 for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2697 period_entry = {
2698 'id': period.get('id', f'period-{period_idx}'),
2699 'formats': [],
2700 'subtitles': collections.defaultdict(list),
2701 }
1bac3455 2702 period_duration = parse_duration(period.get('duration')) or mpd_duration
2703 period_ms_info = extract_multisegment_info(period, {
2704 'start_number': 1,
2705 'timescale': 1,
2706 })
f14be228 2707 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2708 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2709 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2710 representation_attrib = adaptation_set.attrib.copy()
2711 representation_attrib.update(representation.attrib)
f0948348 2712 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2713 mime_type = representation_attrib['mimeType']
171e59ed
F
2714 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2715
21633673 2716 codec_str = representation_attrib.get('codecs', '')
2717 # Some kind of binary subtitle found in some youtube livestreams
2718 if mime_type == 'application/x-rawcc':
2719 codecs = {'scodec': codec_str}
2720 else:
2721 codecs = parse_codecs(codec_str)
be2fc5b2 2722 if content_type not in ('video', 'audio', 'text'):
2723 if mime_type == 'image/jpeg':
a8731fcc 2724 content_type = mime_type
21633673 2725 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2726 content_type = 'video'
21633673 2727 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2728 content_type = 'audio'
3fe75fdc 2729 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2730 content_type = 'text'
6993f78d 2731 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2732 content_type = 'text'
cdb19aa4 2733 else:
be2fc5b2 2734 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2735 continue
2736
2737 base_url = ''
2738 for element in (representation, adaptation_set, period, mpd_doc):
2739 base_url_e = element.find(_add_ns('BaseURL'))
47046464 2740 if try_call(lambda: base_url_e.text) is not None:
be2fc5b2 2741 base_url = base_url_e.text + base_url
2742 if re.match(r'^https?://', base_url):
2743 break
f9cc0161 2744 if mpd_base_url and base_url.startswith('/'):
14f25df2 2745 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
f9cc0161
D
2746 elif mpd_base_url and not re.match(r'^https?://', base_url):
2747 if not mpd_base_url.endswith('/'):
be2fc5b2 2748 mpd_base_url += '/'
2749 base_url = mpd_base_url + base_url
2750 representation_id = representation_attrib.get('id')
2751 lang = representation_attrib.get('lang')
2752 url_el = representation.find(_add_ns('BaseURL'))
2753 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2754 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2755 if representation_id is not None:
2756 format_id = representation_id
2757 else:
2758 format_id = content_type
2759 if mpd_id:
2760 format_id = mpd_id + '-' + format_id
2761 if content_type in ('video', 'audio'):
2762 f = {
2763 'format_id': format_id,
2764 'manifest_url': mpd_url,
2765 'ext': mimetype2ext(mime_type),
2766 'width': int_or_none(representation_attrib.get('width')),
2767 'height': int_or_none(representation_attrib.get('height')),
2768 'tbr': float_or_none(bandwidth, 1000),
2769 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2770 'fps': int_or_none(representation_attrib.get('frameRate')),
2771 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2772 'format_note': 'DASH %s' % content_type,
2773 'filesize': filesize,
2774 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2775 **codecs
be2fc5b2 2776 }
be2fc5b2 2777 elif content_type == 'text':
2778 f = {
2779 'ext': mimetype2ext(mime_type),
2780 'manifest_url': mpd_url,
2781 'filesize': filesize,
2782 }
2783 elif content_type == 'image/jpeg':
2784 # See test case in VikiIE
2785 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2786 f = {
2787 'format_id': format_id,
2788 'ext': 'mhtml',
2789 'manifest_url': mpd_url,
2790 'format_note': 'DASH storyboards (jpeg)',
2791 'acodec': 'none',
2792 'vcodec': 'none',
2793 }
88acdbc2 2794 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2795 f['has_drm'] = True
be2fc5b2 2796 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2797
2798 def prepare_template(template_name, identifiers):
2799 tmpl = representation_ms_info[template_name]
0cb0fdbb 2800 if representation_id is not None:
2801 tmpl = tmpl.replace('$RepresentationID$', representation_id)
be2fc5b2 2802 # First of, % characters outside $...$ templates
2803 # must be escaped by doubling for proper processing
2804 # by % operator string formatting used further (see
2805 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2806 t = ''
2807 in_template = False
2808 for c in tmpl:
2809 t += c
2810 if c == '$':
2811 in_template = not in_template
2812 elif c == '%' and not in_template:
eca1f0d1 2813 t += c
be2fc5b2 2814 # Next, $...$ templates are translated to their
2815 # %(...) counterparts to be used with % operator
be2fc5b2 2816 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2817 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2818 t.replace('$$', '$')
2819 return t
2820
2821 # @initialization is a regular template like @media one
2822 # so it should be handled just the same way (see
2823 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2824 if 'initialization' in representation_ms_info:
2825 initialization_template = prepare_template(
2826 'initialization',
2827 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2828 # $Time$ shall not be included for @initialization thus
2829 # only $Bandwidth$ remains
2830 ('Bandwidth', ))
2831 representation_ms_info['initialization_url'] = initialization_template % {
2832 'Bandwidth': bandwidth,
2833 }
2834
2835 def location_key(location):
2836 return 'url' if re.match(r'^https?://', location) else 'path'
2837
2838 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2839
2840 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2841 media_location_key = location_key(media_template)
2842
2843 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2844 # can't be used at the same time
2845 if '%(Number' in media_template and 's' not in representation_ms_info:
2846 segment_duration = None
2847 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2848 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2849 representation_ms_info['total_number'] = int(math.ceil(
2850 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2851 representation_ms_info['fragments'] = [{
2852 media_location_key: media_template % {
2853 'Number': segment_number,
2854 'Bandwidth': bandwidth,
2855 },
2856 'duration': segment_duration,
2857 } for segment_number in range(
2858 representation_ms_info['start_number'],
2859 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2860 else:
2861 # $Number*$ or $Time$ in media template with S list available
2862 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2863 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2864 representation_ms_info['fragments'] = []
2865 segment_time = 0
2866 segment_d = None
2867 segment_number = representation_ms_info['start_number']
2868
2869 def add_segment_url():
2870 segment_url = media_template % {
2871 'Time': segment_time,
2872 'Bandwidth': bandwidth,
2873 'Number': segment_number,
2874 }
2875 representation_ms_info['fragments'].append({
2876 media_location_key: segment_url,
2877 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2878 })
2879
2880 for num, s in enumerate(representation_ms_info['s']):
2881 segment_time = s.get('t') or segment_time
2882 segment_d = s['d']
2883 add_segment_url()
2884 segment_number += 1
2885 for r in range(s.get('r', 0)):
2886 segment_time += segment_d
f0948348 2887 add_segment_url()
b4c1d6e8 2888 segment_number += 1
be2fc5b2 2889 segment_time += segment_d
2890 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
62b58c09
L
2891 # No media template,
2892 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
be2fc5b2 2893 # or any YouTube dashsegments video
2894 fragments = []
2895 segment_index = 0
2896 timescale = representation_ms_info['timescale']
2897 for s in representation_ms_info['s']:
2898 duration = float_or_none(s['d'], timescale)
2899 for r in range(s.get('r', 0) + 1):
2900 segment_uri = representation_ms_info['segment_urls'][segment_index]
2901 fragments.append({
2902 location_key(segment_uri): segment_uri,
2903 'duration': duration,
2904 })
2905 segment_index += 1
2906 representation_ms_info['fragments'] = fragments
2907 elif 'segment_urls' in representation_ms_info:
2908 # Segment URLs with no SegmentTimeline
62b58c09 2909 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
be2fc5b2 2910 # https://github.com/ytdl-org/youtube-dl/pull/14844
2911 fragments = []
2912 segment_duration = float_or_none(
2913 representation_ms_info['segment_duration'],
2914 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2915 for segment_url in representation_ms_info['segment_urls']:
2916 fragment = {
2917 location_key(segment_url): segment_url,
2918 }
2919 if segment_duration:
2920 fragment['duration'] = segment_duration
2921 fragments.append(fragment)
2922 representation_ms_info['fragments'] = fragments
2923 # If there is a fragments key available then we correctly recognized fragmented media.
2924 # Otherwise we will assume unfragmented media with direct access. Technically, such
2925 # assumption is not necessarily correct since we may simply have no support for
2926 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2927 if 'fragments' in representation_ms_info:
2928 f.update({
2929 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2930 'url': mpd_url or base_url,
2931 'fragment_base_url': base_url,
2932 'fragments': [],
2933 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2934 })
2935 if 'initialization_url' in representation_ms_info:
2936 initialization_url = representation_ms_info['initialization_url']
2937 if not f.get('url'):
2938 f['url'] = initialization_url
2939 f['fragments'].append({location_key(initialization_url): initialization_url})
2940 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 2941 if not period_duration:
2942 period_duration = try_get(
2943 representation_ms_info,
2944 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 2945 else:
be2fc5b2 2946 # Assuming direct URL to unfragmented media.
2947 f['url'] = base_url
234416e4 2948 if content_type in ('video', 'audio', 'image/jpeg'):
2949 f['manifest_stream_number'] = stream_numbers[f['url']]
2950 stream_numbers[f['url']] += 1
4ce57d3b 2951 period_entry['formats'].append(f)
be2fc5b2 2952 elif content_type == 'text':
4ce57d3b
A
2953 period_entry['subtitles'][lang or 'und'].append(f)
2954 yield period_entry
17b598d3 2955
fd76a142
F
2956 def _extract_ism_formats(self, *args, **kwargs):
2957 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2958 if subs:
b5ae35ee 2959 self._report_ignoring_subs('ISM')
fd76a142
F
2960 return fmts
2961
2962 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
0b5546c7 2963 if self.get_param('ignore_no_formats_error'):
2964 fatal = False
2965
47a5cb77 2966 res = self._download_xml_handle(
b2758123 2967 ism_url, video_id,
37a3bb66 2968 note='Downloading ISM manifest' if note is None else note,
2969 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2970 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2971 if res is False:
fd76a142 2972 return [], {}
47a5cb77 2973 ism_doc, urlh = res
13b08034 2974 if ism_doc is None:
fd76a142 2975 return [], {}
b2758123 2976
3d2623a8 2977 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
b2758123 2978
fd76a142 2979 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2980 """
2981 Parse formats from ISM manifest.
2982 References:
2983 1. [MS-SSTR]: Smooth Streaming Protocol,
2984 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2985 """
06869367 2986 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2987 return [], {}
b2758123 2988
b2758123
RA
2989 duration = int(ism_doc.attrib['Duration'])
2990 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2991
2992 formats = []
fd76a142 2993 subtitles = {}
b2758123
RA
2994 for stream in ism_doc.findall('StreamIndex'):
2995 stream_type = stream.get('Type')
fd76a142 2996 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2997 continue
2998 url_pattern = stream.attrib['Url']
2999 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3000 stream_name = stream.get('Name')
fd76a142 3001 stream_language = stream.get('Language', 'und')
b2758123 3002 for track in stream.findall('QualityLevel'):
81b6102d 3003 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3004 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
b2758123 3005 # TODO: add support for WVC1 and WMAP
81b6102d 3006 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
b2758123
RA
3007 self.report_warning('%s is not a supported codec' % fourcc)
3008 continue
3009 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
3010 # [1] does not mention Width and Height attributes. However,
3011 # they're often present while MaxWidth and MaxHeight are
3012 # missing, so should be used as fallbacks
3013 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3014 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
3015 sampling_rate = int_or_none(track.get('SamplingRate'))
3016
3017 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
14f25df2 3018 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
b2758123
RA
3019
3020 fragments = []
3021 fragment_ctx = {
3022 'time': 0,
3023 }
3024 stream_fragments = stream.findall('c')
3025 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3026 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3027 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3028 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3029 if not fragment_ctx['duration']:
3030 try:
3031 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3032 except IndexError:
3033 next_fragment_time = duration
1616f9b4 3034 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
3035 for _ in range(fragment_repeat):
3036 fragments.append({
14f25df2 3037 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
3038 'duration': fragment_ctx['duration'] / stream_timescale,
3039 })
3040 fragment_ctx['time'] += fragment_ctx['duration']
3041
fd76a142
F
3042 if stream_type == 'text':
3043 subtitles.setdefault(stream_language, []).append({
3044 'ext': 'ismt',
3045 'protocol': 'ism',
3046 'url': ism_url,
3047 'manifest_url': ism_url,
3048 'fragments': fragments,
3049 '_download_params': {
3050 'stream_type': stream_type,
3051 'duration': duration,
3052 'timescale': stream_timescale,
3053 'fourcc': fourcc,
3054 'language': stream_language,
3055 'codec_private_data': track.get('CodecPrivateData'),
3056 }
3057 })
3058 elif stream_type in ('video', 'audio'):
3059 formats.append({
34921b43 3060 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
3061 'url': ism_url,
3062 'manifest_url': ism_url,
3063 'ext': 'ismv' if stream_type == 'video' else 'isma',
3064 'width': width,
3065 'height': height,
3066 'tbr': tbr,
3067 'asr': sampling_rate,
3068 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3069 'acodec': 'none' if stream_type == 'video' else fourcc,
3070 'protocol': 'ism',
3071 'fragments': fragments,
88acdbc2 3072 'has_drm': ism_doc.find('Protection') is not None,
f68434cc 3073 'language': stream_language,
3074 'audio_channels': int_or_none(track.get('Channels')),
fd76a142
F
3075 '_download_params': {
3076 'stream_type': stream_type,
3077 'duration': duration,
3078 'timescale': stream_timescale,
3079 'width': width or 0,
3080 'height': height or 0,
3081 'fourcc': fourcc,
3082 'language': stream_language,
3083 'codec_private_data': track.get('CodecPrivateData'),
3084 'sampling_rate': sampling_rate,
3085 'channels': int_or_none(track.get('Channels', 2)),
3086 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3087 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3088 },
3089 })
3090 return formats, subtitles
b2758123 3091
079a7cfc 3092 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
3093 def absolute_url(item_url):
3094 return urljoin(base_url, item_url)
59bbe491 3095
3096 def parse_content_type(content_type):
3097 if not content_type:
3098 return {}
3099 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3100 if ctr:
3101 mimetype, codecs = ctr.groups()
3102 f = parse_codecs(codecs)
3103 f['ext'] = mimetype2ext(mimetype)
3104 return f
3105 return {}
3106
222a2308
L
3107 def _media_formats(src, cur_media_type, type_info=None):
3108 type_info = type_info or {}
520251c0 3109 full_url = absolute_url(src)
82889d4a 3110 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3111 if ext == 'm3u8':
520251c0
YCH
3112 is_plain_url = False
3113 formats = self._extract_m3u8_formats(
ad120ae1 3114 full_url, video_id, ext='mp4',
eeb0a956 3115 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3116 preference=preference, quality=quality, fatal=False)
87a449c1
S
3117 elif ext == 'mpd':
3118 is_plain_url = False
3119 formats = self._extract_mpd_formats(
b359e977 3120 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3121 else:
3122 is_plain_url = True
3123 formats = [{
3124 'url': full_url,
3125 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3126 'ext': ext,
520251c0
YCH
3127 }]
3128 return is_plain_url, formats
3129
59bbe491 3130 entries = []
4328ddf8 3131 # amp-video and amp-audio are very similar to their HTML5 counterparts
962ffcf8 3132 # so we will include them right here (see
4328ddf8 3133 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3134 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3135 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3136 media_tags = [(media_tag, media_tag_name, media_type, '')
3137 for media_tag, media_tag_name, media_type
3138 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3139 media_tags.extend(re.findall(
3140 # We only allow video|audio followed by a whitespace or '>'.
3141 # Allowing more characters may end up in significant slow down (see
62b58c09
L
3142 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3143 # e.g. http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3144 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3145 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3146 media_info = {
3147 'formats': [],
3148 'subtitles': {},
3149 }
3150 media_attributes = extract_attributes(media_tag)
bfbecd11 3151 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3152 if src:
222a2308
L
3153 f = parse_content_type(media_attributes.get('type'))
3154 _, formats = _media_formats(src, media_type, f)
520251c0 3155 media_info['formats'].extend(formats)
6780154e 3156 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3157 if media_content:
3158 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3159 s_attr = extract_attributes(source_tag)
3160 # data-video-src and data-src are non standard but seen
3161 # several times in the wild
bfbecd11 3162 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
59bbe491 3163 if not src:
3164 continue
d493f15c 3165 f = parse_content_type(s_attr.get('type'))
868f79db 3166 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3167 if is_plain_url:
d493f15c
S
3168 # width, height, res, label and title attributes are
3169 # all not standard but seen several times in the wild
3170 labels = [
3171 s_attr.get(lbl)
3172 for lbl in ('label', 'title')
3173 if str_or_none(s_attr.get(lbl))
3174 ]
3175 width = int_or_none(s_attr.get('width'))
3089bc74
S
3176 height = (int_or_none(s_attr.get('height'))
3177 or int_or_none(s_attr.get('res')))
d493f15c
S
3178 if not width or not height:
3179 for lbl in labels:
3180 resolution = parse_resolution(lbl)
3181 if not resolution:
3182 continue
3183 width = width or resolution.get('width')
3184 height = height or resolution.get('height')
3185 for lbl in labels:
3186 tbr = parse_bitrate(lbl)
3187 if tbr:
3188 break
3189 else:
3190 tbr = None
1ed45499 3191 f.update({
d493f15c
S
3192 'width': width,
3193 'height': height,
3194 'tbr': tbr,
3195 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3196 })
520251c0
YCH
3197 f.update(formats[0])
3198 media_info['formats'].append(f)
3199 else:
3200 media_info['formats'].extend(formats)
59bbe491 3201 for track_tag in re.findall(r'<track[^>]+>', media_content):
3202 track_attributes = extract_attributes(track_tag)
3203 kind = track_attributes.get('kind')
5968d7d2 3204 if not kind or kind in ('subtitles', 'captions'):
f856816b 3205 src = strip_or_none(track_attributes.get('src'))
59bbe491 3206 if not src:
3207 continue
3208 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3209 media_info['subtitles'].setdefault(lang, []).append({
3210 'url': absolute_url(src),
3211 })
5e8e2fa5
S
3212 for f in media_info['formats']:
3213 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3214 if media_info['formats'] or media_info['subtitles']:
59bbe491 3215 entries.append(media_info)
3216 return entries
3217
f6a1d69a
F
3218 def _extract_akamai_formats(self, *args, **kwargs):
3219 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3220 if subs:
b5ae35ee 3221 self._report_ignoring_subs('akamai')
f6a1d69a
F
3222 return fmts
3223
3224 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3225 signed = 'hdnea=' in manifest_url
3226 if not signed:
3227 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3228 manifest_url = re.sub(
3229 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3230 '', manifest_url).strip('?')
3231
c7c43a93 3232 formats = []
f6a1d69a 3233 subtitles = {}
70c5802b 3234
e71a4509 3235 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3236 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3237 hds_host = hosts.get('hds')
3238 if hds_host:
3239 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3240 if 'hdcore=' not in f4m_url:
3241 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3242 f4m_formats = self._extract_f4m_formats(
3243 f4m_url, video_id, f4m_id='hds', fatal=False)
3244 for entry in f4m_formats:
3245 entry.update({'extra_param_to_segment_url': hdcore_sign})
3246 formats.extend(f4m_formats)
70c5802b 3247
c4251b9a
RA
3248 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3249 hls_host = hosts.get('hls')
3250 if hls_host:
3251 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3252 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3253 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3254 m3u8_id='hls', fatal=False)
3255 formats.extend(m3u8_formats)
f6a1d69a 3256 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3257
3258 http_host = hosts.get('http')
29f7c58a 3259 if http_host and m3u8_formats and not signed:
3260 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3261 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3262 qualities_length = len(qualities)
29f7c58a 3263 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3264 i = 0
29f7c58a 3265 for f in m3u8_formats:
3266 if f['vcodec'] != 'none':
70c5802b 3267 for protocol in ('http', 'https'):
3268 http_f = f.copy()
3269 del http_f['manifest_url']
3270 http_url = re.sub(
86e5f3ed 3271 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3272 http_f.update({
3273 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3274 'url': http_url,
3275 'protocol': protocol,
3276 })
29f7c58a 3277 formats.append(http_f)
70c5802b 3278 i += 1
70c5802b 3279
f6a1d69a 3280 return formats, subtitles
c7c43a93 3281
6ad02195 3282 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
14f25df2 3283 query = urllib.parse.urlparse(url).query
6ad02195 3284 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3285 mobj = re.search(
3286 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3287 url_base = mobj.group('url')
3288 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3289 formats = []
044eeb14
S
3290
3291 def manifest_url(manifest):
86e5f3ed 3292 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3293 if query:
3294 m_url += '?%s' % query
3295 return m_url
3296
6ad02195
RA
3297 if 'm3u8' not in skip_protocols:
3298 formats.extend(self._extract_m3u8_formats(
044eeb14 3299 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3300 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3301 if 'f4m' not in skip_protocols:
3302 formats.extend(self._extract_f4m_formats(
044eeb14 3303 manifest_url('manifest.f4m'),
6ad02195 3304 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3305 if 'dash' not in skip_protocols:
3306 formats.extend(self._extract_mpd_formats(
044eeb14 3307 manifest_url('manifest.mpd'),
0384932e 3308 video_id, mpd_id='dash', fatal=False))
6ad02195 3309 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3310 if 'smil' not in skip_protocols:
3311 rtmp_formats = self._extract_smil_formats(
044eeb14 3312 manifest_url('jwplayer.smil'),
6ad02195
RA
3313 video_id, fatal=False)
3314 for rtmp_format in rtmp_formats:
3315 rtsp_format = rtmp_format.copy()
3316 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3317 del rtsp_format['play_path']
3318 del rtsp_format['ext']
3319 rtsp_format.update({
3320 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3321 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3322 'protocol': 'rtsp',
3323 })
3324 formats.extend([rtmp_format, rtsp_format])
3325 else:
3326 for protocol in ('rtmp', 'rtsp'):
3327 if protocol not in skip_protocols:
3328 formats.append({
86e5f3ed 3329 'url': f'{protocol}:{url_base}',
6ad02195
RA
3330 'format_id': protocol,
3331 'protocol': protocol,
3332 })
3333 return formats
3334
c73e330e 3335 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3336 mobj = re.search(
32a84bcf 3337 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
a4a554a7
YCH
3338 webpage)
3339 if mobj:
c73e330e
RU
3340 try:
3341 jwplayer_data = self._parse_json(mobj.group('options'),
3342 video_id=video_id,
3343 transform_source=transform_source)
3344 except ExtractorError:
3345 pass
3346 else:
3347 if isinstance(jwplayer_data, dict):
3348 return jwplayer_data
a4a554a7
YCH
3349
3350 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3351 jwplayer_data = self._find_jwplayer_data(
3352 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3353 return self._parse_jwplayer_data(
3354 jwplayer_data, video_id, *args, **kwargs)
3355
3356 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3357 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
a4a554a7 3358 entries = []
32a84bcf
SS
3359 if not isinstance(jwplayer_data, dict):
3360 return entries
a4a554a7 3361
32a84bcf
SS
3362 playlist_items = jwplayer_data.get('playlist')
3363 # JWPlayer backward compatibility: single playlist item/flattened playlists
a4a554a7 3364 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
32a84bcf
SS
3365 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3366 if not isinstance(playlist_items, list):
3367 playlist_items = (playlist_items or jwplayer_data, )
a4a554a7 3368
32a84bcf
SS
3369 for video_data in playlist_items:
3370 if not isinstance(video_data, dict):
3371 continue
a4a554a7
YCH
3372 # JWPlayer backward compatibility: flattened sources
3373 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3374 if 'sources' not in video_data:
3375 video_data['sources'] = [video_data]
3376
3377 this_video_id = video_id or video_data['mediaid']
3378
1a2192cb
S
3379 formats = self._parse_jwplayer_formats(
3380 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3381 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3382
3383 subtitles = {}
3384 tracks = video_data.get('tracks')
3385 if tracks and isinstance(tracks, list):
3386 for track in tracks:
96a2daa1
S
3387 if not isinstance(track, dict):
3388 continue
f4b74272 3389 track_kind = track.get('kind')
14f25df2 3390 if not track_kind or not isinstance(track_kind, str):
f4b74272
S
3391 continue
3392 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3393 continue
3394 track_url = urljoin(base_url, track.get('file'))
3395 if not track_url:
3396 continue
3397 subtitles.setdefault(track.get('label') or 'en', []).append({
3398 'url': self._proto_relative_url(track_url)
3399 })
3400
50d808f5 3401 entry = {
a4a554a7 3402 'id': this_video_id,
50d808f5 3403 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3404 'description': clean_html(video_data.get('description')),
6945b9e7 3405 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3406 'timestamp': int_or_none(video_data.get('pubdate')),
3407 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3408 'subtitles': subtitles,
32a84bcf
SS
3409 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3410 'genre': clean_html(video_data.get('genre')),
3411 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3412 'season_number': int_or_none(video_data.get('season')),
3413 'episode_number': int_or_none(video_data.get('episode')),
3414 'release_year': int_or_none(video_data.get('releasedate')),
3415 'age_limit': int_or_none(video_data.get('age_restriction')),
50d808f5
RA
3416 }
3417 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3418 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3419 entry.update({
3420 '_type': 'url_transparent',
3421 'url': formats[0]['url'],
3422 })
3423 else:
50d808f5
RA
3424 entry['formats'] = formats
3425 entries.append(entry)
a4a554a7
YCH
3426 if len(entries) == 1:
3427 return entries[0]
3428 else:
3429 return self.playlist_result(entries)
3430
ed0cf9b3
S
3431 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3432 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
32a84bcf 3433 urls = set()
ed0cf9b3 3434 formats = []
1a2192cb 3435 for source in jwplayer_sources_data:
0a268c6e
S
3436 if not isinstance(source, dict):
3437 continue
6945b9e7
RA
3438 source_url = urljoin(
3439 base_url, self._proto_relative_url(source.get('file')))
3440 if not source_url or source_url in urls:
bf1b87cd 3441 continue
32a84bcf 3442 urls.add(source_url)
ed0cf9b3
S
3443 source_type = source.get('type') or ''
3444 ext = mimetype2ext(source_type) or determine_ext(source_url)
32a84bcf 3445 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
ed0cf9b3 3446 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3447 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3448 m3u8_id=m3u8_id, fatal=False))
32a84bcf 3449 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
ed0cf9b3
S
3450 formats.extend(self._extract_mpd_formats(
3451 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3452 elif ext == 'smil':
3453 formats.extend(self._extract_smil_formats(
3454 source_url, video_id, fatal=False))
ed0cf9b3 3455 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3456 elif source_type.startswith('audio') or ext in (
3457 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3458 formats.append({
3459 'url': source_url,
3460 'vcodec': 'none',
3461 'ext': ext,
3462 })
3463 else:
32a84bcf 3464 format_id = str_or_none(source.get('label'))
ed0cf9b3 3465 height = int_or_none(source.get('height'))
32a84bcf 3466 if height is None and format_id:
ed0cf9b3 3467 # Often no height is provided but there is a label in
0236cd0d 3468 # format like "1080p", "720p SD", or 1080.
32a84bcf 3469 height = parse_resolution(format_id).get('height')
ed0cf9b3
S
3470 a_format = {
3471 'url': source_url,
3472 'width': int_or_none(source.get('width')),
3473 'height': height,
d3a3d7f0 3474 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3475 'filesize': int_or_none(source.get('filesize')),
ed0cf9b3 3476 'ext': ext,
32a84bcf 3477 'format_id': format_id
ed0cf9b3
S
3478 }
3479 if source_url.startswith('rtmp'):
3480 a_format['ext'] = 'flv'
ed0cf9b3
S
3481 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3482 # of jwplayer.flash.swf
3483 rtmp_url_parts = re.split(
3484 r'((?:mp4|mp3|flv):)', source_url, 1)
3485 if len(rtmp_url_parts) == 3:
3486 rtmp_url, prefix, play_path = rtmp_url_parts
3487 a_format.update({
3488 'url': rtmp_url,
3489 'play_path': prefix + play_path,
3490 })
3491 if rtmp_params:
3492 a_format.update(rtmp_params)
3493 formats.append(a_format)
3494 return formats
3495
f4b1c7ad 3496 def _live_title(self, name):
39ca3b5c 3497 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3498 return name
f4b1c7ad 3499
b14f3a4c
PH
3500 def _int(self, v, name, fatal=False, **kwargs):
3501 res = int_or_none(v, **kwargs)
b14f3a4c 3502 if res is None:
86e5f3ed 3503 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3504 if fatal:
3505 raise ExtractorError(msg)
3506 else:
6a39ee13 3507 self.report_warning(msg)
b14f3a4c
PH
3508 return res
3509
3510 def _float(self, v, name, fatal=False, **kwargs):
3511 res = float_or_none(v, **kwargs)
3512 if res is None:
86e5f3ed 3513 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3514 if fatal:
3515 raise ExtractorError(msg)
3516 else:
6a39ee13 3517 self.report_warning(msg)
b14f3a4c
PH
3518 return res
3519
40e41780
TF
3520 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3521 path='/', secure=False, discard=False, rest={}, **kwargs):
ac668111 3522 cookie = http.cookiejar.Cookie(
4ed2d7b7 3523 0, name, value, port, port is not None, domain, True,
40e41780
TF
3524 domain.startswith('.'), path, True, secure, expire_time,
3525 discard, None, None, rest)
9809740b 3526 self.cookiejar.set_cookie(cookie)
42939b61 3527
799207e8 3528 def _get_cookies(self, url):
ac668111 3529 """ Return a http.cookies.SimpleCookie with the cookies for the url """
b87e01c1 3530 return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
799207e8 3531
e3c1266f 3532 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3533 """
3534 Apply first Set-Cookie header instead of the last. Experimental.
3535
3536 Some sites (e.g. [1-3]) may serve two cookies under the same name
3537 in Set-Cookie header and expect the first (old) one to be set rather
3538 than second (new). However, as of RFC6265 the newer one cookie
3539 should be set into cookie store what actually happens.
3540 We will workaround this issue by resetting the cookie to
3541 the first one manually.
3542 1. https://new.vk.com/
3543 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3544 3. https://learning.oreilly.com/
3545 """
e3c1266f
S
3546 for header, cookies in url_handle.headers.items():
3547 if header.lower() != 'set-cookie':
3548 continue
cfb0511d 3549 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3550 cookie_value = re.search(
3551 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3552 if cookie_value:
3553 value, domain = cookie_value.groups()
3554 self._set_cookie(domain, cookie, value)
3555 break
3556
82d02080 3557 @classmethod
3558 def get_testcases(cls, include_onlymatching=False):
6368e2e6 3559 # Do not look in super classes
3560 t = vars(cls).get('_TEST')
05900629 3561 if t:
82d02080 3562 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3563 tests = [t]
3564 else:
6368e2e6 3565 tests = vars(cls).get('_TESTS', [])
05900629
PH
3566 for t in tests:
3567 if not include_onlymatching and t.get('only_matching', False):
3568 continue
82d02080 3569 t['name'] = cls.ie_key()
05900629 3570 yield t
e756f45b
M
3571 if getattr(cls, '__wrapped__', None):
3572 yield from cls.__wrapped__.get_testcases(include_onlymatching)
05900629 3573
f2e8dbcc 3574 @classmethod
3575 def get_webpage_testcases(cls):
6368e2e6 3576 tests = vars(cls).get('_WEBPAGE_TESTS', [])
f2e8dbcc 3577 for t in tests:
3578 t['name'] = cls.ie_key()
e756f45b
M
3579 yield t
3580 if getattr(cls, '__wrapped__', None):
3581 yield from cls.__wrapped__.get_webpage_testcases()
f2e8dbcc 3582
6368e2e6 3583 @classproperty(cache=True)
24146491 3584 def age_limit(cls):
3585 """Get age limit from the testcases"""
3586 return max(traverse_obj(
f2e8dbcc 3587 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
24146491 3588 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3589
171a31db 3590 @classproperty(cache=True)
3591 def _RETURN_TYPE(cls):
3592 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3593 tests = tuple(cls.get_testcases(include_onlymatching=False))
3594 if not tests:
3595 return None
3596 elif not any(k.startswith('playlist') for test in tests for k in test):
3597 return 'video'
3598 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3599 return 'playlist'
3600 return 'any'
3601
3602 @classmethod
3603 def is_single_video(cls, url):
3604 """Returns whether the URL is of a single video, None if unknown"""
baa922b5 3605 if cls.suitable(url):
3606 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
171a31db 3607
82d02080 3608 @classmethod
3609 def is_suitable(cls, age_limit):
24146491 3610 """Test whether the extractor is generally suitable for the given age limit"""
3611 return not age_restricted(cls.age_limit, age_limit)
05900629 3612
82d02080 3613 @classmethod
3614 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3615 """Description of the extractor"""
3616 desc = ''
82d02080 3617 if cls._NETRC_MACHINE:
8dcce6a8 3618 if markdown:
5b28cef7 3619 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
8dcce6a8 3620 else:
82d02080 3621 desc += f' [{cls._NETRC_MACHINE}]'
3622 if cls.IE_DESC is False:
8dcce6a8 3623 desc += ' [HIDDEN]'
82d02080 3624 elif cls.IE_DESC:
3625 desc += f' {cls.IE_DESC}'
3626 if cls.SEARCH_KEY:
08e29b9f 3627 desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3628 if search_examples:
3629 _COUNTS = ('', '5', '10', 'all')
62b58c09 3630 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
82d02080 3631 if not cls.working():
8dcce6a8 3632 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3633
46d09f87 3634 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3635 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
8dcce6a8 3636 return f'{name}:{desc}' if desc else name
3637
a504ced0 3638 def extract_subtitles(self, *args, **kwargs):
a06916d9 3639 if (self.get_param('writesubtitles', False)
3640 or self.get_param('listsubtitles')):
9868ea49
JMF
3641 return self._get_subtitles(*args, **kwargs)
3642 return {}
a504ced0
JMF
3643
3644 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3645 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3646
0cf643b2
M
3647 class CommentsDisabled(Exception):
3648 """Raise in _get_comments if comments are disabled for the video"""
3649
a2160aa4 3650 def extract_comments(self, *args, **kwargs):
3651 if not self.get_param('getcomments'):
3652 return None
3653 generator = self._get_comments(*args, **kwargs)
3654
3655 def extractor():
3656 comments = []
d2b2fca5 3657 interrupted = True
a2160aa4 3658 try:
3659 while True:
3660 comments.append(next(generator))
a2160aa4 3661 except StopIteration:
3662 interrupted = False
d2b2fca5 3663 except KeyboardInterrupt:
3664 self.to_screen('Interrupted by user')
0cf643b2
M
3665 except self.CommentsDisabled:
3666 return {'comments': None, 'comment_count': None}
d2b2fca5 3667 except Exception as e:
3668 if self.get_param('ignoreerrors') is not True:
3669 raise
3670 self._downloader.report_error(e)
a2160aa4 3671 comment_count = len(comments)
3672 self.to_screen(f'Extracted {comment_count} comments')
3673 return {
3674 'comments': comments,
3675 'comment_count': None if interrupted else comment_count
3676 }
3677 return extractor
3678
3679 def _get_comments(self, *args, **kwargs):
3680 raise NotImplementedError('This method must be implemented by subclasses')
3681
912e0b7e
YCH
3682 @staticmethod
3683 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3684 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3685 will be dropped. """
86e5f3ed 3686 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3687 ret = list(subtitle_list1)
a44ca5a4 3688 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3689 return ret
3690
3691 @classmethod
46890374 3692 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3693 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3694 if target is None:
3695 target = {}
3696 for d in dicts:
3697 for lang, subs in d.items():
3698 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3699 return target
912e0b7e 3700
360e1ca5 3701 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3702 if (self.get_param('writeautomaticsub', False)
3703 or self.get_param('listsubtitles')):
9868ea49
JMF
3704 return self._get_automatic_captions(*args, **kwargs)
3705 return {}
360e1ca5
JMF
3706
3707 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3708 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3709
2762dbb1 3710 @functools.cached_property
24146491 3711 def _cookies_passed(self):
3712 """Whether cookies have been passed to YoutubeDL"""
3713 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3714
d77ab8e2 3715 def mark_watched(self, *args, **kwargs):
1813a6cc 3716 if not self.get_param('mark_watched', False):
3717 return
24146491 3718 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3719 self._mark_watched(*args, **kwargs)
3720
3721 def _mark_watched(self, *args, **kwargs):
3722 raise NotImplementedError('This method must be implemented by subclasses')
3723
38cce791
YCH
3724 def geo_verification_headers(self):
3725 headers = {}
a06916d9 3726 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3727 if geo_verification_proxy:
3728 headers['Ytdl-request-proxy'] = geo_verification_proxy
3729 return headers
3730
8f97a15d 3731 @staticmethod
3732 def _generic_id(url):
14f25df2 3733 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
98763ee3 3734
62b8dac4 3735 def _generic_title(self, url='', webpage='', *, default=None):
3736 return (self._og_search_title(webpage, default=None)
3737 or self._html_extract_title(webpage, default=None)
3738 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3739 or default)
98763ee3 3740
22ccd542 3741 def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3742 if not duration:
3743 return
3744 chapter_list = [{
3745 'start_time': start_function(chapter),
3746 'title': title_function(chapter),
3747 } for chapter in chapter_list or []]
84ffeb7d 3748 if strict:
3749 warn = self.report_warning
3750 else:
3751 warn = self.write_debug
22ccd542 3752 chapter_list.sort(key=lambda c: c['start_time'] or 0)
3753
3754 chapters = [{'start_time': 0}]
3755 for idx, chapter in enumerate(chapter_list):
3756 if chapter['start_time'] is None:
84ffeb7d 3757 warn(f'Incomplete chapter {idx}')
22ccd542 3758 elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3759 chapters.append(chapter)
3760 elif chapter not in chapters:
84ffeb7d 3761 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3762 else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3763 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
22ccd542 3764 return chapters[1:]
3765
3766 def _extract_chapters_from_description(self, description, duration):
3767 duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3768 sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3769 return self._extract_chapters_helper(
3770 re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3771 start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3772 duration=duration, strict=False) or self._extract_chapters_helper(
3773 re.findall(sep_re % (r'.+?', duration_re), description or ''),
3774 start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3775 duration=duration, strict=False)
3776
c224251a 3777 @staticmethod
b0089e89 3778 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3779 all_known = all(map(
3780 lambda x: x is not None,
3781 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3782 return (
3783 'private' if is_private
3784 else 'premium_only' if needs_premium
3785 else 'subscriber_only' if needs_subscription
3786 else 'needs_auth' if needs_auth
3787 else 'unlisted' if is_unlisted
3788 else 'public' if all_known
3789 else None)
3790
d43de682 3791 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3792 '''
3793 @returns A list of values for the extractor argument given by "key"
3794 or "default" if no such key is present
3795 @param default The default value to return when the key is not present (default: [])
3796 @param casesense When false, the values are converted to lower case
3797 '''
5225df50 3798 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3799 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
4bb6b02f 3800 if val is None:
3801 return [] if default is NO_DEFAULT else default
3802 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3803
f40ee5e9 3804 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3805 if not playlist_id or not video_id:
3806 return not video_id
3807
3808 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3809 if no_playlist is not None:
3810 return not no_playlist
3811
3812 video_id = '' if video_id is True else f' {video_id}'
3813 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3814 if self.get_param('noplaylist'):
3815 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3816 return False
3817 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3818 return True
3819
be5c1ae8 3820 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
8ca48a1a 3821 RetryManager.report_retry(
3822 err, _count or int(fatal), _retries,
3823 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3824 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
be5c1ae8 3825
3826 def RetryManager(self, **kwargs):
3827 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3828
ade1fa70 3829 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3830 display_id = traverse_obj(info_dict, 'display_id', 'id')
3831 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3832 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3833 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3834
8f97a15d 3835 @classmethod
3836 def extract_from_webpage(cls, ydl, url, webpage):
3837 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3838 else ydl.get_info_extractor(cls.ie_key()))
f2e8dbcc 3839 for info in ie._extract_from_webpage(url, webpage) or []:
3840 # url = None since we do not want to set (webpage/original)_url
3841 ydl.add_default_extra_info(info, ie, None)
3842 yield info
8f97a15d 3843
3844 @classmethod
3845 def _extract_from_webpage(cls, url, webpage):
3846 for embed_url in orderedSet(
3847 cls._extract_embed_urls(url, webpage) or [], lazy=True):
d2c8aadf 3848 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
8f97a15d 3849
3850 @classmethod
3851 def _extract_embed_urls(cls, url, webpage):
3852 """@returns all the embed urls on the webpage"""
3853 if '_EMBED_URL_RE' not in cls.__dict__:
3854 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3855 for idx, regex in enumerate(cls._EMBED_REGEX):
3856 assert regex.count('(?P<url>') == 1, \
3857 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3858 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3859
3860 for regex in cls._EMBED_URL_RE:
3861 for mobj in regex.finditer(webpage):
3862 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3863 if cls._VALID_URL is False or cls.suitable(embed_url):
3864 yield embed_url
3865
3866 class StopExtraction(Exception):
3867 pass
3868
bfd973ec 3869 @classmethod
3870 def _extract_url(cls, webpage): # TODO: Remove
3871 """Only for compatibility with some older extractors"""
3872 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3873
2314b4d8 3874 @classmethod
3875 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3876 if plugin_name:
3877 mro = inspect.getmro(cls)
3878 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
e756f45b
M
3879 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3880 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
2314b4d8 3881 while getattr(super_class, '__wrapped__', None):
3882 super_class = super_class.__wrapped__
3883 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
e756f45b 3884 _PLUGIN_OVERRIDES[super_class].append(cls)
2314b4d8 3885
3886 return super().__init_subclass__(**kwargs)
3887
8dbe9899 3888
d6983cb4
PH
3889class SearchInfoExtractor(InfoExtractor):
3890 """
3891 Base class for paged search queries extractors.
10952eb2 3892 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3893 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3894 """
3895
96565c7e 3896 _MAX_RESULTS = float('inf')
171a31db 3897 _RETURN_TYPE = 'playlist'
96565c7e 3898
8f97a15d 3899 @classproperty
3900 def _VALID_URL(cls):
d6983cb4
PH
3901 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3902
d6983cb4 3903 def _real_extract(self, query):
2c4aaadd 3904 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3905 if prefix == '':
3906 return self._get_n_results(query, 1)
3907 elif prefix == 'all':
3908 return self._get_n_results(query, self._MAX_RESULTS)
3909 else:
3910 n = int(prefix)
3911 if n <= 0:
86e5f3ed 3912 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3913 elif n > self._MAX_RESULTS:
6a39ee13 3914 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3915 n = self._MAX_RESULTS
3916 return self._get_n_results(query, n)
3917
3918 def _get_n_results(self, query, n):
cc16383f 3919 """Get a specified number of results for a query.
3920 Either this function or _search_results must be overridden by subclasses """
3921 return self.playlist_result(
3922 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3923 query, query)
3924
3925 def _search_results(self, query):
3926 """Returns an iterator of search results"""
611c1dd9 3927 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 3928
82d02080 3929 @classproperty
3930 def SEARCH_KEY(cls):
3931 return cls._SEARCH_KEY
fe7866d0 3932
3933
3934class UnsupportedURLIE(InfoExtractor):
3935 _VALID_URL = '.*'
3936 _ENABLED = False
3937 IE_DESC = False
3938
3939 def _real_extract(self, url):
3940 raise UnsupportedError(url)
e756f45b
M
3941
3942
3943_PLUGIN_OVERRIDES = collections.defaultdict(list)