]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
Standardize retry mechanism (#1649)
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
ac668111 3import getpass
3ec05685 4import hashlib
54007a45 5import http.client
6import http.cookiejar
7import http.cookies
cc16383f 8import itertools
3d3538e4 9import json
f8271158 10import math
4094b6e3 11import netrc
d6983cb4 12import os
773f291d 13import random
6929b41a 14import re
d6983cb4 15import sys
4094b6e3 16import time
8f97a15d 17import types
14f25df2 18import urllib.parse
ac668111 19import urllib.request
f8271158 20import xml.etree.ElementTree
d6983cb4 21
6929b41a 22from ..compat import functools # isort: split
14f25df2 23from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
eb8a4433 24from ..downloader import FileDownloader
f8271158 25from ..downloader.f4m import get_base_url, remove_encrypted_media
8c25f81b 26from ..utils import (
8f97a15d 27 IDENTITY,
f8271158 28 JSON_LD_RE,
29 NO_DEFAULT,
30 ExtractorError,
31 GeoRestrictedError,
32 GeoUtils,
b7c47b74 33 LenientJSONDecoder,
f8271158 34 RegexNotFoundError,
be5c1ae8 35 RetryManager,
f8271158 36 UnsupportedError,
05900629 37 age_restricted,
02dc0a36 38 base_url,
08f2a92c 39 bug_reports_message,
82d02080 40 classproperty,
d6983cb4 41 clean_html,
70f0f5a8 42 determine_ext,
46b18f23 43 determine_protocol,
d493f15c 44 dict_get,
42676437 45 encode_data_uri,
9b9c5355 46 error_to_compat_str,
46b18f23 47 extract_attributes,
90137ca4 48 filter_dict,
97f4aecf 49 fix_xml_ampersands,
b14f3a4c 50 float_or_none,
b868936c 51 format_field,
31bb8d3f 52 int_or_none,
34921b43 53 join_nonempty,
a4a554a7 54 js_to_json,
46b18f23 55 mimetype2ext,
3158150c 56 network_exceptions,
46b18f23 57 orderedSet,
d493f15c 58 parse_bitrate,
46b18f23
JH
59 parse_codecs,
60 parse_duration,
4ca2a3cf 61 parse_iso8601,
46b18f23 62 parse_m3u8_attributes,
d493f15c 63 parse_resolution,
46b18f23 64 sanitize_filename,
8f97a15d 65 sanitize_url,
b868936c 66 sanitized_Request,
d493f15c 67 str_or_none,
ce5b9040 68 str_to_int,
f856816b 69 strip_or_none,
5d3a0e79 70 traverse_obj,
47046464 71 try_call,
ffa89477 72 try_get,
f38de77f 73 unescapeHTML,
647eab45 74 unified_strdate,
6b3a3098 75 unified_timestamp,
46b18f23 76 update_Request,
09d02ea4 77 update_url_query,
a107193e 78 url_basename,
bebef109 79 url_or_none,
b868936c 80 urljoin,
6606817a 81 variadic,
a6571f10 82 xpath_element,
8d6765cf
S
83 xpath_text,
84 xpath_with_ns,
d6983cb4 85)
c342041f 86
d6983cb4 87
86e5f3ed 88class InfoExtractor:
d6983cb4
PH
89 """Information Extractor class.
90
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
5d380852 95 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
98
cf0649f8 99 The type field determines the type of the result.
fed5d032
PH
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
102
103 For a video, the dictionaries must include the following fields:
d6983cb4
PH
104
105 id: Video identifier.
d4736fdb 106 title: Video title, unescaped. Set to an empty string if video has
107 no title as opposed to "None" which signifies that the
108 extractor failed to obtain a title
d67b0b15 109
f49d89ee 110 Additionally, it must contain either a formats entry or a url one:
d67b0b15 111
f49d89ee
PH
112 formats: A list of dictionaries for each format available, ordered
113 from worst to best quality.
114
115 Potential fields:
c790e93a
S
116 * url The mandatory URL representing the media:
117 for plain file media - HTTP URL of this file,
118 for RTMP - RTMP URL,
119 for HLS - URL of the M3U8 media playlist,
120 for HDS - URL of the F4M manifest,
79d2077e
S
121 for DASH
122 - HTTP URL to plain file media (in case of
123 unfragmented media)
124 - URL of the MPD manifest or base URL
125 representing the media if MPD manifest
8ed7a233 126 is parsed from a string (in case of
79d2077e 127 fragmented media)
c790e93a 128 for MSS - URL of the ISM manifest.
86f4d14f
S
129 * manifest_url
130 The URL of the manifest file in case of
c790e93a
S
131 fragmented media:
132 for HLS - URL of the M3U8 master playlist,
133 for HDS - URL of the F4M manifest,
134 for DASH - URL of the MPD manifest,
135 for MSS - URL of the ISM manifest.
a44ca5a4 136 * manifest_stream_number (For internal use only)
137 The index of the stream in the manifest file
10952eb2 138 * ext Will be calculated from URL if missing
d67b0b15
PH
139 * format A human-readable description of the format
140 ("mp4 container with h264/opus").
141 Calculated from the format_id, width, height.
142 and format_note fields if missing.
143 * format_id A short description of the format
5d4f3985
PH
144 ("mp4_h264_opus" or "19").
145 Technically optional, but strongly recommended.
d67b0b15
PH
146 * format_note Additional info about the format
147 ("3D" or "DASH video")
148 * width Width of the video, if known
149 * height Height of the video, if known
f49d89ee 150 * resolution Textual description of width and height
176f1866 151 * dynamic_range The dynamic range of the video. One of:
152 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 153 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
154 * abr Average audio bitrate in KBit/s
155 * acodec Name of the audio codec in use
dd27fd17 156 * asr Audio sampling rate in Hertz
d67b0b15 157 * vbr Average video bitrate in KBit/s
fbb21cf5 158 * fps Frame rate
d67b0b15 159 * vcodec Name of the video codec in use
1394ce65 160 * container Name of the container format
d67b0b15 161 * filesize The number of bytes, if known in advance
9732d77e 162 * filesize_approx An estimate for the number of bytes
d67b0b15 163 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 164 * protocol The protocol that will be used for the actual
adbc4ec4
THD
165 download, lower-case. One of "http", "https" or
166 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
167 * fragment_base_url
168 Base URL for fragments. Each fragment's path
169 value (if present) will be relative to
170 this URL.
171 * fragments A list of fragments of a fragmented media.
172 Each fragment entry must contain either an url
173 or a path. If an url is present it should be
174 considered by a client. Otherwise both path and
175 fragment_base_url must be present. Here is
176 the list of all potential fields:
177 * "url" - fragment's URL
178 * "path" - fragment's path relative to
179 fragment_base_url
a0d5077c
S
180 * "duration" (optional, int or float)
181 * "filesize" (optional, int)
adbc4ec4
THD
182 * is_from_start Is a live format that can be downloaded
183 from the start. Boolean
f49d89ee 184 * preference Order number of this format. If this field is
08d13955 185 present and not None, the formats get sorted
38d63d84 186 by this field, regardless of all other values.
f49d89ee
PH
187 -1 for default (order by other properties),
188 -2 or smaller for less than default.
e65566a9
PH
189 < -1000 to hide the format (if there is
190 another one which is strictly better)
32f90364
PH
191 * language Language code, e.g. "de" or "en-US".
192 * language_preference Is this in the language mentioned in
193 the URL?
aff2f4f4
PH
194 10 if it's what the URL is about,
195 -1 for default (don't know),
196 -10 otherwise, other values reserved for now.
5d73273f
PH
197 * quality Order number of the video quality of this
198 format, irrespective of the file format.
199 -1 for default (order by other properties),
200 -2 or smaller for less than default.
c64ed2a3
PH
201 * source_preference Order number for this video source
202 (quality takes higher priority)
203 -1 for default (order by other properties),
204 -2 or smaller for less than default.
d769be6c
PH
205 * http_headers A dictionary of additional HTTP headers
206 to add to the request.
6271f1ca 207 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
208 video's pixels are not square.
209 width : height ratio as float.
210 * no_resume The server does not support resuming the
211 (HTTP or RTMP) download. Boolean.
88acdbc2 212 * has_drm The format has DRM and cannot be downloaded. Boolean
0a5a191a 213 * downloader_options A dictionary of downloader options
214 (For internal use only)
215 * http_chunk_size Chunk size for HTTP downloads
216 * ffmpeg_args Extra arguments for ffmpeg downloader
3b1fe47d 217 RTMP formats can also have the additional fields: page_url,
218 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
219 rtmp_protocol, rtmp_real_time
3dee7826 220
c0ba0f48 221 url: Final video URL.
d6983cb4 222 ext: Video filename extension.
d67b0b15
PH
223 format: The video format, defaults to ext (used for --get-format)
224 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 225
d6983cb4
PH
226 The following fields are optional:
227
08d30158 228 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 229 alt_title: A secondary title of the video.
0afef30b
PH
230 display_id An alternative identifier for the video, not necessarily
231 unique, but available before title. Typically, id is
232 something like "4234987", title "Dancing naked mole rats",
233 and display_id "dancing-naked-mole-rats"
d5519808 234 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 235 * "id" (optional, string) - Thumbnail format ID
d5519808 236 * "url"
cfb56d1a 237 * "preference" (optional, int) - quality of the image
d5519808
PH
238 * "width" (optional, int)
239 * "height" (optional, int)
5e1c39ac 240 * "resolution" (optional, string "{width}x{height}",
d5519808 241 deprecated)
2de624fd 242 * "filesize" (optional, int)
297e9952 243 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 244 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 245 description: Full video description.
d6983cb4 246 uploader: Full name of the video uploader.
2bc0c46f 247 license: License name the video is licensed under.
8a92e51c 248 creator: The creator of the video.
10db0d2f 249 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 250 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 251 If not explicitly set, calculated from timestamp
252 release_timestamp: UNIX timestamp of the moment the video was released.
253 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 254 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 255 If not explicitly set, calculated from release_timestamp
256 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 257 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 258 If not explicitly set, calculated from modified_timestamp
d6983cb4 259 uploader_id: Nickname or id of the video uploader.
7bcd2830 260 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 261 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 262 Note that channel fields may or may not repeat uploader
6f1f59f3
S
263 fields. This depends on a particular extractor.
264 channel_id: Id of the channel.
265 channel_url: Full URL to a channel webpage.
6c73052c 266 channel_follower_count: Number of followers of the channel.
da9ec3b9 267 location: Physical location where the video was filmed.
a504ced0 268 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
269 {tag: subformats}. "tag" is usually a language code, and
270 "subformats" is a list sorted from lower to higher
271 preference, each element is a dictionary with the "ext"
272 entry and one of:
a504ced0 273 * "data": The subtitles file contents
10952eb2 274 * "url": A URL pointing to the subtitles file
2412044c 275 It can optionally also have:
276 * "name": Name or description of the subtitles
08d30158 277 * "http_headers": A dictionary of additional HTTP headers
297e9952 278 to add to the request.
4bba3716 279 "ext" will be calculated from URL if missing
e167860c 280 automatic_captions: Like 'subtitles'; contains automatically generated
281 captions instead of normal subtitles
62d231c0 282 duration: Length of the video in seconds, as an integer or float.
f3d29461 283 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
284 like_count: Number of positive ratings of the video
285 dislike_count: Number of negative ratings of the video
02835c6b 286 repost_count: Number of reposts of the video
2d30521a 287 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 288 comment_count: Number of comments on the video
dd622d7c
PH
289 comments: A list of comments, each with one or more of the following
290 properties (all but one of text or html optional):
291 * "author" - human-readable name of the comment author
292 * "author_id" - user ID of the comment author
a1c5d2ca 293 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
294 * "id" - Comment ID
295 * "html" - Comment as HTML
296 * "text" - Plain text of the comment
297 * "timestamp" - UNIX timestamp of comment
298 * "parent" - ID of the comment this one is replying to.
299 Set to "root" to indicate that this is a
300 comment to the original video.
a1c5d2ca
M
301 * "like_count" - Number of positive ratings of the comment
302 * "dislike_count" - Number of negative ratings of the comment
303 * "is_favorited" - Whether the comment is marked as
304 favorite by the video uploader
305 * "author_is_uploader" - Whether the comment is made by
306 the video uploader
8dbe9899 307 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 308 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
309 should allow to get the same result again. (It will be set
310 by YoutubeDL if it's missing)
ad3bc6ac
PH
311 categories: A list of categories that the video falls in, for example
312 ["Sports", "Berlin"]
864f24bd 313 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 314 cast: A list of the video cast
7267bd53
PH
315 is_live: True, False, or None (=unknown). Whether this video is a
316 live stream that goes on instead of a fixed-length video.
f76ede8e 317 was_live: True, False, or None (=unknown). Whether this video was
318 originally a live stream.
e325a21a 319 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live'
320 or 'post_live' (was live, but VOD is not yet processed)
ae30b840 321 If absent, automatically set from is_live, was_live
7c80519c 322 start_time: Time in seconds where the reproduction should start, as
10952eb2 323 specified in the URL.
297a564b 324 end_time: Time in seconds where the reproduction should end, as
10952eb2 325 specified in the URL.
55949fed 326 chapters: A list of dictionaries, with the following entries:
327 * "start_time" - The start time of the chapter in seconds
328 * "end_time" - The end time of the chapter in seconds
329 * "title" (optional, string)
6cfda058 330 playable_in_embed: Whether this video is allowed to play in embedded
331 players on other sites. Can be True (=always allowed),
332 False (=never allowed), None (=unknown), or a string
c224251a
M
333 specifying the criteria for embedability (Eg: 'whitelist')
334 availability: Under what condition the video is available. One of
335 'private', 'premium_only', 'subscriber_only', 'needs_auth',
336 'unlisted' or 'public'. Use 'InfoExtractor._availability'
337 to set it
1e8fe57e 338 _old_archive_ids: A list of old archive ids needed for backward compatibility
277d6ff5 339 __post_extractor: A function to be called just before the metadata is
340 written to either disk, logger or console. The function
341 must return a dict which will be added to the info_dict.
342 This is usefull for additional information that is
343 time-consuming to extract. Note that the fields thus
344 extracted will not be available to output template and
345 match_filter. So, only "comments" and "comment_count" are
346 currently allowed to be extracted via this method.
d6983cb4 347
7109903e
S
348 The following fields should only be used when the video belongs to some logical
349 chapter or section:
350
351 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
352 chapter_number: Number of the chapter the video belongs to, as an integer.
353 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
354
355 The following fields should only be used when the video is an episode of some
8d76bdf1 356 series, programme or podcast:
7109903e
S
357
358 series: Title of the series or programme the video episode belongs to.
9ac24e23 359 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 360 season: Title of the season the video episode belongs to.
27bfd4e5
S
361 season_number: Number of the season the video episode belongs to, as an integer.
362 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
363 episode: Title of the video episode. Unlike mandatory video title field,
364 this field should denote the exact title of the video episode
365 without any kind of decoration.
27bfd4e5
S
366 episode_number: Number of the video episode within a season, as an integer.
367 episode_id: Id of the video episode, as a unicode string.
7109903e 368
7a93ab5f
S
369 The following fields should only be used when the media is a track or a part of
370 a music album:
371
372 track: Title of the track.
373 track_number: Number of the track within an album or a disc, as an integer.
374 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
375 as a unicode string.
376 artist: Artist(s) of the track.
377 genre: Genre(s) of the track.
378 album: Title of the album the track belongs to.
379 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
380 album_artist: List of all artists appeared on the album (e.g.
381 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
382 and compilations).
383 disc_number: Number of the disc or other physical medium the track belongs to,
384 as an integer.
385 release_year: Year (YYYY) when the album was released.
8bcd4048 386 composer: Composer of the piece
7a93ab5f 387
3975b4d2 388 The following fields should only be set for clips that should be cut from the original video:
389
390 section_start: Start time of the section in seconds
391 section_end: End time of the section in seconds
392
45e8a04e 393 The following fields should only be set for storyboards:
394 rows: Number of rows in each storyboard fragment, as an integer
395 columns: Number of columns in each storyboard fragment, as an integer
396
deefc05b 397 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 398
d838b1bd
PH
399 Unless mentioned otherwise, None is equivalent to absence of information.
400
fed5d032
PH
401
402 _type "playlist" indicates multiple videos.
b82f815f
PH
403 There must be a key "entries", which is a list, an iterable, or a PagedList
404 object, each element of which is a valid dictionary by this specification.
fed5d032 405
962ffcf8 406 Additionally, playlists can have "id", "title", and any other relevant
b60419c5 407 attributes with the same semantics as videos (see above).
fed5d032 408
f0d785d3 409 It can also have the following optional fields:
410
411 playlist_count: The total number of videos in a playlist. If not given,
412 YoutubeDL tries to calculate it from "entries"
413
fed5d032
PH
414
415 _type "multi_video" indicates that there are multiple videos that
416 form a single show, for examples multiple acts of an opera or TV episode.
417 It must have an entries key like a playlist and contain all the keys
418 required for a video at the same time.
419
420
421 _type "url" indicates that the video must be extracted from another
422 location, possibly by a different extractor. Its only required key is:
423 "url" - the next URL to extract.
f58766ce
PH
424 The key "ie_key" can be set to the class name (minus the trailing "IE",
425 e.g. "Youtube") if the extractor class is known in advance.
426 Additionally, the dictionary may have any properties of the resolved entity
427 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
428 known ahead of time.
429
430
431 _type "url_transparent" entities have the same specification as "url", but
432 indicate that the given additional information is more precise than the one
433 associated with the resolved URL.
434 This is useful when a site employs a video service that hosts the video and
435 its technical metadata, but that video service does not embed a useful
436 title, description etc.
437
438
8f97a15d 439 Subclasses of this should also be added to the list of extractors and
440 should define a _VALID_URL regexp and, re-define the _real_extract() and
441 (optionally) _real_initialize() methods.
d6983cb4 442
e6f21b3d 443 Subclasses may also override suitable() if necessary, but ensure the function
444 signature is preserved and that this function imports everything it needs
52efa4b3 445 (except other extractors), so that lazy_extractors works correctly.
446
8f97a15d 447 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
448 the HTML of Generic webpages. It may also override _extract_embed_urls
449 or _extract_from_webpage as necessary. While these are normally classmethods,
450 _extract_from_webpage is allowed to be an instance method.
451
452 _extract_from_webpage may raise self.StopExtraction() to stop further
453 processing of the webpage and obtain exclusive rights to it. This is useful
454 when the extractor cannot reliably be matched using just the URL.
455 Eg: invidious/peertube instances
456
457 Embed-only extractors can be defined by setting _VALID_URL = False.
458
52efa4b3 459 To support username + password (or netrc) login, the extractor must define a
460 _NETRC_MACHINE and re-define _perform_login(username, password) and
461 (optionally) _initialize_pre_login() methods. The _perform_login method will
462 be called between _initialize_pre_login and _real_initialize if credentials
463 are passed by the user. In cases where it is necessary to have the login
464 process as part of the extraction rather than initialization, _perform_login
465 can be left undefined.
e6f21b3d 466
4248dad9 467 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
468 geo restriction bypass mechanisms for a particular extractor.
469 Though it won't disable explicit geo restriction bypass based on
504f20dd 470 country code provided with geo_bypass_country.
4248dad9
S
471
472 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
473 countries for this extractor. One of these countries will be used by
474 geo restriction bypass mechanism right away in order to bypass
504f20dd 475 geo restriction, of course, if the mechanism is not disabled.
773f291d 476
5f95927a
S
477 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
478 IP blocks in CIDR notation for this extractor. One of these IP blocks
479 will be used by geo restriction bypass mechanism similarly
504f20dd 480 to _GEO_COUNTRIES.
3ccdde8c 481
e6f21b3d 482 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
483 in order to warn the users and skip the tests.
484 """
485
486 _ready = False
487 _downloader = None
773f291d 488 _x_forwarded_for_ip = None
4248dad9
S
489 _GEO_BYPASS = True
490 _GEO_COUNTRIES = None
5f95927a 491 _GEO_IP_BLOCKS = None
d6983cb4 492 _WORKING = True
52efa4b3 493 _NETRC_MACHINE = None
231025c4 494 IE_DESC = None
8dcce6a8 495 SEARCH_KEY = None
8f97a15d 496 _VALID_URL = None
497 _EMBED_REGEX = []
d6983cb4 498
8dcce6a8 499 def _login_hint(self, method=NO_DEFAULT, netrc=None):
500 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
501 return {
502 None: '',
503 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
504 'password': f'Use {password_hint}',
505 'cookies': (
506 'Use --cookies-from-browser or --cookies for the authentication. '
507 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
508 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 509
d6983cb4 510 def __init__(self, downloader=None):
49a57e70 511 """Constructor. Receives an optional downloader (a YoutubeDL instance).
512 If a downloader is not passed during initialization,
513 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 514 self._ready = False
773f291d 515 self._x_forwarded_for_ip = None
28f436ba 516 self._printed_messages = set()
d6983cb4
PH
517 self.set_downloader(downloader)
518
519 @classmethod
5ad28e7f 520 def _match_valid_url(cls, url):
8f97a15d 521 if cls._VALID_URL is False:
522 return None
79cb2577
PH
523 # This does not use has/getattr intentionally - we want to know whether
524 # we have cached the regexp for *this* class, whereas getattr would also
525 # match the superclass
526 if '_VALID_URL_RE' not in cls.__dict__:
527 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 528 return cls._VALID_URL_RE.match(url)
529
530 @classmethod
531 def suitable(cls, url):
532 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 533 # This function must import everything it needs (except other extractors),
534 # so that lazy_extractors works correctly
5ad28e7f 535 return cls._match_valid_url(url) is not None
d6983cb4 536
ed9266db
PH
537 @classmethod
538 def _match_id(cls, url):
5ad28e7f 539 return cls._match_valid_url(url).group('id')
ed9266db 540
1151c407 541 @classmethod
542 def get_temp_id(cls, url):
543 try:
544 return cls._match_id(url)
545 except (IndexError, AttributeError):
546 return None
547
d6983cb4
PH
548 @classmethod
549 def working(cls):
550 """Getter method for _WORKING."""
551 return cls._WORKING
552
52efa4b3 553 @classmethod
554 def supports_login(cls):
555 return bool(cls._NETRC_MACHINE)
556
d6983cb4
PH
557 def initialize(self):
558 """Initializes an instance (authentication, etc)."""
28f436ba 559 self._printed_messages = set()
5f95927a
S
560 self._initialize_geo_bypass({
561 'countries': self._GEO_COUNTRIES,
562 'ip_blocks': self._GEO_IP_BLOCKS,
563 })
4248dad9 564 if not self._ready:
52efa4b3 565 self._initialize_pre_login()
566 if self.supports_login():
567 username, password = self._get_login_info()
568 if username:
569 self._perform_login(username, password)
570 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 571 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
572 self._real_initialize()
573 self._ready = True
574
5f95927a 575 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
576 """
577 Initialize geo restriction bypass mechanism.
578
579 This method is used to initialize geo bypass mechanism based on faking
580 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 581 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
582 IP will be passed as X-Forwarded-For HTTP header in all subsequent
583 HTTP requests.
e39b5d4a
S
584
585 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
586 during the instance initialization with _GEO_COUNTRIES and
587 _GEO_IP_BLOCKS.
e39b5d4a 588
5f95927a 589 You may also manually call it from extractor's code if geo bypass
e39b5d4a 590 information is not available beforehand (e.g. obtained during
5f95927a
S
591 extraction) or due to some other reason. In this case you should pass
592 this information in geo bypass context passed as first argument. It may
593 contain following fields:
594
595 countries: List of geo unrestricted countries (similar
596 to _GEO_COUNTRIES)
597 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
598 (similar to _GEO_IP_BLOCKS)
599
e39b5d4a 600 """
773f291d 601 if not self._x_forwarded_for_ip:
5f95927a
S
602
603 # Geo bypass mechanism is explicitly disabled by user
a06916d9 604 if not self.get_param('geo_bypass', True):
5f95927a
S
605 return
606
607 if not geo_bypass_context:
608 geo_bypass_context = {}
609
610 # Backward compatibility: previously _initialize_geo_bypass
611 # expected a list of countries, some 3rd party code may still use
612 # it this way
613 if isinstance(geo_bypass_context, (list, tuple)):
614 geo_bypass_context = {
615 'countries': geo_bypass_context,
616 }
617
618 # The whole point of geo bypass mechanism is to fake IP
619 # as X-Forwarded-For HTTP header based on some IP block or
620 # country code.
621
622 # Path 1: bypassing based on IP block in CIDR notation
623
624 # Explicit IP block specified by user, use it right away
625 # regardless of whether extractor is geo bypassable or not
a06916d9 626 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
627
628 # Otherwise use random IP block from geo bypass context but only
629 # if extractor is known as geo bypassable
630 if not ip_block:
631 ip_blocks = geo_bypass_context.get('ip_blocks')
632 if self._GEO_BYPASS and ip_blocks:
633 ip_block = random.choice(ip_blocks)
634
635 if ip_block:
636 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 637 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
638 return
639
640 # Path 2: bypassing based on country code
641
642 # Explicit country code specified by user, use it right away
643 # regardless of whether extractor is geo bypassable or not
a06916d9 644 country = self.get_param('geo_bypass_country', None)
5f95927a
S
645
646 # Otherwise use random country code from geo bypass context but
647 # only if extractor is known as geo bypassable
648 if not country:
649 countries = geo_bypass_context.get('countries')
650 if self._GEO_BYPASS and countries:
651 country = random.choice(countries)
652
653 if country:
654 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 655 self._downloader.write_debug(
86e5f3ed 656 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
657
658 def extract(self, url):
659 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 660 try:
773f291d
S
661 for _ in range(2):
662 try:
663 self.initialize()
a06916d9 664 self.write_debug('Extracting URL: %s' % url)
0016b84e 665 ie_result = self._real_extract(url)
07cce701 666 if ie_result is None:
667 return None
0016b84e
S
668 if self._x_forwarded_for_ip:
669 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
b79f9e30 670 subtitles = ie_result.get('subtitles') or {}
671 if 'no-live-chat' in self.get_param('compat_opts'):
672 for lang in ('live_chat', 'comments', 'danmaku'):
673 subtitles.pop(lang, None)
0016b84e 674 return ie_result
773f291d 675 except GeoRestrictedError as e:
4248dad9
S
676 if self.__maybe_fake_ip_and_retry(e.countries):
677 continue
773f291d 678 raise
0db3bae8 679 except UnsupportedError:
680 raise
1151c407 681 except ExtractorError as e:
0db3bae8 682 kwargs = {
683 'video_id': e.video_id or self.get_temp_id(url),
684 'ie': self.IE_NAME,
b69fd25c 685 'tb': e.traceback or sys.exc_info()[2],
0db3bae8 686 'expected': e.expected,
687 'cause': e.cause
688 }
689 if hasattr(e, 'countries'):
690 kwargs['countries'] = e.countries
7265a219 691 raise type(e)(e.orig_msg, **kwargs)
ac668111 692 except http.client.IncompleteRead as e:
1151c407 693 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 694 except (KeyError, StopIteration) as e:
1151c407 695 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 696
4248dad9 697 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 698 if (not self.get_param('geo_bypass_country', None)
3089bc74 699 and self._GEO_BYPASS
a06916d9 700 and self.get_param('geo_bypass', True)
3089bc74
S
701 and not self._x_forwarded_for_ip
702 and countries):
eea0716c
S
703 country_code = random.choice(countries)
704 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
705 if self._x_forwarded_for_ip:
706 self.report_warning(
eea0716c
S
707 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
708 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
709 return True
710 return False
711
d6983cb4 712 def set_downloader(self, downloader):
08d30158 713 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
714 self._downloader = downloader
715
9809740b 716 @property
717 def cache(self):
718 return self._downloader.cache
719
720 @property
721 def cookiejar(self):
722 return self._downloader.cookiejar
723
52efa4b3 724 def _initialize_pre_login(self):
962ffcf8 725 """ Initialization before login. Redefine in subclasses."""
52efa4b3 726 pass
727
728 def _perform_login(self, username, password):
729 """ Login with username and password. Redefine in subclasses."""
730 pass
731
d6983cb4
PH
732 def _real_initialize(self):
733 """Real initialization process. Redefine in subclasses."""
734 pass
735
736 def _real_extract(self, url):
737 """Real extraction process. Redefine in subclasses."""
08d30158 738 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 739
56c73665
JMF
740 @classmethod
741 def ie_key(cls):
742 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 743 return cls.__name__[:-2]
56c73665 744
82d02080 745 @classproperty
746 def IE_NAME(cls):
747 return cls.__name__[:-2]
d6983cb4 748
d391b7e2
S
749 @staticmethod
750 def __can_accept_status_code(err, expected_status):
ac668111 751 assert isinstance(err, urllib.error.HTTPError)
d391b7e2
S
752 if expected_status is None:
753 return False
d391b7e2
S
754 elif callable(expected_status):
755 return expected_status(err.code) is True
756 else:
6606817a 757 return err.code in variadic(expected_status)
d391b7e2 758
c043c246 759 def _create_request(self, url_or_request, data=None, headers=None, query=None):
ac668111 760 if isinstance(url_or_request, urllib.request.Request):
09d02ea4 761 return update_Request(url_or_request, data=data, headers=headers, query=query)
762 if query:
763 url_or_request = update_url_query(url_or_request, query)
c043c246 764 return sanitized_Request(url_or_request, data, headers or {})
f95b9dee 765
c043c246 766 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
d391b7e2
S
767 """
768 Return the response handle.
769
770 See _download_webpage docstring for arguments specification.
771 """
1cf376f5 772 if not self._downloader._first_webpage_request:
49a57e70 773 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 774 if sleep_interval > 0:
5ef7d9bd 775 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 776 time.sleep(sleep_interval)
777 else:
778 self._downloader._first_webpage_request = False
779
d6983cb4
PH
780 if note is None:
781 self.report_download_webpage(video_id)
782 elif note is not False:
7cc3570e 783 if video_id is None:
86e5f3ed 784 self.to_screen(str(note))
7cc3570e 785 else:
86e5f3ed 786 self.to_screen(f'{video_id}: {note}')
2132edaa
S
787
788 # Some sites check X-Forwarded-For HTTP header in order to figure out
789 # the origin of the client behind proxy. This allows bypassing geo
790 # restriction by faking this header's value to IP that belongs to some
791 # geo unrestricted country. We will do so once we encounter any
792 # geo restriction error.
793 if self._x_forwarded_for_ip:
c043c246 794 headers = (headers or {}).copy()
795 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
2132edaa 796
d6983cb4 797 try:
f95b9dee 798 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 799 except network_exceptions as err:
ac668111 800 if isinstance(err, urllib.error.HTTPError):
d391b7e2 801 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
802 # Retain reference to error to prevent file object from
803 # being closed before it can be read. Works around the
804 # effects of <https://bugs.python.org/issue15002>
805 # introduced in Python 3.4.1.
806 err.fp._error = err
d391b7e2
S
807 return err.fp
808
aa94a6d3
PH
809 if errnote is False:
810 return False
d6983cb4 811 if errnote is None:
f1a9d64e 812 errnote = 'Unable to download webpage'
7f8b2714 813
86e5f3ed 814 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 815 if fatal:
497d2fab 816 raise ExtractorError(errmsg, cause=err)
7cc3570e 817 else:
6a39ee13 818 self.report_warning(errmsg)
7cc3570e 819 return False
d6983cb4 820
1890fc63 821 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
822 encoding=None, data=None, headers={}, query={}, expected_status=None):
d391b7e2
S
823 """
824 Return a tuple (page content as string, URL handle).
825
617f658b 826 Arguments:
827 url_or_request -- plain text URL as a string or
ac668111 828 a urllib.request.Request object
617f658b 829 video_id -- Video/playlist/item identifier (string)
830
831 Keyword arguments:
832 note -- note printed before downloading (string)
833 errnote -- note printed in case of an error (string)
834 fatal -- flag denoting whether error should be considered fatal,
835 i.e. whether it should cause ExtractionError to be raised,
836 otherwise a warning will be reported and extraction continued
837 encoding -- encoding for a page content decoding, guessed automatically
838 when not explicitly specified
839 data -- POST data (bytes)
840 headers -- HTTP headers (dict)
841 query -- URL query (dict)
842 expected_status -- allows to accept failed HTTP requests (non 2xx
843 status code) by explicitly specifying a set of accepted status
844 codes. Can be any of the following entities:
845 - an integer type specifying an exact failed status code to
846 accept
847 - a list or a tuple of integer types specifying a list of
848 failed status codes to accept
849 - a callable accepting an actual failed status code and
850 returning True if it should be accepted
851 Note that this argument does not affect success status codes (2xx)
852 which are always accepted.
d391b7e2 853 """
617f658b 854
b9d3e163 855 # Strip hashes from the URL (#1038)
14f25df2 856 if isinstance(url_or_request, str):
b9d3e163
PH
857 url_or_request = url_or_request.partition('#')[0]
858
d391b7e2 859 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
860 if urlh is False:
861 assert not fatal
862 return False
c9a77969 863 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
864 return (content, urlh)
865
c9a77969
YCH
866 @staticmethod
867 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
868 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
869 if m:
870 encoding = m.group(1)
871 else:
0d75ae2c 872 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
873 webpage_bytes[:1024])
874 if m:
875 encoding = m.group(1).decode('ascii')
b60016e8
PH
876 elif webpage_bytes.startswith(b'\xff\xfe'):
877 encoding = 'utf-16'
f143d86a
PH
878 else:
879 encoding = 'utf-8'
c9a77969
YCH
880
881 return encoding
882
4457823d
S
883 def __check_blocked(self, content):
884 first_block = content[:512]
3089bc74
S
885 if ('<title>Access to this site is blocked</title>' in content
886 and 'Websense' in first_block):
4457823d
S
887 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
888 blocked_iframe = self._html_search_regex(
889 r'<iframe src="([^"]+)"', content,
890 'Websense information URL', default=None)
891 if blocked_iframe:
892 msg += ' Visit %s for more details' % blocked_iframe
893 raise ExtractorError(msg, expected=True)
894 if '<title>The URL you requested has been blocked</title>' in first_block:
895 msg = (
896 'Access to this webpage has been blocked by Indian censorship. '
897 'Use a VPN or proxy server (with --proxy) to route around it.')
898 block_msg = self._html_search_regex(
899 r'</h1><p>(.*?)</p>',
900 content, 'block message', default=None)
901 if block_msg:
902 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
903 raise ExtractorError(msg, expected=True)
3089bc74
S
904 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
905 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
906 raise ExtractorError(
907 'Access to this webpage has been blocked by decision of the Russian government. '
908 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
909 expected=True)
910
f95b9dee 911 def _request_dump_filename(self, url, video_id):
912 basen = f'{video_id}_{url}'
913 trim_length = self.get_param('trim_file_name') or 240
914 if len(basen) > trim_length:
915 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
916 basen = basen[:trim_length - len(h)] + h
917 filename = sanitize_filename(f'{basen}.dump', restricted=True)
918 # Working around MAX_PATH limitation on Windows (see
919 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
920 if compat_os_name == 'nt':
921 absfilepath = os.path.abspath(filename)
922 if len(absfilepath) > 259:
923 filename = fR'\\?\{absfilepath}'
924 return filename
925
926 def __decode_webpage(self, webpage_bytes, encoding, headers):
927 if not encoding:
928 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
929 try:
930 return webpage_bytes.decode(encoding, 'replace')
931 except LookupError:
932 return webpage_bytes.decode('utf-8', 'replace')
933
c9a77969 934 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
935 webpage_bytes = urlh.read()
936 if prefix is not None:
937 webpage_bytes = prefix + webpage_bytes
a06916d9 938 if self.get_param('dump_intermediate_pages', False):
f610dbb0 939 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
940 dump = base64.b64encode(webpage_bytes).decode('ascii')
941 self._downloader.to_screen(dump)
f95b9dee 942 if self.get_param('write_pages'):
e121e3ce 943 filename = self._request_dump_filename(urlh.geturl(), video_id)
f95b9dee 944 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
945 with open(filename, 'wb') as outf:
946 outf.write(webpage_bytes)
947
f95b9dee 948 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 949 self.__check_blocked(content)
2410c43d 950
23be51d8 951 return content
d6983cb4 952
6edf2808 953 def __print_error(self, errnote, fatal, video_id, err):
954 if fatal:
c6e07cf1 955 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
6edf2808 956 elif errnote:
c6e07cf1 957 self.report_warning(f'{video_id}: {errnote}: {err}')
6edf2808 958
959 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
e2b38da9
PH
960 if transform_source:
961 xml_string = transform_source(xml_string)
e01c3d2e
S
962 try:
963 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 964 except xml.etree.ElementTree.ParseError as ve:
6edf2808 965 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
267ed0c5 966
6edf2808 967 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
3d3538e4 968 try:
b7c47b74 969 return json.loads(
970 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
3d3538e4 971 except ValueError as ve:
6edf2808 972 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
3d3538e4 973
6edf2808 974 def _parse_socket_response_as_json(self, data, *args, **kwargs):
975 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
adddc50c 976
617f658b 977 def __create_download_methods(name, parser, note, errnote, return_value):
978
6edf2808 979 def parse(ie, content, *args, errnote=errnote, **kwargs):
617f658b 980 if parser is None:
981 return content
6edf2808 982 if errnote is False:
983 kwargs['errnote'] = errnote
617f658b 984 # parser is fetched by name so subclasses can override it
985 return getattr(ie, parser)(content, *args, **kwargs)
986
c4910024 987 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
988 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
989 res = self._download_webpage_handle(
990 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
991 data=data, headers=headers, query=query, expected_status=expected_status)
617f658b 992 if res is False:
993 return res
994 content, urlh = res
6edf2808 995 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
617f658b 996
f95b9dee 997 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
c4910024 998 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
f95b9dee 999 if self.get_param('load_pages'):
1000 url_or_request = self._create_request(url_or_request, data, headers, query)
1001 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1002 self.to_screen(f'Loading request from {filename}')
1003 try:
1004 with open(filename, 'rb') as dumpf:
1005 webpage_bytes = dumpf.read()
1006 except OSError as e:
1007 self.report_warning(f'Unable to load request from disk: {e}')
1008 else:
1009 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
6edf2808 1010 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
c4910024 1011 kwargs = {
1012 'note': note,
1013 'errnote': errnote,
1014 'transform_source': transform_source,
1015 'fatal': fatal,
1016 'encoding': encoding,
1017 'data': data,
1018 'headers': headers,
1019 'query': query,
1020 'expected_status': expected_status,
1021 }
617f658b 1022 if parser is None:
c4910024 1023 kwargs.pop('transform_source')
617f658b 1024 # The method is fetched by name so subclasses can override _download_..._handle
c4910024 1025 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
617f658b 1026 return res if res is False else res[0]
1027
1028 def impersonate(func, name, return_value):
1029 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1030 func.__doc__ = f'''
1031 @param transform_source Apply this transformation before parsing
1032 @returns {return_value}
1033
1034 See _download_webpage_handle docstring for other arguments specification
1035 '''
1036
1037 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1038 impersonate(download_content, f'_download_{name}', f'{return_value}')
1039 return download_handle, download_content
1040
1041 _download_xml_handle, _download_xml = __create_download_methods(
1042 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1043 _download_json_handle, _download_json = __create_download_methods(
1044 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1045 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1046 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1047 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1048
617f658b 1049 def _download_webpage(
1050 self, url_or_request, video_id, note=None, errnote=None,
1051 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1052 """
617f658b 1053 Return the data of the page as a string.
adddc50c 1054
617f658b 1055 Keyword arguments:
1056 tries -- number of tries
1057 timeout -- sleep interval between tries
1058
1059 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1060 """
617f658b 1061
1062 R''' # NB: These are unused; should they be deprecated?
1063 if tries != 1:
1064 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1065 if timeout is NO_DEFAULT:
1066 timeout = 5
1067 else:
1068 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1069 '''
1070
1071 try_count = 0
1072 while True:
1073 try:
1074 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
ac668111 1075 except http.client.IncompleteRead as e:
617f658b 1076 try_count += 1
1077 if try_count >= tries:
1078 raise e
1079 self._sleep(timeout, video_id)
adddc50c 1080
28f436ba 1081 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
a70635b8 1082 idstr = format_field(video_id, None, '%s: ')
28f436ba 1083 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1084 if only_once:
1085 if f'WARNING: {msg}' in self._printed_messages:
1086 return
1087 self._printed_messages.add(f'WARNING: {msg}')
1088 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1089
a06916d9 1090 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1091 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1092 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1093
1094 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1095 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1096
1097 def get_param(self, name, default=None, *args, **kwargs):
1098 if self._downloader:
1099 return self._downloader.params.get(name, default, *args, **kwargs)
1100 return default
d6983cb4 1101
88acdbc2 1102 def report_drm(self, video_id, partial=False):
1103 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1104
d6983cb4
PH
1105 def report_extraction(self, id_or_name):
1106 """Report information extraction."""
f1a9d64e 1107 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1108
1109 def report_download_webpage(self, video_id):
1110 """Report webpage download."""
f1a9d64e 1111 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1112
1113 def report_age_confirmation(self):
1114 """Report attempt to confirm age."""
f1a9d64e 1115 self.to_screen('Confirming age')
d6983cb4 1116
fc79158d
JMF
1117 def report_login(self):
1118 """Report attempt to log in."""
f1a9d64e 1119 self.to_screen('Logging in')
fc79158d 1120
b7da73eb 1121 def raise_login_required(
9d5d4d64 1122 self, msg='This video is only available for registered users',
52efa4b3 1123 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1124 if metadata_available and (
1125 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1126 self.report_warning(msg)
7265a219 1127 return
a70635b8 1128 msg += format_field(self._login_hint(method), None, '. %s')
46890374 1129 raise ExtractorError(msg, expected=True)
43e7d3c9 1130
b7da73eb 1131 def raise_geo_restricted(
1132 self, msg='This video is not available from your location due to geo restriction',
1133 countries=None, metadata_available=False):
f2ebc5c7 1134 if metadata_available and (
1135 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1136 self.report_warning(msg)
1137 else:
1138 raise GeoRestrictedError(msg, countries=countries)
1139
1140 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1141 if expected and (
1142 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1143 self.report_warning(msg, video_id)
68f5867c
L
1144 elif isinstance(msg, ExtractorError):
1145 raise msg
b7da73eb 1146 else:
1147 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1148
5f6a1245 1149 # Methods for following #608
c0d0b01f 1150 @staticmethod
311b6615 1151 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1152 """Returns a URL that points to a page that should be processed"""
311b6615 1153 if ie is not None:
1154 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1155 if video_id is not None:
311b6615 1156 kwargs['id'] = video_id
830d53bf 1157 if video_title is not None:
311b6615 1158 kwargs['title'] = video_title
1159 return {
1160 **kwargs,
1161 '_type': 'url_transparent' if url_transparent else 'url',
1162 'url': url,
1163 }
1164
8f97a15d 1165 @classmethod
1166 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1167 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1168 return cls.playlist_result(
1169 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1170 playlist_id, playlist_title, **kwargs)
46b18f23 1171
c0d0b01f 1172 @staticmethod
311b6615 1173 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1174 """Returns a playlist"""
d6983cb4 1175 if playlist_id:
311b6615 1176 kwargs['id'] = playlist_id
d6983cb4 1177 if playlist_title:
311b6615 1178 kwargs['title'] = playlist_title
ecc97af3 1179 if playlist_description is not None:
311b6615 1180 kwargs['description'] = playlist_description
1181 return {
1182 **kwargs,
1183 '_type': 'multi_video' if multi_video else 'playlist',
1184 'entries': entries,
1185 }
d6983cb4 1186
c342041f 1187 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1188 """
1189 Perform a regex search on the given string, using a single or a list of
1190 patterns returning the first matching group.
1191 In case of failure return a default value or raise a WARNING or a
55b3e45b 1192 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1193 """
61d3665d 1194 if string is None:
1195 mobj = None
77f90330 1196 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1197 mobj = re.search(pattern, string, flags)
1198 else:
1199 for p in pattern:
1200 mobj = re.search(p, string, flags)
c3415d1b
PH
1201 if mobj:
1202 break
d6983cb4 1203
ec11a9f4 1204 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1205
1206 if mobj:
711ede6e
PH
1207 if group is None:
1208 # return the first matching group
1209 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1210 elif isinstance(group, (list, tuple)):
1211 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1212 else:
1213 return mobj.group(group)
c342041f 1214 elif default is not NO_DEFAULT:
d6983cb4
PH
1215 return default
1216 elif fatal:
f1a9d64e 1217 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1218 else:
6a39ee13 1219 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1220 return None
1221
f0bc6e20 1222 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1223 contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
b7c47b74 1224 """Searches string for the JSON object specified by start_pattern"""
1225 # NB: end_pattern is only used to reduce the size of the initial match
f0bc6e20 1226 if default is NO_DEFAULT:
1227 default, has_default = {}, False
1228 else:
1229 fatal, has_default = False, True
1230
1231 json_string = self._search_regex(
1232 rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1233 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1234 if not json_string:
1235 return default
1236
1237 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1238 try:
1239 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1240 except ExtractorError as e:
1241 if fatal:
1242 raise ExtractorError(
1243 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1244 elif not has_default:
1245 self.report_warning(
1246 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1247 return default
b7c47b74 1248
c342041f 1249 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1250 """
1251 Like _search_regex, but strips HTML tags and unescapes entities.
1252 """
711ede6e 1253 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1254 if res:
1255 return clean_html(res).strip()
1256 else:
1257 return res
1258
2118fdd1
RA
1259 def _get_netrc_login_info(self, netrc_machine=None):
1260 username = None
1261 password = None
1262 netrc_machine = netrc_machine or self._NETRC_MACHINE
1263
a06916d9 1264 if self.get_param('usenetrc', False):
2118fdd1 1265 try:
0001fcb5 1266 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1267 if os.path.isdir(netrc_file):
1268 netrc_file = os.path.join(netrc_file, '.netrc')
1269 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1270 if info is not None:
1271 username = info[0]
1272 password = info[2]
1273 else:
dcce092e
S
1274 raise netrc.NetrcParseError(
1275 'No authenticators for %s' % netrc_machine)
86e5f3ed 1276 except (OSError, netrc.NetrcParseError) as err:
6a39ee13 1277 self.report_warning(
dcce092e 1278 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1279
dcce092e 1280 return username, password
2118fdd1 1281
1b6712ab 1282 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1283 """
cf0649f8 1284 Get the login info as (username, password)
32443dd3
S
1285 First look for the manually specified credentials using username_option
1286 and password_option as keys in params dictionary. If no such credentials
1287 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1288 value.
fc79158d
JMF
1289 If there's no info available, return (None, None)
1290 """
fc79158d
JMF
1291
1292 # Attempt to use provided username and password or .netrc data
a06916d9 1293 username = self.get_param(username_option)
1294 if username is not None:
1295 password = self.get_param(password_option)
2118fdd1 1296 else:
1b6712ab 1297 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1298
2133565c 1299 return username, password
fc79158d 1300
e64b7569 1301 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1302 """
1303 Get the two-factor authentication info
1304 TODO - asking the user will be required for sms/phone verify
1305 currently just uses the command line option
1306 If there's no info available, return None
1307 """
83317f69 1308
a06916d9 1309 tfa = self.get_param('twofactor')
1310 if tfa is not None:
1311 return tfa
83317f69 1312
ac668111 1313 return getpass.getpass('Type %s and press [Return]: ' % note)
83317f69 1314
46720279
JMF
1315 # Helper functions for extracting OpenGraph info
1316 @staticmethod
ab2d5247 1317 def _og_regexes(prop):
448ef1f3 1318 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
fbfde1c3
F
1319 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1320 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1321 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1322 return [
78fb87b2
JMF
1323 template % (property_re, content_re),
1324 template % (content_re, property_re),
ab2d5247 1325 ]
46720279 1326
864f24bd
S
1327 @staticmethod
1328 def _meta_regex(prop):
1329 return r'''(?isx)<meta
8b9848ac 1330 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1331 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1332
3c4e6d83 1333 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1334 prop = variadic(prop)
46720279 1335 if name is None:
b070564e
S
1336 name = 'OpenGraph %s' % prop[0]
1337 og_regexes = []
1338 for p in prop:
1339 og_regexes.extend(self._og_regexes(p))
1340 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1341 if escaped is None:
1342 return None
1343 return unescapeHTML(escaped)
46720279
JMF
1344
1345 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1346 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1347
1348 def _og_search_description(self, html, **kargs):
1349 return self._og_search_property('description', html, fatal=False, **kargs)
1350
04f3fd2c 1351 def _og_search_title(self, html, *, fatal=False, **kargs):
1352 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1353
8ffa13e0 1354 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1355 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1356 if secure:
1357 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1358 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1359
78338f71
JMF
1360 def _og_search_url(self, html, **kargs):
1361 return self._og_search_property('url', html, **kargs)
1362
04f3fd2c 1363 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1364 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1365
40c696e5 1366 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1367 name = variadic(name)
59040888 1368 if display_name is None:
88d9f6c0 1369 display_name = name[0]
59040888 1370 return self._html_search_regex(
88d9f6c0 1371 [self._meta_regex(n) for n in name],
711ede6e 1372 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1373
1374 def _dc_search_uploader(self, html):
1375 return self._html_search_meta('dc.creator', html, 'uploader')
1376
8f97a15d 1377 @staticmethod
1378 def _rta_search(html):
8dbe9899
PH
1379 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1380 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1381 r' content="RTA-5042-1996-1400-1577-RTA"',
1382 html):
1383 return 18
8f97a15d 1384
1385 # And then there are the jokers who advertise that they use RTA, but actually don't.
1386 AGE_LIMIT_MARKERS = [
1387 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1388 ]
1389 if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1390 return 18
8dbe9899
PH
1391 return 0
1392
59040888
PH
1393 def _media_rating_search(self, html):
1394 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1395 rating = self._html_search_meta('rating', html)
1396
1397 if not rating:
1398 return None
1399
1400 RATING_TABLE = {
1401 'safe for kids': 0,
1402 'general': 8,
1403 '14 years': 14,
1404 'mature': 17,
1405 'restricted': 19,
1406 }
d800609c 1407 return RATING_TABLE.get(rating.lower())
59040888 1408
69319969 1409 def _family_friendly_search(self, html):
6ca7732d 1410 # See http://schema.org/VideoObject
ac8491fc
S
1411 family_friendly = self._html_search_meta(
1412 'isFamilyFriendly', html, default=None)
69319969
NJ
1413
1414 if not family_friendly:
1415 return None
1416
1417 RATING_TABLE = {
1418 '1': 0,
1419 'true': 0,
1420 '0': 18,
1421 'false': 18,
1422 }
d800609c 1423 return RATING_TABLE.get(family_friendly.lower())
69319969 1424
0c708f11
JMF
1425 def _twitter_search_player(self, html):
1426 return self._html_search_meta('twitter:player', html,
9e1a5b84 1427 'twitter card player')
0c708f11 1428
0c36dc00 1429 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1430 """Yield all json ld objects in the html"""
1431 if default is not NO_DEFAULT:
1432 fatal = False
1433 for mobj in re.finditer(JSON_LD_RE, html):
1434 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1435 for json_ld in variadic(json_ld_item):
1436 if isinstance(json_ld, dict):
1437 yield json_ld
1438
1439 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1440 """Search for a video in any json ld in the html"""
1441 if default is not NO_DEFAULT:
1442 fatal = False
1443 info = self._json_ld(
1444 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1445 video_id, fatal=fatal, expected_type=expected_type)
1446 if info:
1447 return info
4433bb02
S
1448 if default is not NO_DEFAULT:
1449 return default
1450 elif fatal:
1451 raise RegexNotFoundError('Unable to extract JSON-LD')
1452 else:
6a39ee13 1453 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1454 return {}
4ca2a3cf 1455
95b31e26 1456 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
14f25df2 1457 if isinstance(json_ld, str):
4ca2a3cf
S
1458 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1459 if not json_ld:
1460 return {}
1461 info = {}
46933a15
S
1462 if not isinstance(json_ld, (list, tuple, dict)):
1463 return info
1464 if isinstance(json_ld, dict):
1465 json_ld = [json_ld]
bae14048 1466
e7e4a6e0
S
1467 INTERACTION_TYPE_MAP = {
1468 'CommentAction': 'comment',
1469 'AgreeAction': 'like',
1470 'DisagreeAction': 'dislike',
1471 'LikeAction': 'like',
1472 'DislikeAction': 'dislike',
1473 'ListenAction': 'view',
1474 'WatchAction': 'view',
1475 'ViewAction': 'view',
1476 }
1477
f3c0c773 1478 def is_type(e, *expected_types):
1479 type = variadic(traverse_obj(e, '@type'))
1480 return any(x in type for x in expected_types)
1481
29f7c58a 1482 def extract_interaction_type(e):
1483 interaction_type = e.get('interactionType')
1484 if isinstance(interaction_type, dict):
1485 interaction_type = interaction_type.get('@type')
1486 return str_or_none(interaction_type)
1487
e7e4a6e0
S
1488 def extract_interaction_statistic(e):
1489 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1490 if isinstance(interaction_statistic, dict):
1491 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1492 if not isinstance(interaction_statistic, list):
1493 return
1494 for is_e in interaction_statistic:
f3c0c773 1495 if not is_type(is_e, 'InteractionCounter'):
e7e4a6e0 1496 continue
29f7c58a 1497 interaction_type = extract_interaction_type(is_e)
1498 if not interaction_type:
e7e4a6e0 1499 continue
ce5b9040
S
1500 # For interaction count some sites provide string instead of
1501 # an integer (as per spec) with non digit characters (e.g. ",")
1502 # so extracting count with more relaxed str_to_int
1503 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1504 if interaction_count is None:
1505 continue
1506 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1507 if not count_kind:
1508 continue
1509 count_key = '%s_count' % count_kind
1510 if info.get(count_key) is not None:
1511 continue
1512 info[count_key] = interaction_count
1513
f5225737 1514 def extract_chapter_information(e):
1515 chapters = [{
1516 'title': part.get('name'),
1517 'start_time': part.get('startOffset'),
1518 'end_time': part.get('endOffset'),
85553414 1519 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1520 for idx, (last_c, current_c, next_c) in enumerate(zip(
1521 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1522 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1523 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1524 if None in current_c.values():
1525 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1526 return
1527 if chapters:
1528 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1529 info['chapters'] = chapters
1530
bae14048 1531 def extract_video_object(e):
f3c0c773 1532 assert is_type(e, 'VideoObject')
f7ad7160 1533 author = e.get('author')
bae14048 1534 info.update({
0c36dc00 1535 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1536 'title': unescapeHTML(e.get('name')),
1537 'description': unescapeHTML(e.get('description')),
eb2333bc 1538 'thumbnails': [{'url': unescapeHTML(url)}
21633673 1539 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1540 if url_or_none(url)],
bae14048
S
1541 'duration': parse_duration(e.get('duration')),
1542 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1543 # author can be an instance of 'Organization' or 'Person' types.
1544 # both types can have 'name' property(inherited from 'Thing' type). [1]
1545 # however some websites are using 'Text' type instead.
1546 # 1. https://schema.org/VideoObject
14f25df2 1547 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
56ba69e4 1548 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
bae14048
S
1549 'tbr': int_or_none(e.get('bitrate')),
1550 'width': int_or_none(e.get('width')),
1551 'height': int_or_none(e.get('height')),
33a81c2c 1552 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1553 })
e7e4a6e0 1554 extract_interaction_statistic(e)
f5225737 1555 extract_chapter_information(e)
bae14048 1556
d5c32548
ZM
1557 def traverse_json_ld(json_ld, at_top_level=True):
1558 for e in json_ld:
1559 if at_top_level and '@context' not in e:
1560 continue
1561 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1562 traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1563 break
f3c0c773 1564 if expected_type is not None and not is_type(e, expected_type):
4433bb02 1565 continue
8f122fa0 1566 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1567 if rating is not None:
1568 info['average_rating'] = rating
f3c0c773 1569 if is_type(e, 'TVEpisode', 'Episode'):
440863ad 1570 episode_name = unescapeHTML(e.get('name'))
46933a15 1571 info.update({
440863ad 1572 'episode': episode_name,
46933a15
S
1573 'episode_number': int_or_none(e.get('episodeNumber')),
1574 'description': unescapeHTML(e.get('description')),
1575 })
440863ad
S
1576 if not info.get('title') and episode_name:
1577 info['title'] = episode_name
46933a15 1578 part_of_season = e.get('partOfSeason')
f3c0c773 1579 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1580 info.update({
1581 'season': unescapeHTML(part_of_season.get('name')),
1582 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1583 })
d16b3c66 1584 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
f3c0c773 1585 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1586 info['series'] = unescapeHTML(part_of_series.get('name'))
f3c0c773 1587 elif is_type(e, 'Movie'):
391256dc
S
1588 info.update({
1589 'title': unescapeHTML(e.get('name')),
1590 'description': unescapeHTML(e.get('description')),
1591 'duration': parse_duration(e.get('duration')),
1592 'timestamp': unified_timestamp(e.get('dateCreated')),
1593 })
f3c0c773 1594 elif is_type(e, 'Article', 'NewsArticle'):
46933a15
S
1595 info.update({
1596 'timestamp': parse_iso8601(e.get('datePublished')),
1597 'title': unescapeHTML(e.get('headline')),
d5c32548 1598 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1599 })
f3c0c773 1600 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
2edb38e8 1601 extract_video_object(e['video'][0])
f3c0c773 1602 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
e50c3500 1603 extract_video_object(e['subjectOf'][0])
f3c0c773 1604 elif is_type(e, 'VideoObject'):
bae14048 1605 extract_video_object(e)
4433bb02
S
1606 if expected_type is None:
1607 continue
1608 else:
1609 break
c69701c6 1610 video = e.get('video')
f3c0c773 1611 if is_type(video, 'VideoObject'):
c69701c6 1612 extract_video_object(video)
4433bb02
S
1613 if expected_type is None:
1614 continue
1615 else:
1616 break
d5c32548
ZM
1617 traverse_json_ld(json_ld)
1618
90137ca4 1619 return filter_dict(info)
4ca2a3cf 1620
135dfa2c 1621 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1622 return self._parse_json(
1623 self._search_regex(
1624 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1625 webpage, 'next.js data', fatal=fatal, **kw),
1626 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1627
8072ef2b 1628 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1629 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
66f4c04e 1630 rectx = re.escape(context_name)
8072ef2b 1631 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
66f4c04e 1632 js, arg_keys, arg_vals = self._search_regex(
8072ef2b 1633 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1634 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
66f4c04e
THD
1635
1636 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1637
1638 for key, val in args.items():
1639 if val in ('undefined', 'void 0'):
1640 args[key] = 'null'
1641
8072ef2b 1642 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1643 return traverse_obj(ret, traverse) or {}
66f4c04e 1644
27713812 1645 @staticmethod
f8da79f8 1646 def _hidden_inputs(html):
586f1cc5 1647 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1648 hidden_inputs = {}
c8498368
S
1649 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1650 attrs = extract_attributes(input)
1651 if not input:
201ea3ee 1652 continue
c8498368 1653 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1654 continue
c8498368
S
1655 name = attrs.get('name') or attrs.get('id')
1656 value = attrs.get('value')
1657 if name and value is not None:
1658 hidden_inputs[name] = value
201ea3ee 1659 return hidden_inputs
27713812 1660
cf61d96d
S
1661 def _form_hidden_inputs(self, form_id, html):
1662 form = self._search_regex(
73eb13df 1663 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1664 html, '%s form' % form_id, group='form')
1665 return self._hidden_inputs(form)
1666
eb8a4433 1667 class FormatSort:
b050d210 1668 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
eb8a4433 1669
8326b00a 1670 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
176f1866 1671 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
f304da8a 1672 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
198e3a04 1673 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
53ed7066 1674 'height', 'width', 'proto', 'vext', 'abr', 'aext',
f304da8a 1675 'fps', 'fs_approx', 'source', 'id')
eb8a4433 1676
1677 settings = {
1678 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1679 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1680 'acodec': {'type': 'ordered', 'regex': True,
a10aa588 1681 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
176f1866 1682 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1683 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
f137c99e 1684 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
f304da8a 1685 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
eb8a4433 1686 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1687 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1688 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1689 'aext': {'type': 'ordered', 'field': 'audio_ext',
1690 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1691 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1692 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f5510afe 1693 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
8326b00a 1694 'field': ('vcodec', 'acodec'),
1695 'function': lambda it: int(any(v != 'none' for v in it))},
f983b875 1696 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1697 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1698 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
10beccc9 1699 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1700 'quality': {'convert': 'float', 'default': -1},
eb8a4433 1701 'filesize': {'convert': 'bytes'},
f137c99e 1702 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1703 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1704 'height': {'convert': 'float_none'},
1705 'width': {'convert': 'float_none'},
1706 'fps': {'convert': 'float_none'},
1707 'tbr': {'convert': 'float_none'},
1708 'vbr': {'convert': 'float_none'},
1709 'abr': {'convert': 'float_none'},
1710 'asr': {'convert': 'float_none'},
10beccc9 1711 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
63be1aab 1712
eb8a4433 1713 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1714 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1715 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1716 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
f5510afe 1717 'res': {'type': 'multiple', 'field': ('height', 'width'),
dbf5416a 1718 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
63be1aab 1719
19188702 1720 # For compatibility with youtube-dl
1721 'format_id': {'type': 'alias', 'field': 'id'},
1722 'preference': {'type': 'alias', 'field': 'ie_pref'},
1723 'language_preference': {'type': 'alias', 'field': 'lang'},
63be1aab 1724 'source_preference': {'type': 'alias', 'field': 'source'},
08d30158 1725 'protocol': {'type': 'alias', 'field': 'proto'},
63be1aab 1726 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
08d30158 1727
1728 # Deprecated
1729 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1730 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1731 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1732 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1733 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1734 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1735 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1736 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1737 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1738 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1739 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1740 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1741 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1742 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1743 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1744 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1745 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1746 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1747 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1748 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
63be1aab 1749 }
eb8a4433 1750
f304da8a 1751 def __init__(self, ie, field_preference):
1752 self._order = []
1753 self.ydl = ie._downloader
1754 self.evaluate_params(self.ydl.params, field_preference)
1755 if ie.get_param('verbose'):
1756 self.print_verbose_info(self.ydl.write_debug)
eb8a4433 1757
1758 def _get_field_setting(self, field, key):
1759 if field not in self.settings:
ee8dd27a 1760 if key in ('forced', 'priority'):
1761 return False
1762 self.ydl.deprecation_warning(
1763 f'Using arbitrary fields ({field}) for format sorting is deprecated '
1764 'and may be removed in a future version')
eb8a4433 1765 self.settings[field] = {}
1766 propObj = self.settings[field]
1767 if key not in propObj:
1768 type = propObj.get('type')
1769 if key == 'field':
1770 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1771 elif key == 'convert':
1772 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1773 else:
f5510afe 1774 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
eb8a4433 1775 propObj[key] = default
1776 return propObj[key]
1777
1778 def _resolve_field_value(self, field, value, convertNone=False):
1779 if value is None:
1780 if not convertNone:
1781 return None
4bcc7bd1 1782 else:
eb8a4433 1783 value = value.lower()
1784 conversion = self._get_field_setting(field, 'convert')
1785 if conversion == 'ignore':
1786 return None
1787 if conversion == 'string':
1788 return value
1789 elif conversion == 'float_none':
1790 return float_or_none(value)
1791 elif conversion == 'bytes':
1792 return FileDownloader.parse_bytes(value)
1793 elif conversion == 'order':
da9be05e 1794 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1795 use_regex = self._get_field_setting(field, 'regex')
1796 list_length = len(order_list)
1797 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1798 if use_regex and value is not None:
da9be05e 1799 for i, regex in enumerate(order_list):
eb8a4433 1800 if regex and re.match(regex, value):
1801 return list_length - i
1802 return list_length - empty_pos # not in list
1803 else: # not regex or value = None
1804 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1805 else:
1806 if value.isnumeric():
1807 return float(value)
4bcc7bd1 1808 else:
eb8a4433 1809 self.settings[field]['convert'] = 'string'
1810 return value
1811
1812 def evaluate_params(self, params, sort_extractor):
1813 self._use_free_order = params.get('prefer_free_formats', False)
1814 self._sort_user = params.get('format_sort', [])
1815 self._sort_extractor = sort_extractor
1816
1817 def add_item(field, reverse, closest, limit_text):
1818 field = field.lower()
1819 if field in self._order:
1820 return
1821 self._order.append(field)
1822 limit = self._resolve_field_value(field, limit_text)
1823 data = {
1824 'reverse': reverse,
1825 'closest': False if limit is None else closest,
1826 'limit_text': limit_text,
1827 'limit': limit}
1828 if field in self.settings:
1829 self.settings[field].update(data)
1830 else:
1831 self.settings[field] = data
1832
1833 sort_list = (
1834 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1835 + (tuple() if params.get('format_sort_force', False)
1836 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1837 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1838
1839 for item in sort_list:
1840 match = re.match(self.regex, item)
1841 if match is None:
1842 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1843 field = match.group('field')
1844 if field is None:
1845 continue
1846 if self._get_field_setting(field, 'type') == 'alias':
ee8dd27a 1847 alias, field = field, self._get_field_setting(field, 'field')
08d30158 1848 if self._get_field_setting(alias, 'deprecated'):
19188702 1849 self.ydl.deprecation_warning(
1850 f'Format sorting alias {alias} is deprecated '
1851 f'and may be removed in a future version. Please use {field} instead')
eb8a4433 1852 reverse = match.group('reverse') is not None
b050d210 1853 closest = match.group('separator') == '~'
eb8a4433 1854 limit_text = match.group('limit')
1855
1856 has_limit = limit_text is not None
1857 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1858 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1859
1860 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
b5ae35ee 1861 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
eb8a4433 1862 limit_count = len(limits)
1863 for (i, f) in enumerate(fields):
1864 add_item(f, reverse, closest,
1865 limits[i] if i < limit_count
1866 else limits[0] if has_limit and not has_multiple_limits
1867 else None)
1868
0760b0a7 1869 def print_verbose_info(self, write_debug):
b31fdeed 1870 if self._sort_user:
0760b0a7 1871 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
eb8a4433 1872 if self._sort_extractor:
0760b0a7 1873 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1874 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
eb8a4433 1875 '+' if self._get_field_setting(field, 'reverse') else '', field,
1876 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1877 self._get_field_setting(field, 'limit_text'),
1878 self._get_field_setting(field, 'limit'))
1879 if self._get_field_setting(field, 'limit_text') is not None else '')
1880 for field in self._order if self._get_field_setting(field, 'visible')]))
1881
1882 def _calculate_field_preference_from_value(self, format, field, type, value):
1883 reverse = self._get_field_setting(field, 'reverse')
1884 closest = self._get_field_setting(field, 'closest')
1885 limit = self._get_field_setting(field, 'limit')
1886
1887 if type == 'extractor':
1888 maximum = self._get_field_setting(field, 'max')
1889 if value is None or (maximum is not None and value >= maximum):
f983b875 1890 value = -1
eb8a4433 1891 elif type == 'boolean':
1892 in_list = self._get_field_setting(field, 'in_list')
1893 not_in_list = self._get_field_setting(field, 'not_in_list')
1894 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1895 elif type == 'ordered':
1896 value = self._resolve_field_value(field, value, True)
1897
1898 # try to convert to number
6a04a74e 1899 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
eb8a4433 1900 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1901 if is_num:
1902 value = val_num
1903
1904 return ((-10, 0) if value is None
1905 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1906 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1907 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1908 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1909 else (-1, value, 0))
1910
1911 def _calculate_field_preference(self, format, field):
1912 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1913 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1914 if type == 'multiple':
1915 type = 'field' # Only 'field' is allowed in multiple for now
1916 actual_fields = self._get_field_setting(field, 'field')
1917
f5510afe 1918 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
eb8a4433 1919 else:
1920 value = get_value(field)
1921 return self._calculate_field_preference_from_value(format, field, type, value)
1922
1923 def calculate_preference(self, format):
1924 # Determine missing protocol
1925 if not format.get('protocol'):
1926 format['protocol'] = determine_protocol(format)
1927
1928 # Determine missing ext
1929 if not format.get('ext') and 'url' in format:
1930 format['ext'] = determine_ext(format['url'])
1931 if format.get('vcodec') == 'none':
8326b00a 1932 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
eb8a4433 1933 format['video_ext'] = 'none'
1934 else:
1935 format['video_ext'] = format['ext']
1936 format['audio_ext'] = 'none'
1937 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1938 # format['preference'] = -1000
1939
1940 # Determine missing bitrates
1941 if format.get('tbr') is None:
1942 if format.get('vbr') is not None and format.get('abr') is not None:
1943 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1944 else:
b5ae35ee 1945 if format.get('vcodec') != 'none' and format.get('vbr') is None:
eb8a4433 1946 format['vbr'] = format.get('tbr') - format.get('abr', 0)
b5ae35ee 1947 if format.get('acodec') != 'none' and format.get('abr') is None:
eb8a4433 1948 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1949
1950 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1951
1952 def _sort_formats(self, formats, field_preference=[]):
1953 if not formats:
88acdbc2 1954 return
1d485a1a 1955 formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
59040888 1956
96a53167
S
1957 def _check_formats(self, formats, video_id):
1958 if formats:
1959 formats[:] = filter(
1960 lambda f: self._is_valid_url(
1961 f['url'], video_id,
1962 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1963 formats)
1964
f5bdb444
S
1965 @staticmethod
1966 def _remove_duplicate_formats(formats):
1967 format_urls = set()
1968 unique_formats = []
1969 for f in formats:
1970 if f['url'] not in format_urls:
1971 format_urls.add(f['url'])
1972 unique_formats.append(f)
1973 formats[:] = unique_formats
1974
45024183 1975 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1976 url = self._proto_relative_url(url, scheme='http:')
1977 # For now assume non HTTP(S) URLs always valid
1978 if not (url.startswith('http://') or url.startswith('https://')):
1979 return True
96a53167 1980 try:
45024183 1981 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1982 return True
8bdd16b4 1983 except ExtractorError as e:
25e911a9 1984 self.to_screen(
8bdd16b4 1985 '%s: %s URL is invalid, skipping: %s'
1986 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1987 return False
96a53167 1988
20991253 1989 def http_scheme(self):
1ede5b24 1990 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1991 return (
1992 'http:'
a06916d9 1993 if self.get_param('prefer_insecure', False)
20991253
PH
1994 else 'https:')
1995
57c7411f 1996 def _proto_relative_url(self, url, scheme=None):
8f97a15d 1997 scheme = scheme or self.http_scheme()
1998 assert scheme.endswith(':')
1999 return sanitize_url(url, scheme=scheme[:-1])
57c7411f 2000
4094b6e3
PH
2001 def _sleep(self, timeout, video_id, msg_template=None):
2002 if msg_template is None:
f1a9d64e 2003 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
2004 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2005 self.to_screen(msg)
2006 time.sleep(timeout)
2007
f983b875 2008 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 2009 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 2010 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
a076c1f9 2011 res = self._download_xml_handle(
f036a632 2012 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
2013 'Unable to download f4m manifest',
2014 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 2015 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 2016 transform_source=transform_source,
7360c06f 2017 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 2018 if res is False:
8d29e47f 2019 return []
31bb8d3f 2020
a076c1f9
E
2021 manifest, urlh = res
2022 manifest_url = urlh.geturl()
2023
0fdbb332 2024 return self._parse_f4m_formats(
f983b875 2025 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 2026 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 2027
f983b875 2028 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 2029 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 2030 fatal=True, m3u8_id=None):
f9934b96 2031 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
2032 return []
2033
7a5c1cfe 2034 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 2035 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2036 if akamai_pv is not None and ';' in akamai_pv.text:
2037 playerVerificationChallenge = akamai_pv.text.split(';')[0]
2038 if playerVerificationChallenge.strip() != '':
2039 return []
2040
31bb8d3f 2041 formats = []
7a47d07c 2042 manifest_version = '1.0'
b2527359 2043 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 2044 if not media_nodes:
7a47d07c 2045 manifest_version = '2.0'
34e48bed 2046 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 2047 # Remove unsupported DRM protected media from final formats
067aa17e 2048 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
2049 media_nodes = remove_encrypted_media(media_nodes)
2050 if not media_nodes:
2051 return formats
48107c19
S
2052
2053 manifest_base_url = get_base_url(manifest)
0a5685b2 2054
a6571f10 2055 bootstrap_info = xpath_element(
0a5685b2
YCH
2056 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2057 'bootstrap info', default=None)
2058
edd6074c
RA
2059 vcodec = None
2060 mime_type = xpath_text(
2061 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2062 'base URL', default=None)
2063 if mime_type and mime_type.startswith('audio/'):
2064 vcodec = 'none'
2065
b2527359 2066 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
2067 tbr = int_or_none(media_el.attrib.get('bitrate'))
2068 width = int_or_none(media_el.attrib.get('width'))
2069 height = int_or_none(media_el.attrib.get('height'))
34921b43 2070 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
2071 # If <bootstrapInfo> is present, the specified f4m is a
2072 # stream-level manifest, and only set-level manifests may refer to
2073 # external resources. See section 11.4 and section 4 of F4M spec
2074 if bootstrap_info is None:
2075 media_url = None
2076 # @href is introduced in 2.0, see section 11.6 of F4M spec
2077 if manifest_version == '2.0':
2078 media_url = media_el.attrib.get('href')
2079 if media_url is None:
2080 media_url = media_el.attrib.get('url')
31c746e5
S
2081 if not media_url:
2082 continue
cc357c4d
S
2083 manifest_url = (
2084 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 2085 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
2086 # If media_url is itself a f4m manifest do the recursive extraction
2087 # since bitrates in parent manifest (this one) and media_url manifest
2088 # may differ leading to inability to resolve the format by requested
2089 # bitrate in f4m downloader
240b6045
YCH
2090 ext = determine_ext(manifest_url)
2091 if ext == 'f4m':
77b8b4e6 2092 f4m_formats = self._extract_f4m_formats(
f983b875 2093 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
2094 transform_source=transform_source, fatal=fatal)
2095 # Sometimes stream-level manifest contains single media entry that
2096 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2097 # At the same time parent's media entry in set-level manifest may
2098 # contain it. We will copy it from parent in such cases.
2099 if len(f4m_formats) == 1:
2100 f = f4m_formats[0]
2101 f.update({
2102 'tbr': f.get('tbr') or tbr,
2103 'width': f.get('width') or width,
2104 'height': f.get('height') or height,
2105 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 2106 'vcodec': vcodec,
77b8b4e6
S
2107 })
2108 formats.extend(f4m_formats)
70f0f5a8 2109 continue
240b6045
YCH
2110 elif ext == 'm3u8':
2111 formats.extend(self._extract_m3u8_formats(
2112 manifest_url, video_id, 'mp4', preference=preference,
f983b875 2113 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 2114 continue
31bb8d3f 2115 formats.append({
77b8b4e6 2116 'format_id': format_id,
31bb8d3f 2117 'url': manifest_url,
30d0b549 2118 'manifest_url': manifest_url,
a6571f10 2119 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 2120 'protocol': 'f4m',
b2527359 2121 'tbr': tbr,
77b8b4e6
S
2122 'width': width,
2123 'height': height,
edd6074c 2124 'vcodec': vcodec,
60ca389c 2125 'preference': preference,
f983b875 2126 'quality': quality,
31bb8d3f 2127 })
31bb8d3f
JMF
2128 return formats
2129
f983b875 2130 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 2131 return {
34921b43 2132 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
2133 'url': m3u8_url,
2134 'ext': ext,
2135 'protocol': 'm3u8',
37768f92 2136 'preference': preference - 100 if preference else -100,
f983b875 2137 'quality': quality,
704df56d
PH
2138 'resolution': 'multiple',
2139 'format_note': 'Quality selection URL',
16da9bbc
YCH
2140 }
2141
b5ae35ee 2142 def _report_ignoring_subs(self, name):
2143 self.report_warning(bug_reports_message(
2144 f'Ignoring subtitle tracks found in the {name} manifest; '
2145 'if any subtitle tracks are missing,'
2146 ), only_once=True)
2147
a0c3b2d5
F
2148 def _extract_m3u8_formats(self, *args, **kwargs):
2149 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2150 if subs:
b5ae35ee 2151 self._report_ignoring_subs('HLS')
a0c3b2d5
F
2152 return fmts
2153
2154 def _extract_m3u8_formats_and_subtitles(
177877c5 2155 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2156 preference=None, quality=None, m3u8_id=None, note=None,
2157 errnote=None, fatal=True, live=False, data=None, headers={},
2158 query={}):
2159
dbd82a1d 2160 res = self._download_webpage_handle(
81515ad9 2161 m3u8_url, video_id,
37a3bb66 2162 note='Downloading m3u8 information' if note is None else note,
2163 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 2164 fatal=fatal, data=data, headers=headers, query=query)
cb252080 2165
dbd82a1d 2166 if res is False:
a0c3b2d5 2167 return [], {}
cb252080 2168
dbd82a1d 2169 m3u8_doc, urlh = res
37113045 2170 m3u8_url = urlh.geturl()
9cdffeeb 2171
a0c3b2d5 2172 return self._parse_m3u8_formats_and_subtitles(
cb252080 2173 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 2174 preference=preference, quality=quality, m3u8_id=m3u8_id,
2175 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2176 headers=headers, query=query, video_id=video_id)
cb252080 2177
a0c3b2d5 2178 def _parse_m3u8_formats_and_subtitles(
42676437 2179 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2180 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2181 errnote=None, fatal=True, data=None, headers={}, query={},
2182 video_id=None):
60755938 2183 formats, subtitles = [], {}
a0c3b2d5 2184
6b993ca7 2185 has_drm = re.search('|'.join([
2186 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2187 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
2188 ]), m3u8_doc)
a0c3b2d5 2189
60755938 2190 def format_url(url):
14f25df2 2191 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
60755938 2192
2193 if self.get_param('hls_split_discontinuity', False):
2194 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2195 if not m3u8_doc:
2196 if not manifest_url:
2197 return []
2198 m3u8_doc = self._download_webpage(
2199 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2200 note=False, errnote='Failed to download m3u8 playlist information')
2201 if m3u8_doc is False:
2202 return []
2203 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 2204
60755938 2205 else:
2206 def _extract_m3u8_playlist_indices(*args, **kwargs):
2207 return [None]
310c2ed2 2208
cb252080
S
2209 # References:
2210 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2211 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2212 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2213
2214 # We should try extracting formats only from master playlists [1, 4.3.4],
2215 # i.e. playlists that describe available qualities. On the other hand
2216 # media playlists [1, 4.3.3] should be returned as is since they contain
2217 # just the media without qualities renditions.
9cdffeeb 2218 # Fortunately, master playlist can be easily distinguished from media
cb252080 2219 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2220 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2221 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2222 # media playlist and MUST NOT appear in master playlist thus we can
2223 # clearly detect media playlist with this criterion.
2224
9cdffeeb 2225 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2226 formats = [{
34921b43 2227 'format_id': join_nonempty(m3u8_id, idx),
60755938 2228 'format_index': idx,
42676437 2229 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2230 'ext': ext,
2231 'protocol': entry_protocol,
2232 'preference': preference,
2233 'quality': quality,
88acdbc2 2234 'has_drm': has_drm,
60755938 2235 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2236
a0c3b2d5 2237 return formats, subtitles
cb252080
S
2238
2239 groups = {}
2240 last_stream_inf = {}
2241
2242 def extract_media(x_media_line):
2243 media = parse_m3u8_attributes(x_media_line)
2244 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2245 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2246 if not (media_type and group_id and name):
2247 return
2248 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2249 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2250 if media_type == 'SUBTITLES':
3907333c 2251 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2252 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2253 # However, lack of URI has been spotted in the wild.
2254 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2255 if not media.get('URI'):
2256 return
a0c3b2d5
F
2257 url = format_url(media['URI'])
2258 sub_info = {
2259 'url': url,
2260 'ext': determine_ext(url),
2261 }
4a2f19ab
F
2262 if sub_info['ext'] == 'm3u8':
2263 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2264 # files may contain is WebVTT:
2265 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2266 sub_info['ext'] = 'vtt'
2267 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2268 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2269 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2270 if media_type not in ('VIDEO', 'AUDIO'):
2271 return
2272 media_url = media.get('URI')
2273 if media_url:
310c2ed2 2274 manifest_url = format_url(media_url)
60755938 2275 formats.extend({
34921b43 2276 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2277 'format_note': name,
2278 'format_index': idx,
2279 'url': manifest_url,
2280 'manifest_url': m3u8_url,
2281 'language': media.get('LANGUAGE'),
2282 'ext': ext,
2283 'protocol': entry_protocol,
2284 'preference': preference,
2285 'quality': quality,
2286 'vcodec': 'none' if media_type == 'AUDIO' else None,
2287 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2288
2289 def build_stream_name():
2290 # Despite specification does not mention NAME attribute for
3019cb0c
S
2291 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2292 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2293 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2294 stream_name = last_stream_inf.get('NAME')
2295 if stream_name:
2296 return stream_name
2297 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2298 # from corresponding rendition group
2299 stream_group_id = last_stream_inf.get('VIDEO')
2300 if not stream_group_id:
2301 return
2302 stream_group = groups.get(stream_group_id)
2303 if not stream_group:
2304 return stream_group_id
2305 rendition = stream_group[0]
2306 return rendition.get('NAME') or stream_group_id
2307
379306ef 2308 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2309 # chance to detect video only formats when EXT-X-STREAM-INF tags
2310 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2311 for line in m3u8_doc.splitlines():
2312 if line.startswith('#EXT-X-MEDIA:'):
2313 extract_media(line)
2314
704df56d
PH
2315 for line in m3u8_doc.splitlines():
2316 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2317 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2318 elif line.startswith('#') or not line.strip():
2319 continue
2320 else:
9c99bef7 2321 tbr = float_or_none(
3089bc74
S
2322 last_stream_inf.get('AVERAGE-BANDWIDTH')
2323 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2324 manifest_url = format_url(line.strip())
5ef62fc4 2325
60755938 2326 for idx in _extract_m3u8_playlist_indices(manifest_url):
2327 format_id = [m3u8_id, None, idx]
310c2ed2 2328 # Bandwidth of live streams may differ over time thus making
2329 # format_id unpredictable. So it's better to keep provided
2330 # format_id intact.
2331 if not live:
60755938 2332 stream_name = build_stream_name()
34921b43 2333 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2334 f = {
34921b43 2335 'format_id': join_nonempty(*format_id),
60755938 2336 'format_index': idx,
310c2ed2 2337 'url': manifest_url,
2338 'manifest_url': m3u8_url,
2339 'tbr': tbr,
2340 'ext': ext,
2341 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2342 'protocol': entry_protocol,
2343 'preference': preference,
2344 'quality': quality,
2345 }
2346 resolution = last_stream_inf.get('RESOLUTION')
2347 if resolution:
2348 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2349 if mobj:
2350 f['width'] = int(mobj.group('width'))
2351 f['height'] = int(mobj.group('height'))
2352 # Unified Streaming Platform
2353 mobj = re.search(
2354 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2355 if mobj:
2356 abr, vbr = mobj.groups()
2357 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2358 f.update({
2359 'vbr': vbr,
2360 'abr': abr,
2361 })
2362 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2363 f.update(codecs)
2364 audio_group_id = last_stream_inf.get('AUDIO')
2365 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2366 # references a rendition group MUST have a CODECS attribute.
2367 # However, this is not always respected, for example, [2]
2368 # contains EXT-X-STREAM-INF tag which references AUDIO
2369 # rendition group but does not have CODECS and despite
2370 # referencing an audio group it represents a complete
2371 # (with audio and video) format. So, for such cases we will
2372 # ignore references to rendition groups and treat them
2373 # as complete formats.
2374 if audio_group_id and codecs and f.get('vcodec') != 'none':
2375 audio_group = groups.get(audio_group_id)
2376 if audio_group and audio_group[0].get('URI'):
2377 # TODO: update acodec for audio only formats with
2378 # the same GROUP-ID
2379 f['acodec'] = 'none'
fc21af50 2380 if not f.get('ext'):
2381 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2382 formats.append(f)
2383
2384 # for DailyMotion
2385 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2386 if progressive_uri:
2387 http_f = f.copy()
2388 del http_f['manifest_url']
2389 http_f.update({
2390 'format_id': f['format_id'].replace('hls-', 'http-'),
2391 'protocol': 'http',
2392 'url': progressive_uri,
2393 })
2394 formats.append(http_f)
5ef62fc4 2395
cb252080 2396 last_stream_inf = {}
a0c3b2d5 2397 return formats, subtitles
704df56d 2398
3cf4b91d
C
2399 def _extract_m3u8_vod_duration(
2400 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2401
2402 m3u8_vod = self._download_webpage(
2403 m3u8_vod_url, video_id,
2404 note='Downloading m3u8 VOD manifest' if note is None else note,
2405 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2406 fatal=False, data=data, headers=headers, query=query)
2407
2408 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2409
2410 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2411 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2412 return None
2413
2414 return int(sum(
2415 float(line[len('#EXTINF:'):].split(',')[0])
2416 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2417
a107193e
S
2418 @staticmethod
2419 def _xpath_ns(path, namespace=None):
2420 if not namespace:
2421 return path
2422 out = []
2423 for c in path.split('/'):
2424 if not c or c == '.':
2425 out.append(c)
2426 else:
2427 out.append('{%s}%s' % (namespace, c))
2428 return '/'.join(out)
2429
da1c94ee 2430 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
a076c1f9
E
2431 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2432 if res is False:
995029a1 2433 assert not fatal
774a46c5 2434 return [], {}
e89a2aab 2435
a076c1f9
E
2436 smil, urlh = res
2437 smil_url = urlh.geturl()
2438
17712eeb 2439 namespace = self._parse_smil_namespace(smil)
a107193e 2440
da1c94ee 2441 fmts = self._parse_smil_formats(
a107193e 2442 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2443 subs = self._parse_smil_subtitles(
2444 smil, namespace=namespace)
2445
2446 return fmts, subs
2447
2448 def _extract_smil_formats(self, *args, **kwargs):
2449 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2450 if subs:
b5ae35ee 2451 self._report_ignoring_subs('SMIL')
da1c94ee 2452 return fmts
a107193e
S
2453
2454 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2455 res = self._download_smil(smil_url, video_id, fatal=fatal)
2456 if res is False:
a107193e 2457 return {}
a076c1f9
E
2458
2459 smil, urlh = res
2460 smil_url = urlh.geturl()
2461
a107193e
S
2462 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2463
09f572fb 2464 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2465 return self._download_xml_handle(
a107193e 2466 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2467 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2468
2469 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2470 namespace = self._parse_smil_namespace(smil)
a107193e
S
2471
2472 formats = self._parse_smil_formats(
2473 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2474 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2475
2476 video_id = os.path.splitext(url_basename(smil_url))[0]
2477 title = None
2478 description = None
647eab45 2479 upload_date = None
a107193e
S
2480 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2481 name = meta.attrib.get('name')
2482 content = meta.attrib.get('content')
2483 if not name or not content:
2484 continue
2485 if not title and name == 'title':
2486 title = content
2487 elif not description and name in ('description', 'abstract'):
2488 description = content
647eab45
S
2489 elif not upload_date and name == 'date':
2490 upload_date = unified_strdate(content)
a107193e 2491
1e5bcdec
S
2492 thumbnails = [{
2493 'id': image.get('type'),
2494 'url': image.get('src'),
2495 'width': int_or_none(image.get('width')),
2496 'height': int_or_none(image.get('height')),
2497 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2498
a107193e
S
2499 return {
2500 'id': video_id,
2501 'title': title or video_id,
2502 'description': description,
647eab45 2503 'upload_date': upload_date,
1e5bcdec 2504 'thumbnails': thumbnails,
a107193e
S
2505 'formats': formats,
2506 'subtitles': subtitles,
2507 }
2508
17712eeb
S
2509 def _parse_smil_namespace(self, smil):
2510 return self._search_regex(
2511 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2512
f877c6ae 2513 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2514 base = smil_url
2515 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2516 b = meta.get('base') or meta.get('httpBase')
2517 if b:
2518 base = b
2519 break
e89a2aab
S
2520
2521 formats = []
2522 rtmp_count = 0
a107193e 2523 http_count = 0
7f32e5dc 2524 m3u8_count = 0
9359f3d4 2525 imgs_count = 0
a107193e 2526
9359f3d4 2527 srcs = set()
ad96b4c8
YCH
2528 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2529 for medium in media:
2530 src = medium.get('src')
81e1c4e2 2531 if not src or src in srcs:
a107193e 2532 continue
9359f3d4 2533 srcs.add(src)
a107193e 2534
ad96b4c8
YCH
2535 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2536 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2537 width = int_or_none(medium.get('width'))
2538 height = int_or_none(medium.get('height'))
2539 proto = medium.get('proto')
2540 ext = medium.get('ext')
a107193e 2541 src_ext = determine_ext(src)
ad96b4c8 2542 streamer = medium.get('streamer') or base
a107193e
S
2543
2544 if proto == 'rtmp' or streamer.startswith('rtmp'):
2545 rtmp_count += 1
2546 formats.append({
2547 'url': streamer,
2548 'play_path': src,
2549 'ext': 'flv',
2550 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2551 'tbr': bitrate,
2552 'filesize': filesize,
2553 'width': width,
2554 'height': height,
2555 })
f877c6ae
YCH
2556 if transform_rtmp_url:
2557 streamer, src = transform_rtmp_url(streamer, src)
2558 formats[-1].update({
2559 'url': streamer,
2560 'play_path': src,
2561 })
a107193e
S
2562 continue
2563
14f25df2 2564 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
c349456e 2565 src_url = src_url.strip()
a107193e
S
2566
2567 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2568 m3u8_formats = self._extract_m3u8_formats(
2569 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2570 if len(m3u8_formats) == 1:
2571 m3u8_count += 1
2572 m3u8_formats[0].update({
2573 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2574 'tbr': bitrate,
2575 'width': width,
2576 'height': height,
2577 })
2578 formats.extend(m3u8_formats)
bd21ead2 2579 elif src_ext == 'f4m':
a107193e
S
2580 f4m_url = src_url
2581 if not f4m_params:
2582 f4m_params = {
2583 'hdcore': '3.2.0',
2584 'plugin': 'flowplayer-3.2.0.1',
2585 }
2586 f4m_url += '&' if '?' in f4m_url else '?'
14f25df2 2587 f4m_url += urllib.parse.urlencode(f4m_params)
7e5edcfd 2588 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2589 elif src_ext == 'mpd':
2590 formats.extend(self._extract_mpd_formats(
2591 src_url, video_id, mpd_id='dash', fatal=False))
2592 elif re.search(r'\.ism/[Mm]anifest', src_url):
2593 formats.extend(self._extract_ism_formats(
2594 src_url, video_id, ism_id='mss', fatal=False))
2595 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2596 http_count += 1
2597 formats.append({
2598 'url': src_url,
2599 'ext': ext or src_ext or 'flv',
2600 'format_id': 'http-%d' % (bitrate or http_count),
2601 'tbr': bitrate,
2602 'filesize': filesize,
2603 'width': width,
2604 'height': height,
2605 })
63757032 2606
9359f3d4
F
2607 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2608 src = medium.get('src')
2609 if not src or src in srcs:
2610 continue
2611 srcs.add(src)
2612
2613 imgs_count += 1
2614 formats.append({
2615 'format_id': 'imagestream-%d' % (imgs_count),
2616 'url': src,
2617 'ext': mimetype2ext(medium.get('type')),
2618 'acodec': 'none',
2619 'vcodec': 'none',
2620 'width': int_or_none(medium.get('width')),
2621 'height': int_or_none(medium.get('height')),
2622 'format_note': 'SMIL storyboards',
2623 })
2624
e89a2aab
S
2625 return formats
2626
ce00af87 2627 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2628 urls = []
a107193e
S
2629 subtitles = {}
2630 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2631 src = textstream.get('src')
d413095f 2632 if not src or src in urls:
a107193e 2633 continue
d413095f 2634 urls.append(src)
df634be2 2635 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2636 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2637 subtitles.setdefault(lang, []).append({
2638 'url': src,
2639 'ext': ext,
2640 })
2641 return subtitles
63757032 2642
47a5cb77 2643 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2644 res = self._download_xml_handle(
47a5cb77 2645 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2646 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2647 if res is False:
942acef5 2648 return []
a076c1f9
E
2649
2650 xspf, urlh = res
2651 xspf_url = urlh.geturl()
2652
47a5cb77
S
2653 return self._parse_xspf(
2654 xspf, playlist_id, xspf_url=xspf_url,
2655 xspf_base_url=base_url(xspf_url))
8d6765cf 2656
47a5cb77 2657 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2658 NS_MAP = {
2659 'xspf': 'http://xspf.org/ns/0/',
2660 's1': 'http://static.streamone.nl/player/ns/0',
2661 }
2662
2663 entries = []
47a5cb77 2664 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2665 title = xpath_text(
98044462 2666 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2667 description = xpath_text(
2668 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2669 thumbnail = xpath_text(
2670 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2671 duration = float_or_none(
2672 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2673
47a5cb77
S
2674 formats = []
2675 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2676 format_url = urljoin(xspf_base_url, location.text)
2677 if not format_url:
2678 continue
2679 formats.append({
2680 'url': format_url,
2681 'manifest_url': xspf_url,
2682 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2683 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2684 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2685 })
8d6765cf
S
2686 self._sort_formats(formats)
2687
2688 entries.append({
2689 'id': playlist_id,
2690 'title': title,
2691 'description': description,
2692 'thumbnail': thumbnail,
2693 'duration': duration,
2694 'formats': formats,
2695 })
2696 return entries
2697
171e59ed
F
2698 def _extract_mpd_formats(self, *args, **kwargs):
2699 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2700 if subs:
b5ae35ee 2701 self._report_ignoring_subs('DASH')
171e59ed
F
2702 return fmts
2703
2704 def _extract_mpd_formats_and_subtitles(
2705 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2706 fatal=True, data=None, headers={}, query={}):
47a5cb77 2707 res = self._download_xml_handle(
1bac3455 2708 mpd_url, video_id,
37a3bb66 2709 note='Downloading MPD manifest' if note is None else note,
2710 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2711 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2712 if res is False:
171e59ed 2713 return [], {}
47a5cb77 2714 mpd_doc, urlh = res
c25720ef 2715 if mpd_doc is None:
171e59ed 2716 return [], {}
779da8e3
E
2717
2718 # We could have been redirected to a new url when we retrieved our mpd file.
2719 mpd_url = urlh.geturl()
2720 mpd_base_url = base_url(mpd_url)
1bac3455 2721
171e59ed 2722 return self._parse_mpd_formats_and_subtitles(
545cc85d 2723 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2724
171e59ed
F
2725 def _parse_mpd_formats(self, *args, **kwargs):
2726 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2727 if subs:
b5ae35ee 2728 self._report_ignoring_subs('DASH')
171e59ed
F
2729 return fmts
2730
2731 def _parse_mpd_formats_and_subtitles(
2732 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2733 """
2734 Parse formats from MPD manifest.
2735 References:
2736 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2737 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2738 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2739 """
a06916d9 2740 if not self.get_param('dynamic_mpd', True):
78895bd3 2741 if mpd_doc.get('type') == 'dynamic':
171e59ed 2742 return [], {}
2d2fa82d 2743
91cb6b50 2744 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2745
2746 def _add_ns(path):
2747 return self._xpath_ns(path, namespace)
2748
675d0016 2749 def is_drm_protected(element):
2750 return element.find(_add_ns('ContentProtection')) is not None
2751
1bac3455 2752 def extract_multisegment_info(element, ms_parent_info):
2753 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2754
2755 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2756 # common attributes and elements. We will only extract relevant
2757 # for us.
2758 def extract_common(source):
2759 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2760 if segment_timeline is not None:
2761 s_e = segment_timeline.findall(_add_ns('S'))
2762 if s_e:
2763 ms_info['total_number'] = 0
2764 ms_info['s'] = []
2765 for s in s_e:
2766 r = int(s.get('r', 0))
2767 ms_info['total_number'] += 1 + r
2768 ms_info['s'].append({
2769 't': int(s.get('t', 0)),
2770 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2771 'd': int(s.attrib['d']),
2772 'r': r,
2773 })
2774 start_number = source.get('startNumber')
2775 if start_number:
2776 ms_info['start_number'] = int(start_number)
2777 timescale = source.get('timescale')
2778 if timescale:
2779 ms_info['timescale'] = int(timescale)
2780 segment_duration = source.get('duration')
2781 if segment_duration:
48504785 2782 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2783
2784 def extract_Initialization(source):
2785 initialization = source.find(_add_ns('Initialization'))
2786 if initialization is not None:
2787 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2788
f14be228 2789 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2790 if segment_list is not None:
b4c1d6e8
S
2791 extract_common(segment_list)
2792 extract_Initialization(segment_list)
f14be228 2793 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2794 if segment_urls_e:
2795 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2796 else:
f14be228 2797 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2798 if segment_template is not None:
b4c1d6e8 2799 extract_common(segment_template)
e228616c
S
2800 media = segment_template.get('media')
2801 if media:
2802 ms_info['media'] = media
1bac3455 2803 initialization = segment_template.get('initialization')
2804 if initialization:
e228616c 2805 ms_info['initialization'] = initialization
1bac3455 2806 else:
b4c1d6e8 2807 extract_Initialization(segment_template)
1bac3455 2808 return ms_info
b323e170 2809
1bac3455 2810 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2811 formats, subtitles = [], {}
234416e4 2812 stream_numbers = collections.defaultdict(int)
f14be228 2813 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2814 period_duration = parse_duration(period.get('duration')) or mpd_duration
2815 period_ms_info = extract_multisegment_info(period, {
2816 'start_number': 1,
2817 'timescale': 1,
2818 })
f14be228 2819 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2820 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2821 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2822 representation_attrib = adaptation_set.attrib.copy()
2823 representation_attrib.update(representation.attrib)
f0948348 2824 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2825 mime_type = representation_attrib['mimeType']
171e59ed
F
2826 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2827
21633673 2828 codec_str = representation_attrib.get('codecs', '')
2829 # Some kind of binary subtitle found in some youtube livestreams
2830 if mime_type == 'application/x-rawcc':
2831 codecs = {'scodec': codec_str}
2832 else:
2833 codecs = parse_codecs(codec_str)
be2fc5b2 2834 if content_type not in ('video', 'audio', 'text'):
2835 if mime_type == 'image/jpeg':
a8731fcc 2836 content_type = mime_type
21633673 2837 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2838 content_type = 'video'
21633673 2839 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2840 content_type = 'audio'
3fe75fdc 2841 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2842 content_type = 'text'
6993f78d 2843 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2844 content_type = 'text'
cdb19aa4 2845 else:
be2fc5b2 2846 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2847 continue
2848
2849 base_url = ''
2850 for element in (representation, adaptation_set, period, mpd_doc):
2851 base_url_e = element.find(_add_ns('BaseURL'))
47046464 2852 if try_call(lambda: base_url_e.text) is not None:
be2fc5b2 2853 base_url = base_url_e.text + base_url
2854 if re.match(r'^https?://', base_url):
2855 break
f9cc0161 2856 if mpd_base_url and base_url.startswith('/'):
14f25df2 2857 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
f9cc0161
D
2858 elif mpd_base_url and not re.match(r'^https?://', base_url):
2859 if not mpd_base_url.endswith('/'):
be2fc5b2 2860 mpd_base_url += '/'
2861 base_url = mpd_base_url + base_url
2862 representation_id = representation_attrib.get('id')
2863 lang = representation_attrib.get('lang')
2864 url_el = representation.find(_add_ns('BaseURL'))
2865 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2866 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2867 if representation_id is not None:
2868 format_id = representation_id
2869 else:
2870 format_id = content_type
2871 if mpd_id:
2872 format_id = mpd_id + '-' + format_id
2873 if content_type in ('video', 'audio'):
2874 f = {
2875 'format_id': format_id,
2876 'manifest_url': mpd_url,
2877 'ext': mimetype2ext(mime_type),
2878 'width': int_or_none(representation_attrib.get('width')),
2879 'height': int_or_none(representation_attrib.get('height')),
2880 'tbr': float_or_none(bandwidth, 1000),
2881 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2882 'fps': int_or_none(representation_attrib.get('frameRate')),
2883 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2884 'format_note': 'DASH %s' % content_type,
2885 'filesize': filesize,
2886 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2887 **codecs
be2fc5b2 2888 }
be2fc5b2 2889 elif content_type == 'text':
2890 f = {
2891 'ext': mimetype2ext(mime_type),
2892 'manifest_url': mpd_url,
2893 'filesize': filesize,
2894 }
2895 elif content_type == 'image/jpeg':
2896 # See test case in VikiIE
2897 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2898 f = {
2899 'format_id': format_id,
2900 'ext': 'mhtml',
2901 'manifest_url': mpd_url,
2902 'format_note': 'DASH storyboards (jpeg)',
2903 'acodec': 'none',
2904 'vcodec': 'none',
2905 }
88acdbc2 2906 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2907 f['has_drm'] = True
be2fc5b2 2908 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2909
2910 def prepare_template(template_name, identifiers):
2911 tmpl = representation_ms_info[template_name]
2912 # First of, % characters outside $...$ templates
2913 # must be escaped by doubling for proper processing
2914 # by % operator string formatting used further (see
2915 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2916 t = ''
2917 in_template = False
2918 for c in tmpl:
2919 t += c
2920 if c == '$':
2921 in_template = not in_template
2922 elif c == '%' and not in_template:
eca1f0d1 2923 t += c
be2fc5b2 2924 # Next, $...$ templates are translated to their
2925 # %(...) counterparts to be used with % operator
2926 if representation_id is not None:
2927 t = t.replace('$RepresentationID$', representation_id)
2928 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2929 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2930 t.replace('$$', '$')
2931 return t
2932
2933 # @initialization is a regular template like @media one
2934 # so it should be handled just the same way (see
2935 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2936 if 'initialization' in representation_ms_info:
2937 initialization_template = prepare_template(
2938 'initialization',
2939 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2940 # $Time$ shall not be included for @initialization thus
2941 # only $Bandwidth$ remains
2942 ('Bandwidth', ))
2943 representation_ms_info['initialization_url'] = initialization_template % {
2944 'Bandwidth': bandwidth,
2945 }
2946
2947 def location_key(location):
2948 return 'url' if re.match(r'^https?://', location) else 'path'
2949
2950 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2951
2952 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2953 media_location_key = location_key(media_template)
2954
2955 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2956 # can't be used at the same time
2957 if '%(Number' in media_template and 's' not in representation_ms_info:
2958 segment_duration = None
2959 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2960 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2961 representation_ms_info['total_number'] = int(math.ceil(
2962 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2963 representation_ms_info['fragments'] = [{
2964 media_location_key: media_template % {
2965 'Number': segment_number,
2966 'Bandwidth': bandwidth,
2967 },
2968 'duration': segment_duration,
2969 } for segment_number in range(
2970 representation_ms_info['start_number'],
2971 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2972 else:
2973 # $Number*$ or $Time$ in media template with S list available
2974 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2975 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2976 representation_ms_info['fragments'] = []
2977 segment_time = 0
2978 segment_d = None
2979 segment_number = representation_ms_info['start_number']
2980
2981 def add_segment_url():
2982 segment_url = media_template % {
2983 'Time': segment_time,
2984 'Bandwidth': bandwidth,
2985 'Number': segment_number,
2986 }
2987 representation_ms_info['fragments'].append({
2988 media_location_key: segment_url,
2989 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2990 })
2991
2992 for num, s in enumerate(representation_ms_info['s']):
2993 segment_time = s.get('t') or segment_time
2994 segment_d = s['d']
2995 add_segment_url()
2996 segment_number += 1
2997 for r in range(s.get('r', 0)):
2998 segment_time += segment_d
f0948348 2999 add_segment_url()
b4c1d6e8 3000 segment_number += 1
be2fc5b2 3001 segment_time += segment_d
3002 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3003 # No media template
3004 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
3005 # or any YouTube dashsegments video
3006 fragments = []
3007 segment_index = 0
3008 timescale = representation_ms_info['timescale']
3009 for s in representation_ms_info['s']:
3010 duration = float_or_none(s['d'], timescale)
3011 for r in range(s.get('r', 0) + 1):
3012 segment_uri = representation_ms_info['segment_urls'][segment_index]
3013 fragments.append({
3014 location_key(segment_uri): segment_uri,
3015 'duration': duration,
3016 })
3017 segment_index += 1
3018 representation_ms_info['fragments'] = fragments
3019 elif 'segment_urls' in representation_ms_info:
3020 # Segment URLs with no SegmentTimeline
3021 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3022 # https://github.com/ytdl-org/youtube-dl/pull/14844
3023 fragments = []
3024 segment_duration = float_or_none(
3025 representation_ms_info['segment_duration'],
3026 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3027 for segment_url in representation_ms_info['segment_urls']:
3028 fragment = {
3029 location_key(segment_url): segment_url,
3030 }
3031 if segment_duration:
3032 fragment['duration'] = segment_duration
3033 fragments.append(fragment)
3034 representation_ms_info['fragments'] = fragments
3035 # If there is a fragments key available then we correctly recognized fragmented media.
3036 # Otherwise we will assume unfragmented media with direct access. Technically, such
3037 # assumption is not necessarily correct since we may simply have no support for
3038 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3039 if 'fragments' in representation_ms_info:
3040 f.update({
3041 # NB: mpd_url may be empty when MPD manifest is parsed from a string
3042 'url': mpd_url or base_url,
3043 'fragment_base_url': base_url,
3044 'fragments': [],
3045 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3046 })
3047 if 'initialization_url' in representation_ms_info:
3048 initialization_url = representation_ms_info['initialization_url']
3049 if not f.get('url'):
3050 f['url'] = initialization_url
3051 f['fragments'].append({location_key(initialization_url): initialization_url})
3052 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 3053 if not period_duration:
3054 period_duration = try_get(
3055 representation_ms_info,
3056 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 3057 else:
be2fc5b2 3058 # Assuming direct URL to unfragmented media.
3059 f['url'] = base_url
234416e4 3060 if content_type in ('video', 'audio', 'image/jpeg'):
3061 f['manifest_stream_number'] = stream_numbers[f['url']]
3062 stream_numbers[f['url']] += 1
be2fc5b2 3063 formats.append(f)
3064 elif content_type == 'text':
3065 subtitles.setdefault(lang or 'und', []).append(f)
3066
171e59ed 3067 return formats, subtitles
17b598d3 3068
fd76a142
F
3069 def _extract_ism_formats(self, *args, **kwargs):
3070 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3071 if subs:
b5ae35ee 3072 self._report_ignoring_subs('ISM')
fd76a142
F
3073 return fmts
3074
3075 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 3076 res = self._download_xml_handle(
b2758123 3077 ism_url, video_id,
37a3bb66 3078 note='Downloading ISM manifest' if note is None else note,
3079 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 3080 fatal=fatal, data=data, headers=headers, query=query)
b2758123 3081 if res is False:
fd76a142 3082 return [], {}
47a5cb77 3083 ism_doc, urlh = res
13b08034 3084 if ism_doc is None:
fd76a142 3085 return [], {}
b2758123 3086
fd76a142 3087 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 3088
fd76a142 3089 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
3090 """
3091 Parse formats from ISM manifest.
3092 References:
3093 1. [MS-SSTR]: Smooth Streaming Protocol,
3094 https://msdn.microsoft.com/en-us/library/ff469518.aspx
3095 """
06869367 3096 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 3097 return [], {}
b2758123 3098
b2758123
RA
3099 duration = int(ism_doc.attrib['Duration'])
3100 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3101
3102 formats = []
fd76a142 3103 subtitles = {}
b2758123
RA
3104 for stream in ism_doc.findall('StreamIndex'):
3105 stream_type = stream.get('Type')
fd76a142 3106 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
3107 continue
3108 url_pattern = stream.attrib['Url']
3109 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3110 stream_name = stream.get('Name')
fd76a142 3111 stream_language = stream.get('Language', 'und')
b2758123 3112 for track in stream.findall('QualityLevel'):
e2efe599 3113 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
b2758123 3114 # TODO: add support for WVC1 and WMAP
66a1b864 3115 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
b2758123
RA
3116 self.report_warning('%s is not a supported codec' % fourcc)
3117 continue
3118 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
3119 # [1] does not mention Width and Height attributes. However,
3120 # they're often present while MaxWidth and MaxHeight are
3121 # missing, so should be used as fallbacks
3122 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3123 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
3124 sampling_rate = int_or_none(track.get('SamplingRate'))
3125
3126 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
14f25df2 3127 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
b2758123
RA
3128
3129 fragments = []
3130 fragment_ctx = {
3131 'time': 0,
3132 }
3133 stream_fragments = stream.findall('c')
3134 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3135 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3136 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3137 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3138 if not fragment_ctx['duration']:
3139 try:
3140 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3141 except IndexError:
3142 next_fragment_time = duration
1616f9b4 3143 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
3144 for _ in range(fragment_repeat):
3145 fragments.append({
14f25df2 3146 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
3147 'duration': fragment_ctx['duration'] / stream_timescale,
3148 })
3149 fragment_ctx['time'] += fragment_ctx['duration']
3150
fd76a142
F
3151 if stream_type == 'text':
3152 subtitles.setdefault(stream_language, []).append({
3153 'ext': 'ismt',
3154 'protocol': 'ism',
3155 'url': ism_url,
3156 'manifest_url': ism_url,
3157 'fragments': fragments,
3158 '_download_params': {
3159 'stream_type': stream_type,
3160 'duration': duration,
3161 'timescale': stream_timescale,
3162 'fourcc': fourcc,
3163 'language': stream_language,
3164 'codec_private_data': track.get('CodecPrivateData'),
3165 }
3166 })
3167 elif stream_type in ('video', 'audio'):
3168 formats.append({
34921b43 3169 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
3170 'url': ism_url,
3171 'manifest_url': ism_url,
3172 'ext': 'ismv' if stream_type == 'video' else 'isma',
3173 'width': width,
3174 'height': height,
3175 'tbr': tbr,
3176 'asr': sampling_rate,
3177 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3178 'acodec': 'none' if stream_type == 'video' else fourcc,
3179 'protocol': 'ism',
3180 'fragments': fragments,
88acdbc2 3181 'has_drm': ism_doc.find('Protection') is not None,
fd76a142
F
3182 '_download_params': {
3183 'stream_type': stream_type,
3184 'duration': duration,
3185 'timescale': stream_timescale,
3186 'width': width or 0,
3187 'height': height or 0,
3188 'fourcc': fourcc,
3189 'language': stream_language,
3190 'codec_private_data': track.get('CodecPrivateData'),
3191 'sampling_rate': sampling_rate,
3192 'channels': int_or_none(track.get('Channels', 2)),
3193 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3194 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3195 },
3196 })
3197 return formats, subtitles
b2758123 3198
079a7cfc 3199 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
3200 def absolute_url(item_url):
3201 return urljoin(base_url, item_url)
59bbe491 3202
3203 def parse_content_type(content_type):
3204 if not content_type:
3205 return {}
3206 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3207 if ctr:
3208 mimetype, codecs = ctr.groups()
3209 f = parse_codecs(codecs)
3210 f['ext'] = mimetype2ext(mimetype)
3211 return f
3212 return {}
3213
222a2308
L
3214 def _media_formats(src, cur_media_type, type_info=None):
3215 type_info = type_info or {}
520251c0 3216 full_url = absolute_url(src)
82889d4a 3217 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3218 if ext == 'm3u8':
520251c0
YCH
3219 is_plain_url = False
3220 formats = self._extract_m3u8_formats(
ad120ae1 3221 full_url, video_id, ext='mp4',
eeb0a956 3222 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3223 preference=preference, quality=quality, fatal=False)
87a449c1
S
3224 elif ext == 'mpd':
3225 is_plain_url = False
3226 formats = self._extract_mpd_formats(
b359e977 3227 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3228 else:
3229 is_plain_url = True
3230 formats = [{
3231 'url': full_url,
3232 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3233 'ext': ext,
520251c0
YCH
3234 }]
3235 return is_plain_url, formats
3236
59bbe491 3237 entries = []
4328ddf8 3238 # amp-video and amp-audio are very similar to their HTML5 counterparts
962ffcf8 3239 # so we will include them right here (see
4328ddf8 3240 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3241 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3242 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3243 media_tags = [(media_tag, media_tag_name, media_type, '')
3244 for media_tag, media_tag_name, media_type
3245 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3246 media_tags.extend(re.findall(
3247 # We only allow video|audio followed by a whitespace or '>'.
3248 # Allowing more characters may end up in significant slow down (see
067aa17e 3249 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 3250 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3251 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3252 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3253 media_info = {
3254 'formats': [],
3255 'subtitles': {},
3256 }
3257 media_attributes = extract_attributes(media_tag)
f856816b 3258 src = strip_or_none(media_attributes.get('src'))
59bbe491 3259 if src:
222a2308
L
3260 f = parse_content_type(media_attributes.get('type'))
3261 _, formats = _media_formats(src, media_type, f)
520251c0 3262 media_info['formats'].extend(formats)
6780154e 3263 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3264 if media_content:
3265 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3266 s_attr = extract_attributes(source_tag)
3267 # data-video-src and data-src are non standard but seen
3268 # several times in the wild
f856816b 3269 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 3270 if not src:
3271 continue
d493f15c 3272 f = parse_content_type(s_attr.get('type'))
868f79db 3273 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3274 if is_plain_url:
d493f15c
S
3275 # width, height, res, label and title attributes are
3276 # all not standard but seen several times in the wild
3277 labels = [
3278 s_attr.get(lbl)
3279 for lbl in ('label', 'title')
3280 if str_or_none(s_attr.get(lbl))
3281 ]
3282 width = int_or_none(s_attr.get('width'))
3089bc74
S
3283 height = (int_or_none(s_attr.get('height'))
3284 or int_or_none(s_attr.get('res')))
d493f15c
S
3285 if not width or not height:
3286 for lbl in labels:
3287 resolution = parse_resolution(lbl)
3288 if not resolution:
3289 continue
3290 width = width or resolution.get('width')
3291 height = height or resolution.get('height')
3292 for lbl in labels:
3293 tbr = parse_bitrate(lbl)
3294 if tbr:
3295 break
3296 else:
3297 tbr = None
1ed45499 3298 f.update({
d493f15c
S
3299 'width': width,
3300 'height': height,
3301 'tbr': tbr,
3302 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3303 })
520251c0
YCH
3304 f.update(formats[0])
3305 media_info['formats'].append(f)
3306 else:
3307 media_info['formats'].extend(formats)
59bbe491 3308 for track_tag in re.findall(r'<track[^>]+>', media_content):
3309 track_attributes = extract_attributes(track_tag)
3310 kind = track_attributes.get('kind')
5968d7d2 3311 if not kind or kind in ('subtitles', 'captions'):
f856816b 3312 src = strip_or_none(track_attributes.get('src'))
59bbe491 3313 if not src:
3314 continue
3315 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3316 media_info['subtitles'].setdefault(lang, []).append({
3317 'url': absolute_url(src),
3318 })
5e8e2fa5
S
3319 for f in media_info['formats']:
3320 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3321 if media_info['formats'] or media_info['subtitles']:
59bbe491 3322 entries.append(media_info)
3323 return entries
3324
f6a1d69a
F
3325 def _extract_akamai_formats(self, *args, **kwargs):
3326 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3327 if subs:
b5ae35ee 3328 self._report_ignoring_subs('akamai')
f6a1d69a
F
3329 return fmts
3330
3331 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3332 signed = 'hdnea=' in manifest_url
3333 if not signed:
3334 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3335 manifest_url = re.sub(
3336 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3337 '', manifest_url).strip('?')
3338
c7c43a93 3339 formats = []
f6a1d69a 3340 subtitles = {}
70c5802b 3341
e71a4509 3342 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3343 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3344 hds_host = hosts.get('hds')
3345 if hds_host:
3346 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3347 if 'hdcore=' not in f4m_url:
3348 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3349 f4m_formats = self._extract_f4m_formats(
3350 f4m_url, video_id, f4m_id='hds', fatal=False)
3351 for entry in f4m_formats:
3352 entry.update({'extra_param_to_segment_url': hdcore_sign})
3353 formats.extend(f4m_formats)
70c5802b 3354
c4251b9a
RA
3355 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3356 hls_host = hosts.get('hls')
3357 if hls_host:
3358 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3359 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3360 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3361 m3u8_id='hls', fatal=False)
3362 formats.extend(m3u8_formats)
f6a1d69a 3363 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3364
3365 http_host = hosts.get('http')
29f7c58a 3366 if http_host and m3u8_formats and not signed:
3367 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3368 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3369 qualities_length = len(qualities)
29f7c58a 3370 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3371 i = 0
29f7c58a 3372 for f in m3u8_formats:
3373 if f['vcodec'] != 'none':
70c5802b 3374 for protocol in ('http', 'https'):
3375 http_f = f.copy()
3376 del http_f['manifest_url']
3377 http_url = re.sub(
86e5f3ed 3378 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3379 http_f.update({
3380 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3381 'url': http_url,
3382 'protocol': protocol,
3383 })
29f7c58a 3384 formats.append(http_f)
70c5802b 3385 i += 1
70c5802b 3386
f6a1d69a 3387 return formats, subtitles
c7c43a93 3388
6ad02195 3389 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
14f25df2 3390 query = urllib.parse.urlparse(url).query
6ad02195 3391 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3392 mobj = re.search(
3393 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3394 url_base = mobj.group('url')
3395 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3396 formats = []
044eeb14
S
3397
3398 def manifest_url(manifest):
86e5f3ed 3399 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3400 if query:
3401 m_url += '?%s' % query
3402 return m_url
3403
6ad02195
RA
3404 if 'm3u8' not in skip_protocols:
3405 formats.extend(self._extract_m3u8_formats(
044eeb14 3406 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3407 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3408 if 'f4m' not in skip_protocols:
3409 formats.extend(self._extract_f4m_formats(
044eeb14 3410 manifest_url('manifest.f4m'),
6ad02195 3411 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3412 if 'dash' not in skip_protocols:
3413 formats.extend(self._extract_mpd_formats(
044eeb14 3414 manifest_url('manifest.mpd'),
0384932e 3415 video_id, mpd_id='dash', fatal=False))
6ad02195 3416 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3417 if 'smil' not in skip_protocols:
3418 rtmp_formats = self._extract_smil_formats(
044eeb14 3419 manifest_url('jwplayer.smil'),
6ad02195
RA
3420 video_id, fatal=False)
3421 for rtmp_format in rtmp_formats:
3422 rtsp_format = rtmp_format.copy()
3423 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3424 del rtsp_format['play_path']
3425 del rtsp_format['ext']
3426 rtsp_format.update({
3427 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3428 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3429 'protocol': 'rtsp',
3430 })
3431 formats.extend([rtmp_format, rtsp_format])
3432 else:
3433 for protocol in ('rtmp', 'rtsp'):
3434 if protocol not in skip_protocols:
3435 formats.append({
86e5f3ed 3436 'url': f'{protocol}:{url_base}',
6ad02195
RA
3437 'format_id': protocol,
3438 'protocol': protocol,
3439 })
3440 return formats
3441
c73e330e 3442 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3443 mobj = re.search(
ac9c69ac 3444 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3445 webpage)
3446 if mobj:
c73e330e
RU
3447 try:
3448 jwplayer_data = self._parse_json(mobj.group('options'),
3449 video_id=video_id,
3450 transform_source=transform_source)
3451 except ExtractorError:
3452 pass
3453 else:
3454 if isinstance(jwplayer_data, dict):
3455 return jwplayer_data
a4a554a7
YCH
3456
3457 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3458 jwplayer_data = self._find_jwplayer_data(
3459 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3460 return self._parse_jwplayer_data(
3461 jwplayer_data, video_id, *args, **kwargs)
3462
3463 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3464 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3465 # JWPlayer backward compatibility: flattened playlists
3466 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3467 if 'playlist' not in jwplayer_data:
3468 jwplayer_data = {'playlist': [jwplayer_data]}
3469
3470 entries = []
3471
3472 # JWPlayer backward compatibility: single playlist item
3473 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3474 if not isinstance(jwplayer_data['playlist'], list):
3475 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3476
3477 for video_data in jwplayer_data['playlist']:
3478 # JWPlayer backward compatibility: flattened sources
3479 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3480 if 'sources' not in video_data:
3481 video_data['sources'] = [video_data]
3482
3483 this_video_id = video_id or video_data['mediaid']
3484
1a2192cb
S
3485 formats = self._parse_jwplayer_formats(
3486 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3487 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3488
3489 subtitles = {}
3490 tracks = video_data.get('tracks')
3491 if tracks and isinstance(tracks, list):
3492 for track in tracks:
96a2daa1
S
3493 if not isinstance(track, dict):
3494 continue
f4b74272 3495 track_kind = track.get('kind')
14f25df2 3496 if not track_kind or not isinstance(track_kind, str):
f4b74272
S
3497 continue
3498 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3499 continue
3500 track_url = urljoin(base_url, track.get('file'))
3501 if not track_url:
3502 continue
3503 subtitles.setdefault(track.get('label') or 'en', []).append({
3504 'url': self._proto_relative_url(track_url)
3505 })
3506
50d808f5 3507 entry = {
a4a554a7 3508 'id': this_video_id,
50d808f5 3509 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3510 'description': clean_html(video_data.get('description')),
6945b9e7 3511 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3512 'timestamp': int_or_none(video_data.get('pubdate')),
3513 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3514 'subtitles': subtitles,
50d808f5
RA
3515 }
3516 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3517 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3518 entry.update({
3519 '_type': 'url_transparent',
3520 'url': formats[0]['url'],
3521 })
3522 else:
3523 self._sort_formats(formats)
3524 entry['formats'] = formats
3525 entries.append(entry)
a4a554a7
YCH
3526 if len(entries) == 1:
3527 return entries[0]
3528 else:
3529 return self.playlist_result(entries)
3530
ed0cf9b3
S
3531 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3532 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3533 urls = []
ed0cf9b3 3534 formats = []
1a2192cb 3535 for source in jwplayer_sources_data:
0a268c6e
S
3536 if not isinstance(source, dict):
3537 continue
6945b9e7
RA
3538 source_url = urljoin(
3539 base_url, self._proto_relative_url(source.get('file')))
3540 if not source_url or source_url in urls:
bf1b87cd
RA
3541 continue
3542 urls.append(source_url)
ed0cf9b3
S
3543 source_type = source.get('type') or ''
3544 ext = mimetype2ext(source_type) or determine_ext(source_url)
3545 if source_type == 'hls' or ext == 'm3u8':
3546 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3547 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3548 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3549 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3550 formats.extend(self._extract_mpd_formats(
3551 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3552 elif ext == 'smil':
3553 formats.extend(self._extract_smil_formats(
3554 source_url, video_id, fatal=False))
ed0cf9b3 3555 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3556 elif source_type.startswith('audio') or ext in (
3557 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3558 formats.append({
3559 'url': source_url,
3560 'vcodec': 'none',
3561 'ext': ext,
3562 })
3563 else:
3564 height = int_or_none(source.get('height'))
3565 if height is None:
3566 # Often no height is provided but there is a label in
0236cd0d 3567 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3568 height = int_or_none(self._search_regex(
14f25df2 3569 r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
ed0cf9b3
S
3570 'height', default=None))
3571 a_format = {
3572 'url': source_url,
3573 'width': int_or_none(source.get('width')),
3574 'height': height,
0236cd0d 3575 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3576 'ext': ext,
3577 }
3578 if source_url.startswith('rtmp'):
3579 a_format['ext'] = 'flv'
ed0cf9b3
S
3580 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3581 # of jwplayer.flash.swf
3582 rtmp_url_parts = re.split(
3583 r'((?:mp4|mp3|flv):)', source_url, 1)
3584 if len(rtmp_url_parts) == 3:
3585 rtmp_url, prefix, play_path = rtmp_url_parts
3586 a_format.update({
3587 'url': rtmp_url,
3588 'play_path': prefix + play_path,
3589 })
3590 if rtmp_params:
3591 a_format.update(rtmp_params)
3592 formats.append(a_format)
3593 return formats
3594
f4b1c7ad 3595 def _live_title(self, name):
39ca3b5c 3596 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3597 return name
f4b1c7ad 3598
b14f3a4c
PH
3599 def _int(self, v, name, fatal=False, **kwargs):
3600 res = int_or_none(v, **kwargs)
b14f3a4c 3601 if res is None:
86e5f3ed 3602 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3603 if fatal:
3604 raise ExtractorError(msg)
3605 else:
6a39ee13 3606 self.report_warning(msg)
b14f3a4c
PH
3607 return res
3608
3609 def _float(self, v, name, fatal=False, **kwargs):
3610 res = float_or_none(v, **kwargs)
3611 if res is None:
86e5f3ed 3612 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3613 if fatal:
3614 raise ExtractorError(msg)
3615 else:
6a39ee13 3616 self.report_warning(msg)
b14f3a4c
PH
3617 return res
3618
40e41780
TF
3619 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3620 path='/', secure=False, discard=False, rest={}, **kwargs):
ac668111 3621 cookie = http.cookiejar.Cookie(
4ed2d7b7 3622 0, name, value, port, port is not None, domain, True,
40e41780
TF
3623 domain.startswith('.'), path, True, secure, expire_time,
3624 discard, None, None, rest)
9809740b 3625 self.cookiejar.set_cookie(cookie)
42939b61 3626
799207e8 3627 def _get_cookies(self, url):
ac668111 3628 """ Return a http.cookies.SimpleCookie with the cookies for the url """
3629 return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
799207e8 3630
e3c1266f 3631 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3632 """
3633 Apply first Set-Cookie header instead of the last. Experimental.
3634
3635 Some sites (e.g. [1-3]) may serve two cookies under the same name
3636 in Set-Cookie header and expect the first (old) one to be set rather
3637 than second (new). However, as of RFC6265 the newer one cookie
3638 should be set into cookie store what actually happens.
3639 We will workaround this issue by resetting the cookie to
3640 the first one manually.
3641 1. https://new.vk.com/
3642 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3643 3. https://learning.oreilly.com/
3644 """
e3c1266f
S
3645 for header, cookies in url_handle.headers.items():
3646 if header.lower() != 'set-cookie':
3647 continue
cfb0511d 3648 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3649 cookie_value = re.search(
3650 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3651 if cookie_value:
3652 value, domain = cookie_value.groups()
3653 self._set_cookie(domain, cookie, value)
3654 break
3655
82d02080 3656 @classmethod
3657 def get_testcases(cls, include_onlymatching=False):
3658 t = getattr(cls, '_TEST', None)
05900629 3659 if t:
82d02080 3660 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3661 tests = [t]
3662 else:
82d02080 3663 tests = getattr(cls, '_TESTS', [])
05900629
PH
3664 for t in tests:
3665 if not include_onlymatching and t.get('only_matching', False):
3666 continue
82d02080 3667 t['name'] = cls.ie_key()
05900629
PH
3668 yield t
3669
f2e8dbcc 3670 @classmethod
3671 def get_webpage_testcases(cls):
3672 tests = getattr(cls, '_WEBPAGE_TESTS', [])
3673 for t in tests:
3674 t['name'] = cls.ie_key()
3675 return tests
3676
24146491 3677 @classproperty
3678 def age_limit(cls):
3679 """Get age limit from the testcases"""
3680 return max(traverse_obj(
f2e8dbcc 3681 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
24146491 3682 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3683
82d02080 3684 @classmethod
3685 def is_suitable(cls, age_limit):
24146491 3686 """Test whether the extractor is generally suitable for the given age limit"""
3687 return not age_restricted(cls.age_limit, age_limit)
05900629 3688
82d02080 3689 @classmethod
3690 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3691 """Description of the extractor"""
3692 desc = ''
82d02080 3693 if cls._NETRC_MACHINE:
8dcce6a8 3694 if markdown:
82d02080 3695 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
8dcce6a8 3696 else:
82d02080 3697 desc += f' [{cls._NETRC_MACHINE}]'
3698 if cls.IE_DESC is False:
8dcce6a8 3699 desc += ' [HIDDEN]'
82d02080 3700 elif cls.IE_DESC:
3701 desc += f' {cls.IE_DESC}'
3702 if cls.SEARCH_KEY:
3703 desc += f'; "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3704 if search_examples:
3705 _COUNTS = ('', '5', '10', 'all')
82d02080 3706 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3707 if not cls.working():
8dcce6a8 3708 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3709
82d02080 3710 name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
8dcce6a8 3711 return f'{name}:{desc}' if desc else name
3712
a504ced0 3713 def extract_subtitles(self, *args, **kwargs):
a06916d9 3714 if (self.get_param('writesubtitles', False)
3715 or self.get_param('listsubtitles')):
9868ea49
JMF
3716 return self._get_subtitles(*args, **kwargs)
3717 return {}
a504ced0
JMF
3718
3719 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3720 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3721
a2160aa4 3722 def extract_comments(self, *args, **kwargs):
3723 if not self.get_param('getcomments'):
3724 return None
3725 generator = self._get_comments(*args, **kwargs)
3726
3727 def extractor():
3728 comments = []
d2b2fca5 3729 interrupted = True
a2160aa4 3730 try:
3731 while True:
3732 comments.append(next(generator))
a2160aa4 3733 except StopIteration:
3734 interrupted = False
d2b2fca5 3735 except KeyboardInterrupt:
3736 self.to_screen('Interrupted by user')
3737 except Exception as e:
3738 if self.get_param('ignoreerrors') is not True:
3739 raise
3740 self._downloader.report_error(e)
a2160aa4 3741 comment_count = len(comments)
3742 self.to_screen(f'Extracted {comment_count} comments')
3743 return {
3744 'comments': comments,
3745 'comment_count': None if interrupted else comment_count
3746 }
3747 return extractor
3748
3749 def _get_comments(self, *args, **kwargs):
3750 raise NotImplementedError('This method must be implemented by subclasses')
3751
912e0b7e
YCH
3752 @staticmethod
3753 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3754 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3755 will be dropped. """
86e5f3ed 3756 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3757 ret = list(subtitle_list1)
a44ca5a4 3758 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3759 return ret
3760
3761 @classmethod
46890374 3762 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3763 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3764 if target is None:
3765 target = {}
3766 for d in dicts:
3767 for lang, subs in d.items():
3768 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3769 return target
912e0b7e 3770
360e1ca5 3771 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3772 if (self.get_param('writeautomaticsub', False)
3773 or self.get_param('listsubtitles')):
9868ea49
JMF
3774 return self._get_automatic_captions(*args, **kwargs)
3775 return {}
360e1ca5
JMF
3776
3777 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3778 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3779
2762dbb1 3780 @functools.cached_property
24146491 3781 def _cookies_passed(self):
3782 """Whether cookies have been passed to YoutubeDL"""
3783 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3784
d77ab8e2 3785 def mark_watched(self, *args, **kwargs):
1813a6cc 3786 if not self.get_param('mark_watched', False):
3787 return
24146491 3788 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3789 self._mark_watched(*args, **kwargs)
3790
3791 def _mark_watched(self, *args, **kwargs):
3792 raise NotImplementedError('This method must be implemented by subclasses')
3793
38cce791
YCH
3794 def geo_verification_headers(self):
3795 headers = {}
a06916d9 3796 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3797 if geo_verification_proxy:
3798 headers['Ytdl-request-proxy'] = geo_verification_proxy
3799 return headers
3800
8f97a15d 3801 @staticmethod
3802 def _generic_id(url):
14f25df2 3803 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
98763ee3 3804
8f97a15d 3805 @staticmethod
3806 def _generic_title(url):
14f25df2 3807 return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
98763ee3 3808
c224251a 3809 @staticmethod
b0089e89 3810 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3811 all_known = all(map(
3812 lambda x: x is not None,
3813 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3814 return (
3815 'private' if is_private
3816 else 'premium_only' if needs_premium
3817 else 'subscriber_only' if needs_subscription
3818 else 'needs_auth' if needs_auth
3819 else 'unlisted' if is_unlisted
3820 else 'public' if all_known
3821 else None)
3822
d43de682 3823 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3824 '''
3825 @returns A list of values for the extractor argument given by "key"
3826 or "default" if no such key is present
3827 @param default The default value to return when the key is not present (default: [])
3828 @param casesense When false, the values are converted to lower case
3829 '''
3830 val = traverse_obj(
d43de682 3831 self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
4bb6b02f 3832 if val is None:
3833 return [] if default is NO_DEFAULT else default
3834 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3835
f40ee5e9 3836 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3837 if not playlist_id or not video_id:
3838 return not video_id
3839
3840 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3841 if no_playlist is not None:
3842 return not no_playlist
3843
3844 video_id = '' if video_id is True else f' {video_id}'
3845 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3846 if self.get_param('noplaylist'):
3847 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3848 return False
3849 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3850 return True
3851
be5c1ae8 3852 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3853 RetryManager.report_retry(err, _count or int(fatal), _retries, info=self.to_screen, warn=self.report_warning,
3854 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3855
3856 def RetryManager(self, **kwargs):
3857 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3858
8f97a15d 3859 @classmethod
3860 def extract_from_webpage(cls, ydl, url, webpage):
3861 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3862 else ydl.get_info_extractor(cls.ie_key()))
f2e8dbcc 3863 for info in ie._extract_from_webpage(url, webpage) or []:
3864 # url = None since we do not want to set (webpage/original)_url
3865 ydl.add_default_extra_info(info, ie, None)
3866 yield info
8f97a15d 3867
3868 @classmethod
3869 def _extract_from_webpage(cls, url, webpage):
3870 for embed_url in orderedSet(
3871 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3872 yield cls.url_result(embed_url, cls)
3873
3874 @classmethod
3875 def _extract_embed_urls(cls, url, webpage):
3876 """@returns all the embed urls on the webpage"""
3877 if '_EMBED_URL_RE' not in cls.__dict__:
3878 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3879 for idx, regex in enumerate(cls._EMBED_REGEX):
3880 assert regex.count('(?P<url>') == 1, \
3881 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3882 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3883
3884 for regex in cls._EMBED_URL_RE:
3885 for mobj in regex.finditer(webpage):
3886 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3887 if cls._VALID_URL is False or cls.suitable(embed_url):
3888 yield embed_url
3889
3890 class StopExtraction(Exception):
3891 pass
3892
bfd973ec 3893 @classmethod
3894 def _extract_url(cls, webpage): # TODO: Remove
3895 """Only for compatibility with some older extractors"""
3896 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3897
8dbe9899 3898
d6983cb4
PH
3899class SearchInfoExtractor(InfoExtractor):
3900 """
3901 Base class for paged search queries extractors.
10952eb2 3902 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3903 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3904 """
3905
96565c7e 3906 _MAX_RESULTS = float('inf')
3907
8f97a15d 3908 @classproperty
3909 def _VALID_URL(cls):
d6983cb4
PH
3910 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3911
d6983cb4 3912 def _real_extract(self, query):
2c4aaadd 3913 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3914 if prefix == '':
3915 return self._get_n_results(query, 1)
3916 elif prefix == 'all':
3917 return self._get_n_results(query, self._MAX_RESULTS)
3918 else:
3919 n = int(prefix)
3920 if n <= 0:
86e5f3ed 3921 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3922 elif n > self._MAX_RESULTS:
6a39ee13 3923 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3924 n = self._MAX_RESULTS
3925 return self._get_n_results(query, n)
3926
3927 def _get_n_results(self, query, n):
cc16383f 3928 """Get a specified number of results for a query.
3929 Either this function or _search_results must be overridden by subclasses """
3930 return self.playlist_result(
3931 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3932 query, query)
3933
3934 def _search_results(self, query):
3935 """Returns an iterator of search results"""
611c1dd9 3936 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 3937
82d02080 3938 @classproperty
3939 def SEARCH_KEY(cls):
3940 return cls._SEARCH_KEY