]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
Remove Python 3.6 support
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
d6983cb4 1import base64
234416e4 2import collections
ac668111 3import getpass
3ec05685 4import hashlib
54007a45 5import http.client
6import http.cookiejar
7import http.cookies
cc16383f 8import itertools
3d3538e4 9import json
f8271158 10import math
4094b6e3 11import netrc
d6983cb4 12import os
773f291d 13import random
6929b41a 14import re
d6983cb4 15import sys
4094b6e3 16import time
14f25df2 17import urllib.parse
ac668111 18import urllib.request
f8271158 19import xml.etree.ElementTree
d6983cb4 20
6929b41a 21from ..compat import functools # isort: split
14f25df2 22from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
eb8a4433 23from ..downloader import FileDownloader
f8271158 24from ..downloader.f4m import get_base_url, remove_encrypted_media
8c25f81b 25from ..utils import (
f8271158 26 JSON_LD_RE,
27 NO_DEFAULT,
28 ExtractorError,
29 GeoRestrictedError,
30 GeoUtils,
b7c47b74 31 LenientJSONDecoder,
f8271158 32 RegexNotFoundError,
33 UnsupportedError,
05900629 34 age_restricted,
02dc0a36 35 base_url,
08f2a92c 36 bug_reports_message,
82d02080 37 classproperty,
d6983cb4 38 clean_html,
70f0f5a8 39 determine_ext,
46b18f23 40 determine_protocol,
d493f15c 41 dict_get,
42676437 42 encode_data_uri,
9b9c5355 43 error_to_compat_str,
46b18f23 44 extract_attributes,
90137ca4 45 filter_dict,
97f4aecf 46 fix_xml_ampersands,
b14f3a4c 47 float_or_none,
b868936c 48 format_field,
31bb8d3f 49 int_or_none,
34921b43 50 join_nonempty,
a4a554a7 51 js_to_json,
46b18f23 52 mimetype2ext,
3158150c 53 network_exceptions,
46b18f23 54 orderedSet,
d493f15c 55 parse_bitrate,
46b18f23
JH
56 parse_codecs,
57 parse_duration,
4ca2a3cf 58 parse_iso8601,
46b18f23 59 parse_m3u8_attributes,
d493f15c 60 parse_resolution,
46b18f23 61 sanitize_filename,
b868936c 62 sanitized_Request,
d493f15c 63 str_or_none,
ce5b9040 64 str_to_int,
f856816b 65 strip_or_none,
5d3a0e79 66 traverse_obj,
47046464 67 try_call,
ffa89477 68 try_get,
f38de77f 69 unescapeHTML,
647eab45 70 unified_strdate,
6b3a3098 71 unified_timestamp,
46b18f23 72 update_Request,
09d02ea4 73 update_url_query,
a107193e 74 url_basename,
bebef109 75 url_or_none,
b868936c 76 urljoin,
6606817a 77 variadic,
a6571f10 78 xpath_element,
8d6765cf
S
79 xpath_text,
80 xpath_with_ns,
d6983cb4 81)
c342041f 82
d6983cb4 83
86e5f3ed 84class InfoExtractor:
d6983cb4
PH
85 """Information Extractor class.
86
87 Information extractors are the classes that, given a URL, extract
88 information about the video (or videos) the URL refers to. This
89 information includes the real video URL, the video title, author and
90 others. The information is stored in a dictionary which is then
5d380852 91 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
92 information possibly downloading the video to the file system, among
93 other possible outcomes.
94
cf0649f8 95 The type field determines the type of the result.
fed5d032
PH
96 By far the most common value (and the default if _type is missing) is
97 "video", which indicates a single video.
98
99 For a video, the dictionaries must include the following fields:
d6983cb4
PH
100
101 id: Video identifier.
d4736fdb 102 title: Video title, unescaped. Set to an empty string if video has
103 no title as opposed to "None" which signifies that the
104 extractor failed to obtain a title
d67b0b15 105
f49d89ee 106 Additionally, it must contain either a formats entry or a url one:
d67b0b15 107
f49d89ee
PH
108 formats: A list of dictionaries for each format available, ordered
109 from worst to best quality.
110
111 Potential fields:
c790e93a
S
112 * url The mandatory URL representing the media:
113 for plain file media - HTTP URL of this file,
114 for RTMP - RTMP URL,
115 for HLS - URL of the M3U8 media playlist,
116 for HDS - URL of the F4M manifest,
79d2077e
S
117 for DASH
118 - HTTP URL to plain file media (in case of
119 unfragmented media)
120 - URL of the MPD manifest or base URL
121 representing the media if MPD manifest
8ed7a233 122 is parsed from a string (in case of
79d2077e 123 fragmented media)
c790e93a 124 for MSS - URL of the ISM manifest.
86f4d14f
S
125 * manifest_url
126 The URL of the manifest file in case of
c790e93a
S
127 fragmented media:
128 for HLS - URL of the M3U8 master playlist,
129 for HDS - URL of the F4M manifest,
130 for DASH - URL of the MPD manifest,
131 for MSS - URL of the ISM manifest.
a44ca5a4 132 * manifest_stream_number (For internal use only)
133 The index of the stream in the manifest file
10952eb2 134 * ext Will be calculated from URL if missing
d67b0b15
PH
135 * format A human-readable description of the format
136 ("mp4 container with h264/opus").
137 Calculated from the format_id, width, height.
138 and format_note fields if missing.
139 * format_id A short description of the format
5d4f3985
PH
140 ("mp4_h264_opus" or "19").
141 Technically optional, but strongly recommended.
d67b0b15
PH
142 * format_note Additional info about the format
143 ("3D" or "DASH video")
144 * width Width of the video, if known
145 * height Height of the video, if known
f49d89ee 146 * resolution Textual description of width and height
176f1866 147 * dynamic_range The dynamic range of the video. One of:
148 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 149 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
150 * abr Average audio bitrate in KBit/s
151 * acodec Name of the audio codec in use
dd27fd17 152 * asr Audio sampling rate in Hertz
d67b0b15 153 * vbr Average video bitrate in KBit/s
fbb21cf5 154 * fps Frame rate
d67b0b15 155 * vcodec Name of the video codec in use
1394ce65 156 * container Name of the container format
d67b0b15 157 * filesize The number of bytes, if known in advance
9732d77e 158 * filesize_approx An estimate for the number of bytes
d67b0b15 159 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c 160 * protocol The protocol that will be used for the actual
adbc4ec4
THD
161 download, lower-case. One of "http", "https" or
162 one of the protocols defined in downloader.PROTOCOL_MAP
c58c2d63
S
163 * fragment_base_url
164 Base URL for fragments. Each fragment's path
165 value (if present) will be relative to
166 this URL.
167 * fragments A list of fragments of a fragmented media.
168 Each fragment entry must contain either an url
169 or a path. If an url is present it should be
170 considered by a client. Otherwise both path and
171 fragment_base_url must be present. Here is
172 the list of all potential fields:
173 * "url" - fragment's URL
174 * "path" - fragment's path relative to
175 fragment_base_url
a0d5077c
S
176 * "duration" (optional, int or float)
177 * "filesize" (optional, int)
adbc4ec4
THD
178 * is_from_start Is a live format that can be downloaded
179 from the start. Boolean
f49d89ee 180 * preference Order number of this format. If this field is
08d13955 181 present and not None, the formats get sorted
38d63d84 182 by this field, regardless of all other values.
f49d89ee
PH
183 -1 for default (order by other properties),
184 -2 or smaller for less than default.
e65566a9
PH
185 < -1000 to hide the format (if there is
186 another one which is strictly better)
32f90364
PH
187 * language Language code, e.g. "de" or "en-US".
188 * language_preference Is this in the language mentioned in
189 the URL?
aff2f4f4
PH
190 10 if it's what the URL is about,
191 -1 for default (don't know),
192 -10 otherwise, other values reserved for now.
5d73273f
PH
193 * quality Order number of the video quality of this
194 format, irrespective of the file format.
195 -1 for default (order by other properties),
196 -2 or smaller for less than default.
c64ed2a3
PH
197 * source_preference Order number for this video source
198 (quality takes higher priority)
199 -1 for default (order by other properties),
200 -2 or smaller for less than default.
d769be6c
PH
201 * http_headers A dictionary of additional HTTP headers
202 to add to the request.
6271f1ca 203 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
204 video's pixels are not square.
205 width : height ratio as float.
206 * no_resume The server does not support resuming the
207 (HTTP or RTMP) download. Boolean.
88acdbc2 208 * has_drm The format has DRM and cannot be downloaded. Boolean
0a5a191a 209 * downloader_options A dictionary of downloader options
210 (For internal use only)
211 * http_chunk_size Chunk size for HTTP downloads
212 * ffmpeg_args Extra arguments for ffmpeg downloader
3b1fe47d 213 RTMP formats can also have the additional fields: page_url,
214 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
215 rtmp_protocol, rtmp_real_time
3dee7826 216
c0ba0f48 217 url: Final video URL.
d6983cb4 218 ext: Video filename extension.
d67b0b15
PH
219 format: The video format, defaults to ext (used for --get-format)
220 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 221
d6983cb4
PH
222 The following fields are optional:
223
08d30158 224 direct: True if a direct video file was given (must only be set by GenericIE)
f5e43bc6 225 alt_title: A secondary title of the video.
0afef30b
PH
226 display_id An alternative identifier for the video, not necessarily
227 unique, but available before title. Typically, id is
228 something like "4234987", title "Dancing naked mole rats",
229 and display_id "dancing-naked-mole-rats"
d5519808 230 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 231 * "id" (optional, string) - Thumbnail format ID
d5519808 232 * "url"
cfb56d1a 233 * "preference" (optional, int) - quality of the image
d5519808
PH
234 * "width" (optional, int)
235 * "height" (optional, int)
5e1c39ac 236 * "resolution" (optional, string "{width}x{height}",
d5519808 237 deprecated)
2de624fd 238 * "filesize" (optional, int)
297e9952 239 * "http_headers" (dict) - HTTP headers for the request
d6983cb4 240 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 241 description: Full video description.
d6983cb4 242 uploader: Full name of the video uploader.
2bc0c46f 243 license: License name the video is licensed under.
8a92e51c 244 creator: The creator of the video.
10db0d2f 245 timestamp: UNIX timestamp of the moment the video was uploaded
ae6a1b95 246 upload_date: Video upload date in UTC (YYYYMMDD).
f0d785d3 247 If not explicitly set, calculated from timestamp
248 release_timestamp: UNIX timestamp of the moment the video was released.
249 If it is not clear whether to use timestamp or this, use the former
ae6a1b95 250 release_date: The date (YYYYMMDD) when the video was released in UTC.
f0d785d3 251 If not explicitly set, calculated from release_timestamp
252 modified_timestamp: UNIX timestamp of the moment the video was last modified.
ae6a1b95 253 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
f0d785d3 254 If not explicitly set, calculated from modified_timestamp
d6983cb4 255 uploader_id: Nickname or id of the video uploader.
7bcd2830 256 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 257 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 258 Note that channel fields may or may not repeat uploader
6f1f59f3
S
259 fields. This depends on a particular extractor.
260 channel_id: Id of the channel.
261 channel_url: Full URL to a channel webpage.
6c73052c 262 channel_follower_count: Number of followers of the channel.
da9ec3b9 263 location: Physical location where the video was filmed.
a504ced0 264 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
265 {tag: subformats}. "tag" is usually a language code, and
266 "subformats" is a list sorted from lower to higher
267 preference, each element is a dictionary with the "ext"
268 entry and one of:
a504ced0 269 * "data": The subtitles file contents
10952eb2 270 * "url": A URL pointing to the subtitles file
2412044c 271 It can optionally also have:
272 * "name": Name or description of the subtitles
08d30158 273 * "http_headers": A dictionary of additional HTTP headers
297e9952 274 to add to the request.
4bba3716 275 "ext" will be calculated from URL if missing
e167860c 276 automatic_captions: Like 'subtitles'; contains automatically generated
277 captions instead of normal subtitles
62d231c0 278 duration: Length of the video in seconds, as an integer or float.
f3d29461 279 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
280 like_count: Number of positive ratings of the video
281 dislike_count: Number of negative ratings of the video
02835c6b 282 repost_count: Number of reposts of the video
2d30521a 283 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 284 comment_count: Number of comments on the video
dd622d7c
PH
285 comments: A list of comments, each with one or more of the following
286 properties (all but one of text or html optional):
287 * "author" - human-readable name of the comment author
288 * "author_id" - user ID of the comment author
a1c5d2ca 289 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
290 * "id" - Comment ID
291 * "html" - Comment as HTML
292 * "text" - Plain text of the comment
293 * "timestamp" - UNIX timestamp of comment
294 * "parent" - ID of the comment this one is replying to.
295 Set to "root" to indicate that this is a
296 comment to the original video.
a1c5d2ca
M
297 * "like_count" - Number of positive ratings of the comment
298 * "dislike_count" - Number of negative ratings of the comment
299 * "is_favorited" - Whether the comment is marked as
300 favorite by the video uploader
301 * "author_is_uploader" - Whether the comment is made by
302 the video uploader
8dbe9899 303 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 304 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
305 should allow to get the same result again. (It will be set
306 by YoutubeDL if it's missing)
ad3bc6ac
PH
307 categories: A list of categories that the video falls in, for example
308 ["Sports", "Berlin"]
864f24bd 309 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 310 cast: A list of the video cast
7267bd53
PH
311 is_live: True, False, or None (=unknown). Whether this video is a
312 live stream that goes on instead of a fixed-length video.
f76ede8e 313 was_live: True, False, or None (=unknown). Whether this video was
314 originally a live stream.
3dbb2a9d 315 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
ae30b840 316 If absent, automatically set from is_live, was_live
7c80519c 317 start_time: Time in seconds where the reproduction should start, as
10952eb2 318 specified in the URL.
297a564b 319 end_time: Time in seconds where the reproduction should end, as
10952eb2 320 specified in the URL.
55949fed 321 chapters: A list of dictionaries, with the following entries:
322 * "start_time" - The start time of the chapter in seconds
323 * "end_time" - The end time of the chapter in seconds
324 * "title" (optional, string)
6cfda058 325 playable_in_embed: Whether this video is allowed to play in embedded
326 players on other sites. Can be True (=always allowed),
327 False (=never allowed), None (=unknown), or a string
c224251a
M
328 specifying the criteria for embedability (Eg: 'whitelist')
329 availability: Under what condition the video is available. One of
330 'private', 'premium_only', 'subscriber_only', 'needs_auth',
331 'unlisted' or 'public'. Use 'InfoExtractor._availability'
332 to set it
277d6ff5 333 __post_extractor: A function to be called just before the metadata is
334 written to either disk, logger or console. The function
335 must return a dict which will be added to the info_dict.
336 This is usefull for additional information that is
337 time-consuming to extract. Note that the fields thus
338 extracted will not be available to output template and
339 match_filter. So, only "comments" and "comment_count" are
340 currently allowed to be extracted via this method.
d6983cb4 341
7109903e
S
342 The following fields should only be used when the video belongs to some logical
343 chapter or section:
344
345 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
346 chapter_number: Number of the chapter the video belongs to, as an integer.
347 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
348
349 The following fields should only be used when the video is an episode of some
8d76bdf1 350 series, programme or podcast:
7109903e
S
351
352 series: Title of the series or programme the video episode belongs to.
9ac24e23 353 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 354 season: Title of the season the video episode belongs to.
27bfd4e5
S
355 season_number: Number of the season the video episode belongs to, as an integer.
356 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
357 episode: Title of the video episode. Unlike mandatory video title field,
358 this field should denote the exact title of the video episode
359 without any kind of decoration.
27bfd4e5
S
360 episode_number: Number of the video episode within a season, as an integer.
361 episode_id: Id of the video episode, as a unicode string.
7109903e 362
7a93ab5f
S
363 The following fields should only be used when the media is a track or a part of
364 a music album:
365
366 track: Title of the track.
367 track_number: Number of the track within an album or a disc, as an integer.
368 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
369 as a unicode string.
370 artist: Artist(s) of the track.
371 genre: Genre(s) of the track.
372 album: Title of the album the track belongs to.
373 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
374 album_artist: List of all artists appeared on the album (e.g.
375 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
376 and compilations).
377 disc_number: Number of the disc or other physical medium the track belongs to,
378 as an integer.
379 release_year: Year (YYYY) when the album was released.
8bcd4048 380 composer: Composer of the piece
7a93ab5f 381
3975b4d2 382 The following fields should only be set for clips that should be cut from the original video:
383
384 section_start: Start time of the section in seconds
385 section_end: End time of the section in seconds
386
45e8a04e 387 The following fields should only be set for storyboards:
388 rows: Number of rows in each storyboard fragment, as an integer
389 columns: Number of columns in each storyboard fragment, as an integer
390
deefc05b 391 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 392
d838b1bd
PH
393 Unless mentioned otherwise, None is equivalent to absence of information.
394
fed5d032
PH
395
396 _type "playlist" indicates multiple videos.
b82f815f
PH
397 There must be a key "entries", which is a list, an iterable, or a PagedList
398 object, each element of which is a valid dictionary by this specification.
fed5d032 399
962ffcf8 400 Additionally, playlists can have "id", "title", and any other relevant
b60419c5 401 attributes with the same semantics as videos (see above).
fed5d032 402
f0d785d3 403 It can also have the following optional fields:
404
405 playlist_count: The total number of videos in a playlist. If not given,
406 YoutubeDL tries to calculate it from "entries"
407
fed5d032
PH
408
409 _type "multi_video" indicates that there are multiple videos that
410 form a single show, for examples multiple acts of an opera or TV episode.
411 It must have an entries key like a playlist and contain all the keys
412 required for a video at the same time.
413
414
415 _type "url" indicates that the video must be extracted from another
416 location, possibly by a different extractor. Its only required key is:
417 "url" - the next URL to extract.
f58766ce
PH
418 The key "ie_key" can be set to the class name (minus the trailing "IE",
419 e.g. "Youtube") if the extractor class is known in advance.
420 Additionally, the dictionary may have any properties of the resolved entity
421 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
422 known ahead of time.
423
424
425 _type "url_transparent" entities have the same specification as "url", but
426 indicate that the given additional information is more precise than the one
427 associated with the resolved URL.
428 This is useful when a site employs a video service that hosts the video and
429 its technical metadata, but that video service does not embed a useful
430 title, description etc.
431
432
08d30158 433 Subclasses of this should define a _VALID_URL regexp and, re-define the
434 _real_extract() and (optionally) _real_initialize() methods.
d6983cb4
PH
435 Probably, they should also be added to the list of extractors.
436
e6f21b3d 437 Subclasses may also override suitable() if necessary, but ensure the function
438 signature is preserved and that this function imports everything it needs
52efa4b3 439 (except other extractors), so that lazy_extractors works correctly.
440
441 To support username + password (or netrc) login, the extractor must define a
442 _NETRC_MACHINE and re-define _perform_login(username, password) and
443 (optionally) _initialize_pre_login() methods. The _perform_login method will
444 be called between _initialize_pre_login and _real_initialize if credentials
445 are passed by the user. In cases where it is necessary to have the login
446 process as part of the extraction rather than initialization, _perform_login
447 can be left undefined.
e6f21b3d 448
4248dad9 449 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
450 geo restriction bypass mechanisms for a particular extractor.
451 Though it won't disable explicit geo restriction bypass based on
504f20dd 452 country code provided with geo_bypass_country.
4248dad9
S
453
454 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
455 countries for this extractor. One of these countries will be used by
456 geo restriction bypass mechanism right away in order to bypass
504f20dd 457 geo restriction, of course, if the mechanism is not disabled.
773f291d 458
5f95927a
S
459 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
460 IP blocks in CIDR notation for this extractor. One of these IP blocks
461 will be used by geo restriction bypass mechanism similarly
504f20dd 462 to _GEO_COUNTRIES.
3ccdde8c 463
e6f21b3d 464 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
465 in order to warn the users and skip the tests.
466 """
467
468 _ready = False
469 _downloader = None
773f291d 470 _x_forwarded_for_ip = None
4248dad9
S
471 _GEO_BYPASS = True
472 _GEO_COUNTRIES = None
5f95927a 473 _GEO_IP_BLOCKS = None
d6983cb4 474 _WORKING = True
52efa4b3 475 _NETRC_MACHINE = None
231025c4 476 IE_DESC = None
8dcce6a8 477 SEARCH_KEY = None
d6983cb4 478
8dcce6a8 479 def _login_hint(self, method=NO_DEFAULT, netrc=None):
480 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
481 return {
482 None: '',
483 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
484 'password': f'Use {password_hint}',
485 'cookies': (
486 'Use --cookies-from-browser or --cookies for the authentication. '
487 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
488 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
9d5d4d64 489
d6983cb4 490 def __init__(self, downloader=None):
49a57e70 491 """Constructor. Receives an optional downloader (a YoutubeDL instance).
492 If a downloader is not passed during initialization,
493 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 494 self._ready = False
773f291d 495 self._x_forwarded_for_ip = None
28f436ba 496 self._printed_messages = set()
d6983cb4
PH
497 self.set_downloader(downloader)
498
499 @classmethod
5ad28e7f 500 def _match_valid_url(cls, url):
79cb2577
PH
501 # This does not use has/getattr intentionally - we want to know whether
502 # we have cached the regexp for *this* class, whereas getattr would also
503 # match the superclass
504 if '_VALID_URL_RE' not in cls.__dict__:
2c4aaadd 505 if '_VALID_URL' not in cls.__dict__:
506 cls._VALID_URL = cls._make_valid_url()
79cb2577 507 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 508 return cls._VALID_URL_RE.match(url)
509
510 @classmethod
511 def suitable(cls, url):
512 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 513 # This function must import everything it needs (except other extractors),
514 # so that lazy_extractors works correctly
5ad28e7f 515 return cls._match_valid_url(url) is not None
d6983cb4 516
ed9266db
PH
517 @classmethod
518 def _match_id(cls, url):
5ad28e7f 519 return cls._match_valid_url(url).group('id')
ed9266db 520
1151c407 521 @classmethod
522 def get_temp_id(cls, url):
523 try:
524 return cls._match_id(url)
525 except (IndexError, AttributeError):
526 return None
527
d6983cb4
PH
528 @classmethod
529 def working(cls):
530 """Getter method for _WORKING."""
531 return cls._WORKING
532
52efa4b3 533 @classmethod
534 def supports_login(cls):
535 return bool(cls._NETRC_MACHINE)
536
d6983cb4
PH
537 def initialize(self):
538 """Initializes an instance (authentication, etc)."""
28f436ba 539 self._printed_messages = set()
5f95927a
S
540 self._initialize_geo_bypass({
541 'countries': self._GEO_COUNTRIES,
542 'ip_blocks': self._GEO_IP_BLOCKS,
543 })
4248dad9 544 if not self._ready:
52efa4b3 545 self._initialize_pre_login()
546 if self.supports_login():
547 username, password = self._get_login_info()
548 if username:
549 self._perform_login(username, password)
550 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
8dcce6a8 551 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
4248dad9
S
552 self._real_initialize()
553 self._ready = True
554
5f95927a 555 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
556 """
557 Initialize geo restriction bypass mechanism.
558
559 This method is used to initialize geo bypass mechanism based on faking
560 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 561 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
562 IP will be passed as X-Forwarded-For HTTP header in all subsequent
563 HTTP requests.
e39b5d4a
S
564
565 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
566 during the instance initialization with _GEO_COUNTRIES and
567 _GEO_IP_BLOCKS.
e39b5d4a 568
5f95927a 569 You may also manually call it from extractor's code if geo bypass
e39b5d4a 570 information is not available beforehand (e.g. obtained during
5f95927a
S
571 extraction) or due to some other reason. In this case you should pass
572 this information in geo bypass context passed as first argument. It may
573 contain following fields:
574
575 countries: List of geo unrestricted countries (similar
576 to _GEO_COUNTRIES)
577 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
578 (similar to _GEO_IP_BLOCKS)
579
e39b5d4a 580 """
773f291d 581 if not self._x_forwarded_for_ip:
5f95927a
S
582
583 # Geo bypass mechanism is explicitly disabled by user
a06916d9 584 if not self.get_param('geo_bypass', True):
5f95927a
S
585 return
586
587 if not geo_bypass_context:
588 geo_bypass_context = {}
589
590 # Backward compatibility: previously _initialize_geo_bypass
591 # expected a list of countries, some 3rd party code may still use
592 # it this way
593 if isinstance(geo_bypass_context, (list, tuple)):
594 geo_bypass_context = {
595 'countries': geo_bypass_context,
596 }
597
598 # The whole point of geo bypass mechanism is to fake IP
599 # as X-Forwarded-For HTTP header based on some IP block or
600 # country code.
601
602 # Path 1: bypassing based on IP block in CIDR notation
603
604 # Explicit IP block specified by user, use it right away
605 # regardless of whether extractor is geo bypassable or not
a06916d9 606 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
607
608 # Otherwise use random IP block from geo bypass context but only
609 # if extractor is known as geo bypassable
610 if not ip_block:
611 ip_blocks = geo_bypass_context.get('ip_blocks')
612 if self._GEO_BYPASS and ip_blocks:
613 ip_block = random.choice(ip_blocks)
614
615 if ip_block:
616 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
8a82af35 617 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
5f95927a
S
618 return
619
620 # Path 2: bypassing based on country code
621
622 # Explicit country code specified by user, use it right away
623 # regardless of whether extractor is geo bypassable or not
a06916d9 624 country = self.get_param('geo_bypass_country', None)
5f95927a
S
625
626 # Otherwise use random country code from geo bypass context but
627 # only if extractor is known as geo bypassable
628 if not country:
629 countries = geo_bypass_context.get('countries')
630 if self._GEO_BYPASS and countries:
631 country = random.choice(countries)
632
633 if country:
634 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 635 self._downloader.write_debug(
86e5f3ed 636 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
d6983cb4
PH
637
638 def extract(self, url):
639 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 640 try:
773f291d
S
641 for _ in range(2):
642 try:
643 self.initialize()
a06916d9 644 self.write_debug('Extracting URL: %s' % url)
0016b84e 645 ie_result = self._real_extract(url)
07cce701 646 if ie_result is None:
647 return None
0016b84e
S
648 if self._x_forwarded_for_ip:
649 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
53ed7066 650 subtitles = ie_result.get('subtitles')
651 if (subtitles and 'live_chat' in subtitles
a06916d9 652 and 'no-live-chat' in self.get_param('compat_opts', [])):
53ed7066 653 del subtitles['live_chat']
0016b84e 654 return ie_result
773f291d 655 except GeoRestrictedError as e:
4248dad9
S
656 if self.__maybe_fake_ip_and_retry(e.countries):
657 continue
773f291d 658 raise
0db3bae8 659 except UnsupportedError:
660 raise
1151c407 661 except ExtractorError as e:
0db3bae8 662 kwargs = {
663 'video_id': e.video_id or self.get_temp_id(url),
664 'ie': self.IE_NAME,
b69fd25c 665 'tb': e.traceback or sys.exc_info()[2],
0db3bae8 666 'expected': e.expected,
667 'cause': e.cause
668 }
669 if hasattr(e, 'countries'):
670 kwargs['countries'] = e.countries
7265a219 671 raise type(e)(e.orig_msg, **kwargs)
ac668111 672 except http.client.IncompleteRead as e:
1151c407 673 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 674 except (KeyError, StopIteration) as e:
1151c407 675 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 676
4248dad9 677 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 678 if (not self.get_param('geo_bypass_country', None)
3089bc74 679 and self._GEO_BYPASS
a06916d9 680 and self.get_param('geo_bypass', True)
3089bc74
S
681 and not self._x_forwarded_for_ip
682 and countries):
eea0716c
S
683 country_code = random.choice(countries)
684 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
685 if self._x_forwarded_for_ip:
686 self.report_warning(
eea0716c
S
687 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
688 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
689 return True
690 return False
691
d6983cb4 692 def set_downloader(self, downloader):
08d30158 693 """Sets a YoutubeDL instance as the downloader for this IE."""
d6983cb4
PH
694 self._downloader = downloader
695
9809740b 696 @property
697 def cache(self):
698 return self._downloader.cache
699
700 @property
701 def cookiejar(self):
702 return self._downloader.cookiejar
703
52efa4b3 704 def _initialize_pre_login(self):
962ffcf8 705 """ Initialization before login. Redefine in subclasses."""
52efa4b3 706 pass
707
708 def _perform_login(self, username, password):
709 """ Login with username and password. Redefine in subclasses."""
710 pass
711
d6983cb4
PH
712 def _real_initialize(self):
713 """Real initialization process. Redefine in subclasses."""
714 pass
715
716 def _real_extract(self, url):
717 """Real extraction process. Redefine in subclasses."""
08d30158 718 raise NotImplementedError('This method must be implemented by subclasses')
d6983cb4 719
56c73665
JMF
720 @classmethod
721 def ie_key(cls):
722 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 723 return cls.__name__[:-2]
56c73665 724
82d02080 725 @classproperty
726 def IE_NAME(cls):
727 return cls.__name__[:-2]
d6983cb4 728
d391b7e2
S
729 @staticmethod
730 def __can_accept_status_code(err, expected_status):
ac668111 731 assert isinstance(err, urllib.error.HTTPError)
d391b7e2
S
732 if expected_status is None:
733 return False
d391b7e2
S
734 elif callable(expected_status):
735 return expected_status(err.code) is True
736 else:
6606817a 737 return err.code in variadic(expected_status)
d391b7e2 738
c043c246 739 def _create_request(self, url_or_request, data=None, headers=None, query=None):
ac668111 740 if isinstance(url_or_request, urllib.request.Request):
09d02ea4 741 return update_Request(url_or_request, data=data, headers=headers, query=query)
742 if query:
743 url_or_request = update_url_query(url_or_request, query)
c043c246 744 return sanitized_Request(url_or_request, data, headers or {})
f95b9dee 745
c043c246 746 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
d391b7e2
S
747 """
748 Return the response handle.
749
750 See _download_webpage docstring for arguments specification.
751 """
1cf376f5 752 if not self._downloader._first_webpage_request:
49a57e70 753 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 754 if sleep_interval > 0:
5ef7d9bd 755 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 756 time.sleep(sleep_interval)
757 else:
758 self._downloader._first_webpage_request = False
759
d6983cb4
PH
760 if note is None:
761 self.report_download_webpage(video_id)
762 elif note is not False:
7cc3570e 763 if video_id is None:
86e5f3ed 764 self.to_screen(str(note))
7cc3570e 765 else:
86e5f3ed 766 self.to_screen(f'{video_id}: {note}')
2132edaa
S
767
768 # Some sites check X-Forwarded-For HTTP header in order to figure out
769 # the origin of the client behind proxy. This allows bypassing geo
770 # restriction by faking this header's value to IP that belongs to some
771 # geo unrestricted country. We will do so once we encounter any
772 # geo restriction error.
773 if self._x_forwarded_for_ip:
c043c246 774 headers = (headers or {}).copy()
775 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
2132edaa 776
d6983cb4 777 try:
f95b9dee 778 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
3158150c 779 except network_exceptions as err:
ac668111 780 if isinstance(err, urllib.error.HTTPError):
d391b7e2 781 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
782 # Retain reference to error to prevent file object from
783 # being closed before it can be read. Works around the
784 # effects of <https://bugs.python.org/issue15002>
785 # introduced in Python 3.4.1.
786 err.fp._error = err
d391b7e2
S
787 return err.fp
788
aa94a6d3
PH
789 if errnote is False:
790 return False
d6983cb4 791 if errnote is None:
f1a9d64e 792 errnote = 'Unable to download webpage'
7f8b2714 793
86e5f3ed 794 errmsg = f'{errnote}: {error_to_compat_str(err)}'
7cc3570e 795 if fatal:
497d2fab 796 raise ExtractorError(errmsg, cause=err)
7cc3570e 797 else:
6a39ee13 798 self.report_warning(errmsg)
7cc3570e 799 return False
d6983cb4 800
1890fc63 801 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
802 encoding=None, data=None, headers={}, query={}, expected_status=None):
d391b7e2
S
803 """
804 Return a tuple (page content as string, URL handle).
805
617f658b 806 Arguments:
807 url_or_request -- plain text URL as a string or
ac668111 808 a urllib.request.Request object
617f658b 809 video_id -- Video/playlist/item identifier (string)
810
811 Keyword arguments:
812 note -- note printed before downloading (string)
813 errnote -- note printed in case of an error (string)
814 fatal -- flag denoting whether error should be considered fatal,
815 i.e. whether it should cause ExtractionError to be raised,
816 otherwise a warning will be reported and extraction continued
817 encoding -- encoding for a page content decoding, guessed automatically
818 when not explicitly specified
819 data -- POST data (bytes)
820 headers -- HTTP headers (dict)
821 query -- URL query (dict)
822 expected_status -- allows to accept failed HTTP requests (non 2xx
823 status code) by explicitly specifying a set of accepted status
824 codes. Can be any of the following entities:
825 - an integer type specifying an exact failed status code to
826 accept
827 - a list or a tuple of integer types specifying a list of
828 failed status codes to accept
829 - a callable accepting an actual failed status code and
830 returning True if it should be accepted
831 Note that this argument does not affect success status codes (2xx)
832 which are always accepted.
d391b7e2 833 """
617f658b 834
b9d3e163 835 # Strip hashes from the URL (#1038)
14f25df2 836 if isinstance(url_or_request, str):
b9d3e163
PH
837 url_or_request = url_or_request.partition('#')[0]
838
d391b7e2 839 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
840 if urlh is False:
841 assert not fatal
842 return False
c9a77969 843 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
844 return (content, urlh)
845
c9a77969
YCH
846 @staticmethod
847 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
848 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
849 if m:
850 encoding = m.group(1)
851 else:
0d75ae2c 852 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
853 webpage_bytes[:1024])
854 if m:
855 encoding = m.group(1).decode('ascii')
b60016e8
PH
856 elif webpage_bytes.startswith(b'\xff\xfe'):
857 encoding = 'utf-16'
f143d86a
PH
858 else:
859 encoding = 'utf-8'
c9a77969
YCH
860
861 return encoding
862
4457823d
S
863 def __check_blocked(self, content):
864 first_block = content[:512]
3089bc74
S
865 if ('<title>Access to this site is blocked</title>' in content
866 and 'Websense' in first_block):
4457823d
S
867 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
868 blocked_iframe = self._html_search_regex(
869 r'<iframe src="([^"]+)"', content,
870 'Websense information URL', default=None)
871 if blocked_iframe:
872 msg += ' Visit %s for more details' % blocked_iframe
873 raise ExtractorError(msg, expected=True)
874 if '<title>The URL you requested has been blocked</title>' in first_block:
875 msg = (
876 'Access to this webpage has been blocked by Indian censorship. '
877 'Use a VPN or proxy server (with --proxy) to route around it.')
878 block_msg = self._html_search_regex(
879 r'</h1><p>(.*?)</p>',
880 content, 'block message', default=None)
881 if block_msg:
882 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
883 raise ExtractorError(msg, expected=True)
3089bc74
S
884 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
885 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
886 raise ExtractorError(
887 'Access to this webpage has been blocked by decision of the Russian government. '
888 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
889 expected=True)
890
f95b9dee 891 def _request_dump_filename(self, url, video_id):
892 basen = f'{video_id}_{url}'
893 trim_length = self.get_param('trim_file_name') or 240
894 if len(basen) > trim_length:
895 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
896 basen = basen[:trim_length - len(h)] + h
897 filename = sanitize_filename(f'{basen}.dump', restricted=True)
898 # Working around MAX_PATH limitation on Windows (see
899 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
900 if compat_os_name == 'nt':
901 absfilepath = os.path.abspath(filename)
902 if len(absfilepath) > 259:
903 filename = fR'\\?\{absfilepath}'
904 return filename
905
906 def __decode_webpage(self, webpage_bytes, encoding, headers):
907 if not encoding:
908 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
909 try:
910 return webpage_bytes.decode(encoding, 'replace')
911 except LookupError:
912 return webpage_bytes.decode('utf-8', 'replace')
913
c9a77969 914 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
c9a77969
YCH
915 webpage_bytes = urlh.read()
916 if prefix is not None:
917 webpage_bytes = prefix + webpage_bytes
a06916d9 918 if self.get_param('dump_intermediate_pages', False):
f610dbb0 919 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
920 dump = base64.b64encode(webpage_bytes).decode('ascii')
921 self._downloader.to_screen(dump)
f95b9dee 922 if self.get_param('write_pages'):
e121e3ce 923 filename = self._request_dump_filename(urlh.geturl(), video_id)
f95b9dee 924 self.to_screen(f'Saving request to {filename}')
d41e6efc
PH
925 with open(filename, 'wb') as outf:
926 outf.write(webpage_bytes)
927
f95b9dee 928 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
4457823d 929 self.__check_blocked(content)
2410c43d 930
23be51d8 931 return content
d6983cb4 932
6edf2808 933 def __print_error(self, errnote, fatal, video_id, err):
934 if fatal:
c6e07cf1 935 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
6edf2808 936 elif errnote:
c6e07cf1 937 self.report_warning(f'{video_id}: {errnote}: {err}')
6edf2808 938
939 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
e2b38da9
PH
940 if transform_source:
941 xml_string = transform_source(xml_string)
e01c3d2e
S
942 try:
943 return compat_etree_fromstring(xml_string.encode('utf-8'))
f9934b96 944 except xml.etree.ElementTree.ParseError as ve:
6edf2808 945 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
267ed0c5 946
6edf2808 947 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
3d3538e4 948 try:
b7c47b74 949 return json.loads(
950 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
3d3538e4 951 except ValueError as ve:
6edf2808 952 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
3d3538e4 953
6edf2808 954 def _parse_socket_response_as_json(self, data, *args, **kwargs):
955 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
adddc50c 956
617f658b 957 def __create_download_methods(name, parser, note, errnote, return_value):
958
6edf2808 959 def parse(ie, content, *args, errnote=errnote, **kwargs):
617f658b 960 if parser is None:
961 return content
6edf2808 962 if errnote is False:
963 kwargs['errnote'] = errnote
617f658b 964 # parser is fetched by name so subclasses can override it
965 return getattr(ie, parser)(content, *args, **kwargs)
966
c4910024 967 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
968 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
969 res = self._download_webpage_handle(
970 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
971 data=data, headers=headers, query=query, expected_status=expected_status)
617f658b 972 if res is False:
973 return res
974 content, urlh = res
6edf2808 975 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
617f658b 976
f95b9dee 977 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
c4910024 978 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
f95b9dee 979 if self.get_param('load_pages'):
980 url_or_request = self._create_request(url_or_request, data, headers, query)
981 filename = self._request_dump_filename(url_or_request.full_url, video_id)
982 self.to_screen(f'Loading request from {filename}')
983 try:
984 with open(filename, 'rb') as dumpf:
985 webpage_bytes = dumpf.read()
986 except OSError as e:
987 self.report_warning(f'Unable to load request from disk: {e}')
988 else:
989 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
6edf2808 990 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
c4910024 991 kwargs = {
992 'note': note,
993 'errnote': errnote,
994 'transform_source': transform_source,
995 'fatal': fatal,
996 'encoding': encoding,
997 'data': data,
998 'headers': headers,
999 'query': query,
1000 'expected_status': expected_status,
1001 }
617f658b 1002 if parser is None:
c4910024 1003 kwargs.pop('transform_source')
617f658b 1004 # The method is fetched by name so subclasses can override _download_..._handle
c4910024 1005 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
617f658b 1006 return res if res is False else res[0]
1007
1008 def impersonate(func, name, return_value):
1009 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1010 func.__doc__ = f'''
1011 @param transform_source Apply this transformation before parsing
1012 @returns {return_value}
1013
1014 See _download_webpage_handle docstring for other arguments specification
1015 '''
1016
1017 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1018 impersonate(download_content, f'_download_{name}', f'{return_value}')
1019 return download_handle, download_content
1020
1021 _download_xml_handle, _download_xml = __create_download_methods(
1022 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1023 _download_json_handle, _download_json = __create_download_methods(
1024 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1025 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1026 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1027 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
adddc50c 1028
617f658b 1029 def _download_webpage(
1030 self, url_or_request, video_id, note=None, errnote=None,
1031 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
adddc50c 1032 """
617f658b 1033 Return the data of the page as a string.
adddc50c 1034
617f658b 1035 Keyword arguments:
1036 tries -- number of tries
1037 timeout -- sleep interval between tries
1038
1039 See _download_webpage_handle docstring for other arguments specification.
adddc50c 1040 """
617f658b 1041
1042 R''' # NB: These are unused; should they be deprecated?
1043 if tries != 1:
1044 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1045 if timeout is NO_DEFAULT:
1046 timeout = 5
1047 else:
1048 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1049 '''
1050
1051 try_count = 0
1052 while True:
1053 try:
1054 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
ac668111 1055 except http.client.IncompleteRead as e:
617f658b 1056 try_count += 1
1057 if try_count >= tries:
1058 raise e
1059 self._sleep(timeout, video_id)
adddc50c 1060
28f436ba 1061 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
a70635b8 1062 idstr = format_field(video_id, None, '%s: ')
28f436ba 1063 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1064 if only_once:
1065 if f'WARNING: {msg}' in self._printed_messages:
1066 return
1067 self._printed_messages.add(f'WARNING: {msg}')
1068 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1069
a06916d9 1070 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1071 """Print msg to screen, prefixing it with '[ie_name]'"""
86e5f3ed 1072 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1073
1074 def write_debug(self, msg, *args, **kwargs):
86e5f3ed 1075 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
a06916d9 1076
1077 def get_param(self, name, default=None, *args, **kwargs):
1078 if self._downloader:
1079 return self._downloader.params.get(name, default, *args, **kwargs)
1080 return default
d6983cb4 1081
88acdbc2 1082 def report_drm(self, video_id, partial=False):
1083 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1084
d6983cb4
PH
1085 def report_extraction(self, id_or_name):
1086 """Report information extraction."""
f1a9d64e 1087 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1088
1089 def report_download_webpage(self, video_id):
1090 """Report webpage download."""
f1a9d64e 1091 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1092
1093 def report_age_confirmation(self):
1094 """Report attempt to confirm age."""
f1a9d64e 1095 self.to_screen('Confirming age')
d6983cb4 1096
fc79158d
JMF
1097 def report_login(self):
1098 """Report attempt to log in."""
f1a9d64e 1099 self.to_screen('Logging in')
fc79158d 1100
b7da73eb 1101 def raise_login_required(
9d5d4d64 1102 self, msg='This video is only available for registered users',
52efa4b3 1103 metadata_available=False, method=NO_DEFAULT):
f2ebc5c7 1104 if metadata_available and (
1105 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1106 self.report_warning(msg)
7265a219 1107 return
a70635b8 1108 msg += format_field(self._login_hint(method), None, '. %s')
46890374 1109 raise ExtractorError(msg, expected=True)
43e7d3c9 1110
b7da73eb 1111 def raise_geo_restricted(
1112 self, msg='This video is not available from your location due to geo restriction',
1113 countries=None, metadata_available=False):
f2ebc5c7 1114 if metadata_available and (
1115 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1116 self.report_warning(msg)
1117 else:
1118 raise GeoRestrictedError(msg, countries=countries)
1119
1120 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1121 if expected and (
1122 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1123 self.report_warning(msg, video_id)
68f5867c
L
1124 elif isinstance(msg, ExtractorError):
1125 raise msg
b7da73eb 1126 else:
1127 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1128
5f6a1245 1129 # Methods for following #608
c0d0b01f 1130 @staticmethod
311b6615 1131 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
10952eb2 1132 """Returns a URL that points to a page that should be processed"""
311b6615 1133 if ie is not None:
1134 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
7012b23c 1135 if video_id is not None:
311b6615 1136 kwargs['id'] = video_id
830d53bf 1137 if video_title is not None:
311b6615 1138 kwargs['title'] = video_title
1139 return {
1140 **kwargs,
1141 '_type': 'url_transparent' if url_transparent else 'url',
1142 'url': url,
1143 }
1144
27231526
ZM
1145 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
1146 urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
311b6615 1147 for m in orderedSet(map(getter, matches) if getter else matches))
1148 return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
46b18f23 1149
c0d0b01f 1150 @staticmethod
311b6615 1151 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
d6983cb4 1152 """Returns a playlist"""
d6983cb4 1153 if playlist_id:
311b6615 1154 kwargs['id'] = playlist_id
d6983cb4 1155 if playlist_title:
311b6615 1156 kwargs['title'] = playlist_title
ecc97af3 1157 if playlist_description is not None:
311b6615 1158 kwargs['description'] = playlist_description
1159 return {
1160 **kwargs,
1161 '_type': 'multi_video' if multi_video else 'playlist',
1162 'entries': entries,
1163 }
d6983cb4 1164
c342041f 1165 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1166 """
1167 Perform a regex search on the given string, using a single or a list of
1168 patterns returning the first matching group.
1169 In case of failure return a default value or raise a WARNING or a
55b3e45b 1170 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4 1171 """
61d3665d 1172 if string is None:
1173 mobj = None
77f90330 1174 elif isinstance(pattern, (str, re.Pattern)):
d6983cb4
PH
1175 mobj = re.search(pattern, string, flags)
1176 else:
1177 for p in pattern:
1178 mobj = re.search(p, string, flags)
c3415d1b
PH
1179 if mobj:
1180 break
d6983cb4 1181
ec11a9f4 1182 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1183
1184 if mobj:
711ede6e
PH
1185 if group is None:
1186 # return the first matching group
1187 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1188 elif isinstance(group, (list, tuple)):
1189 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1190 else:
1191 return mobj.group(group)
c342041f 1192 elif default is not NO_DEFAULT:
d6983cb4
PH
1193 return default
1194 elif fatal:
f1a9d64e 1195 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1196 else:
6a39ee13 1197 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1198 return None
1199
f0bc6e20 1200 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1201 contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
b7c47b74 1202 """Searches string for the JSON object specified by start_pattern"""
1203 # NB: end_pattern is only used to reduce the size of the initial match
f0bc6e20 1204 if default is NO_DEFAULT:
1205 default, has_default = {}, False
1206 else:
1207 fatal, has_default = False, True
1208
1209 json_string = self._search_regex(
1210 rf'{start_pattern}\s*(?P<json>{{\s*{contains_pattern}\s*}})\s*{end_pattern}',
1211 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1212 if not json_string:
1213 return default
1214
1215 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1216 try:
1217 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1218 except ExtractorError as e:
1219 if fatal:
1220 raise ExtractorError(
1221 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1222 elif not has_default:
1223 self.report_warning(
1224 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1225 return default
b7c47b74 1226
c342041f 1227 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1228 """
1229 Like _search_regex, but strips HTML tags and unescapes entities.
1230 """
711ede6e 1231 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1232 if res:
1233 return clean_html(res).strip()
1234 else:
1235 return res
1236
2118fdd1
RA
1237 def _get_netrc_login_info(self, netrc_machine=None):
1238 username = None
1239 password = None
1240 netrc_machine = netrc_machine or self._NETRC_MACHINE
1241
a06916d9 1242 if self.get_param('usenetrc', False):
2118fdd1 1243 try:
0001fcb5 1244 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1245 if os.path.isdir(netrc_file):
1246 netrc_file = os.path.join(netrc_file, '.netrc')
1247 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1248 if info is not None:
1249 username = info[0]
1250 password = info[2]
1251 else:
dcce092e
S
1252 raise netrc.NetrcParseError(
1253 'No authenticators for %s' % netrc_machine)
86e5f3ed 1254 except (OSError, netrc.NetrcParseError) as err:
6a39ee13 1255 self.report_warning(
dcce092e 1256 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1257
dcce092e 1258 return username, password
2118fdd1 1259
1b6712ab 1260 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1261 """
cf0649f8 1262 Get the login info as (username, password)
32443dd3
S
1263 First look for the manually specified credentials using username_option
1264 and password_option as keys in params dictionary. If no such credentials
1265 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1266 value.
fc79158d
JMF
1267 If there's no info available, return (None, None)
1268 """
fc79158d
JMF
1269
1270 # Attempt to use provided username and password or .netrc data
a06916d9 1271 username = self.get_param(username_option)
1272 if username is not None:
1273 password = self.get_param(password_option)
2118fdd1 1274 else:
1b6712ab 1275 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1276
2133565c 1277 return username, password
fc79158d 1278
e64b7569 1279 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1280 """
1281 Get the two-factor authentication info
1282 TODO - asking the user will be required for sms/phone verify
1283 currently just uses the command line option
1284 If there's no info available, return None
1285 """
83317f69 1286
a06916d9 1287 tfa = self.get_param('twofactor')
1288 if tfa is not None:
1289 return tfa
83317f69 1290
ac668111 1291 return getpass.getpass('Type %s and press [Return]: ' % note)
83317f69 1292
46720279
JMF
1293 # Helper functions for extracting OpenGraph info
1294 @staticmethod
ab2d5247 1295 def _og_regexes(prop):
448ef1f3 1296 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
fbfde1c3
F
1297 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1298 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
78fb87b2 1299 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1300 return [
78fb87b2
JMF
1301 template % (property_re, content_re),
1302 template % (content_re, property_re),
ab2d5247 1303 ]
46720279 1304
864f24bd
S
1305 @staticmethod
1306 def _meta_regex(prop):
1307 return r'''(?isx)<meta
8b9848ac 1308 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1309 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1310
3c4e6d83 1311 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1312 prop = variadic(prop)
46720279 1313 if name is None:
b070564e
S
1314 name = 'OpenGraph %s' % prop[0]
1315 og_regexes = []
1316 for p in prop:
1317 og_regexes.extend(self._og_regexes(p))
1318 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1319 if escaped is None:
1320 return None
1321 return unescapeHTML(escaped)
46720279
JMF
1322
1323 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1324 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1325
1326 def _og_search_description(self, html, **kargs):
1327 return self._og_search_property('description', html, fatal=False, **kargs)
1328
04f3fd2c 1329 def _og_search_title(self, html, *, fatal=False, **kargs):
1330 return self._og_search_property('title', html, fatal=fatal, **kargs)
46720279 1331
8ffa13e0 1332 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1333 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1334 if secure:
1335 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1336 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1337
78338f71
JMF
1338 def _og_search_url(self, html, **kargs):
1339 return self._og_search_property('url', html, **kargs)
1340
04f3fd2c 1341 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
21633673 1342 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
77cc7c6e 1343
40c696e5 1344 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1345 name = variadic(name)
59040888 1346 if display_name is None:
88d9f6c0 1347 display_name = name[0]
59040888 1348 return self._html_search_regex(
88d9f6c0 1349 [self._meta_regex(n) for n in name],
711ede6e 1350 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1351
1352 def _dc_search_uploader(self, html):
1353 return self._html_search_meta('dc.creator', html, 'uploader')
1354
8dbe9899
PH
1355 def _rta_search(self, html):
1356 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1357 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1358 r' content="RTA-5042-1996-1400-1577-RTA"',
1359 html):
1360 return 18
1361 return 0
1362
59040888
PH
1363 def _media_rating_search(self, html):
1364 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1365 rating = self._html_search_meta('rating', html)
1366
1367 if not rating:
1368 return None
1369
1370 RATING_TABLE = {
1371 'safe for kids': 0,
1372 'general': 8,
1373 '14 years': 14,
1374 'mature': 17,
1375 'restricted': 19,
1376 }
d800609c 1377 return RATING_TABLE.get(rating.lower())
59040888 1378
69319969 1379 def _family_friendly_search(self, html):
6ca7732d 1380 # See http://schema.org/VideoObject
ac8491fc
S
1381 family_friendly = self._html_search_meta(
1382 'isFamilyFriendly', html, default=None)
69319969
NJ
1383
1384 if not family_friendly:
1385 return None
1386
1387 RATING_TABLE = {
1388 '1': 0,
1389 'true': 0,
1390 '0': 18,
1391 'false': 18,
1392 }
d800609c 1393 return RATING_TABLE.get(family_friendly.lower())
69319969 1394
0c708f11
JMF
1395 def _twitter_search_player(self, html):
1396 return self._html_search_meta('twitter:player', html,
9e1a5b84 1397 'twitter card player')
0c708f11 1398
0c36dc00 1399 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1400 """Yield all json ld objects in the html"""
1401 if default is not NO_DEFAULT:
1402 fatal = False
1403 for mobj in re.finditer(JSON_LD_RE, html):
1404 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1405 for json_ld in variadic(json_ld_item):
1406 if isinstance(json_ld, dict):
1407 yield json_ld
1408
1409 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1410 """Search for a video in any json ld in the html"""
1411 if default is not NO_DEFAULT:
1412 fatal = False
1413 info = self._json_ld(
1414 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1415 video_id, fatal=fatal, expected_type=expected_type)
1416 if info:
1417 return info
4433bb02
S
1418 if default is not NO_DEFAULT:
1419 return default
1420 elif fatal:
1421 raise RegexNotFoundError('Unable to extract JSON-LD')
1422 else:
6a39ee13 1423 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1424 return {}
4ca2a3cf 1425
95b31e26 1426 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
14f25df2 1427 if isinstance(json_ld, str):
4ca2a3cf
S
1428 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1429 if not json_ld:
1430 return {}
1431 info = {}
46933a15
S
1432 if not isinstance(json_ld, (list, tuple, dict)):
1433 return info
1434 if isinstance(json_ld, dict):
1435 json_ld = [json_ld]
bae14048 1436
e7e4a6e0
S
1437 INTERACTION_TYPE_MAP = {
1438 'CommentAction': 'comment',
1439 'AgreeAction': 'like',
1440 'DisagreeAction': 'dislike',
1441 'LikeAction': 'like',
1442 'DislikeAction': 'dislike',
1443 'ListenAction': 'view',
1444 'WatchAction': 'view',
1445 'ViewAction': 'view',
1446 }
1447
f3c0c773 1448 def is_type(e, *expected_types):
1449 type = variadic(traverse_obj(e, '@type'))
1450 return any(x in type for x in expected_types)
1451
29f7c58a 1452 def extract_interaction_type(e):
1453 interaction_type = e.get('interactionType')
1454 if isinstance(interaction_type, dict):
1455 interaction_type = interaction_type.get('@type')
1456 return str_or_none(interaction_type)
1457
e7e4a6e0
S
1458 def extract_interaction_statistic(e):
1459 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1460 if isinstance(interaction_statistic, dict):
1461 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1462 if not isinstance(interaction_statistic, list):
1463 return
1464 for is_e in interaction_statistic:
f3c0c773 1465 if not is_type(is_e, 'InteractionCounter'):
e7e4a6e0 1466 continue
29f7c58a 1467 interaction_type = extract_interaction_type(is_e)
1468 if not interaction_type:
e7e4a6e0 1469 continue
ce5b9040
S
1470 # For interaction count some sites provide string instead of
1471 # an integer (as per spec) with non digit characters (e.g. ",")
1472 # so extracting count with more relaxed str_to_int
1473 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1474 if interaction_count is None:
1475 continue
1476 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1477 if not count_kind:
1478 continue
1479 count_key = '%s_count' % count_kind
1480 if info.get(count_key) is not None:
1481 continue
1482 info[count_key] = interaction_count
1483
f5225737 1484 def extract_chapter_information(e):
1485 chapters = [{
1486 'title': part.get('name'),
1487 'start_time': part.get('startOffset'),
1488 'end_time': part.get('endOffset'),
85553414 1489 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
f5225737 1490 for idx, (last_c, current_c, next_c) in enumerate(zip(
1491 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1492 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1493 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1494 if None in current_c.values():
1495 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1496 return
1497 if chapters:
1498 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1499 info['chapters'] = chapters
1500
bae14048 1501 def extract_video_object(e):
f3c0c773 1502 assert is_type(e, 'VideoObject')
f7ad7160 1503 author = e.get('author')
bae14048 1504 info.update({
0c36dc00 1505 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1506 'title': unescapeHTML(e.get('name')),
1507 'description': unescapeHTML(e.get('description')),
eb2333bc 1508 'thumbnails': [{'url': unescapeHTML(url)}
21633673 1509 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1510 if url_or_none(url)],
bae14048
S
1511 'duration': parse_duration(e.get('duration')),
1512 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1513 # author can be an instance of 'Organization' or 'Person' types.
1514 # both types can have 'name' property(inherited from 'Thing' type). [1]
1515 # however some websites are using 'Text' type instead.
1516 # 1. https://schema.org/VideoObject
14f25df2 1517 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
56ba69e4 1518 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
bae14048
S
1519 'tbr': int_or_none(e.get('bitrate')),
1520 'width': int_or_none(e.get('width')),
1521 'height': int_or_none(e.get('height')),
33a81c2c 1522 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1523 })
e7e4a6e0 1524 extract_interaction_statistic(e)
f5225737 1525 extract_chapter_information(e)
bae14048 1526
d5c32548
ZM
1527 def traverse_json_ld(json_ld, at_top_level=True):
1528 for e in json_ld:
1529 if at_top_level and '@context' not in e:
1530 continue
1531 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1532 traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
1533 break
f3c0c773 1534 if expected_type is not None and not is_type(e, expected_type):
4433bb02 1535 continue
8f122fa0 1536 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1537 if rating is not None:
1538 info['average_rating'] = rating
f3c0c773 1539 if is_type(e, 'TVEpisode', 'Episode'):
440863ad 1540 episode_name = unescapeHTML(e.get('name'))
46933a15 1541 info.update({
440863ad 1542 'episode': episode_name,
46933a15
S
1543 'episode_number': int_or_none(e.get('episodeNumber')),
1544 'description': unescapeHTML(e.get('description')),
1545 })
440863ad
S
1546 if not info.get('title') and episode_name:
1547 info['title'] = episode_name
46933a15 1548 part_of_season = e.get('partOfSeason')
f3c0c773 1549 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1550 info.update({
1551 'season': unescapeHTML(part_of_season.get('name')),
1552 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1553 })
d16b3c66 1554 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
f3c0c773 1555 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1556 info['series'] = unescapeHTML(part_of_series.get('name'))
f3c0c773 1557 elif is_type(e, 'Movie'):
391256dc
S
1558 info.update({
1559 'title': unescapeHTML(e.get('name')),
1560 'description': unescapeHTML(e.get('description')),
1561 'duration': parse_duration(e.get('duration')),
1562 'timestamp': unified_timestamp(e.get('dateCreated')),
1563 })
f3c0c773 1564 elif is_type(e, 'Article', 'NewsArticle'):
46933a15
S
1565 info.update({
1566 'timestamp': parse_iso8601(e.get('datePublished')),
1567 'title': unescapeHTML(e.get('headline')),
d5c32548 1568 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
46933a15 1569 })
f3c0c773 1570 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
2edb38e8 1571 extract_video_object(e['video'][0])
f3c0c773 1572 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
e50c3500 1573 extract_video_object(e['subjectOf'][0])
f3c0c773 1574 elif is_type(e, 'VideoObject'):
bae14048 1575 extract_video_object(e)
4433bb02
S
1576 if expected_type is None:
1577 continue
1578 else:
1579 break
c69701c6 1580 video = e.get('video')
f3c0c773 1581 if is_type(video, 'VideoObject'):
c69701c6 1582 extract_video_object(video)
4433bb02
S
1583 if expected_type is None:
1584 continue
1585 else:
1586 break
d5c32548
ZM
1587 traverse_json_ld(json_ld)
1588
90137ca4 1589 return filter_dict(info)
4ca2a3cf 1590
135dfa2c 1591 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
f98709af
LL
1592 return self._parse_json(
1593 self._search_regex(
1594 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
135dfa2c 1595 webpage, 'next.js data', fatal=fatal, **kw),
1596 video_id, transform_source=transform_source, fatal=fatal)
f98709af 1597
8072ef2b 1598 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1599 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
66f4c04e 1600 rectx = re.escape(context_name)
8072ef2b 1601 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
66f4c04e 1602 js, arg_keys, arg_vals = self._search_regex(
8072ef2b 1603 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1604 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), fatal=fatal)
66f4c04e
THD
1605
1606 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1607
1608 for key, val in args.items():
1609 if val in ('undefined', 'void 0'):
1610 args[key] = 'null'
1611
8072ef2b 1612 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1613 return traverse_obj(ret, traverse) or {}
66f4c04e 1614
27713812 1615 @staticmethod
f8da79f8 1616 def _hidden_inputs(html):
586f1cc5 1617 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1618 hidden_inputs = {}
c8498368
S
1619 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1620 attrs = extract_attributes(input)
1621 if not input:
201ea3ee 1622 continue
c8498368 1623 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1624 continue
c8498368
S
1625 name = attrs.get('name') or attrs.get('id')
1626 value = attrs.get('value')
1627 if name and value is not None:
1628 hidden_inputs[name] = value
201ea3ee 1629 return hidden_inputs
27713812 1630
cf61d96d
S
1631 def _form_hidden_inputs(self, form_id, html):
1632 form = self._search_regex(
73eb13df 1633 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1634 html, '%s form' % form_id, group='form')
1635 return self._hidden_inputs(form)
1636
eb8a4433 1637 class FormatSort:
b050d210 1638 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
eb8a4433 1639
8326b00a 1640 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
176f1866 1641 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
f304da8a 1642 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
198e3a04 1643 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
53ed7066 1644 'height', 'width', 'proto', 'vext', 'abr', 'aext',
f304da8a 1645 'fps', 'fs_approx', 'source', 'id')
eb8a4433 1646
1647 settings = {
1648 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1649 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1650 'acodec': {'type': 'ordered', 'regex': True,
a10aa588 1651 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
176f1866 1652 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1653 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
f137c99e 1654 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
f304da8a 1655 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
eb8a4433 1656 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1657 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1658 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1659 'aext': {'type': 'ordered', 'field': 'audio_ext',
1660 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1661 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1662 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f5510afe 1663 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
8326b00a 1664 'field': ('vcodec', 'acodec'),
1665 'function': lambda it: int(any(v != 'none' for v in it))},
f983b875 1666 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1667 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1668 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
10beccc9 1669 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1670 'quality': {'convert': 'float', 'default': -1},
eb8a4433 1671 'filesize': {'convert': 'bytes'},
f137c99e 1672 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1673 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1674 'height': {'convert': 'float_none'},
1675 'width': {'convert': 'float_none'},
1676 'fps': {'convert': 'float_none'},
1677 'tbr': {'convert': 'float_none'},
1678 'vbr': {'convert': 'float_none'},
1679 'abr': {'convert': 'float_none'},
1680 'asr': {'convert': 'float_none'},
10beccc9 1681 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
63be1aab 1682
eb8a4433 1683 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1684 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1685 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1686 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
f5510afe 1687 'res': {'type': 'multiple', 'field': ('height', 'width'),
dbf5416a 1688 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
63be1aab 1689
19188702 1690 # For compatibility with youtube-dl
1691 'format_id': {'type': 'alias', 'field': 'id'},
1692 'preference': {'type': 'alias', 'field': 'ie_pref'},
1693 'language_preference': {'type': 'alias', 'field': 'lang'},
63be1aab 1694 'source_preference': {'type': 'alias', 'field': 'source'},
08d30158 1695 'protocol': {'type': 'alias', 'field': 'proto'},
63be1aab 1696 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
08d30158 1697
1698 # Deprecated
1699 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1700 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1701 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1702 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1703 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1704 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1705 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1706 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1707 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1708 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1709 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1710 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1711 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1712 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1713 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1714 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1715 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1716 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1717 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1718 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
63be1aab 1719 }
eb8a4433 1720
f304da8a 1721 def __init__(self, ie, field_preference):
1722 self._order = []
1723 self.ydl = ie._downloader
1724 self.evaluate_params(self.ydl.params, field_preference)
1725 if ie.get_param('verbose'):
1726 self.print_verbose_info(self.ydl.write_debug)
eb8a4433 1727
1728 def _get_field_setting(self, field, key):
1729 if field not in self.settings:
ee8dd27a 1730 if key in ('forced', 'priority'):
1731 return False
1732 self.ydl.deprecation_warning(
1733 f'Using arbitrary fields ({field}) for format sorting is deprecated '
1734 'and may be removed in a future version')
eb8a4433 1735 self.settings[field] = {}
1736 propObj = self.settings[field]
1737 if key not in propObj:
1738 type = propObj.get('type')
1739 if key == 'field':
1740 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1741 elif key == 'convert':
1742 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1743 else:
f5510afe 1744 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
eb8a4433 1745 propObj[key] = default
1746 return propObj[key]
1747
1748 def _resolve_field_value(self, field, value, convertNone=False):
1749 if value is None:
1750 if not convertNone:
1751 return None
4bcc7bd1 1752 else:
eb8a4433 1753 value = value.lower()
1754 conversion = self._get_field_setting(field, 'convert')
1755 if conversion == 'ignore':
1756 return None
1757 if conversion == 'string':
1758 return value
1759 elif conversion == 'float_none':
1760 return float_or_none(value)
1761 elif conversion == 'bytes':
1762 return FileDownloader.parse_bytes(value)
1763 elif conversion == 'order':
da9be05e 1764 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1765 use_regex = self._get_field_setting(field, 'regex')
1766 list_length = len(order_list)
1767 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1768 if use_regex and value is not None:
da9be05e 1769 for i, regex in enumerate(order_list):
eb8a4433 1770 if regex and re.match(regex, value):
1771 return list_length - i
1772 return list_length - empty_pos # not in list
1773 else: # not regex or value = None
1774 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1775 else:
1776 if value.isnumeric():
1777 return float(value)
4bcc7bd1 1778 else:
eb8a4433 1779 self.settings[field]['convert'] = 'string'
1780 return value
1781
1782 def evaluate_params(self, params, sort_extractor):
1783 self._use_free_order = params.get('prefer_free_formats', False)
1784 self._sort_user = params.get('format_sort', [])
1785 self._sort_extractor = sort_extractor
1786
1787 def add_item(field, reverse, closest, limit_text):
1788 field = field.lower()
1789 if field in self._order:
1790 return
1791 self._order.append(field)
1792 limit = self._resolve_field_value(field, limit_text)
1793 data = {
1794 'reverse': reverse,
1795 'closest': False if limit is None else closest,
1796 'limit_text': limit_text,
1797 'limit': limit}
1798 if field in self.settings:
1799 self.settings[field].update(data)
1800 else:
1801 self.settings[field] = data
1802
1803 sort_list = (
1804 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1805 + (tuple() if params.get('format_sort_force', False)
1806 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1807 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1808
1809 for item in sort_list:
1810 match = re.match(self.regex, item)
1811 if match is None:
1812 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1813 field = match.group('field')
1814 if field is None:
1815 continue
1816 if self._get_field_setting(field, 'type') == 'alias':
ee8dd27a 1817 alias, field = field, self._get_field_setting(field, 'field')
08d30158 1818 if self._get_field_setting(alias, 'deprecated'):
19188702 1819 self.ydl.deprecation_warning(
1820 f'Format sorting alias {alias} is deprecated '
1821 f'and may be removed in a future version. Please use {field} instead')
eb8a4433 1822 reverse = match.group('reverse') is not None
b050d210 1823 closest = match.group('separator') == '~'
eb8a4433 1824 limit_text = match.group('limit')
1825
1826 has_limit = limit_text is not None
1827 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1828 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1829
1830 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
b5ae35ee 1831 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
eb8a4433 1832 limit_count = len(limits)
1833 for (i, f) in enumerate(fields):
1834 add_item(f, reverse, closest,
1835 limits[i] if i < limit_count
1836 else limits[0] if has_limit and not has_multiple_limits
1837 else None)
1838
0760b0a7 1839 def print_verbose_info(self, write_debug):
b31fdeed 1840 if self._sort_user:
0760b0a7 1841 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
eb8a4433 1842 if self._sort_extractor:
0760b0a7 1843 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1844 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
eb8a4433 1845 '+' if self._get_field_setting(field, 'reverse') else '', field,
1846 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1847 self._get_field_setting(field, 'limit_text'),
1848 self._get_field_setting(field, 'limit'))
1849 if self._get_field_setting(field, 'limit_text') is not None else '')
1850 for field in self._order if self._get_field_setting(field, 'visible')]))
1851
1852 def _calculate_field_preference_from_value(self, format, field, type, value):
1853 reverse = self._get_field_setting(field, 'reverse')
1854 closest = self._get_field_setting(field, 'closest')
1855 limit = self._get_field_setting(field, 'limit')
1856
1857 if type == 'extractor':
1858 maximum = self._get_field_setting(field, 'max')
1859 if value is None or (maximum is not None and value >= maximum):
f983b875 1860 value = -1
eb8a4433 1861 elif type == 'boolean':
1862 in_list = self._get_field_setting(field, 'in_list')
1863 not_in_list = self._get_field_setting(field, 'not_in_list')
1864 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1865 elif type == 'ordered':
1866 value = self._resolve_field_value(field, value, True)
1867
1868 # try to convert to number
6a04a74e 1869 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
eb8a4433 1870 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1871 if is_num:
1872 value = val_num
1873
1874 return ((-10, 0) if value is None
1875 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1876 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1877 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1878 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1879 else (-1, value, 0))
1880
1881 def _calculate_field_preference(self, format, field):
1882 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1883 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1884 if type == 'multiple':
1885 type = 'field' # Only 'field' is allowed in multiple for now
1886 actual_fields = self._get_field_setting(field, 'field')
1887
f5510afe 1888 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
eb8a4433 1889 else:
1890 value = get_value(field)
1891 return self._calculate_field_preference_from_value(format, field, type, value)
1892
1893 def calculate_preference(self, format):
1894 # Determine missing protocol
1895 if not format.get('protocol'):
1896 format['protocol'] = determine_protocol(format)
1897
1898 # Determine missing ext
1899 if not format.get('ext') and 'url' in format:
1900 format['ext'] = determine_ext(format['url'])
1901 if format.get('vcodec') == 'none':
8326b00a 1902 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
eb8a4433 1903 format['video_ext'] = 'none'
1904 else:
1905 format['video_ext'] = format['ext']
1906 format['audio_ext'] = 'none'
1907 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1908 # format['preference'] = -1000
1909
1910 # Determine missing bitrates
1911 if format.get('tbr') is None:
1912 if format.get('vbr') is not None and format.get('abr') is not None:
1913 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1914 else:
b5ae35ee 1915 if format.get('vcodec') != 'none' and format.get('vbr') is None:
eb8a4433 1916 format['vbr'] = format.get('tbr') - format.get('abr', 0)
b5ae35ee 1917 if format.get('acodec') != 'none' and format.get('abr') is None:
eb8a4433 1918 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1919
1920 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1921
1922 def _sort_formats(self, formats, field_preference=[]):
1923 if not formats:
88acdbc2 1924 return
1d485a1a 1925 formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
59040888 1926
96a53167
S
1927 def _check_formats(self, formats, video_id):
1928 if formats:
1929 formats[:] = filter(
1930 lambda f: self._is_valid_url(
1931 f['url'], video_id,
1932 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1933 formats)
1934
f5bdb444
S
1935 @staticmethod
1936 def _remove_duplicate_formats(formats):
1937 format_urls = set()
1938 unique_formats = []
1939 for f in formats:
1940 if f['url'] not in format_urls:
1941 format_urls.add(f['url'])
1942 unique_formats.append(f)
1943 formats[:] = unique_formats
1944
45024183 1945 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1946 url = self._proto_relative_url(url, scheme='http:')
1947 # For now assume non HTTP(S) URLs always valid
1948 if not (url.startswith('http://') or url.startswith('https://')):
1949 return True
96a53167 1950 try:
45024183 1951 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1952 return True
8bdd16b4 1953 except ExtractorError as e:
25e911a9 1954 self.to_screen(
8bdd16b4 1955 '%s: %s URL is invalid, skipping: %s'
1956 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1957 return False
96a53167 1958
20991253 1959 def http_scheme(self):
1ede5b24 1960 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1961 return (
1962 'http:'
a06916d9 1963 if self.get_param('prefer_insecure', False)
20991253
PH
1964 else 'https:')
1965
57c7411f
PH
1966 def _proto_relative_url(self, url, scheme=None):
1967 if url is None:
1968 return url
1969 if url.startswith('//'):
1970 if scheme is None:
1971 scheme = self.http_scheme()
1972 return scheme + url
1973 else:
1974 return url
1975
4094b6e3
PH
1976 def _sleep(self, timeout, video_id, msg_template=None):
1977 if msg_template is None:
f1a9d64e 1978 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1979 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1980 self.to_screen(msg)
1981 time.sleep(timeout)
1982
f983b875 1983 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1984 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1985 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
a076c1f9 1986 res = self._download_xml_handle(
f036a632 1987 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1988 'Unable to download f4m manifest',
1989 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1990 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1991 transform_source=transform_source,
7360c06f 1992 fatal=fatal, data=data, headers=headers, query=query)
a076c1f9 1993 if res is False:
8d29e47f 1994 return []
31bb8d3f 1995
a076c1f9
E
1996 manifest, urlh = res
1997 manifest_url = urlh.geturl()
1998
0fdbb332 1999 return self._parse_f4m_formats(
f983b875 2000 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 2001 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 2002
f983b875 2003 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 2004 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 2005 fatal=True, m3u8_id=None):
f9934b96 2006 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
d9eb580a
S
2007 return []
2008
7a5c1cfe 2009 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 2010 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2011 if akamai_pv is not None and ';' in akamai_pv.text:
2012 playerVerificationChallenge = akamai_pv.text.split(';')[0]
2013 if playerVerificationChallenge.strip() != '':
2014 return []
2015
31bb8d3f 2016 formats = []
7a47d07c 2017 manifest_version = '1.0'
b2527359 2018 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 2019 if not media_nodes:
7a47d07c 2020 manifest_version = '2.0'
34e48bed 2021 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 2022 # Remove unsupported DRM protected media from final formats
067aa17e 2023 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
2024 media_nodes = remove_encrypted_media(media_nodes)
2025 if not media_nodes:
2026 return formats
48107c19
S
2027
2028 manifest_base_url = get_base_url(manifest)
0a5685b2 2029
a6571f10 2030 bootstrap_info = xpath_element(
0a5685b2
YCH
2031 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2032 'bootstrap info', default=None)
2033
edd6074c
RA
2034 vcodec = None
2035 mime_type = xpath_text(
2036 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2037 'base URL', default=None)
2038 if mime_type and mime_type.startswith('audio/'):
2039 vcodec = 'none'
2040
b2527359 2041 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
2042 tbr = int_or_none(media_el.attrib.get('bitrate'))
2043 width = int_or_none(media_el.attrib.get('width'))
2044 height = int_or_none(media_el.attrib.get('height'))
34921b43 2045 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
2046 # If <bootstrapInfo> is present, the specified f4m is a
2047 # stream-level manifest, and only set-level manifests may refer to
2048 # external resources. See section 11.4 and section 4 of F4M spec
2049 if bootstrap_info is None:
2050 media_url = None
2051 # @href is introduced in 2.0, see section 11.6 of F4M spec
2052 if manifest_version == '2.0':
2053 media_url = media_el.attrib.get('href')
2054 if media_url is None:
2055 media_url = media_el.attrib.get('url')
31c746e5
S
2056 if not media_url:
2057 continue
cc357c4d
S
2058 manifest_url = (
2059 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 2060 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
2061 # If media_url is itself a f4m manifest do the recursive extraction
2062 # since bitrates in parent manifest (this one) and media_url manifest
2063 # may differ leading to inability to resolve the format by requested
2064 # bitrate in f4m downloader
240b6045
YCH
2065 ext = determine_ext(manifest_url)
2066 if ext == 'f4m':
77b8b4e6 2067 f4m_formats = self._extract_f4m_formats(
f983b875 2068 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
2069 transform_source=transform_source, fatal=fatal)
2070 # Sometimes stream-level manifest contains single media entry that
2071 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2072 # At the same time parent's media entry in set-level manifest may
2073 # contain it. We will copy it from parent in such cases.
2074 if len(f4m_formats) == 1:
2075 f = f4m_formats[0]
2076 f.update({
2077 'tbr': f.get('tbr') or tbr,
2078 'width': f.get('width') or width,
2079 'height': f.get('height') or height,
2080 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 2081 'vcodec': vcodec,
77b8b4e6
S
2082 })
2083 formats.extend(f4m_formats)
70f0f5a8 2084 continue
240b6045
YCH
2085 elif ext == 'm3u8':
2086 formats.extend(self._extract_m3u8_formats(
2087 manifest_url, video_id, 'mp4', preference=preference,
f983b875 2088 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 2089 continue
31bb8d3f 2090 formats.append({
77b8b4e6 2091 'format_id': format_id,
31bb8d3f 2092 'url': manifest_url,
30d0b549 2093 'manifest_url': manifest_url,
a6571f10 2094 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 2095 'protocol': 'f4m',
b2527359 2096 'tbr': tbr,
77b8b4e6
S
2097 'width': width,
2098 'height': height,
edd6074c 2099 'vcodec': vcodec,
60ca389c 2100 'preference': preference,
f983b875 2101 'quality': quality,
31bb8d3f 2102 })
31bb8d3f
JMF
2103 return formats
2104
f983b875 2105 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 2106 return {
34921b43 2107 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
2108 'url': m3u8_url,
2109 'ext': ext,
2110 'protocol': 'm3u8',
37768f92 2111 'preference': preference - 100 if preference else -100,
f983b875 2112 'quality': quality,
704df56d
PH
2113 'resolution': 'multiple',
2114 'format_note': 'Quality selection URL',
16da9bbc
YCH
2115 }
2116
b5ae35ee 2117 def _report_ignoring_subs(self, name):
2118 self.report_warning(bug_reports_message(
2119 f'Ignoring subtitle tracks found in the {name} manifest; '
2120 'if any subtitle tracks are missing,'
2121 ), only_once=True)
2122
a0c3b2d5
F
2123 def _extract_m3u8_formats(self, *args, **kwargs):
2124 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2125 if subs:
b5ae35ee 2126 self._report_ignoring_subs('HLS')
a0c3b2d5
F
2127 return fmts
2128
2129 def _extract_m3u8_formats_and_subtitles(
177877c5 2130 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2131 preference=None, quality=None, m3u8_id=None, note=None,
2132 errnote=None, fatal=True, live=False, data=None, headers={},
2133 query={}):
2134
dbd82a1d 2135 res = self._download_webpage_handle(
81515ad9 2136 m3u8_url, video_id,
37a3bb66 2137 note='Downloading m3u8 information' if note is None else note,
2138 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 2139 fatal=fatal, data=data, headers=headers, query=query)
cb252080 2140
dbd82a1d 2141 if res is False:
a0c3b2d5 2142 return [], {}
cb252080 2143
dbd82a1d 2144 m3u8_doc, urlh = res
37113045 2145 m3u8_url = urlh.geturl()
9cdffeeb 2146
a0c3b2d5 2147 return self._parse_m3u8_formats_and_subtitles(
cb252080 2148 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 2149 preference=preference, quality=quality, m3u8_id=m3u8_id,
2150 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2151 headers=headers, query=query, video_id=video_id)
cb252080 2152
a0c3b2d5 2153 def _parse_m3u8_formats_and_subtitles(
42676437 2154 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2155 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2156 errnote=None, fatal=True, data=None, headers={}, query={},
2157 video_id=None):
60755938 2158 formats, subtitles = [], {}
a0c3b2d5 2159
6b993ca7 2160 has_drm = re.search('|'.join([
2161 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2162 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
2163 ]), m3u8_doc)
a0c3b2d5 2164
60755938 2165 def format_url(url):
14f25df2 2166 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
60755938 2167
2168 if self.get_param('hls_split_discontinuity', False):
2169 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2170 if not m3u8_doc:
2171 if not manifest_url:
2172 return []
2173 m3u8_doc = self._download_webpage(
2174 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2175 note=False, errnote='Failed to download m3u8 playlist information')
2176 if m3u8_doc is False:
2177 return []
2178 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 2179
60755938 2180 else:
2181 def _extract_m3u8_playlist_indices(*args, **kwargs):
2182 return [None]
310c2ed2 2183
cb252080
S
2184 # References:
2185 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2186 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2187 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2188
2189 # We should try extracting formats only from master playlists [1, 4.3.4],
2190 # i.e. playlists that describe available qualities. On the other hand
2191 # media playlists [1, 4.3.3] should be returned as is since they contain
2192 # just the media without qualities renditions.
9cdffeeb 2193 # Fortunately, master playlist can be easily distinguished from media
cb252080 2194 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2195 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2196 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2197 # media playlist and MUST NOT appear in master playlist thus we can
2198 # clearly detect media playlist with this criterion.
2199
9cdffeeb 2200 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2201 formats = [{
34921b43 2202 'format_id': join_nonempty(m3u8_id, idx),
60755938 2203 'format_index': idx,
42676437 2204 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
60755938 2205 'ext': ext,
2206 'protocol': entry_protocol,
2207 'preference': preference,
2208 'quality': quality,
88acdbc2 2209 'has_drm': has_drm,
60755938 2210 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2211
a0c3b2d5 2212 return formats, subtitles
cb252080
S
2213
2214 groups = {}
2215 last_stream_inf = {}
2216
2217 def extract_media(x_media_line):
2218 media = parse_m3u8_attributes(x_media_line)
2219 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2220 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2221 if not (media_type and group_id and name):
2222 return
2223 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2224 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2225 if media_type == 'SUBTITLES':
3907333c 2226 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2227 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2228 # However, lack of URI has been spotted in the wild.
2229 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2230 if not media.get('URI'):
2231 return
a0c3b2d5
F
2232 url = format_url(media['URI'])
2233 sub_info = {
2234 'url': url,
2235 'ext': determine_ext(url),
2236 }
4a2f19ab
F
2237 if sub_info['ext'] == 'm3u8':
2238 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2239 # files may contain is WebVTT:
2240 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2241 sub_info['ext'] = 'vtt'
2242 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2243 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2244 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2245 if media_type not in ('VIDEO', 'AUDIO'):
2246 return
2247 media_url = media.get('URI')
2248 if media_url:
310c2ed2 2249 manifest_url = format_url(media_url)
60755938 2250 formats.extend({
34921b43 2251 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2252 'format_note': name,
2253 'format_index': idx,
2254 'url': manifest_url,
2255 'manifest_url': m3u8_url,
2256 'language': media.get('LANGUAGE'),
2257 'ext': ext,
2258 'protocol': entry_protocol,
2259 'preference': preference,
2260 'quality': quality,
2261 'vcodec': 'none' if media_type == 'AUDIO' else None,
2262 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2263
2264 def build_stream_name():
2265 # Despite specification does not mention NAME attribute for
3019cb0c
S
2266 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2267 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2268 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2269 stream_name = last_stream_inf.get('NAME')
2270 if stream_name:
2271 return stream_name
2272 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2273 # from corresponding rendition group
2274 stream_group_id = last_stream_inf.get('VIDEO')
2275 if not stream_group_id:
2276 return
2277 stream_group = groups.get(stream_group_id)
2278 if not stream_group:
2279 return stream_group_id
2280 rendition = stream_group[0]
2281 return rendition.get('NAME') or stream_group_id
2282
379306ef 2283 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2284 # chance to detect video only formats when EXT-X-STREAM-INF tags
2285 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2286 for line in m3u8_doc.splitlines():
2287 if line.startswith('#EXT-X-MEDIA:'):
2288 extract_media(line)
2289
704df56d
PH
2290 for line in m3u8_doc.splitlines():
2291 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2292 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2293 elif line.startswith('#') or not line.strip():
2294 continue
2295 else:
9c99bef7 2296 tbr = float_or_none(
3089bc74
S
2297 last_stream_inf.get('AVERAGE-BANDWIDTH')
2298 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2299 manifest_url = format_url(line.strip())
5ef62fc4 2300
60755938 2301 for idx in _extract_m3u8_playlist_indices(manifest_url):
2302 format_id = [m3u8_id, None, idx]
310c2ed2 2303 # Bandwidth of live streams may differ over time thus making
2304 # format_id unpredictable. So it's better to keep provided
2305 # format_id intact.
2306 if not live:
60755938 2307 stream_name = build_stream_name()
34921b43 2308 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2309 f = {
34921b43 2310 'format_id': join_nonempty(*format_id),
60755938 2311 'format_index': idx,
310c2ed2 2312 'url': manifest_url,
2313 'manifest_url': m3u8_url,
2314 'tbr': tbr,
2315 'ext': ext,
2316 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2317 'protocol': entry_protocol,
2318 'preference': preference,
2319 'quality': quality,
2320 }
2321 resolution = last_stream_inf.get('RESOLUTION')
2322 if resolution:
2323 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2324 if mobj:
2325 f['width'] = int(mobj.group('width'))
2326 f['height'] = int(mobj.group('height'))
2327 # Unified Streaming Platform
2328 mobj = re.search(
2329 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2330 if mobj:
2331 abr, vbr = mobj.groups()
2332 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2333 f.update({
2334 'vbr': vbr,
2335 'abr': abr,
2336 })
2337 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2338 f.update(codecs)
2339 audio_group_id = last_stream_inf.get('AUDIO')
2340 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2341 # references a rendition group MUST have a CODECS attribute.
2342 # However, this is not always respected, for example, [2]
2343 # contains EXT-X-STREAM-INF tag which references AUDIO
2344 # rendition group but does not have CODECS and despite
2345 # referencing an audio group it represents a complete
2346 # (with audio and video) format. So, for such cases we will
2347 # ignore references to rendition groups and treat them
2348 # as complete formats.
2349 if audio_group_id and codecs and f.get('vcodec') != 'none':
2350 audio_group = groups.get(audio_group_id)
2351 if audio_group and audio_group[0].get('URI'):
2352 # TODO: update acodec for audio only formats with
2353 # the same GROUP-ID
2354 f['acodec'] = 'none'
fc21af50 2355 if not f.get('ext'):
2356 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2357 formats.append(f)
2358
2359 # for DailyMotion
2360 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2361 if progressive_uri:
2362 http_f = f.copy()
2363 del http_f['manifest_url']
2364 http_f.update({
2365 'format_id': f['format_id'].replace('hls-', 'http-'),
2366 'protocol': 'http',
2367 'url': progressive_uri,
2368 })
2369 formats.append(http_f)
5ef62fc4 2370
cb252080 2371 last_stream_inf = {}
a0c3b2d5 2372 return formats, subtitles
704df56d 2373
3cf4b91d
C
2374 def _extract_m3u8_vod_duration(
2375 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2376
2377 m3u8_vod = self._download_webpage(
2378 m3u8_vod_url, video_id,
2379 note='Downloading m3u8 VOD manifest' if note is None else note,
2380 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2381 fatal=False, data=data, headers=headers, query=query)
2382
2383 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2384
2385 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2386 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2387 return None
2388
2389 return int(sum(
2390 float(line[len('#EXTINF:'):].split(',')[0])
2391 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2392
a107193e
S
2393 @staticmethod
2394 def _xpath_ns(path, namespace=None):
2395 if not namespace:
2396 return path
2397 out = []
2398 for c in path.split('/'):
2399 if not c or c == '.':
2400 out.append(c)
2401 else:
2402 out.append('{%s}%s' % (namespace, c))
2403 return '/'.join(out)
2404
da1c94ee 2405 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
a076c1f9
E
2406 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2407 if res is False:
995029a1 2408 assert not fatal
774a46c5 2409 return [], {}
e89a2aab 2410
a076c1f9
E
2411 smil, urlh = res
2412 smil_url = urlh.geturl()
2413
17712eeb 2414 namespace = self._parse_smil_namespace(smil)
a107193e 2415
da1c94ee 2416 fmts = self._parse_smil_formats(
a107193e 2417 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2418 subs = self._parse_smil_subtitles(
2419 smil, namespace=namespace)
2420
2421 return fmts, subs
2422
2423 def _extract_smil_formats(self, *args, **kwargs):
2424 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2425 if subs:
b5ae35ee 2426 self._report_ignoring_subs('SMIL')
da1c94ee 2427 return fmts
a107193e
S
2428
2429 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
a076c1f9
E
2430 res = self._download_smil(smil_url, video_id, fatal=fatal)
2431 if res is False:
a107193e 2432 return {}
a076c1f9
E
2433
2434 smil, urlh = res
2435 smil_url = urlh.geturl()
2436
a107193e
S
2437 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2438
09f572fb 2439 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a076c1f9 2440 return self._download_xml_handle(
a107193e 2441 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2442 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2443
2444 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2445 namespace = self._parse_smil_namespace(smil)
a107193e
S
2446
2447 formats = self._parse_smil_formats(
2448 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2449 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2450
2451 video_id = os.path.splitext(url_basename(smil_url))[0]
2452 title = None
2453 description = None
647eab45 2454 upload_date = None
a107193e
S
2455 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2456 name = meta.attrib.get('name')
2457 content = meta.attrib.get('content')
2458 if not name or not content:
2459 continue
2460 if not title and name == 'title':
2461 title = content
2462 elif not description and name in ('description', 'abstract'):
2463 description = content
647eab45
S
2464 elif not upload_date and name == 'date':
2465 upload_date = unified_strdate(content)
a107193e 2466
1e5bcdec
S
2467 thumbnails = [{
2468 'id': image.get('type'),
2469 'url': image.get('src'),
2470 'width': int_or_none(image.get('width')),
2471 'height': int_or_none(image.get('height')),
2472 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2473
a107193e
S
2474 return {
2475 'id': video_id,
2476 'title': title or video_id,
2477 'description': description,
647eab45 2478 'upload_date': upload_date,
1e5bcdec 2479 'thumbnails': thumbnails,
a107193e
S
2480 'formats': formats,
2481 'subtitles': subtitles,
2482 }
2483
17712eeb
S
2484 def _parse_smil_namespace(self, smil):
2485 return self._search_regex(
2486 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2487
f877c6ae 2488 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2489 base = smil_url
2490 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2491 b = meta.get('base') or meta.get('httpBase')
2492 if b:
2493 base = b
2494 break
e89a2aab
S
2495
2496 formats = []
2497 rtmp_count = 0
a107193e 2498 http_count = 0
7f32e5dc 2499 m3u8_count = 0
9359f3d4 2500 imgs_count = 0
a107193e 2501
9359f3d4 2502 srcs = set()
ad96b4c8
YCH
2503 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2504 for medium in media:
2505 src = medium.get('src')
81e1c4e2 2506 if not src or src in srcs:
a107193e 2507 continue
9359f3d4 2508 srcs.add(src)
a107193e 2509
ad96b4c8
YCH
2510 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2511 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2512 width = int_or_none(medium.get('width'))
2513 height = int_or_none(medium.get('height'))
2514 proto = medium.get('proto')
2515 ext = medium.get('ext')
a107193e 2516 src_ext = determine_ext(src)
ad96b4c8 2517 streamer = medium.get('streamer') or base
a107193e
S
2518
2519 if proto == 'rtmp' or streamer.startswith('rtmp'):
2520 rtmp_count += 1
2521 formats.append({
2522 'url': streamer,
2523 'play_path': src,
2524 'ext': 'flv',
2525 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2526 'tbr': bitrate,
2527 'filesize': filesize,
2528 'width': width,
2529 'height': height,
2530 })
f877c6ae
YCH
2531 if transform_rtmp_url:
2532 streamer, src = transform_rtmp_url(streamer, src)
2533 formats[-1].update({
2534 'url': streamer,
2535 'play_path': src,
2536 })
a107193e
S
2537 continue
2538
14f25df2 2539 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
c349456e 2540 src_url = src_url.strip()
a107193e
S
2541
2542 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2543 m3u8_formats = self._extract_m3u8_formats(
2544 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2545 if len(m3u8_formats) == 1:
2546 m3u8_count += 1
2547 m3u8_formats[0].update({
2548 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2549 'tbr': bitrate,
2550 'width': width,
2551 'height': height,
2552 })
2553 formats.extend(m3u8_formats)
bd21ead2 2554 elif src_ext == 'f4m':
a107193e
S
2555 f4m_url = src_url
2556 if not f4m_params:
2557 f4m_params = {
2558 'hdcore': '3.2.0',
2559 'plugin': 'flowplayer-3.2.0.1',
2560 }
2561 f4m_url += '&' if '?' in f4m_url else '?'
14f25df2 2562 f4m_url += urllib.parse.urlencode(f4m_params)
7e5edcfd 2563 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2564 elif src_ext == 'mpd':
2565 formats.extend(self._extract_mpd_formats(
2566 src_url, video_id, mpd_id='dash', fatal=False))
2567 elif re.search(r'\.ism/[Mm]anifest', src_url):
2568 formats.extend(self._extract_ism_formats(
2569 src_url, video_id, ism_id='mss', fatal=False))
2570 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2571 http_count += 1
2572 formats.append({
2573 'url': src_url,
2574 'ext': ext or src_ext or 'flv',
2575 'format_id': 'http-%d' % (bitrate or http_count),
2576 'tbr': bitrate,
2577 'filesize': filesize,
2578 'width': width,
2579 'height': height,
2580 })
63757032 2581
9359f3d4
F
2582 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2583 src = medium.get('src')
2584 if not src or src in srcs:
2585 continue
2586 srcs.add(src)
2587
2588 imgs_count += 1
2589 formats.append({
2590 'format_id': 'imagestream-%d' % (imgs_count),
2591 'url': src,
2592 'ext': mimetype2ext(medium.get('type')),
2593 'acodec': 'none',
2594 'vcodec': 'none',
2595 'width': int_or_none(medium.get('width')),
2596 'height': int_or_none(medium.get('height')),
2597 'format_note': 'SMIL storyboards',
2598 })
2599
e89a2aab
S
2600 return formats
2601
ce00af87 2602 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2603 urls = []
a107193e
S
2604 subtitles = {}
2605 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2606 src = textstream.get('src')
d413095f 2607 if not src or src in urls:
a107193e 2608 continue
d413095f 2609 urls.append(src)
df634be2 2610 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2611 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2612 subtitles.setdefault(lang, []).append({
2613 'url': src,
2614 'ext': ext,
2615 })
2616 return subtitles
63757032 2617
47a5cb77 2618 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
a076c1f9 2619 res = self._download_xml_handle(
47a5cb77 2620 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5 2621 'Unable to download xspf manifest', fatal=fatal)
a076c1f9 2622 if res is False:
942acef5 2623 return []
a076c1f9
E
2624
2625 xspf, urlh = res
2626 xspf_url = urlh.geturl()
2627
47a5cb77
S
2628 return self._parse_xspf(
2629 xspf, playlist_id, xspf_url=xspf_url,
2630 xspf_base_url=base_url(xspf_url))
8d6765cf 2631
47a5cb77 2632 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2633 NS_MAP = {
2634 'xspf': 'http://xspf.org/ns/0/',
2635 's1': 'http://static.streamone.nl/player/ns/0',
2636 }
2637
2638 entries = []
47a5cb77 2639 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2640 title = xpath_text(
98044462 2641 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2642 description = xpath_text(
2643 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2644 thumbnail = xpath_text(
2645 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2646 duration = float_or_none(
2647 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2648
47a5cb77
S
2649 formats = []
2650 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2651 format_url = urljoin(xspf_base_url, location.text)
2652 if not format_url:
2653 continue
2654 formats.append({
2655 'url': format_url,
2656 'manifest_url': xspf_url,
2657 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2658 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2659 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2660 })
8d6765cf
S
2661 self._sort_formats(formats)
2662
2663 entries.append({
2664 'id': playlist_id,
2665 'title': title,
2666 'description': description,
2667 'thumbnail': thumbnail,
2668 'duration': duration,
2669 'formats': formats,
2670 })
2671 return entries
2672
171e59ed
F
2673 def _extract_mpd_formats(self, *args, **kwargs):
2674 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2675 if subs:
b5ae35ee 2676 self._report_ignoring_subs('DASH')
171e59ed
F
2677 return fmts
2678
2679 def _extract_mpd_formats_and_subtitles(
2680 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2681 fatal=True, data=None, headers={}, query={}):
47a5cb77 2682 res = self._download_xml_handle(
1bac3455 2683 mpd_url, video_id,
37a3bb66 2684 note='Downloading MPD manifest' if note is None else note,
2685 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2686 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2687 if res is False:
171e59ed 2688 return [], {}
47a5cb77 2689 mpd_doc, urlh = res
c25720ef 2690 if mpd_doc is None:
171e59ed 2691 return [], {}
779da8e3
E
2692
2693 # We could have been redirected to a new url when we retrieved our mpd file.
2694 mpd_url = urlh.geturl()
2695 mpd_base_url = base_url(mpd_url)
1bac3455 2696
171e59ed 2697 return self._parse_mpd_formats_and_subtitles(
545cc85d 2698 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2699
171e59ed
F
2700 def _parse_mpd_formats(self, *args, **kwargs):
2701 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2702 if subs:
b5ae35ee 2703 self._report_ignoring_subs('DASH')
171e59ed
F
2704 return fmts
2705
2706 def _parse_mpd_formats_and_subtitles(
2707 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2708 """
2709 Parse formats from MPD manifest.
2710 References:
2711 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2712 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2713 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2714 """
a06916d9 2715 if not self.get_param('dynamic_mpd', True):
78895bd3 2716 if mpd_doc.get('type') == 'dynamic':
171e59ed 2717 return [], {}
2d2fa82d 2718
91cb6b50 2719 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2720
2721 def _add_ns(path):
2722 return self._xpath_ns(path, namespace)
2723
675d0016 2724 def is_drm_protected(element):
2725 return element.find(_add_ns('ContentProtection')) is not None
2726
1bac3455 2727 def extract_multisegment_info(element, ms_parent_info):
2728 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2729
2730 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2731 # common attributes and elements. We will only extract relevant
2732 # for us.
2733 def extract_common(source):
2734 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2735 if segment_timeline is not None:
2736 s_e = segment_timeline.findall(_add_ns('S'))
2737 if s_e:
2738 ms_info['total_number'] = 0
2739 ms_info['s'] = []
2740 for s in s_e:
2741 r = int(s.get('r', 0))
2742 ms_info['total_number'] += 1 + r
2743 ms_info['s'].append({
2744 't': int(s.get('t', 0)),
2745 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2746 'd': int(s.attrib['d']),
2747 'r': r,
2748 })
2749 start_number = source.get('startNumber')
2750 if start_number:
2751 ms_info['start_number'] = int(start_number)
2752 timescale = source.get('timescale')
2753 if timescale:
2754 ms_info['timescale'] = int(timescale)
2755 segment_duration = source.get('duration')
2756 if segment_duration:
48504785 2757 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2758
2759 def extract_Initialization(source):
2760 initialization = source.find(_add_ns('Initialization'))
2761 if initialization is not None:
2762 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2763
f14be228 2764 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2765 if segment_list is not None:
b4c1d6e8
S
2766 extract_common(segment_list)
2767 extract_Initialization(segment_list)
f14be228 2768 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2769 if segment_urls_e:
2770 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2771 else:
f14be228 2772 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2773 if segment_template is not None:
b4c1d6e8 2774 extract_common(segment_template)
e228616c
S
2775 media = segment_template.get('media')
2776 if media:
2777 ms_info['media'] = media
1bac3455 2778 initialization = segment_template.get('initialization')
2779 if initialization:
e228616c 2780 ms_info['initialization'] = initialization
1bac3455 2781 else:
b4c1d6e8 2782 extract_Initialization(segment_template)
1bac3455 2783 return ms_info
b323e170 2784
1bac3455 2785 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2786 formats, subtitles = [], {}
234416e4 2787 stream_numbers = collections.defaultdict(int)
f14be228 2788 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2789 period_duration = parse_duration(period.get('duration')) or mpd_duration
2790 period_ms_info = extract_multisegment_info(period, {
2791 'start_number': 1,
2792 'timescale': 1,
2793 })
f14be228 2794 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2795 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2796 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2797 representation_attrib = adaptation_set.attrib.copy()
2798 representation_attrib.update(representation.attrib)
f0948348 2799 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2800 mime_type = representation_attrib['mimeType']
171e59ed
F
2801 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2802
21633673 2803 codec_str = representation_attrib.get('codecs', '')
2804 # Some kind of binary subtitle found in some youtube livestreams
2805 if mime_type == 'application/x-rawcc':
2806 codecs = {'scodec': codec_str}
2807 else:
2808 codecs = parse_codecs(codec_str)
be2fc5b2 2809 if content_type not in ('video', 'audio', 'text'):
2810 if mime_type == 'image/jpeg':
a8731fcc 2811 content_type = mime_type
21633673 2812 elif codecs.get('vcodec', 'none') != 'none':
4afa3ec4 2813 content_type = 'video'
21633673 2814 elif codecs.get('acodec', 'none') != 'none':
4afa3ec4 2815 content_type = 'audio'
3fe75fdc 2816 elif codecs.get('scodec', 'none') != 'none':
be2fc5b2 2817 content_type = 'text'
6993f78d 2818 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2819 content_type = 'text'
cdb19aa4 2820 else:
be2fc5b2 2821 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2822 continue
2823
2824 base_url = ''
2825 for element in (representation, adaptation_set, period, mpd_doc):
2826 base_url_e = element.find(_add_ns('BaseURL'))
47046464 2827 if try_call(lambda: base_url_e.text) is not None:
be2fc5b2 2828 base_url = base_url_e.text + base_url
2829 if re.match(r'^https?://', base_url):
2830 break
f9cc0161 2831 if mpd_base_url and base_url.startswith('/'):
14f25df2 2832 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
f9cc0161
D
2833 elif mpd_base_url and not re.match(r'^https?://', base_url):
2834 if not mpd_base_url.endswith('/'):
be2fc5b2 2835 mpd_base_url += '/'
2836 base_url = mpd_base_url + base_url
2837 representation_id = representation_attrib.get('id')
2838 lang = representation_attrib.get('lang')
2839 url_el = representation.find(_add_ns('BaseURL'))
2840 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2841 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2842 if representation_id is not None:
2843 format_id = representation_id
2844 else:
2845 format_id = content_type
2846 if mpd_id:
2847 format_id = mpd_id + '-' + format_id
2848 if content_type in ('video', 'audio'):
2849 f = {
2850 'format_id': format_id,
2851 'manifest_url': mpd_url,
2852 'ext': mimetype2ext(mime_type),
2853 'width': int_or_none(representation_attrib.get('width')),
2854 'height': int_or_none(representation_attrib.get('height')),
2855 'tbr': float_or_none(bandwidth, 1000),
2856 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2857 'fps': int_or_none(representation_attrib.get('frameRate')),
2858 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2859 'format_note': 'DASH %s' % content_type,
2860 'filesize': filesize,
2861 'container': mimetype2ext(mime_type) + '_dash',
4afa3ec4 2862 **codecs
be2fc5b2 2863 }
be2fc5b2 2864 elif content_type == 'text':
2865 f = {
2866 'ext': mimetype2ext(mime_type),
2867 'manifest_url': mpd_url,
2868 'filesize': filesize,
2869 }
2870 elif content_type == 'image/jpeg':
2871 # See test case in VikiIE
2872 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2873 f = {
2874 'format_id': format_id,
2875 'ext': 'mhtml',
2876 'manifest_url': mpd_url,
2877 'format_note': 'DASH storyboards (jpeg)',
2878 'acodec': 'none',
2879 'vcodec': 'none',
2880 }
88acdbc2 2881 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2882 f['has_drm'] = True
be2fc5b2 2883 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2884
2885 def prepare_template(template_name, identifiers):
2886 tmpl = representation_ms_info[template_name]
2887 # First of, % characters outside $...$ templates
2888 # must be escaped by doubling for proper processing
2889 # by % operator string formatting used further (see
2890 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2891 t = ''
2892 in_template = False
2893 for c in tmpl:
2894 t += c
2895 if c == '$':
2896 in_template = not in_template
2897 elif c == '%' and not in_template:
eca1f0d1 2898 t += c
be2fc5b2 2899 # Next, $...$ templates are translated to their
2900 # %(...) counterparts to be used with % operator
2901 if representation_id is not None:
2902 t = t.replace('$RepresentationID$', representation_id)
2903 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2904 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2905 t.replace('$$', '$')
2906 return t
2907
2908 # @initialization is a regular template like @media one
2909 # so it should be handled just the same way (see
2910 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2911 if 'initialization' in representation_ms_info:
2912 initialization_template = prepare_template(
2913 'initialization',
2914 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2915 # $Time$ shall not be included for @initialization thus
2916 # only $Bandwidth$ remains
2917 ('Bandwidth', ))
2918 representation_ms_info['initialization_url'] = initialization_template % {
2919 'Bandwidth': bandwidth,
2920 }
2921
2922 def location_key(location):
2923 return 'url' if re.match(r'^https?://', location) else 'path'
2924
2925 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2926
2927 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2928 media_location_key = location_key(media_template)
2929
2930 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2931 # can't be used at the same time
2932 if '%(Number' in media_template and 's' not in representation_ms_info:
2933 segment_duration = None
2934 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2935 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
ffa89477 2936 representation_ms_info['total_number'] = int(math.ceil(
2937 float_or_none(period_duration, segment_duration, default=0)))
be2fc5b2 2938 representation_ms_info['fragments'] = [{
2939 media_location_key: media_template % {
2940 'Number': segment_number,
2941 'Bandwidth': bandwidth,
2942 },
2943 'duration': segment_duration,
2944 } for segment_number in range(
2945 representation_ms_info['start_number'],
2946 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2947 else:
2948 # $Number*$ or $Time$ in media template with S list available
2949 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2950 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2951 representation_ms_info['fragments'] = []
2952 segment_time = 0
2953 segment_d = None
2954 segment_number = representation_ms_info['start_number']
2955
2956 def add_segment_url():
2957 segment_url = media_template % {
2958 'Time': segment_time,
2959 'Bandwidth': bandwidth,
2960 'Number': segment_number,
2961 }
2962 representation_ms_info['fragments'].append({
2963 media_location_key: segment_url,
2964 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2965 })
2966
2967 for num, s in enumerate(representation_ms_info['s']):
2968 segment_time = s.get('t') or segment_time
2969 segment_d = s['d']
2970 add_segment_url()
2971 segment_number += 1
2972 for r in range(s.get('r', 0)):
2973 segment_time += segment_d
f0948348 2974 add_segment_url()
b4c1d6e8 2975 segment_number += 1
be2fc5b2 2976 segment_time += segment_d
2977 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2978 # No media template
2979 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2980 # or any YouTube dashsegments video
2981 fragments = []
2982 segment_index = 0
2983 timescale = representation_ms_info['timescale']
2984 for s in representation_ms_info['s']:
2985 duration = float_or_none(s['d'], timescale)
2986 for r in range(s.get('r', 0) + 1):
2987 segment_uri = representation_ms_info['segment_urls'][segment_index]
2988 fragments.append({
2989 location_key(segment_uri): segment_uri,
2990 'duration': duration,
2991 })
2992 segment_index += 1
2993 representation_ms_info['fragments'] = fragments
2994 elif 'segment_urls' in representation_ms_info:
2995 # Segment URLs with no SegmentTimeline
2996 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2997 # https://github.com/ytdl-org/youtube-dl/pull/14844
2998 fragments = []
2999 segment_duration = float_or_none(
3000 representation_ms_info['segment_duration'],
3001 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3002 for segment_url in representation_ms_info['segment_urls']:
3003 fragment = {
3004 location_key(segment_url): segment_url,
3005 }
3006 if segment_duration:
3007 fragment['duration'] = segment_duration
3008 fragments.append(fragment)
3009 representation_ms_info['fragments'] = fragments
3010 # If there is a fragments key available then we correctly recognized fragmented media.
3011 # Otherwise we will assume unfragmented media with direct access. Technically, such
3012 # assumption is not necessarily correct since we may simply have no support for
3013 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3014 if 'fragments' in representation_ms_info:
3015 f.update({
3016 # NB: mpd_url may be empty when MPD manifest is parsed from a string
3017 'url': mpd_url or base_url,
3018 'fragment_base_url': base_url,
3019 'fragments': [],
3020 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3021 })
3022 if 'initialization_url' in representation_ms_info:
3023 initialization_url = representation_ms_info['initialization_url']
3024 if not f.get('url'):
3025 f['url'] = initialization_url
3026 f['fragments'].append({location_key(initialization_url): initialization_url})
3027 f['fragments'].extend(representation_ms_info['fragments'])
ffa89477 3028 if not period_duration:
3029 period_duration = try_get(
3030 representation_ms_info,
3031 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
17b598d3 3032 else:
be2fc5b2 3033 # Assuming direct URL to unfragmented media.
3034 f['url'] = base_url
234416e4 3035 if content_type in ('video', 'audio', 'image/jpeg'):
3036 f['manifest_stream_number'] = stream_numbers[f['url']]
3037 stream_numbers[f['url']] += 1
be2fc5b2 3038 formats.append(f)
3039 elif content_type == 'text':
3040 subtitles.setdefault(lang or 'und', []).append(f)
3041
171e59ed 3042 return formats, subtitles
17b598d3 3043
fd76a142
F
3044 def _extract_ism_formats(self, *args, **kwargs):
3045 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3046 if subs:
b5ae35ee 3047 self._report_ignoring_subs('ISM')
fd76a142
F
3048 return fmts
3049
3050 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 3051 res = self._download_xml_handle(
b2758123 3052 ism_url, video_id,
37a3bb66 3053 note='Downloading ISM manifest' if note is None else note,
3054 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 3055 fatal=fatal, data=data, headers=headers, query=query)
b2758123 3056 if res is False:
fd76a142 3057 return [], {}
47a5cb77 3058 ism_doc, urlh = res
13b08034 3059 if ism_doc is None:
fd76a142 3060 return [], {}
b2758123 3061
fd76a142 3062 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 3063
fd76a142 3064 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
3065 """
3066 Parse formats from ISM manifest.
3067 References:
3068 1. [MS-SSTR]: Smooth Streaming Protocol,
3069 https://msdn.microsoft.com/en-us/library/ff469518.aspx
3070 """
06869367 3071 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 3072 return [], {}
b2758123 3073
b2758123
RA
3074 duration = int(ism_doc.attrib['Duration'])
3075 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3076
3077 formats = []
fd76a142 3078 subtitles = {}
b2758123
RA
3079 for stream in ism_doc.findall('StreamIndex'):
3080 stream_type = stream.get('Type')
fd76a142 3081 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
3082 continue
3083 url_pattern = stream.attrib['Url']
3084 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3085 stream_name = stream.get('Name')
fd76a142 3086 stream_language = stream.get('Language', 'und')
b2758123 3087 for track in stream.findall('QualityLevel'):
e2efe599 3088 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
b2758123 3089 # TODO: add support for WVC1 and WMAP
66a1b864 3090 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
b2758123
RA
3091 self.report_warning('%s is not a supported codec' % fourcc)
3092 continue
3093 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
3094 # [1] does not mention Width and Height attributes. However,
3095 # they're often present while MaxWidth and MaxHeight are
3096 # missing, so should be used as fallbacks
3097 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3098 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
3099 sampling_rate = int_or_none(track.get('SamplingRate'))
3100
3101 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
14f25df2 3102 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
b2758123
RA
3103
3104 fragments = []
3105 fragment_ctx = {
3106 'time': 0,
3107 }
3108 stream_fragments = stream.findall('c')
3109 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3110 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3111 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3112 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3113 if not fragment_ctx['duration']:
3114 try:
3115 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3116 except IndexError:
3117 next_fragment_time = duration
1616f9b4 3118 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
3119 for _ in range(fragment_repeat):
3120 fragments.append({
14f25df2 3121 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
3122 'duration': fragment_ctx['duration'] / stream_timescale,
3123 })
3124 fragment_ctx['time'] += fragment_ctx['duration']
3125
fd76a142
F
3126 if stream_type == 'text':
3127 subtitles.setdefault(stream_language, []).append({
3128 'ext': 'ismt',
3129 'protocol': 'ism',
3130 'url': ism_url,
3131 'manifest_url': ism_url,
3132 'fragments': fragments,
3133 '_download_params': {
3134 'stream_type': stream_type,
3135 'duration': duration,
3136 'timescale': stream_timescale,
3137 'fourcc': fourcc,
3138 'language': stream_language,
3139 'codec_private_data': track.get('CodecPrivateData'),
3140 }
3141 })
3142 elif stream_type in ('video', 'audio'):
3143 formats.append({
34921b43 3144 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
3145 'url': ism_url,
3146 'manifest_url': ism_url,
3147 'ext': 'ismv' if stream_type == 'video' else 'isma',
3148 'width': width,
3149 'height': height,
3150 'tbr': tbr,
3151 'asr': sampling_rate,
3152 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3153 'acodec': 'none' if stream_type == 'video' else fourcc,
3154 'protocol': 'ism',
3155 'fragments': fragments,
88acdbc2 3156 'has_drm': ism_doc.find('Protection') is not None,
fd76a142
F
3157 '_download_params': {
3158 'stream_type': stream_type,
3159 'duration': duration,
3160 'timescale': stream_timescale,
3161 'width': width or 0,
3162 'height': height or 0,
3163 'fourcc': fourcc,
3164 'language': stream_language,
3165 'codec_private_data': track.get('CodecPrivateData'),
3166 'sampling_rate': sampling_rate,
3167 'channels': int_or_none(track.get('Channels', 2)),
3168 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3169 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3170 },
3171 })
3172 return formats, subtitles
b2758123 3173
079a7cfc 3174 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
6780154e
S
3175 def absolute_url(item_url):
3176 return urljoin(base_url, item_url)
59bbe491 3177
3178 def parse_content_type(content_type):
3179 if not content_type:
3180 return {}
3181 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3182 if ctr:
3183 mimetype, codecs = ctr.groups()
3184 f = parse_codecs(codecs)
3185 f['ext'] = mimetype2ext(mimetype)
3186 return f
3187 return {}
3188
222a2308
L
3189 def _media_formats(src, cur_media_type, type_info=None):
3190 type_info = type_info or {}
520251c0 3191 full_url = absolute_url(src)
82889d4a 3192 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3193 if ext == 'm3u8':
520251c0
YCH
3194 is_plain_url = False
3195 formats = self._extract_m3u8_formats(
ad120ae1 3196 full_url, video_id, ext='mp4',
eeb0a956 3197 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3198 preference=preference, quality=quality, fatal=False)
87a449c1
S
3199 elif ext == 'mpd':
3200 is_plain_url = False
3201 formats = self._extract_mpd_formats(
b359e977 3202 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3203 else:
3204 is_plain_url = True
3205 formats = [{
3206 'url': full_url,
3207 'vcodec': 'none' if cur_media_type == 'audio' else None,
222a2308 3208 'ext': ext,
520251c0
YCH
3209 }]
3210 return is_plain_url, formats
3211
59bbe491 3212 entries = []
4328ddf8 3213 # amp-video and amp-audio are very similar to their HTML5 counterparts
962ffcf8 3214 # so we will include them right here (see
4328ddf8 3215 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3216 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3217 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3218 media_tags = [(media_tag, media_tag_name, media_type, '')
3219 for media_tag, media_tag_name, media_type
3220 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3221 media_tags.extend(re.findall(
3222 # We only allow video|audio followed by a whitespace or '>'.
3223 # Allowing more characters may end up in significant slow down (see
067aa17e 3224 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 3225 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3226 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3227 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3228 media_info = {
3229 'formats': [],
3230 'subtitles': {},
3231 }
3232 media_attributes = extract_attributes(media_tag)
f856816b 3233 src = strip_or_none(media_attributes.get('src'))
59bbe491 3234 if src:
222a2308
L
3235 f = parse_content_type(media_attributes.get('type'))
3236 _, formats = _media_formats(src, media_type, f)
520251c0 3237 media_info['formats'].extend(formats)
6780154e 3238 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3239 if media_content:
3240 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3241 s_attr = extract_attributes(source_tag)
3242 # data-video-src and data-src are non standard but seen
3243 # several times in the wild
f856816b 3244 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 3245 if not src:
3246 continue
d493f15c 3247 f = parse_content_type(s_attr.get('type'))
868f79db 3248 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3249 if is_plain_url:
d493f15c
S
3250 # width, height, res, label and title attributes are
3251 # all not standard but seen several times in the wild
3252 labels = [
3253 s_attr.get(lbl)
3254 for lbl in ('label', 'title')
3255 if str_or_none(s_attr.get(lbl))
3256 ]
3257 width = int_or_none(s_attr.get('width'))
3089bc74
S
3258 height = (int_or_none(s_attr.get('height'))
3259 or int_or_none(s_attr.get('res')))
d493f15c
S
3260 if not width or not height:
3261 for lbl in labels:
3262 resolution = parse_resolution(lbl)
3263 if not resolution:
3264 continue
3265 width = width or resolution.get('width')
3266 height = height or resolution.get('height')
3267 for lbl in labels:
3268 tbr = parse_bitrate(lbl)
3269 if tbr:
3270 break
3271 else:
3272 tbr = None
1ed45499 3273 f.update({
d493f15c
S
3274 'width': width,
3275 'height': height,
3276 'tbr': tbr,
3277 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3278 })
520251c0
YCH
3279 f.update(formats[0])
3280 media_info['formats'].append(f)
3281 else:
3282 media_info['formats'].extend(formats)
59bbe491 3283 for track_tag in re.findall(r'<track[^>]+>', media_content):
3284 track_attributes = extract_attributes(track_tag)
3285 kind = track_attributes.get('kind')
5968d7d2 3286 if not kind or kind in ('subtitles', 'captions'):
f856816b 3287 src = strip_or_none(track_attributes.get('src'))
59bbe491 3288 if not src:
3289 continue
3290 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3291 media_info['subtitles'].setdefault(lang, []).append({
3292 'url': absolute_url(src),
3293 })
5e8e2fa5
S
3294 for f in media_info['formats']:
3295 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3296 if media_info['formats'] or media_info['subtitles']:
59bbe491 3297 entries.append(media_info)
3298 return entries
3299
f6a1d69a
F
3300 def _extract_akamai_formats(self, *args, **kwargs):
3301 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3302 if subs:
b5ae35ee 3303 self._report_ignoring_subs('akamai')
f6a1d69a
F
3304 return fmts
3305
3306 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3307 signed = 'hdnea=' in manifest_url
3308 if not signed:
3309 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3310 manifest_url = re.sub(
3311 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3312 '', manifest_url).strip('?')
3313
c7c43a93 3314 formats = []
f6a1d69a 3315 subtitles = {}
70c5802b 3316
e71a4509 3317 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3318 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3319 hds_host = hosts.get('hds')
3320 if hds_host:
3321 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3322 if 'hdcore=' not in f4m_url:
3323 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3324 f4m_formats = self._extract_f4m_formats(
3325 f4m_url, video_id, f4m_id='hds', fatal=False)
3326 for entry in f4m_formats:
3327 entry.update({'extra_param_to_segment_url': hdcore_sign})
3328 formats.extend(f4m_formats)
70c5802b 3329
c4251b9a
RA
3330 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3331 hls_host = hosts.get('hls')
3332 if hls_host:
3333 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3334 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3335 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3336 m3u8_id='hls', fatal=False)
3337 formats.extend(m3u8_formats)
f6a1d69a 3338 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3339
3340 http_host = hosts.get('http')
29f7c58a 3341 if http_host and m3u8_formats and not signed:
3342 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3343 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3344 qualities_length = len(qualities)
29f7c58a 3345 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3346 i = 0
29f7c58a 3347 for f in m3u8_formats:
3348 if f['vcodec'] != 'none':
70c5802b 3349 for protocol in ('http', 'https'):
3350 http_f = f.copy()
3351 del http_f['manifest_url']
3352 http_url = re.sub(
86e5f3ed 3353 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
70c5802b 3354 http_f.update({
3355 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3356 'url': http_url,
3357 'protocol': protocol,
3358 })
29f7c58a 3359 formats.append(http_f)
70c5802b 3360 i += 1
70c5802b 3361
f6a1d69a 3362 return formats, subtitles
c7c43a93 3363
6ad02195 3364 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
14f25df2 3365 query = urllib.parse.urlparse(url).query
6ad02195 3366 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3367 mobj = re.search(
3368 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3369 url_base = mobj.group('url')
3370 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3371 formats = []
044eeb14
S
3372
3373 def manifest_url(manifest):
86e5f3ed 3374 m_url = f'{http_base_url}/{manifest}'
044eeb14
S
3375 if query:
3376 m_url += '?%s' % query
3377 return m_url
3378
6ad02195
RA
3379 if 'm3u8' not in skip_protocols:
3380 formats.extend(self._extract_m3u8_formats(
044eeb14 3381 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3382 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3383 if 'f4m' not in skip_protocols:
3384 formats.extend(self._extract_f4m_formats(
044eeb14 3385 manifest_url('manifest.f4m'),
6ad02195 3386 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3387 if 'dash' not in skip_protocols:
3388 formats.extend(self._extract_mpd_formats(
044eeb14 3389 manifest_url('manifest.mpd'),
0384932e 3390 video_id, mpd_id='dash', fatal=False))
6ad02195 3391 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3392 if 'smil' not in skip_protocols:
3393 rtmp_formats = self._extract_smil_formats(
044eeb14 3394 manifest_url('jwplayer.smil'),
6ad02195
RA
3395 video_id, fatal=False)
3396 for rtmp_format in rtmp_formats:
3397 rtsp_format = rtmp_format.copy()
3398 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3399 del rtsp_format['play_path']
3400 del rtsp_format['ext']
3401 rtsp_format.update({
3402 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3403 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3404 'protocol': 'rtsp',
3405 })
3406 formats.extend([rtmp_format, rtsp_format])
3407 else:
3408 for protocol in ('rtmp', 'rtsp'):
3409 if protocol not in skip_protocols:
3410 formats.append({
86e5f3ed 3411 'url': f'{protocol}:{url_base}',
6ad02195
RA
3412 'format_id': protocol,
3413 'protocol': protocol,
3414 })
3415 return formats
3416
c73e330e 3417 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3418 mobj = re.search(
ac9c69ac 3419 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3420 webpage)
3421 if mobj:
c73e330e
RU
3422 try:
3423 jwplayer_data = self._parse_json(mobj.group('options'),
3424 video_id=video_id,
3425 transform_source=transform_source)
3426 except ExtractorError:
3427 pass
3428 else:
3429 if isinstance(jwplayer_data, dict):
3430 return jwplayer_data
a4a554a7
YCH
3431
3432 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3433 jwplayer_data = self._find_jwplayer_data(
3434 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3435 return self._parse_jwplayer_data(
3436 jwplayer_data, video_id, *args, **kwargs)
3437
3438 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3439 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3440 # JWPlayer backward compatibility: flattened playlists
3441 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3442 if 'playlist' not in jwplayer_data:
3443 jwplayer_data = {'playlist': [jwplayer_data]}
3444
3445 entries = []
3446
3447 # JWPlayer backward compatibility: single playlist item
3448 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3449 if not isinstance(jwplayer_data['playlist'], list):
3450 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3451
3452 for video_data in jwplayer_data['playlist']:
3453 # JWPlayer backward compatibility: flattened sources
3454 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3455 if 'sources' not in video_data:
3456 video_data['sources'] = [video_data]
3457
3458 this_video_id = video_id or video_data['mediaid']
3459
1a2192cb
S
3460 formats = self._parse_jwplayer_formats(
3461 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3462 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3463
3464 subtitles = {}
3465 tracks = video_data.get('tracks')
3466 if tracks and isinstance(tracks, list):
3467 for track in tracks:
96a2daa1
S
3468 if not isinstance(track, dict):
3469 continue
f4b74272 3470 track_kind = track.get('kind')
14f25df2 3471 if not track_kind or not isinstance(track_kind, str):
f4b74272
S
3472 continue
3473 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3474 continue
3475 track_url = urljoin(base_url, track.get('file'))
3476 if not track_url:
3477 continue
3478 subtitles.setdefault(track.get('label') or 'en', []).append({
3479 'url': self._proto_relative_url(track_url)
3480 })
3481
50d808f5 3482 entry = {
a4a554a7 3483 'id': this_video_id,
50d808f5 3484 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3485 'description': clean_html(video_data.get('description')),
6945b9e7 3486 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3487 'timestamp': int_or_none(video_data.get('pubdate')),
3488 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3489 'subtitles': subtitles,
50d808f5
RA
3490 }
3491 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3492 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3493 entry.update({
3494 '_type': 'url_transparent',
3495 'url': formats[0]['url'],
3496 })
3497 else:
3498 self._sort_formats(formats)
3499 entry['formats'] = formats
3500 entries.append(entry)
a4a554a7
YCH
3501 if len(entries) == 1:
3502 return entries[0]
3503 else:
3504 return self.playlist_result(entries)
3505
ed0cf9b3
S
3506 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3507 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3508 urls = []
ed0cf9b3 3509 formats = []
1a2192cb 3510 for source in jwplayer_sources_data:
0a268c6e
S
3511 if not isinstance(source, dict):
3512 continue
6945b9e7
RA
3513 source_url = urljoin(
3514 base_url, self._proto_relative_url(source.get('file')))
3515 if not source_url or source_url in urls:
bf1b87cd
RA
3516 continue
3517 urls.append(source_url)
ed0cf9b3
S
3518 source_type = source.get('type') or ''
3519 ext = mimetype2ext(source_type) or determine_ext(source_url)
3520 if source_type == 'hls' or ext == 'm3u8':
3521 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3522 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3523 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3524 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3525 formats.extend(self._extract_mpd_formats(
3526 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3527 elif ext == 'smil':
3528 formats.extend(self._extract_smil_formats(
3529 source_url, video_id, fatal=False))
ed0cf9b3 3530 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3531 elif source_type.startswith('audio') or ext in (
3532 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3533 formats.append({
3534 'url': source_url,
3535 'vcodec': 'none',
3536 'ext': ext,
3537 })
3538 else:
3539 height = int_or_none(source.get('height'))
3540 if height is None:
3541 # Often no height is provided but there is a label in
0236cd0d 3542 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3543 height = int_or_none(self._search_regex(
14f25df2 3544 r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
ed0cf9b3
S
3545 'height', default=None))
3546 a_format = {
3547 'url': source_url,
3548 'width': int_or_none(source.get('width')),
3549 'height': height,
0236cd0d 3550 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3551 'ext': ext,
3552 }
3553 if source_url.startswith('rtmp'):
3554 a_format['ext'] = 'flv'
ed0cf9b3
S
3555 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3556 # of jwplayer.flash.swf
3557 rtmp_url_parts = re.split(
3558 r'((?:mp4|mp3|flv):)', source_url, 1)
3559 if len(rtmp_url_parts) == 3:
3560 rtmp_url, prefix, play_path = rtmp_url_parts
3561 a_format.update({
3562 'url': rtmp_url,
3563 'play_path': prefix + play_path,
3564 })
3565 if rtmp_params:
3566 a_format.update(rtmp_params)
3567 formats.append(a_format)
3568 return formats
3569
f4b1c7ad 3570 def _live_title(self, name):
39ca3b5c 3571 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3572 return name
f4b1c7ad 3573
b14f3a4c
PH
3574 def _int(self, v, name, fatal=False, **kwargs):
3575 res = int_or_none(v, **kwargs)
b14f3a4c 3576 if res is None:
86e5f3ed 3577 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3578 if fatal:
3579 raise ExtractorError(msg)
3580 else:
6a39ee13 3581 self.report_warning(msg)
b14f3a4c
PH
3582 return res
3583
3584 def _float(self, v, name, fatal=False, **kwargs):
3585 res = float_or_none(v, **kwargs)
3586 if res is None:
86e5f3ed 3587 msg = f'Failed to extract {name}: Could not parse value {v!r}'
b14f3a4c
PH
3588 if fatal:
3589 raise ExtractorError(msg)
3590 else:
6a39ee13 3591 self.report_warning(msg)
b14f3a4c
PH
3592 return res
3593
40e41780
TF
3594 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3595 path='/', secure=False, discard=False, rest={}, **kwargs):
ac668111 3596 cookie = http.cookiejar.Cookie(
4ed2d7b7 3597 0, name, value, port, port is not None, domain, True,
40e41780
TF
3598 domain.startswith('.'), path, True, secure, expire_time,
3599 discard, None, None, rest)
9809740b 3600 self.cookiejar.set_cookie(cookie)
42939b61 3601
799207e8 3602 def _get_cookies(self, url):
ac668111 3603 """ Return a http.cookies.SimpleCookie with the cookies for the url """
3604 return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
799207e8 3605
e3c1266f 3606 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3607 """
3608 Apply first Set-Cookie header instead of the last. Experimental.
3609
3610 Some sites (e.g. [1-3]) may serve two cookies under the same name
3611 in Set-Cookie header and expect the first (old) one to be set rather
3612 than second (new). However, as of RFC6265 the newer one cookie
3613 should be set into cookie store what actually happens.
3614 We will workaround this issue by resetting the cookie to
3615 the first one manually.
3616 1. https://new.vk.com/
3617 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3618 3. https://learning.oreilly.com/
3619 """
e3c1266f
S
3620 for header, cookies in url_handle.headers.items():
3621 if header.lower() != 'set-cookie':
3622 continue
cfb0511d 3623 cookies = cookies.encode('iso-8859-1').decode('utf-8')
e3c1266f
S
3624 cookie_value = re.search(
3625 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3626 if cookie_value:
3627 value, domain = cookie_value.groups()
3628 self._set_cookie(domain, cookie, value)
3629 break
3630
82d02080 3631 @classmethod
3632 def get_testcases(cls, include_onlymatching=False):
3633 t = getattr(cls, '_TEST', None)
05900629 3634 if t:
82d02080 3635 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
05900629
PH
3636 tests = [t]
3637 else:
82d02080 3638 tests = getattr(cls, '_TESTS', [])
05900629
PH
3639 for t in tests:
3640 if not include_onlymatching and t.get('only_matching', False):
3641 continue
82d02080 3642 t['name'] = cls.ie_key()
05900629
PH
3643 yield t
3644
24146491 3645 @classproperty
3646 def age_limit(cls):
3647 """Get age limit from the testcases"""
3648 return max(traverse_obj(
3649 tuple(cls.get_testcases(include_onlymatching=False)),
3650 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3651
82d02080 3652 @classmethod
3653 def is_suitable(cls, age_limit):
24146491 3654 """Test whether the extractor is generally suitable for the given age limit"""
3655 return not age_restricted(cls.age_limit, age_limit)
05900629 3656
82d02080 3657 @classmethod
3658 def description(cls, *, markdown=True, search_examples=None):
8dcce6a8 3659 """Description of the extractor"""
3660 desc = ''
82d02080 3661 if cls._NETRC_MACHINE:
8dcce6a8 3662 if markdown:
82d02080 3663 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
8dcce6a8 3664 else:
82d02080 3665 desc += f' [{cls._NETRC_MACHINE}]'
3666 if cls.IE_DESC is False:
8dcce6a8 3667 desc += ' [HIDDEN]'
82d02080 3668 elif cls.IE_DESC:
3669 desc += f' {cls.IE_DESC}'
3670 if cls.SEARCH_KEY:
3671 desc += f'; "{cls.SEARCH_KEY}:" prefix'
8dcce6a8 3672 if search_examples:
3673 _COUNTS = ('', '5', '10', 'all')
82d02080 3674 desc += f' (Example: "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3675 if not cls.working():
8dcce6a8 3676 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3677
82d02080 3678 name = f' - **{cls.IE_NAME}**' if markdown else cls.IE_NAME
8dcce6a8 3679 return f'{name}:{desc}' if desc else name
3680
a504ced0 3681 def extract_subtitles(self, *args, **kwargs):
a06916d9 3682 if (self.get_param('writesubtitles', False)
3683 or self.get_param('listsubtitles')):
9868ea49
JMF
3684 return self._get_subtitles(*args, **kwargs)
3685 return {}
a504ced0
JMF
3686
3687 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3688 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3689
a2160aa4 3690 def extract_comments(self, *args, **kwargs):
3691 if not self.get_param('getcomments'):
3692 return None
3693 generator = self._get_comments(*args, **kwargs)
3694
3695 def extractor():
3696 comments = []
d2b2fca5 3697 interrupted = True
a2160aa4 3698 try:
3699 while True:
3700 comments.append(next(generator))
a2160aa4 3701 except StopIteration:
3702 interrupted = False
d2b2fca5 3703 except KeyboardInterrupt:
3704 self.to_screen('Interrupted by user')
3705 except Exception as e:
3706 if self.get_param('ignoreerrors') is not True:
3707 raise
3708 self._downloader.report_error(e)
a2160aa4 3709 comment_count = len(comments)
3710 self.to_screen(f'Extracted {comment_count} comments')
3711 return {
3712 'comments': comments,
3713 'comment_count': None if interrupted else comment_count
3714 }
3715 return extractor
3716
3717 def _get_comments(self, *args, **kwargs):
3718 raise NotImplementedError('This method must be implemented by subclasses')
3719
912e0b7e
YCH
3720 @staticmethod
3721 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
a825ffbf 3722 """ Merge subtitle items for one language. Items with duplicated URLs/data
912e0b7e 3723 will be dropped. """
86e5f3ed 3724 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
912e0b7e 3725 ret = list(subtitle_list1)
a44ca5a4 3726 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
912e0b7e
YCH
3727 return ret
3728
3729 @classmethod
46890374 3730 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3731 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3732 if target is None:
3733 target = {}
3734 for d in dicts:
3735 for lang, subs in d.items():
3736 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3737 return target
912e0b7e 3738
360e1ca5 3739 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3740 if (self.get_param('writeautomaticsub', False)
3741 or self.get_param('listsubtitles')):
9868ea49
JMF
3742 return self._get_automatic_captions(*args, **kwargs)
3743 return {}
360e1ca5
JMF
3744
3745 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3746 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3747
2762dbb1 3748 @functools.cached_property
24146491 3749 def _cookies_passed(self):
3750 """Whether cookies have been passed to YoutubeDL"""
3751 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3752
d77ab8e2 3753 def mark_watched(self, *args, **kwargs):
1813a6cc 3754 if not self.get_param('mark_watched', False):
3755 return
24146491 3756 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
d77ab8e2
S
3757 self._mark_watched(*args, **kwargs)
3758
3759 def _mark_watched(self, *args, **kwargs):
3760 raise NotImplementedError('This method must be implemented by subclasses')
3761
38cce791
YCH
3762 def geo_verification_headers(self):
3763 headers = {}
a06916d9 3764 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3765 if geo_verification_proxy:
3766 headers['Ytdl-request-proxy'] = geo_verification_proxy
3767 return headers
3768
98763ee3 3769 def _generic_id(self, url):
14f25df2 3770 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
98763ee3
YCH
3771
3772 def _generic_title(self, url):
14f25df2 3773 return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
98763ee3 3774
c224251a 3775 @staticmethod
b0089e89 3776 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3777 all_known = all(map(
3778 lambda x: x is not None,
3779 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3780 return (
3781 'private' if is_private
3782 else 'premium_only' if needs_premium
3783 else 'subscriber_only' if needs_subscription
3784 else 'needs_auth' if needs_auth
3785 else 'unlisted' if is_unlisted
3786 else 'public' if all_known
3787 else None)
3788
d43de682 3789 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
4bb6b02f 3790 '''
3791 @returns A list of values for the extractor argument given by "key"
3792 or "default" if no such key is present
3793 @param default The default value to return when the key is not present (default: [])
3794 @param casesense When false, the values are converted to lower case
3795 '''
3796 val = traverse_obj(
d43de682 3797 self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
4bb6b02f 3798 if val is None:
3799 return [] if default is NO_DEFAULT else default
3800 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3801
f40ee5e9 3802 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3803 if not playlist_id or not video_id:
3804 return not video_id
3805
3806 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3807 if no_playlist is not None:
3808 return not no_playlist
3809
3810 video_id = '' if video_id is True else f' {video_id}'
3811 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3812 if self.get_param('noplaylist'):
3813 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3814 return False
3815 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3816 return True
3817
8dbe9899 3818
d6983cb4
PH
3819class SearchInfoExtractor(InfoExtractor):
3820 """
3821 Base class for paged search queries extractors.
10952eb2 3822 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3823 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3824 """
3825
96565c7e 3826 _MAX_RESULTS = float('inf')
3827
d6983cb4
PH
3828 @classmethod
3829 def _make_valid_url(cls):
3830 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3831
d6983cb4 3832 def _real_extract(self, query):
2c4aaadd 3833 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3834 if prefix == '':
3835 return self._get_n_results(query, 1)
3836 elif prefix == 'all':
3837 return self._get_n_results(query, self._MAX_RESULTS)
3838 else:
3839 n = int(prefix)
3840 if n <= 0:
86e5f3ed 3841 raise ExtractorError(f'invalid download number {n} for query "{query}"')
d6983cb4 3842 elif n > self._MAX_RESULTS:
6a39ee13 3843 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3844 n = self._MAX_RESULTS
3845 return self._get_n_results(query, n)
3846
3847 def _get_n_results(self, query, n):
cc16383f 3848 """Get a specified number of results for a query.
3849 Either this function or _search_results must be overridden by subclasses """
3850 return self.playlist_result(
3851 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3852 query, query)
3853
3854 def _search_results(self, query):
3855 """Returns an iterator of search results"""
611c1dd9 3856 raise NotImplementedError('This method must be implemented by subclasses')
0f818663 3857
82d02080 3858 @classproperty
3859 def SEARCH_KEY(cls):
3860 return cls._SEARCH_KEY