]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
[tiktok] Fix 53dad39e30b007feed4b6d4776bd15d28c27a96c
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
234416e4 5import collections
3ec05685 6import hashlib
cc16383f 7import itertools
3d3538e4 8import json
4094b6e3 9import netrc
d6983cb4 10import os
773f291d 11import random
d6983cb4 12import re
d6983cb4 13import sys
4094b6e3 14import time
1bac3455 15import math
d6983cb4 16
8c25f81b 17from ..compat import (
6c22cee6 18 compat_cookiejar_Cookie,
f7ad7160 19 compat_cookies_SimpleCookie,
ee0ba927 20 compat_etree_Element,
e9c0cdd3 21 compat_etree_fromstring,
0001fcb5 22 compat_expanduser,
e64b7569 23 compat_getpass,
d6983cb4 24 compat_http_client,
e9c0cdd3
YCH
25 compat_os_name,
26 compat_str,
d6983cb4 27 compat_urllib_error,
98763ee3 28 compat_urllib_parse_unquote,
15707c7e 29 compat_urllib_parse_urlencode,
41d06b04 30 compat_urllib_request,
f0b5d6af 31 compat_urlparse,
e01c3d2e 32 compat_xml_parse_error,
8c25f81b 33)
eb8a4433 34from ..downloader import FileDownloader
48107c19
S
35from ..downloader.f4m import (
36 get_base_url,
37 remove_encrypted_media,
38)
8c25f81b 39from ..utils import (
05900629 40 age_restricted,
02dc0a36 41 base_url,
08f2a92c 42 bug_reports_message,
d6983cb4
PH
43 clean_html,
44 compiled_regex_type,
70f0f5a8 45 determine_ext,
46b18f23 46 determine_protocol,
d493f15c 47 dict_get,
9b9c5355 48 error_to_compat_str,
46b18f23 49 extract_attributes,
b868936c 50 ExtractorError,
97f4aecf 51 fix_xml_ampersands,
b14f3a4c 52 float_or_none,
b868936c 53 format_field,
773f291d
S
54 GeoRestrictedError,
55 GeoUtils,
31bb8d3f 56 int_or_none,
34921b43 57 join_nonempty,
a4a554a7 58 js_to_json,
0685d972 59 JSON_LD_RE,
46b18f23 60 mimetype2ext,
3158150c 61 network_exceptions,
b868936c 62 NO_DEFAULT,
46b18f23 63 orderedSet,
d493f15c 64 parse_bitrate,
46b18f23
JH
65 parse_codecs,
66 parse_duration,
4ca2a3cf 67 parse_iso8601,
46b18f23 68 parse_m3u8_attributes,
d493f15c 69 parse_resolution,
55b3e45b 70 RegexNotFoundError,
46b18f23 71 sanitize_filename,
b868936c 72 sanitized_Request,
d493f15c 73 str_or_none,
ce5b9040 74 str_to_int,
f856816b 75 strip_or_none,
5d3a0e79 76 traverse_obj,
f38de77f 77 unescapeHTML,
0db3bae8 78 UnsupportedError,
647eab45 79 unified_strdate,
6b3a3098 80 unified_timestamp,
46b18f23
JH
81 update_Request,
82 update_url_query,
a107193e 83 url_basename,
bebef109 84 url_or_none,
b868936c 85 urljoin,
6606817a 86 variadic,
a6571f10 87 xpath_element,
8d6765cf
S
88 xpath_text,
89 xpath_with_ns,
d6983cb4 90)
c342041f 91
d6983cb4
PH
92
93class InfoExtractor(object):
94 """Information Extractor class.
95
96 Information extractors are the classes that, given a URL, extract
97 information about the video (or videos) the URL refers to. This
98 information includes the real video URL, the video title, author and
99 others. The information is stored in a dictionary which is then
5d380852 100 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
101 information possibly downloading the video to the file system, among
102 other possible outcomes.
103
cf0649f8 104 The type field determines the type of the result.
fed5d032
PH
105 By far the most common value (and the default if _type is missing) is
106 "video", which indicates a single video.
107
108 For a video, the dictionaries must include the following fields:
d6983cb4
PH
109
110 id: Video identifier.
d6983cb4 111 title: Video title, unescaped.
d67b0b15 112
f49d89ee 113 Additionally, it must contain either a formats entry or a url one:
d67b0b15 114
f49d89ee
PH
115 formats: A list of dictionaries for each format available, ordered
116 from worst to best quality.
117
118 Potential fields:
c790e93a
S
119 * url The mandatory URL representing the media:
120 for plain file media - HTTP URL of this file,
121 for RTMP - RTMP URL,
122 for HLS - URL of the M3U8 media playlist,
123 for HDS - URL of the F4M manifest,
79d2077e
S
124 for DASH
125 - HTTP URL to plain file media (in case of
126 unfragmented media)
127 - URL of the MPD manifest or base URL
128 representing the media if MPD manifest
8ed7a233 129 is parsed from a string (in case of
79d2077e 130 fragmented media)
c790e93a 131 for MSS - URL of the ISM manifest.
86f4d14f
S
132 * manifest_url
133 The URL of the manifest file in case of
c790e93a
S
134 fragmented media:
135 for HLS - URL of the M3U8 master playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH - URL of the MPD manifest,
138 for MSS - URL of the ISM manifest.
10952eb2 139 * ext Will be calculated from URL if missing
d67b0b15
PH
140 * format A human-readable description of the format
141 ("mp4 container with h264/opus").
142 Calculated from the format_id, width, height.
143 and format_note fields if missing.
144 * format_id A short description of the format
5d4f3985
PH
145 ("mp4_h264_opus" or "19").
146 Technically optional, but strongly recommended.
d67b0b15
PH
147 * format_note Additional info about the format
148 ("3D" or "DASH video")
149 * width Width of the video, if known
150 * height Height of the video, if known
f49d89ee 151 * resolution Textual description of width and height
176f1866 152 * dynamic_range The dynamic range of the video. One of:
153 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 154 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
155 * abr Average audio bitrate in KBit/s
156 * acodec Name of the audio codec in use
dd27fd17 157 * asr Audio sampling rate in Hertz
d67b0b15 158 * vbr Average video bitrate in KBit/s
fbb21cf5 159 * fps Frame rate
d67b0b15 160 * vcodec Name of the video codec in use
1394ce65 161 * container Name of the container format
d67b0b15 162 * filesize The number of bytes, if known in advance
9732d77e 163 * filesize_approx An estimate for the number of bytes
d67b0b15 164 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
165 * protocol The protocol that will be used for the actual
166 download, lower-case.
0fa9a1e2 167 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
af7d5a63 168 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
169 * fragment_base_url
170 Base URL for fragments. Each fragment's path
171 value (if present) will be relative to
172 this URL.
173 * fragments A list of fragments of a fragmented media.
174 Each fragment entry must contain either an url
175 or a path. If an url is present it should be
176 considered by a client. Otherwise both path and
177 fragment_base_url must be present. Here is
178 the list of all potential fields:
179 * "url" - fragment's URL
180 * "path" - fragment's path relative to
181 fragment_base_url
a0d5077c
S
182 * "duration" (optional, int or float)
183 * "filesize" (optional, int)
f49d89ee 184 * preference Order number of this format. If this field is
08d13955 185 present and not None, the formats get sorted
38d63d84 186 by this field, regardless of all other values.
f49d89ee
PH
187 -1 for default (order by other properties),
188 -2 or smaller for less than default.
e65566a9
PH
189 < -1000 to hide the format (if there is
190 another one which is strictly better)
32f90364
PH
191 * language Language code, e.g. "de" or "en-US".
192 * language_preference Is this in the language mentioned in
193 the URL?
aff2f4f4
PH
194 10 if it's what the URL is about,
195 -1 for default (don't know),
196 -10 otherwise, other values reserved for now.
5d73273f
PH
197 * quality Order number of the video quality of this
198 format, irrespective of the file format.
199 -1 for default (order by other properties),
200 -2 or smaller for less than default.
c64ed2a3
PH
201 * source_preference Order number for this video source
202 (quality takes higher priority)
203 -1 for default (order by other properties),
204 -2 or smaller for less than default.
d769be6c
PH
205 * http_headers A dictionary of additional HTTP headers
206 to add to the request.
6271f1ca 207 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
208 video's pixels are not square.
209 width : height ratio as float.
210 * no_resume The server does not support resuming the
211 (HTTP or RTMP) download. Boolean.
88acdbc2 212 * has_drm The format has DRM and cannot be downloaded. Boolean
00c97e3e
S
213 * downloader_options A dictionary of downloader options as
214 described in FileDownloader
3b1fe47d 215 RTMP formats can also have the additional fields: page_url,
216 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
217 rtmp_protocol, rtmp_real_time
3dee7826 218
c0ba0f48 219 url: Final video URL.
d6983cb4 220 ext: Video filename extension.
d67b0b15
PH
221 format: The video format, defaults to ext (used for --get-format)
222 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 223
d6983cb4
PH
224 The following fields are optional:
225
f5e43bc6 226 alt_title: A secondary title of the video.
0afef30b
PH
227 display_id An alternative identifier for the video, not necessarily
228 unique, but available before title. Typically, id is
229 something like "4234987", title "Dancing naked mole rats",
230 and display_id "dancing-naked-mole-rats"
d5519808 231 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 232 * "id" (optional, string) - Thumbnail format ID
d5519808 233 * "url"
cfb56d1a 234 * "preference" (optional, int) - quality of the image
d5519808
PH
235 * "width" (optional, int)
236 * "height" (optional, int)
5e1c39ac 237 * "resolution" (optional, string "{width}x{height}",
d5519808 238 deprecated)
2de624fd 239 * "filesize" (optional, int)
d6983cb4 240 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 241 description: Full video description.
d6983cb4 242 uploader: Full name of the video uploader.
2bc0c46f 243 license: License name the video is licensed under.
8a92e51c 244 creator: The creator of the video.
10db0d2f 245 release_timestamp: UNIX timestamp of the moment the video was released.
8aab976b 246 release_date: The date (YYYYMMDD) when the video was released.
10db0d2f 247 timestamp: UNIX timestamp of the moment the video was uploaded
d6983cb4 248 upload_date: Video upload date (YYYYMMDD).
955c4514 249 If not explicitly set, calculated from timestamp.
d6983cb4 250 uploader_id: Nickname or id of the video uploader.
7bcd2830 251 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 252 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 253 Note that channel fields may or may not repeat uploader
6f1f59f3
S
254 fields. This depends on a particular extractor.
255 channel_id: Id of the channel.
256 channel_url: Full URL to a channel webpage.
da9ec3b9 257 location: Physical location where the video was filmed.
a504ced0 258 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
259 {tag: subformats}. "tag" is usually a language code, and
260 "subformats" is a list sorted from lower to higher
261 preference, each element is a dictionary with the "ext"
262 entry and one of:
a504ced0 263 * "data": The subtitles file contents
10952eb2 264 * "url": A URL pointing to the subtitles file
2412044c 265 It can optionally also have:
266 * "name": Name or description of the subtitles
4bba3716 267 "ext" will be calculated from URL if missing
e167860c 268 automatic_captions: Like 'subtitles'; contains automatically generated
269 captions instead of normal subtitles
62d231c0 270 duration: Length of the video in seconds, as an integer or float.
f3d29461 271 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
272 like_count: Number of positive ratings of the video
273 dislike_count: Number of negative ratings of the video
02835c6b 274 repost_count: Number of reposts of the video
2d30521a 275 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 276 comment_count: Number of comments on the video
dd622d7c
PH
277 comments: A list of comments, each with one or more of the following
278 properties (all but one of text or html optional):
279 * "author" - human-readable name of the comment author
280 * "author_id" - user ID of the comment author
a1c5d2ca 281 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
282 * "id" - Comment ID
283 * "html" - Comment as HTML
284 * "text" - Plain text of the comment
285 * "timestamp" - UNIX timestamp of comment
286 * "parent" - ID of the comment this one is replying to.
287 Set to "root" to indicate that this is a
288 comment to the original video.
a1c5d2ca
M
289 * "like_count" - Number of positive ratings of the comment
290 * "dislike_count" - Number of negative ratings of the comment
291 * "is_favorited" - Whether the comment is marked as
292 favorite by the video uploader
293 * "author_is_uploader" - Whether the comment is made by
294 the video uploader
8dbe9899 295 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 296 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
297 should allow to get the same result again. (It will be set
298 by YoutubeDL if it's missing)
ad3bc6ac
PH
299 categories: A list of categories that the video falls in, for example
300 ["Sports", "Berlin"]
864f24bd 301 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 302 cast: A list of the video cast
7267bd53
PH
303 is_live: True, False, or None (=unknown). Whether this video is a
304 live stream that goes on instead of a fixed-length video.
f76ede8e 305 was_live: True, False, or None (=unknown). Whether this video was
306 originally a live stream.
3dbb2a9d 307 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
ae30b840 308 If absent, automatically set from is_live, was_live
7c80519c 309 start_time: Time in seconds where the reproduction should start, as
10952eb2 310 specified in the URL.
297a564b 311 end_time: Time in seconds where the reproduction should end, as
10952eb2 312 specified in the URL.
55949fed 313 chapters: A list of dictionaries, with the following entries:
314 * "start_time" - The start time of the chapter in seconds
315 * "end_time" - The end time of the chapter in seconds
316 * "title" (optional, string)
6cfda058 317 playable_in_embed: Whether this video is allowed to play in embedded
318 players on other sites. Can be True (=always allowed),
319 False (=never allowed), None (=unknown), or a string
c224251a
M
320 specifying the criteria for embedability (Eg: 'whitelist')
321 availability: Under what condition the video is available. One of
322 'private', 'premium_only', 'subscriber_only', 'needs_auth',
323 'unlisted' or 'public'. Use 'InfoExtractor._availability'
324 to set it
277d6ff5 325 __post_extractor: A function to be called just before the metadata is
326 written to either disk, logger or console. The function
327 must return a dict which will be added to the info_dict.
328 This is usefull for additional information that is
329 time-consuming to extract. Note that the fields thus
330 extracted will not be available to output template and
331 match_filter. So, only "comments" and "comment_count" are
332 currently allowed to be extracted via this method.
d6983cb4 333
7109903e
S
334 The following fields should only be used when the video belongs to some logical
335 chapter or section:
336
337 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
338 chapter_number: Number of the chapter the video belongs to, as an integer.
339 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
340
341 The following fields should only be used when the video is an episode of some
8d76bdf1 342 series, programme or podcast:
7109903e
S
343
344 series: Title of the series or programme the video episode belongs to.
9ac24e23 345 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
7109903e 346 season: Title of the season the video episode belongs to.
27bfd4e5
S
347 season_number: Number of the season the video episode belongs to, as an integer.
348 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
349 episode: Title of the video episode. Unlike mandatory video title field,
350 this field should denote the exact title of the video episode
351 without any kind of decoration.
27bfd4e5
S
352 episode_number: Number of the video episode within a season, as an integer.
353 episode_id: Id of the video episode, as a unicode string.
7109903e 354
7a93ab5f
S
355 The following fields should only be used when the media is a track or a part of
356 a music album:
357
358 track: Title of the track.
359 track_number: Number of the track within an album or a disc, as an integer.
360 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
361 as a unicode string.
362 artist: Artist(s) of the track.
363 genre: Genre(s) of the track.
364 album: Title of the album the track belongs to.
365 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
366 album_artist: List of all artists appeared on the album (e.g.
367 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
368 and compilations).
369 disc_number: Number of the disc or other physical medium the track belongs to,
370 as an integer.
371 release_year: Year (YYYY) when the album was released.
372
deefc05b 373 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 374
d838b1bd
PH
375 Unless mentioned otherwise, None is equivalent to absence of information.
376
fed5d032
PH
377
378 _type "playlist" indicates multiple videos.
b82f815f
PH
379 There must be a key "entries", which is a list, an iterable, or a PagedList
380 object, each element of which is a valid dictionary by this specification.
fed5d032 381
b60419c5 382 Additionally, playlists can have "id", "title", and any other relevent
383 attributes with the same semantics as videos (see above).
fed5d032
PH
384
385
386 _type "multi_video" indicates that there are multiple videos that
387 form a single show, for examples multiple acts of an opera or TV episode.
388 It must have an entries key like a playlist and contain all the keys
389 required for a video at the same time.
390
391
392 _type "url" indicates that the video must be extracted from another
393 location, possibly by a different extractor. Its only required key is:
394 "url" - the next URL to extract.
f58766ce
PH
395 The key "ie_key" can be set to the class name (minus the trailing "IE",
396 e.g. "Youtube") if the extractor class is known in advance.
397 Additionally, the dictionary may have any properties of the resolved entity
398 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
399 known ahead of time.
400
401
402 _type "url_transparent" entities have the same specification as "url", but
403 indicate that the given additional information is more precise than the one
404 associated with the resolved URL.
405 This is useful when a site employs a video service that hosts the video and
406 its technical metadata, but that video service does not embed a useful
407 title, description etc.
408
409
d6983cb4
PH
410 Subclasses of this one should re-define the _real_initialize() and
411 _real_extract() methods and define a _VALID_URL regexp.
412 Probably, they should also be added to the list of extractors.
413
e6f21b3d 414 Subclasses may also override suitable() if necessary, but ensure the function
415 signature is preserved and that this function imports everything it needs
416 (except other extractors), so that lazy_extractors works correctly
417
4248dad9 418 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
419 geo restriction bypass mechanisms for a particular extractor.
420 Though it won't disable explicit geo restriction bypass based on
504f20dd 421 country code provided with geo_bypass_country.
4248dad9
S
422
423 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
424 countries for this extractor. One of these countries will be used by
425 geo restriction bypass mechanism right away in order to bypass
504f20dd 426 geo restriction, of course, if the mechanism is not disabled.
773f291d 427
5f95927a
S
428 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
429 IP blocks in CIDR notation for this extractor. One of these IP blocks
430 will be used by geo restriction bypass mechanism similarly
504f20dd 431 to _GEO_COUNTRIES.
3ccdde8c 432
e6f21b3d 433 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
434 in order to warn the users and skip the tests.
435 """
436
437 _ready = False
438 _downloader = None
773f291d 439 _x_forwarded_for_ip = None
4248dad9
S
440 _GEO_BYPASS = True
441 _GEO_COUNTRIES = None
5f95927a 442 _GEO_IP_BLOCKS = None
d6983cb4
PH
443 _WORKING = True
444
9d5d4d64 445 _LOGIN_HINTS = {
0930b11f 446 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
9d5d4d64 447 'cookies': (
a0c716bb 448 'Use --cookies-from-browser or --cookies for the authentication. '
449 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
0930b11f 450 'password': 'Use --username and --password, or --netrc to provide account credentials',
9d5d4d64 451 }
452
d6983cb4 453 def __init__(self, downloader=None):
49a57e70 454 """Constructor. Receives an optional downloader (a YoutubeDL instance).
455 If a downloader is not passed during initialization,
456 it must be set using "set_downloader()" before "extract()" is called"""
d6983cb4 457 self._ready = False
773f291d 458 self._x_forwarded_for_ip = None
28f436ba 459 self._printed_messages = set()
d6983cb4
PH
460 self.set_downloader(downloader)
461
462 @classmethod
5ad28e7f 463 def _match_valid_url(cls, url):
79cb2577
PH
464 # This does not use has/getattr intentionally - we want to know whether
465 # we have cached the regexp for *this* class, whereas getattr would also
466 # match the superclass
467 if '_VALID_URL_RE' not in cls.__dict__:
2c4aaadd 468 if '_VALID_URL' not in cls.__dict__:
469 cls._VALID_URL = cls._make_valid_url()
79cb2577 470 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 471 return cls._VALID_URL_RE.match(url)
472
473 @classmethod
474 def suitable(cls, url):
475 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 476 # This function must import everything it needs (except other extractors),
477 # so that lazy_extractors works correctly
5ad28e7f 478 return cls._match_valid_url(url) is not None
d6983cb4 479
ed9266db
PH
480 @classmethod
481 def _match_id(cls, url):
5ad28e7f 482 return cls._match_valid_url(url).group('id')
ed9266db 483
1151c407 484 @classmethod
485 def get_temp_id(cls, url):
486 try:
487 return cls._match_id(url)
488 except (IndexError, AttributeError):
489 return None
490
d6983cb4
PH
491 @classmethod
492 def working(cls):
493 """Getter method for _WORKING."""
494 return cls._WORKING
495
496 def initialize(self):
497 """Initializes an instance (authentication, etc)."""
28f436ba 498 self._printed_messages = set()
5f95927a
S
499 self._initialize_geo_bypass({
500 'countries': self._GEO_COUNTRIES,
501 'ip_blocks': self._GEO_IP_BLOCKS,
502 })
4248dad9
S
503 if not self._ready:
504 self._real_initialize()
505 self._ready = True
506
5f95927a 507 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
508 """
509 Initialize geo restriction bypass mechanism.
510
511 This method is used to initialize geo bypass mechanism based on faking
512 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 513 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
514 IP will be passed as X-Forwarded-For HTTP header in all subsequent
515 HTTP requests.
e39b5d4a
S
516
517 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
518 during the instance initialization with _GEO_COUNTRIES and
519 _GEO_IP_BLOCKS.
e39b5d4a 520
5f95927a 521 You may also manually call it from extractor's code if geo bypass
e39b5d4a 522 information is not available beforehand (e.g. obtained during
5f95927a
S
523 extraction) or due to some other reason. In this case you should pass
524 this information in geo bypass context passed as first argument. It may
525 contain following fields:
526
527 countries: List of geo unrestricted countries (similar
528 to _GEO_COUNTRIES)
529 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
530 (similar to _GEO_IP_BLOCKS)
531
e39b5d4a 532 """
773f291d 533 if not self._x_forwarded_for_ip:
5f95927a
S
534
535 # Geo bypass mechanism is explicitly disabled by user
a06916d9 536 if not self.get_param('geo_bypass', True):
5f95927a
S
537 return
538
539 if not geo_bypass_context:
540 geo_bypass_context = {}
541
542 # Backward compatibility: previously _initialize_geo_bypass
543 # expected a list of countries, some 3rd party code may still use
544 # it this way
545 if isinstance(geo_bypass_context, (list, tuple)):
546 geo_bypass_context = {
547 'countries': geo_bypass_context,
548 }
549
550 # The whole point of geo bypass mechanism is to fake IP
551 # as X-Forwarded-For HTTP header based on some IP block or
552 # country code.
553
554 # Path 1: bypassing based on IP block in CIDR notation
555
556 # Explicit IP block specified by user, use it right away
557 # regardless of whether extractor is geo bypassable or not
a06916d9 558 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
559
560 # Otherwise use random IP block from geo bypass context but only
561 # if extractor is known as geo bypassable
562 if not ip_block:
563 ip_blocks = geo_bypass_context.get('ip_blocks')
564 if self._GEO_BYPASS and ip_blocks:
565 ip_block = random.choice(ip_blocks)
566
567 if ip_block:
568 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
0760b0a7 569 self._downloader.write_debug(
570 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
5f95927a
S
571 return
572
573 # Path 2: bypassing based on country code
574
575 # Explicit country code specified by user, use it right away
576 # regardless of whether extractor is geo bypassable or not
a06916d9 577 country = self.get_param('geo_bypass_country', None)
5f95927a
S
578
579 # Otherwise use random country code from geo bypass context but
580 # only if extractor is known as geo bypassable
581 if not country:
582 countries = geo_bypass_context.get('countries')
583 if self._GEO_BYPASS and countries:
584 country = random.choice(countries)
585
586 if country:
587 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 588 self._downloader.write_debug(
589 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
590
591 def extract(self, url):
592 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 593 try:
773f291d
S
594 for _ in range(2):
595 try:
596 self.initialize()
a06916d9 597 self.write_debug('Extracting URL: %s' % url)
0016b84e 598 ie_result = self._real_extract(url)
07cce701 599 if ie_result is None:
600 return None
0016b84e
S
601 if self._x_forwarded_for_ip:
602 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
53ed7066 603 subtitles = ie_result.get('subtitles')
604 if (subtitles and 'live_chat' in subtitles
a06916d9 605 and 'no-live-chat' in self.get_param('compat_opts', [])):
53ed7066 606 del subtitles['live_chat']
0016b84e 607 return ie_result
773f291d 608 except GeoRestrictedError as e:
4248dad9
S
609 if self.__maybe_fake_ip_and_retry(e.countries):
610 continue
773f291d 611 raise
0db3bae8 612 except UnsupportedError:
613 raise
1151c407 614 except ExtractorError as e:
0db3bae8 615 kwargs = {
616 'video_id': e.video_id or self.get_temp_id(url),
617 'ie': self.IE_NAME,
618 'tb': e.traceback,
619 'expected': e.expected,
620 'cause': e.cause
621 }
622 if hasattr(e, 'countries'):
623 kwargs['countries'] = e.countries
624 raise type(e)(e.msg, **kwargs)
3a5bcd03 625 except compat_http_client.IncompleteRead as e:
1151c407 626 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 627 except (KeyError, StopIteration) as e:
1151c407 628 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 629
4248dad9 630 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 631 if (not self.get_param('geo_bypass_country', None)
3089bc74 632 and self._GEO_BYPASS
a06916d9 633 and self.get_param('geo_bypass', True)
3089bc74
S
634 and not self._x_forwarded_for_ip
635 and countries):
eea0716c
S
636 country_code = random.choice(countries)
637 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
638 if self._x_forwarded_for_ip:
639 self.report_warning(
eea0716c
S
640 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
641 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
642 return True
643 return False
644
d6983cb4
PH
645 def set_downloader(self, downloader):
646 """Sets the downloader for this IE."""
647 self._downloader = downloader
648
649 def _real_initialize(self):
650 """Real initialization process. Redefine in subclasses."""
651 pass
652
653 def _real_extract(self, url):
654 """Real extraction process. Redefine in subclasses."""
655 pass
656
56c73665
JMF
657 @classmethod
658 def ie_key(cls):
659 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 660 return cls.__name__[:-2]
56c73665 661
d6983cb4
PH
662 @property
663 def IE_NAME(self):
dc519b54 664 return compat_str(type(self).__name__[:-2])
d6983cb4 665
d391b7e2
S
666 @staticmethod
667 def __can_accept_status_code(err, expected_status):
668 assert isinstance(err, compat_urllib_error.HTTPError)
669 if expected_status is None:
670 return False
d391b7e2
S
671 elif callable(expected_status):
672 return expected_status(err.code) is True
673 else:
6606817a 674 return err.code in variadic(expected_status)
d391b7e2
S
675
676 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
677 """
678 Return the response handle.
679
680 See _download_webpage docstring for arguments specification.
681 """
1cf376f5 682 if not self._downloader._first_webpage_request:
49a57e70 683 sleep_interval = self.get_param('sleep_interval_requests') or 0
1cf376f5 684 if sleep_interval > 0:
5ef7d9bd 685 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 686 time.sleep(sleep_interval)
687 else:
688 self._downloader._first_webpage_request = False
689
d6983cb4
PH
690 if note is None:
691 self.report_download_webpage(video_id)
692 elif note is not False:
7cc3570e 693 if video_id is None:
f1a9d64e 694 self.to_screen('%s' % (note,))
7cc3570e 695 else:
f1a9d64e 696 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
697
698 # Some sites check X-Forwarded-For HTTP header in order to figure out
699 # the origin of the client behind proxy. This allows bypassing geo
700 # restriction by faking this header's value to IP that belongs to some
701 # geo unrestricted country. We will do so once we encounter any
702 # geo restriction error.
703 if self._x_forwarded_for_ip:
704 if 'X-Forwarded-For' not in headers:
705 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
706
41d06b04
S
707 if isinstance(url_or_request, compat_urllib_request.Request):
708 url_or_request = update_Request(
709 url_or_request, data=data, headers=headers, query=query)
710 else:
cdfee168 711 if query:
712 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 713 if data is not None or headers:
41d06b04 714 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 715 try:
dca08720 716 return self._downloader.urlopen(url_or_request)
3158150c 717 except network_exceptions as err:
d391b7e2
S
718 if isinstance(err, compat_urllib_error.HTTPError):
719 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
720 # Retain reference to error to prevent file object from
721 # being closed before it can be read. Works around the
722 # effects of <https://bugs.python.org/issue15002>
723 # introduced in Python 3.4.1.
724 err.fp._error = err
d391b7e2
S
725 return err.fp
726
aa94a6d3
PH
727 if errnote is False:
728 return False
d6983cb4 729 if errnote is None:
f1a9d64e 730 errnote = 'Unable to download webpage'
7f8b2714 731
9b9c5355 732 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
733 if fatal:
734 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
735 else:
6a39ee13 736 self.report_warning(errmsg)
7cc3570e 737 return False
d6983cb4 738
d391b7e2
S
739 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
740 """
741 Return a tuple (page content as string, URL handle).
742
743 See _download_webpage docstring for arguments specification.
744 """
b9d3e163
PH
745 # Strip hashes from the URL (#1038)
746 if isinstance(url_or_request, (compat_str, str)):
747 url_or_request = url_or_request.partition('#')[0]
748
d391b7e2 749 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
750 if urlh is False:
751 assert not fatal
752 return False
c9a77969 753 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
754 return (content, urlh)
755
c9a77969
YCH
756 @staticmethod
757 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
758 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
759 if m:
760 encoding = m.group(1)
761 else:
0d75ae2c 762 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
763 webpage_bytes[:1024])
764 if m:
765 encoding = m.group(1).decode('ascii')
b60016e8
PH
766 elif webpage_bytes.startswith(b'\xff\xfe'):
767 encoding = 'utf-16'
f143d86a
PH
768 else:
769 encoding = 'utf-8'
c9a77969
YCH
770
771 return encoding
772
4457823d
S
773 def __check_blocked(self, content):
774 first_block = content[:512]
3089bc74
S
775 if ('<title>Access to this site is blocked</title>' in content
776 and 'Websense' in first_block):
4457823d
S
777 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
778 blocked_iframe = self._html_search_regex(
779 r'<iframe src="([^"]+)"', content,
780 'Websense information URL', default=None)
781 if blocked_iframe:
782 msg += ' Visit %s for more details' % blocked_iframe
783 raise ExtractorError(msg, expected=True)
784 if '<title>The URL you requested has been blocked</title>' in first_block:
785 msg = (
786 'Access to this webpage has been blocked by Indian censorship. '
787 'Use a VPN or proxy server (with --proxy) to route around it.')
788 block_msg = self._html_search_regex(
789 r'</h1><p>(.*?)</p>',
790 content, 'block message', default=None)
791 if block_msg:
792 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
793 raise ExtractorError(msg, expected=True)
3089bc74
S
794 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
795 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
796 raise ExtractorError(
797 'Access to this webpage has been blocked by decision of the Russian government. '
798 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
799 expected=True)
800
c9a77969
YCH
801 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
802 content_type = urlh.headers.get('Content-Type', '')
803 webpage_bytes = urlh.read()
804 if prefix is not None:
805 webpage_bytes = prefix + webpage_bytes
806 if not encoding:
807 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
a06916d9 808 if self.get_param('dump_intermediate_pages', False):
f610dbb0 809 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
810 dump = base64.b64encode(webpage_bytes).decode('ascii')
811 self._downloader.to_screen(dump)
a06916d9 812 if self.get_param('write_pages', False):
f610dbb0 813 basen = '%s_%s' % (video_id, urlh.geturl())
bd6f722d 814 trim_length = self.get_param('trim_file_name') or 240
815 if len(basen) > trim_length:
f1a9d64e 816 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
bd6f722d 817 basen = basen[:trim_length - len(h)] + h
c1bce22f 818 raw_filename = basen + '.dump'
d41e6efc 819 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 820 self.to_screen('Saving request to ' + filename)
5f58165d
S
821 # Working around MAX_PATH limitation on Windows (see
822 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 823 if compat_os_name == 'nt':
5f58165d
S
824 absfilepath = os.path.abspath(filename)
825 if len(absfilepath) > 259:
826 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
827 with open(filename, 'wb') as outf:
828 outf.write(webpage_bytes)
829
ec0fafbb
AA
830 try:
831 content = webpage_bytes.decode(encoding, 'replace')
832 except LookupError:
833 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 834
4457823d 835 self.__check_blocked(content)
2410c43d 836
23be51d8 837 return content
d6983cb4 838
d391b7e2
S
839 def _download_webpage(
840 self, url_or_request, video_id, note=None, errnote=None,
841 fatal=True, tries=1, timeout=5, encoding=None, data=None,
842 headers={}, query={}, expected_status=None):
843 """
844 Return the data of the page as a string.
845
846 Arguments:
847 url_or_request -- plain text URL as a string or
848 a compat_urllib_request.Requestobject
849 video_id -- Video/playlist/item identifier (string)
850
851 Keyword arguments:
852 note -- note printed before downloading (string)
853 errnote -- note printed in case of an error (string)
854 fatal -- flag denoting whether error should be considered fatal,
855 i.e. whether it should cause ExtractionError to be raised,
856 otherwise a warning will be reported and extraction continued
857 tries -- number of tries
858 timeout -- sleep interval between tries
859 encoding -- encoding for a page content decoding, guessed automatically
860 when not explicitly specified
861 data -- POST data (bytes)
862 headers -- HTTP headers (dict)
863 query -- URL query (dict)
864 expected_status -- allows to accept failed HTTP requests (non 2xx
865 status code) by explicitly specifying a set of accepted status
866 codes. Can be any of the following entities:
867 - an integer type specifying an exact failed status code to
868 accept
869 - a list or a tuple of integer types specifying a list of
870 failed status codes to accept
871 - a callable accepting an actual failed status code and
872 returning True if it should be accepted
873 Note that this argument does not affect success status codes (2xx)
874 which are always accepted.
875 """
876
995ad69c
TF
877 success = False
878 try_count = 0
879 while success is False:
880 try:
d391b7e2
S
881 res = self._download_webpage_handle(
882 url_or_request, video_id, note, errnote, fatal,
883 encoding=encoding, data=data, headers=headers, query=query,
884 expected_status=expected_status)
995ad69c
TF
885 success = True
886 except compat_http_client.IncompleteRead as e:
887 try_count += 1
888 if try_count >= tries:
889 raise e
890 self._sleep(timeout, video_id)
7cc3570e
PH
891 if res is False:
892 return res
893 else:
894 content, _ = res
895 return content
d6983cb4 896
e0d198c1
S
897 def _download_xml_handle(
898 self, url_or_request, video_id, note='Downloading XML',
899 errnote='Unable to download XML', transform_source=None,
d391b7e2
S
900 fatal=True, encoding=None, data=None, headers={}, query={},
901 expected_status=None):
902 """
ee0ba927 903 Return a tuple (xml as an compat_etree_Element, URL handle).
d391b7e2
S
904
905 See _download_webpage docstring for arguments specification.
906 """
e0d198c1
S
907 res = self._download_webpage_handle(
908 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
909 encoding=encoding, data=data, headers=headers, query=query,
910 expected_status=expected_status)
e0d198c1
S
911 if res is False:
912 return res
913 xml_string, urlh = res
914 return self._parse_xml(
915 xml_string, video_id, transform_source=transform_source,
916 fatal=fatal), urlh
917
d391b7e2
S
918 def _download_xml(
919 self, url_or_request, video_id,
920 note='Downloading XML', errnote='Unable to download XML',
921 transform_source=None, fatal=True, encoding=None,
922 data=None, headers={}, query={}, expected_status=None):
923 """
ee0ba927 924 Return the xml as an compat_etree_Element.
d391b7e2
S
925
926 See _download_webpage docstring for arguments specification.
927 """
e0d198c1
S
928 res = self._download_xml_handle(
929 url_or_request, video_id, note=note, errnote=errnote,
930 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
931 data=data, headers=headers, query=query,
932 expected_status=expected_status)
e0d198c1 933 return res if res is False else res[0]
e01c3d2e
S
934
935 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
936 if transform_source:
937 xml_string = transform_source(xml_string)
e01c3d2e
S
938 try:
939 return compat_etree_fromstring(xml_string.encode('utf-8'))
940 except compat_xml_parse_error as ve:
941 errmsg = '%s: Failed to parse XML ' % video_id
942 if fatal:
943 raise ExtractorError(errmsg, cause=ve)
944 else:
945 self.report_warning(errmsg + str(ve))
267ed0c5 946
0fe7783e
S
947 def _download_json_handle(
948 self, url_or_request, video_id, note='Downloading JSON metadata',
949 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
950 fatal=True, encoding=None, data=None, headers={}, query={},
951 expected_status=None):
952 """
953 Return a tuple (JSON object, URL handle).
954
955 See _download_webpage docstring for arguments specification.
956 """
0fe7783e 957 res = self._download_webpage_handle(
c9a77969 958 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
959 encoding=encoding, data=data, headers=headers, query=query,
960 expected_status=expected_status)
0fe7783e
S
961 if res is False:
962 return res
963 json_string, urlh = res
ebb64199 964 return self._parse_json(
0fe7783e
S
965 json_string, video_id, transform_source=transform_source,
966 fatal=fatal), urlh
967
968 def _download_json(
969 self, url_or_request, video_id, note='Downloading JSON metadata',
970 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
971 fatal=True, encoding=None, data=None, headers={}, query={},
972 expected_status=None):
973 """
974 Return the JSON object as a dict.
975
976 See _download_webpage docstring for arguments specification.
977 """
0fe7783e
S
978 res = self._download_json_handle(
979 url_or_request, video_id, note=note, errnote=errnote,
980 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
981 data=data, headers=headers, query=query,
982 expected_status=expected_status)
0fe7783e 983 return res if res is False else res[0]
ebb64199
TF
984
985 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
986 if transform_source:
987 json_string = transform_source(json_string)
3d3538e4
PH
988 try:
989 return json.loads(json_string)
990 except ValueError as ve:
e7b6d122
PH
991 errmsg = '%s: Failed to parse JSON ' % video_id
992 if fatal:
993 raise ExtractorError(errmsg, cause=ve)
994 else:
995 self.report_warning(errmsg + str(ve))
3d3538e4 996
adddc50c 997 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
998 return self._parse_json(
999 data[data.find('{'):data.rfind('}') + 1],
1000 video_id, transform_source, fatal)
1001
1002 def _download_socket_json_handle(
1003 self, url_or_request, video_id, note='Polling socket',
1004 errnote='Unable to poll socket', transform_source=None,
1005 fatal=True, encoding=None, data=None, headers={}, query={},
1006 expected_status=None):
1007 """
1008 Return a tuple (JSON object, URL handle).
1009
1010 See _download_webpage docstring for arguments specification.
1011 """
1012 res = self._download_webpage_handle(
1013 url_or_request, video_id, note, errnote, fatal=fatal,
1014 encoding=encoding, data=data, headers=headers, query=query,
1015 expected_status=expected_status)
1016 if res is False:
1017 return res
1018 webpage, urlh = res
1019 return self._parse_socket_response_as_json(
1020 webpage, video_id, transform_source=transform_source,
1021 fatal=fatal), urlh
1022
1023 def _download_socket_json(
1024 self, url_or_request, video_id, note='Polling socket',
1025 errnote='Unable to poll socket', transform_source=None,
1026 fatal=True, encoding=None, data=None, headers={}, query={},
1027 expected_status=None):
1028 """
1029 Return the JSON object as a dict.
1030
1031 See _download_webpage docstring for arguments specification.
1032 """
1033 res = self._download_socket_json_handle(
1034 url_or_request, video_id, note=note, errnote=errnote,
1035 transform_source=transform_source, fatal=fatal, encoding=encoding,
1036 data=data, headers=headers, query=query,
1037 expected_status=expected_status)
1038 return res if res is False else res[0]
1039
28f436ba 1040 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
b868936c 1041 idstr = format_field(video_id, template='%s: ')
28f436ba 1042 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1043 if only_once:
1044 if f'WARNING: {msg}' in self._printed_messages:
1045 return
1046 self._printed_messages.add(f'WARNING: {msg}')
1047 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1048
a06916d9 1049 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1050 """Print msg to screen, prefixing it with '[ie_name]'"""
a06916d9 1051 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1052
1053 def write_debug(self, msg, *args, **kwargs):
1054 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1055
1056 def get_param(self, name, default=None, *args, **kwargs):
1057 if self._downloader:
1058 return self._downloader.params.get(name, default, *args, **kwargs)
1059 return default
d6983cb4 1060
88acdbc2 1061 def report_drm(self, video_id, partial=False):
1062 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1063
d6983cb4
PH
1064 def report_extraction(self, id_or_name):
1065 """Report information extraction."""
f1a9d64e 1066 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1067
1068 def report_download_webpage(self, video_id):
1069 """Report webpage download."""
f1a9d64e 1070 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1071
1072 def report_age_confirmation(self):
1073 """Report attempt to confirm age."""
f1a9d64e 1074 self.to_screen('Confirming age')
d6983cb4 1075
fc79158d
JMF
1076 def report_login(self):
1077 """Report attempt to log in."""
f1a9d64e 1078 self.to_screen('Logging in')
fc79158d 1079
b7da73eb 1080 def raise_login_required(
9d5d4d64 1081 self, msg='This video is only available for registered users',
1082 metadata_available=False, method='any'):
f2ebc5c7 1083 if metadata_available and (
1084 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1085 self.report_warning(msg)
46890374 1086 if method is not None:
1087 msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1088 raise ExtractorError(msg, expected=True)
43e7d3c9 1089
b7da73eb 1090 def raise_geo_restricted(
1091 self, msg='This video is not available from your location due to geo restriction',
1092 countries=None, metadata_available=False):
f2ebc5c7 1093 if metadata_available and (
1094 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1095 self.report_warning(msg)
1096 else:
1097 raise GeoRestrictedError(msg, countries=countries)
1098
1099 def raise_no_formats(self, msg, expected=False, video_id=None):
f2ebc5c7 1100 if expected and (
1101 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
b7da73eb 1102 self.report_warning(msg, video_id)
68f5867c
L
1103 elif isinstance(msg, ExtractorError):
1104 raise msg
b7da73eb 1105 else:
1106 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1107
5f6a1245 1108 # Methods for following #608
c0d0b01f 1109 @staticmethod
ec3f6640 1110 def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
10952eb2 1111 """Returns a URL that points to a page that should be processed"""
5f6a1245 1112 # TODO: ie should be the class used for getting the info
d6983cb4
PH
1113 video_info = {'_type': 'url',
1114 'url': url,
1115 'ie_key': ie}
ec3f6640 1116 video_info.update(kwargs)
7012b23c
PH
1117 if video_id is not None:
1118 video_info['id'] = video_id
830d53bf
S
1119 if video_title is not None:
1120 video_info['title'] = video_title
d6983cb4 1121 return video_info
5f6a1245 1122
749ca5ec
S
1123 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1124 urls = orderedSet(
46b18f23
JH
1125 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1126 for m in matches)
1127 return self.playlist_result(
749ca5ec 1128 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 1129
c0d0b01f 1130 @staticmethod
b60419c5 1131 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
d6983cb4
PH
1132 """Returns a playlist"""
1133 video_info = {'_type': 'playlist',
1134 'entries': entries}
b60419c5 1135 video_info.update(kwargs)
d6983cb4
PH
1136 if playlist_id:
1137 video_info['id'] = playlist_id
1138 if playlist_title:
1139 video_info['title'] = playlist_title
ecc97af3 1140 if playlist_description is not None:
acf5cbfe 1141 video_info['description'] = playlist_description
d6983cb4
PH
1142 return video_info
1143
c342041f 1144 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1145 """
1146 Perform a regex search on the given string, using a single or a list of
1147 patterns returning the first matching group.
1148 In case of failure return a default value or raise a WARNING or a
55b3e45b 1149 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
1150 """
1151 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1152 mobj = re.search(pattern, string, flags)
1153 else:
1154 for p in pattern:
1155 mobj = re.search(p, string, flags)
c3415d1b
PH
1156 if mobj:
1157 break
d6983cb4 1158
ec11a9f4 1159 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
d6983cb4
PH
1160
1161 if mobj:
711ede6e
PH
1162 if group is None:
1163 # return the first matching group
1164 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1165 elif isinstance(group, (list, tuple)):
1166 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1167 else:
1168 return mobj.group(group)
c342041f 1169 elif default is not NO_DEFAULT:
d6983cb4
PH
1170 return default
1171 elif fatal:
f1a9d64e 1172 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1173 else:
6a39ee13 1174 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1175 return None
1176
c342041f 1177 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1178 """
1179 Like _search_regex, but strips HTML tags and unescapes entities.
1180 """
711ede6e 1181 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1182 if res:
1183 return clean_html(res).strip()
1184 else:
1185 return res
1186
2118fdd1
RA
1187 def _get_netrc_login_info(self, netrc_machine=None):
1188 username = None
1189 password = None
1190 netrc_machine = netrc_machine or self._NETRC_MACHINE
1191
a06916d9 1192 if self.get_param('usenetrc', False):
2118fdd1 1193 try:
0001fcb5 1194 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1195 if os.path.isdir(netrc_file):
1196 netrc_file = os.path.join(netrc_file, '.netrc')
1197 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1198 if info is not None:
1199 username = info[0]
1200 password = info[2]
1201 else:
dcce092e
S
1202 raise netrc.NetrcParseError(
1203 'No authenticators for %s' % netrc_machine)
2118fdd1 1204 except (IOError, netrc.NetrcParseError) as err:
6a39ee13 1205 self.report_warning(
dcce092e 1206 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1207
dcce092e 1208 return username, password
2118fdd1 1209
1b6712ab 1210 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1211 """
cf0649f8 1212 Get the login info as (username, password)
32443dd3
S
1213 First look for the manually specified credentials using username_option
1214 and password_option as keys in params dictionary. If no such credentials
1215 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1216 value.
fc79158d
JMF
1217 If there's no info available, return (None, None)
1218 """
fc79158d
JMF
1219
1220 # Attempt to use provided username and password or .netrc data
a06916d9 1221 username = self.get_param(username_option)
1222 if username is not None:
1223 password = self.get_param(password_option)
2118fdd1 1224 else:
1b6712ab 1225 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1226
2133565c 1227 return username, password
fc79158d 1228
e64b7569 1229 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1230 """
1231 Get the two-factor authentication info
1232 TODO - asking the user will be required for sms/phone verify
1233 currently just uses the command line option
1234 If there's no info available, return None
1235 """
83317f69 1236
a06916d9 1237 tfa = self.get_param('twofactor')
1238 if tfa is not None:
1239 return tfa
83317f69 1240
e64b7569 1241 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1242
46720279
JMF
1243 # Helper functions for extracting OpenGraph info
1244 @staticmethod
ab2d5247 1245 def _og_regexes(prop):
448ef1f3 1246 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
22f5f5c6 1247 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
7a6d76a6 1248 % {'prop': re.escape(prop)})
78fb87b2 1249 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1250 return [
78fb87b2
JMF
1251 template % (property_re, content_re),
1252 template % (content_re, property_re),
ab2d5247 1253 ]
46720279 1254
864f24bd
S
1255 @staticmethod
1256 def _meta_regex(prop):
1257 return r'''(?isx)<meta
8b9848ac 1258 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1259 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1260
3c4e6d83 1261 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1262 prop = variadic(prop)
46720279 1263 if name is None:
b070564e
S
1264 name = 'OpenGraph %s' % prop[0]
1265 og_regexes = []
1266 for p in prop:
1267 og_regexes.extend(self._og_regexes(p))
1268 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1269 if escaped is None:
1270 return None
1271 return unescapeHTML(escaped)
46720279
JMF
1272
1273 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1274 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1275
1276 def _og_search_description(self, html, **kargs):
1277 return self._og_search_property('description', html, fatal=False, **kargs)
1278
1279 def _og_search_title(self, html, **kargs):
1280 return self._og_search_property('title', html, **kargs)
1281
8ffa13e0 1282 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1283 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1284 if secure:
1285 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1286 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1287
78338f71
JMF
1288 def _og_search_url(self, html, **kargs):
1289 return self._og_search_property('url', html, **kargs)
1290
40c696e5 1291 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1292 name = variadic(name)
59040888 1293 if display_name is None:
88d9f6c0 1294 display_name = name[0]
59040888 1295 return self._html_search_regex(
88d9f6c0 1296 [self._meta_regex(n) for n in name],
711ede6e 1297 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1298
1299 def _dc_search_uploader(self, html):
1300 return self._html_search_meta('dc.creator', html, 'uploader')
1301
8dbe9899
PH
1302 def _rta_search(self, html):
1303 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1304 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1305 r' content="RTA-5042-1996-1400-1577-RTA"',
1306 html):
1307 return 18
1308 return 0
1309
59040888
PH
1310 def _media_rating_search(self, html):
1311 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1312 rating = self._html_search_meta('rating', html)
1313
1314 if not rating:
1315 return None
1316
1317 RATING_TABLE = {
1318 'safe for kids': 0,
1319 'general': 8,
1320 '14 years': 14,
1321 'mature': 17,
1322 'restricted': 19,
1323 }
d800609c 1324 return RATING_TABLE.get(rating.lower())
59040888 1325
69319969 1326 def _family_friendly_search(self, html):
6ca7732d 1327 # See http://schema.org/VideoObject
ac8491fc
S
1328 family_friendly = self._html_search_meta(
1329 'isFamilyFriendly', html, default=None)
69319969
NJ
1330
1331 if not family_friendly:
1332 return None
1333
1334 RATING_TABLE = {
1335 '1': 0,
1336 'true': 0,
1337 '0': 18,
1338 'false': 18,
1339 }
d800609c 1340 return RATING_TABLE.get(family_friendly.lower())
69319969 1341
0c708f11
JMF
1342 def _twitter_search_player(self, html):
1343 return self._html_search_meta('twitter:player', html,
9e1a5b84 1344 'twitter card player')
0c708f11 1345
95b31e26 1346 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4433bb02 1347 json_ld_list = list(re.finditer(JSON_LD_RE, html))
321b5e08 1348 default = kwargs.get('default', NO_DEFAULT)
321b5e08
S
1349 # JSON-LD may be malformed and thus `fatal` should be respected.
1350 # At the same time `default` may be passed that assumes `fatal=False`
1351 # for _search_regex. Let's simulate the same behavior here as well.
dbf5416a 1352 fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
4433bb02
S
1353 json_ld = []
1354 for mobj in json_ld_list:
1355 json_ld_item = self._parse_json(
1356 mobj.group('json_ld'), video_id, fatal=fatal)
1357 if not json_ld_item:
1358 continue
1359 if isinstance(json_ld_item, dict):
1360 json_ld.append(json_ld_item)
1361 elif isinstance(json_ld_item, (list, tuple)):
1362 json_ld.extend(json_ld_item)
1363 if json_ld:
1364 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1365 if json_ld:
1366 return json_ld
1367 if default is not NO_DEFAULT:
1368 return default
1369 elif fatal:
1370 raise RegexNotFoundError('Unable to extract JSON-LD')
1371 else:
6a39ee13 1372 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1373 return {}
4ca2a3cf 1374
95b31e26 1375 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1376 if isinstance(json_ld, compat_str):
1377 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1378 if not json_ld:
1379 return {}
1380 info = {}
46933a15
S
1381 if not isinstance(json_ld, (list, tuple, dict)):
1382 return info
1383 if isinstance(json_ld, dict):
1384 json_ld = [json_ld]
bae14048 1385
e7e4a6e0
S
1386 INTERACTION_TYPE_MAP = {
1387 'CommentAction': 'comment',
1388 'AgreeAction': 'like',
1389 'DisagreeAction': 'dislike',
1390 'LikeAction': 'like',
1391 'DislikeAction': 'dislike',
1392 'ListenAction': 'view',
1393 'WatchAction': 'view',
1394 'ViewAction': 'view',
1395 }
1396
29f7c58a 1397 def extract_interaction_type(e):
1398 interaction_type = e.get('interactionType')
1399 if isinstance(interaction_type, dict):
1400 interaction_type = interaction_type.get('@type')
1401 return str_or_none(interaction_type)
1402
e7e4a6e0
S
1403 def extract_interaction_statistic(e):
1404 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1405 if isinstance(interaction_statistic, dict):
1406 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1407 if not isinstance(interaction_statistic, list):
1408 return
1409 for is_e in interaction_statistic:
1410 if not isinstance(is_e, dict):
1411 continue
1412 if is_e.get('@type') != 'InteractionCounter':
1413 continue
29f7c58a 1414 interaction_type = extract_interaction_type(is_e)
1415 if not interaction_type:
e7e4a6e0 1416 continue
ce5b9040
S
1417 # For interaction count some sites provide string instead of
1418 # an integer (as per spec) with non digit characters (e.g. ",")
1419 # so extracting count with more relaxed str_to_int
1420 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1421 if interaction_count is None:
1422 continue
1423 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1424 if not count_kind:
1425 continue
1426 count_key = '%s_count' % count_kind
1427 if info.get(count_key) is not None:
1428 continue
1429 info[count_key] = interaction_count
1430
bae14048
S
1431 def extract_video_object(e):
1432 assert e['@type'] == 'VideoObject'
f7ad7160 1433 author = e.get('author')
bae14048 1434 info.update({
bebef109 1435 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1436 'title': unescapeHTML(e.get('name')),
1437 'description': unescapeHTML(e.get('description')),
bebef109 1438 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
bae14048
S
1439 'duration': parse_duration(e.get('duration')),
1440 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1441 # author can be an instance of 'Organization' or 'Person' types.
1442 # both types can have 'name' property(inherited from 'Thing' type). [1]
1443 # however some websites are using 'Text' type instead.
1444 # 1. https://schema.org/VideoObject
1445 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
bae14048
S
1446 'filesize': float_or_none(e.get('contentSize')),
1447 'tbr': int_or_none(e.get('bitrate')),
1448 'width': int_or_none(e.get('width')),
1449 'height': int_or_none(e.get('height')),
33a81c2c 1450 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1451 })
e7e4a6e0 1452 extract_interaction_statistic(e)
bae14048 1453
46933a15 1454 for e in json_ld:
4433bb02 1455 if '@context' in e:
46933a15
S
1456 item_type = e.get('@type')
1457 if expected_type is not None and expected_type != item_type:
4433bb02 1458 continue
8f122fa0 1459 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1460 if rating is not None:
1461 info['average_rating'] = rating
c69701c6 1462 if item_type in ('TVEpisode', 'Episode'):
440863ad 1463 episode_name = unescapeHTML(e.get('name'))
46933a15 1464 info.update({
440863ad 1465 'episode': episode_name,
46933a15
S
1466 'episode_number': int_or_none(e.get('episodeNumber')),
1467 'description': unescapeHTML(e.get('description')),
1468 })
440863ad
S
1469 if not info.get('title') and episode_name:
1470 info['title'] = episode_name
46933a15 1471 part_of_season = e.get('partOfSeason')
c69701c6 1472 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1473 info.update({
1474 'season': unescapeHTML(part_of_season.get('name')),
1475 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1476 })
d16b3c66 1477 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1478 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1479 info['series'] = unescapeHTML(part_of_series.get('name'))
391256dc
S
1480 elif item_type == 'Movie':
1481 info.update({
1482 'title': unescapeHTML(e.get('name')),
1483 'description': unescapeHTML(e.get('description')),
1484 'duration': parse_duration(e.get('duration')),
1485 'timestamp': unified_timestamp(e.get('dateCreated')),
1486 })
3931b845 1487 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1488 info.update({
1489 'timestamp': parse_iso8601(e.get('datePublished')),
1490 'title': unescapeHTML(e.get('headline')),
1491 'description': unescapeHTML(e.get('articleBody')),
1492 })
1493 elif item_type == 'VideoObject':
bae14048 1494 extract_video_object(e)
4433bb02
S
1495 if expected_type is None:
1496 continue
1497 else:
1498 break
c69701c6
S
1499 video = e.get('video')
1500 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1501 extract_video_object(video)
4433bb02
S
1502 if expected_type is None:
1503 continue
1504 else:
1505 break
4ca2a3cf
S
1506 return dict((k, v) for k, v in info.items() if v is not None)
1507
f98709af
LL
1508 def _search_nextjs_data(self, webpage, video_id, **kw):
1509 return self._parse_json(
1510 self._search_regex(
1511 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1512 webpage, 'next.js data', **kw),
1513 video_id, **kw)
1514
66f4c04e
THD
1515 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
1516 ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1517 # not all website do this, but it can be changed
1518 # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1519 rectx = re.escape(context_name)
1520 js, arg_keys, arg_vals = self._search_regex(
1521 (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx,
1522 r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx),
1523 webpage, context_name, group=['js', 'arg_keys', 'arg_vals'])
1524
1525 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1526
1527 for key, val in args.items():
1528 if val in ('undefined', 'void 0'):
1529 args[key] = 'null'
1530
1531 return self._parse_json(js_to_json(js, args), video_id)['data'][0]
1532
27713812 1533 @staticmethod
f8da79f8 1534 def _hidden_inputs(html):
586f1cc5 1535 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1536 hidden_inputs = {}
c8498368
S
1537 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1538 attrs = extract_attributes(input)
1539 if not input:
201ea3ee 1540 continue
c8498368 1541 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1542 continue
c8498368
S
1543 name = attrs.get('name') or attrs.get('id')
1544 value = attrs.get('value')
1545 if name and value is not None:
1546 hidden_inputs[name] = value
201ea3ee 1547 return hidden_inputs
27713812 1548
cf61d96d
S
1549 def _form_hidden_inputs(self, form_id, html):
1550 form = self._search_regex(
73eb13df 1551 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1552 html, '%s form' % form_id, group='form')
1553 return self._hidden_inputs(form)
1554
eb8a4433 1555 class FormatSort:
b050d210 1556 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
eb8a4433 1557
8326b00a 1558 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
176f1866 1559 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
f304da8a 1560 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
198e3a04 1561 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
53ed7066 1562 'height', 'width', 'proto', 'vext', 'abr', 'aext',
f304da8a 1563 'fps', 'fs_approx', 'source', 'id')
eb8a4433 1564
1565 settings = {
1566 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1567 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1568 'acodec': {'type': 'ordered', 'regex': True,
aeb2a9ad 1569 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
176f1866 1570 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1571 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
f137c99e 1572 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
f304da8a 1573 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
eb8a4433 1574 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1575 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1576 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1577 'aext': {'type': 'ordered', 'field': 'audio_ext',
1578 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1579 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1580 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f5510afe 1581 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
8326b00a 1582 'field': ('vcodec', 'acodec'),
1583 'function': lambda it: int(any(v != 'none' for v in it))},
f983b875 1584 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1585 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1586 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
10beccc9 1587 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1588 'quality': {'convert': 'float', 'default': -1},
eb8a4433 1589 'filesize': {'convert': 'bytes'},
f137c99e 1590 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1591 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1592 'height': {'convert': 'float_none'},
1593 'width': {'convert': 'float_none'},
1594 'fps': {'convert': 'float_none'},
1595 'tbr': {'convert': 'float_none'},
1596 'vbr': {'convert': 'float_none'},
1597 'abr': {'convert': 'float_none'},
1598 'asr': {'convert': 'float_none'},
10beccc9 1599 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
63be1aab 1600
eb8a4433 1601 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1602 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1603 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1604 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
f5510afe 1605 'res': {'type': 'multiple', 'field': ('height', 'width'),
dbf5416a 1606 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
63be1aab 1607
19188702 1608 # For compatibility with youtube-dl
1609 'format_id': {'type': 'alias', 'field': 'id'},
1610 'preference': {'type': 'alias', 'field': 'ie_pref'},
1611 'language_preference': {'type': 'alias', 'field': 'lang'},
1612
ee8dd27a 1613 # Deprecated
63be1aab 1614 'dimension': {'type': 'alias', 'field': 'res'},
1615 'resolution': {'type': 'alias', 'field': 'res'},
1616 'extension': {'type': 'alias', 'field': 'ext'},
1617 'bitrate': {'type': 'alias', 'field': 'br'},
eb8a4433 1618 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1619 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1620 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1621 'framerate': {'type': 'alias', 'field': 'fps'},
63be1aab 1622 'protocol': {'type': 'alias', 'field': 'proto'},
1623 'source_preference': {'type': 'alias', 'field': 'source'},
1624 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1625 'filesize_estimate': {'type': 'alias', 'field': 'size'},
eb8a4433 1626 'samplerate': {'type': 'alias', 'field': 'asr'},
1627 'video_ext': {'type': 'alias', 'field': 'vext'},
1628 'audio_ext': {'type': 'alias', 'field': 'aext'},
1629 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1630 'audio_codec': {'type': 'alias', 'field': 'acodec'},
63be1aab 1631 'video': {'type': 'alias', 'field': 'hasvid'},
1632 'has_video': {'type': 'alias', 'field': 'hasvid'},
1633 'audio': {'type': 'alias', 'field': 'hasaud'},
1634 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1635 'extractor': {'type': 'alias', 'field': 'ie_pref'},
63be1aab 1636 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
63be1aab 1637 }
eb8a4433 1638
f304da8a 1639 def __init__(self, ie, field_preference):
1640 self._order = []
1641 self.ydl = ie._downloader
1642 self.evaluate_params(self.ydl.params, field_preference)
1643 if ie.get_param('verbose'):
1644 self.print_verbose_info(self.ydl.write_debug)
eb8a4433 1645
1646 def _get_field_setting(self, field, key):
1647 if field not in self.settings:
ee8dd27a 1648 if key in ('forced', 'priority'):
1649 return False
1650 self.ydl.deprecation_warning(
1651 f'Using arbitrary fields ({field}) for format sorting is deprecated '
1652 'and may be removed in a future version')
eb8a4433 1653 self.settings[field] = {}
1654 propObj = self.settings[field]
1655 if key not in propObj:
1656 type = propObj.get('type')
1657 if key == 'field':
1658 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1659 elif key == 'convert':
1660 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1661 else:
f5510afe 1662 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
eb8a4433 1663 propObj[key] = default
1664 return propObj[key]
1665
1666 def _resolve_field_value(self, field, value, convertNone=False):
1667 if value is None:
1668 if not convertNone:
1669 return None
4bcc7bd1 1670 else:
eb8a4433 1671 value = value.lower()
1672 conversion = self._get_field_setting(field, 'convert')
1673 if conversion == 'ignore':
1674 return None
1675 if conversion == 'string':
1676 return value
1677 elif conversion == 'float_none':
1678 return float_or_none(value)
1679 elif conversion == 'bytes':
1680 return FileDownloader.parse_bytes(value)
1681 elif conversion == 'order':
da9be05e 1682 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1683 use_regex = self._get_field_setting(field, 'regex')
1684 list_length = len(order_list)
1685 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1686 if use_regex and value is not None:
da9be05e 1687 for i, regex in enumerate(order_list):
eb8a4433 1688 if regex and re.match(regex, value):
1689 return list_length - i
1690 return list_length - empty_pos # not in list
1691 else: # not regex or value = None
1692 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1693 else:
1694 if value.isnumeric():
1695 return float(value)
4bcc7bd1 1696 else:
eb8a4433 1697 self.settings[field]['convert'] = 'string'
1698 return value
1699
1700 def evaluate_params(self, params, sort_extractor):
1701 self._use_free_order = params.get('prefer_free_formats', False)
1702 self._sort_user = params.get('format_sort', [])
1703 self._sort_extractor = sort_extractor
1704
1705 def add_item(field, reverse, closest, limit_text):
1706 field = field.lower()
1707 if field in self._order:
1708 return
1709 self._order.append(field)
1710 limit = self._resolve_field_value(field, limit_text)
1711 data = {
1712 'reverse': reverse,
1713 'closest': False if limit is None else closest,
1714 'limit_text': limit_text,
1715 'limit': limit}
1716 if field in self.settings:
1717 self.settings[field].update(data)
1718 else:
1719 self.settings[field] = data
1720
1721 sort_list = (
1722 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1723 + (tuple() if params.get('format_sort_force', False)
1724 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1725 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1726
1727 for item in sort_list:
1728 match = re.match(self.regex, item)
1729 if match is None:
1730 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1731 field = match.group('field')
1732 if field is None:
1733 continue
1734 if self._get_field_setting(field, 'type') == 'alias':
ee8dd27a 1735 alias, field = field, self._get_field_setting(field, 'field')
19188702 1736 if alias not in ('format_id', 'preference', 'language_preference'):
1737 self.ydl.deprecation_warning(
1738 f'Format sorting alias {alias} is deprecated '
1739 f'and may be removed in a future version. Please use {field} instead')
eb8a4433 1740 reverse = match.group('reverse') is not None
b050d210 1741 closest = match.group('separator') == '~'
eb8a4433 1742 limit_text = match.group('limit')
1743
1744 has_limit = limit_text is not None
1745 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1746 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1747
1748 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
b5ae35ee 1749 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
eb8a4433 1750 limit_count = len(limits)
1751 for (i, f) in enumerate(fields):
1752 add_item(f, reverse, closest,
1753 limits[i] if i < limit_count
1754 else limits[0] if has_limit and not has_multiple_limits
1755 else None)
1756
0760b0a7 1757 def print_verbose_info(self, write_debug):
b31fdeed 1758 if self._sort_user:
0760b0a7 1759 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
eb8a4433 1760 if self._sort_extractor:
0760b0a7 1761 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1762 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
eb8a4433 1763 '+' if self._get_field_setting(field, 'reverse') else '', field,
1764 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1765 self._get_field_setting(field, 'limit_text'),
1766 self._get_field_setting(field, 'limit'))
1767 if self._get_field_setting(field, 'limit_text') is not None else '')
1768 for field in self._order if self._get_field_setting(field, 'visible')]))
1769
1770 def _calculate_field_preference_from_value(self, format, field, type, value):
1771 reverse = self._get_field_setting(field, 'reverse')
1772 closest = self._get_field_setting(field, 'closest')
1773 limit = self._get_field_setting(field, 'limit')
1774
1775 if type == 'extractor':
1776 maximum = self._get_field_setting(field, 'max')
1777 if value is None or (maximum is not None and value >= maximum):
f983b875 1778 value = -1
eb8a4433 1779 elif type == 'boolean':
1780 in_list = self._get_field_setting(field, 'in_list')
1781 not_in_list = self._get_field_setting(field, 'not_in_list')
1782 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1783 elif type == 'ordered':
1784 value = self._resolve_field_value(field, value, True)
1785
1786 # try to convert to number
6a04a74e 1787 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
eb8a4433 1788 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1789 if is_num:
1790 value = val_num
1791
1792 return ((-10, 0) if value is None
1793 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1794 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1795 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1796 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1797 else (-1, value, 0))
1798
1799 def _calculate_field_preference(self, format, field):
1800 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1801 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1802 if type == 'multiple':
1803 type = 'field' # Only 'field' is allowed in multiple for now
1804 actual_fields = self._get_field_setting(field, 'field')
1805
f5510afe 1806 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
eb8a4433 1807 else:
1808 value = get_value(field)
1809 return self._calculate_field_preference_from_value(format, field, type, value)
1810
1811 def calculate_preference(self, format):
1812 # Determine missing protocol
1813 if not format.get('protocol'):
1814 format['protocol'] = determine_protocol(format)
1815
1816 # Determine missing ext
1817 if not format.get('ext') and 'url' in format:
1818 format['ext'] = determine_ext(format['url'])
1819 if format.get('vcodec') == 'none':
8326b00a 1820 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
eb8a4433 1821 format['video_ext'] = 'none'
1822 else:
1823 format['video_ext'] = format['ext']
1824 format['audio_ext'] = 'none'
1825 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1826 # format['preference'] = -1000
1827
1828 # Determine missing bitrates
1829 if format.get('tbr') is None:
1830 if format.get('vbr') is not None and format.get('abr') is not None:
1831 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1832 else:
b5ae35ee 1833 if format.get('vcodec') != 'none' and format.get('vbr') is None:
eb8a4433 1834 format['vbr'] = format.get('tbr') - format.get('abr', 0)
b5ae35ee 1835 if format.get('acodec') != 'none' and format.get('abr') is None:
eb8a4433 1836 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1837
1838 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1839
1840 def _sort_formats(self, formats, field_preference=[]):
1841 if not formats:
88acdbc2 1842 return
f304da8a 1843 format_sort = self.FormatSort(self, field_preference)
eb8a4433 1844 formats.sort(key=lambda f: format_sort.calculate_preference(f))
59040888 1845
96a53167
S
1846 def _check_formats(self, formats, video_id):
1847 if formats:
1848 formats[:] = filter(
1849 lambda f: self._is_valid_url(
1850 f['url'], video_id,
1851 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1852 formats)
1853
f5bdb444
S
1854 @staticmethod
1855 def _remove_duplicate_formats(formats):
1856 format_urls = set()
1857 unique_formats = []
1858 for f in formats:
1859 if f['url'] not in format_urls:
1860 format_urls.add(f['url'])
1861 unique_formats.append(f)
1862 formats[:] = unique_formats
1863
45024183 1864 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1865 url = self._proto_relative_url(url, scheme='http:')
1866 # For now assume non HTTP(S) URLs always valid
1867 if not (url.startswith('http://') or url.startswith('https://')):
1868 return True
96a53167 1869 try:
45024183 1870 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1871 return True
8bdd16b4 1872 except ExtractorError as e:
25e911a9 1873 self.to_screen(
8bdd16b4 1874 '%s: %s URL is invalid, skipping: %s'
1875 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1876 return False
96a53167 1877
20991253 1878 def http_scheme(self):
1ede5b24 1879 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1880 return (
1881 'http:'
a06916d9 1882 if self.get_param('prefer_insecure', False)
20991253
PH
1883 else 'https:')
1884
57c7411f
PH
1885 def _proto_relative_url(self, url, scheme=None):
1886 if url is None:
1887 return url
1888 if url.startswith('//'):
1889 if scheme is None:
1890 scheme = self.http_scheme()
1891 return scheme + url
1892 else:
1893 return url
1894
4094b6e3
PH
1895 def _sleep(self, timeout, video_id, msg_template=None):
1896 if msg_template is None:
f1a9d64e 1897 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1898 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1899 self.to_screen(msg)
1900 time.sleep(timeout)
1901
f983b875 1902 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1903 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1904 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
f036a632
JMF
1905 manifest = self._download_xml(
1906 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1907 'Unable to download f4m manifest',
1908 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1909 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1910 transform_source=transform_source,
7360c06f 1911 fatal=fatal, data=data, headers=headers, query=query)
4de61310
S
1912
1913 if manifest is False:
8d29e47f 1914 return []
31bb8d3f 1915
0fdbb332 1916 return self._parse_f4m_formats(
f983b875 1917 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1918 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1919
f983b875 1920 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1921 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1922 fatal=True, m3u8_id=None):
ee0ba927 1923 if not isinstance(manifest, compat_etree_Element) and not fatal:
d9eb580a
S
1924 return []
1925
7a5c1cfe 1926 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1927 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1928 if akamai_pv is not None and ';' in akamai_pv.text:
1929 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1930 if playerVerificationChallenge.strip() != '':
1931 return []
1932
31bb8d3f 1933 formats = []
7a47d07c 1934 manifest_version = '1.0'
b2527359 1935 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1936 if not media_nodes:
7a47d07c 1937 manifest_version = '2.0'
34e48bed 1938 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1939 # Remove unsupported DRM protected media from final formats
067aa17e 1940 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1941 media_nodes = remove_encrypted_media(media_nodes)
1942 if not media_nodes:
1943 return formats
48107c19
S
1944
1945 manifest_base_url = get_base_url(manifest)
0a5685b2 1946
a6571f10 1947 bootstrap_info = xpath_element(
0a5685b2
YCH
1948 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1949 'bootstrap info', default=None)
1950
edd6074c
RA
1951 vcodec = None
1952 mime_type = xpath_text(
1953 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1954 'base URL', default=None)
1955 if mime_type and mime_type.startswith('audio/'):
1956 vcodec = 'none'
1957
b2527359 1958 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1959 tbr = int_or_none(media_el.attrib.get('bitrate'))
1960 width = int_or_none(media_el.attrib.get('width'))
1961 height = int_or_none(media_el.attrib.get('height'))
34921b43 1962 format_id = join_nonempty(f4m_id, tbr or i)
448bb5f3
YCH
1963 # If <bootstrapInfo> is present, the specified f4m is a
1964 # stream-level manifest, and only set-level manifests may refer to
1965 # external resources. See section 11.4 and section 4 of F4M spec
1966 if bootstrap_info is None:
1967 media_url = None
1968 # @href is introduced in 2.0, see section 11.6 of F4M spec
1969 if manifest_version == '2.0':
1970 media_url = media_el.attrib.get('href')
1971 if media_url is None:
1972 media_url = media_el.attrib.get('url')
31c746e5
S
1973 if not media_url:
1974 continue
cc357c4d
S
1975 manifest_url = (
1976 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1977 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1978 # If media_url is itself a f4m manifest do the recursive extraction
1979 # since bitrates in parent manifest (this one) and media_url manifest
1980 # may differ leading to inability to resolve the format by requested
1981 # bitrate in f4m downloader
240b6045
YCH
1982 ext = determine_ext(manifest_url)
1983 if ext == 'f4m':
77b8b4e6 1984 f4m_formats = self._extract_f4m_formats(
f983b875 1985 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1986 transform_source=transform_source, fatal=fatal)
1987 # Sometimes stream-level manifest contains single media entry that
1988 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1989 # At the same time parent's media entry in set-level manifest may
1990 # contain it. We will copy it from parent in such cases.
1991 if len(f4m_formats) == 1:
1992 f = f4m_formats[0]
1993 f.update({
1994 'tbr': f.get('tbr') or tbr,
1995 'width': f.get('width') or width,
1996 'height': f.get('height') or height,
1997 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1998 'vcodec': vcodec,
77b8b4e6
S
1999 })
2000 formats.extend(f4m_formats)
70f0f5a8 2001 continue
240b6045
YCH
2002 elif ext == 'm3u8':
2003 formats.extend(self._extract_m3u8_formats(
2004 manifest_url, video_id, 'mp4', preference=preference,
f983b875 2005 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 2006 continue
31bb8d3f 2007 formats.append({
77b8b4e6 2008 'format_id': format_id,
31bb8d3f 2009 'url': manifest_url,
30d0b549 2010 'manifest_url': manifest_url,
a6571f10 2011 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 2012 'protocol': 'f4m',
b2527359 2013 'tbr': tbr,
77b8b4e6
S
2014 'width': width,
2015 'height': height,
edd6074c 2016 'vcodec': vcodec,
60ca389c 2017 'preference': preference,
f983b875 2018 'quality': quality,
31bb8d3f 2019 })
31bb8d3f
JMF
2020 return formats
2021
f983b875 2022 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 2023 return {
34921b43 2024 'format_id': join_nonempty(m3u8_id, 'meta'),
704df56d
PH
2025 'url': m3u8_url,
2026 'ext': ext,
2027 'protocol': 'm3u8',
37768f92 2028 'preference': preference - 100 if preference else -100,
f983b875 2029 'quality': quality,
704df56d
PH
2030 'resolution': 'multiple',
2031 'format_note': 'Quality selection URL',
16da9bbc
YCH
2032 }
2033
b5ae35ee 2034 def _report_ignoring_subs(self, name):
2035 self.report_warning(bug_reports_message(
2036 f'Ignoring subtitle tracks found in the {name} manifest; '
2037 'if any subtitle tracks are missing,'
2038 ), only_once=True)
2039
a0c3b2d5
F
2040 def _extract_m3u8_formats(self, *args, **kwargs):
2041 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2042 if subs:
b5ae35ee 2043 self._report_ignoring_subs('HLS')
a0c3b2d5
F
2044 return fmts
2045
2046 def _extract_m3u8_formats_and_subtitles(
177877c5 2047 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2048 preference=None, quality=None, m3u8_id=None, note=None,
2049 errnote=None, fatal=True, live=False, data=None, headers={},
2050 query={}):
2051
dbd82a1d 2052 res = self._download_webpage_handle(
81515ad9 2053 m3u8_url, video_id,
37a3bb66 2054 note='Downloading m3u8 information' if note is None else note,
2055 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 2056 fatal=fatal, data=data, headers=headers, query=query)
cb252080 2057
dbd82a1d 2058 if res is False:
a0c3b2d5 2059 return [], {}
cb252080 2060
dbd82a1d 2061 m3u8_doc, urlh = res
37113045 2062 m3u8_url = urlh.geturl()
9cdffeeb 2063
a0c3b2d5 2064 return self._parse_m3u8_formats_and_subtitles(
cb252080 2065 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 2066 preference=preference, quality=quality, m3u8_id=m3u8_id,
2067 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2068 headers=headers, query=query, video_id=video_id)
cb252080 2069
a0c3b2d5 2070 def _parse_m3u8_formats_and_subtitles(
177877c5 2071 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2072 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2073 errnote=None, fatal=True, data=None, headers={}, query={},
2074 video_id=None):
60755938 2075 formats, subtitles = [], {}
a0c3b2d5 2076
6b993ca7 2077 has_drm = re.search('|'.join([
2078 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2079 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
2080 ]), m3u8_doc)
a0c3b2d5 2081
60755938 2082 def format_url(url):
2083 return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2084
2085 if self.get_param('hls_split_discontinuity', False):
2086 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2087 if not m3u8_doc:
2088 if not manifest_url:
2089 return []
2090 m3u8_doc = self._download_webpage(
2091 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2092 note=False, errnote='Failed to download m3u8 playlist information')
2093 if m3u8_doc is False:
2094 return []
2095 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 2096
60755938 2097 else:
2098 def _extract_m3u8_playlist_indices(*args, **kwargs):
2099 return [None]
310c2ed2 2100
cb252080
S
2101 # References:
2102 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2103 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2104 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2105
2106 # We should try extracting formats only from master playlists [1, 4.3.4],
2107 # i.e. playlists that describe available qualities. On the other hand
2108 # media playlists [1, 4.3.3] should be returned as is since they contain
2109 # just the media without qualities renditions.
9cdffeeb 2110 # Fortunately, master playlist can be easily distinguished from media
cb252080 2111 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2112 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2113 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2114 # media playlist and MUST NOT appear in master playlist thus we can
2115 # clearly detect media playlist with this criterion.
2116
9cdffeeb 2117 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2118 formats = [{
34921b43 2119 'format_id': join_nonempty(m3u8_id, idx),
60755938 2120 'format_index': idx,
2121 'url': m3u8_url,
2122 'ext': ext,
2123 'protocol': entry_protocol,
2124 'preference': preference,
2125 'quality': quality,
88acdbc2 2126 'has_drm': has_drm,
60755938 2127 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2128
a0c3b2d5 2129 return formats, subtitles
cb252080
S
2130
2131 groups = {}
2132 last_stream_inf = {}
2133
2134 def extract_media(x_media_line):
2135 media = parse_m3u8_attributes(x_media_line)
2136 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2137 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2138 if not (media_type and group_id and name):
2139 return
2140 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2141 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2142 if media_type == 'SUBTITLES':
3907333c 2143 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2144 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2145 # However, lack of URI has been spotted in the wild.
2146 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2147 if not media.get('URI'):
2148 return
a0c3b2d5
F
2149 url = format_url(media['URI'])
2150 sub_info = {
2151 'url': url,
2152 'ext': determine_ext(url),
2153 }
4a2f19ab
F
2154 if sub_info['ext'] == 'm3u8':
2155 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2156 # files may contain is WebVTT:
2157 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2158 sub_info['ext'] = 'vtt'
2159 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2160 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2161 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2162 if media_type not in ('VIDEO', 'AUDIO'):
2163 return
2164 media_url = media.get('URI')
2165 if media_url:
310c2ed2 2166 manifest_url = format_url(media_url)
60755938 2167 formats.extend({
34921b43 2168 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
60755938 2169 'format_note': name,
2170 'format_index': idx,
2171 'url': manifest_url,
2172 'manifest_url': m3u8_url,
2173 'language': media.get('LANGUAGE'),
2174 'ext': ext,
2175 'protocol': entry_protocol,
2176 'preference': preference,
2177 'quality': quality,
2178 'vcodec': 'none' if media_type == 'AUDIO' else None,
2179 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2180
2181 def build_stream_name():
2182 # Despite specification does not mention NAME attribute for
3019cb0c
S
2183 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2184 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2185 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2186 stream_name = last_stream_inf.get('NAME')
2187 if stream_name:
2188 return stream_name
2189 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2190 # from corresponding rendition group
2191 stream_group_id = last_stream_inf.get('VIDEO')
2192 if not stream_group_id:
2193 return
2194 stream_group = groups.get(stream_group_id)
2195 if not stream_group:
2196 return stream_group_id
2197 rendition = stream_group[0]
2198 return rendition.get('NAME') or stream_group_id
2199
379306ef 2200 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2201 # chance to detect video only formats when EXT-X-STREAM-INF tags
2202 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2203 for line in m3u8_doc.splitlines():
2204 if line.startswith('#EXT-X-MEDIA:'):
2205 extract_media(line)
2206
704df56d
PH
2207 for line in m3u8_doc.splitlines():
2208 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2209 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2210 elif line.startswith('#') or not line.strip():
2211 continue
2212 else:
9c99bef7 2213 tbr = float_or_none(
3089bc74
S
2214 last_stream_inf.get('AVERAGE-BANDWIDTH')
2215 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2216 manifest_url = format_url(line.strip())
5ef62fc4 2217
60755938 2218 for idx in _extract_m3u8_playlist_indices(manifest_url):
2219 format_id = [m3u8_id, None, idx]
310c2ed2 2220 # Bandwidth of live streams may differ over time thus making
2221 # format_id unpredictable. So it's better to keep provided
2222 # format_id intact.
2223 if not live:
60755938 2224 stream_name = build_stream_name()
34921b43 2225 format_id[1] = stream_name or '%d' % (tbr or len(formats))
310c2ed2 2226 f = {
34921b43 2227 'format_id': join_nonempty(*format_id),
60755938 2228 'format_index': idx,
310c2ed2 2229 'url': manifest_url,
2230 'manifest_url': m3u8_url,
2231 'tbr': tbr,
2232 'ext': ext,
2233 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2234 'protocol': entry_protocol,
2235 'preference': preference,
2236 'quality': quality,
2237 }
2238 resolution = last_stream_inf.get('RESOLUTION')
2239 if resolution:
2240 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2241 if mobj:
2242 f['width'] = int(mobj.group('width'))
2243 f['height'] = int(mobj.group('height'))
2244 # Unified Streaming Platform
2245 mobj = re.search(
2246 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2247 if mobj:
2248 abr, vbr = mobj.groups()
2249 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2250 f.update({
2251 'vbr': vbr,
2252 'abr': abr,
2253 })
2254 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2255 f.update(codecs)
2256 audio_group_id = last_stream_inf.get('AUDIO')
2257 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2258 # references a rendition group MUST have a CODECS attribute.
2259 # However, this is not always respected, for example, [2]
2260 # contains EXT-X-STREAM-INF tag which references AUDIO
2261 # rendition group but does not have CODECS and despite
2262 # referencing an audio group it represents a complete
2263 # (with audio and video) format. So, for such cases we will
2264 # ignore references to rendition groups and treat them
2265 # as complete formats.
2266 if audio_group_id and codecs and f.get('vcodec') != 'none':
2267 audio_group = groups.get(audio_group_id)
2268 if audio_group and audio_group[0].get('URI'):
2269 # TODO: update acodec for audio only formats with
2270 # the same GROUP-ID
2271 f['acodec'] = 'none'
fc21af50 2272 if not f.get('ext'):
2273 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2274 formats.append(f)
2275
2276 # for DailyMotion
2277 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2278 if progressive_uri:
2279 http_f = f.copy()
2280 del http_f['manifest_url']
2281 http_f.update({
2282 'format_id': f['format_id'].replace('hls-', 'http-'),
2283 'protocol': 'http',
2284 'url': progressive_uri,
2285 })
2286 formats.append(http_f)
5ef62fc4 2287
cb252080 2288 last_stream_inf = {}
a0c3b2d5 2289 return formats, subtitles
704df56d 2290
3cf4b91d
C
2291 def _extract_m3u8_vod_duration(
2292 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2293
2294 m3u8_vod = self._download_webpage(
2295 m3u8_vod_url, video_id,
2296 note='Downloading m3u8 VOD manifest' if note is None else note,
2297 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2298 fatal=False, data=data, headers=headers, query=query)
2299
2300 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2301
2302 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2303 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2304 return None
2305
2306 return int(sum(
2307 float(line[len('#EXTINF:'):].split(',')[0])
2308 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2309
a107193e
S
2310 @staticmethod
2311 def _xpath_ns(path, namespace=None):
2312 if not namespace:
2313 return path
2314 out = []
2315 for c in path.split('/'):
2316 if not c or c == '.':
2317 out.append(c)
2318 else:
2319 out.append('{%s}%s' % (namespace, c))
2320 return '/'.join(out)
2321
da1c94ee 2322 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
09f572fb 2323 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 2324
995029a1
PH
2325 if smil is False:
2326 assert not fatal
2327 return []
e89a2aab 2328
17712eeb 2329 namespace = self._parse_smil_namespace(smil)
a107193e 2330
da1c94ee 2331 fmts = self._parse_smil_formats(
a107193e 2332 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2333 subs = self._parse_smil_subtitles(
2334 smil, namespace=namespace)
2335
2336 return fmts, subs
2337
2338 def _extract_smil_formats(self, *args, **kwargs):
2339 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2340 if subs:
b5ae35ee 2341 self._report_ignoring_subs('SMIL')
da1c94ee 2342 return fmts
a107193e
S
2343
2344 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2345 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2346 if smil is False:
2347 return {}
2348 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2349
09f572fb 2350 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
2351 return self._download_xml(
2352 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2353 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2354
2355 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2356 namespace = self._parse_smil_namespace(smil)
a107193e
S
2357
2358 formats = self._parse_smil_formats(
2359 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2360 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2361
2362 video_id = os.path.splitext(url_basename(smil_url))[0]
2363 title = None
2364 description = None
647eab45 2365 upload_date = None
a107193e
S
2366 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2367 name = meta.attrib.get('name')
2368 content = meta.attrib.get('content')
2369 if not name or not content:
2370 continue
2371 if not title and name == 'title':
2372 title = content
2373 elif not description and name in ('description', 'abstract'):
2374 description = content
647eab45
S
2375 elif not upload_date and name == 'date':
2376 upload_date = unified_strdate(content)
a107193e 2377
1e5bcdec
S
2378 thumbnails = [{
2379 'id': image.get('type'),
2380 'url': image.get('src'),
2381 'width': int_or_none(image.get('width')),
2382 'height': int_or_none(image.get('height')),
2383 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2384
a107193e
S
2385 return {
2386 'id': video_id,
2387 'title': title or video_id,
2388 'description': description,
647eab45 2389 'upload_date': upload_date,
1e5bcdec 2390 'thumbnails': thumbnails,
a107193e
S
2391 'formats': formats,
2392 'subtitles': subtitles,
2393 }
2394
17712eeb
S
2395 def _parse_smil_namespace(self, smil):
2396 return self._search_regex(
2397 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2398
f877c6ae 2399 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2400 base = smil_url
2401 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2402 b = meta.get('base') or meta.get('httpBase')
2403 if b:
2404 base = b
2405 break
e89a2aab
S
2406
2407 formats = []
2408 rtmp_count = 0
a107193e 2409 http_count = 0
7f32e5dc 2410 m3u8_count = 0
9359f3d4 2411 imgs_count = 0
a107193e 2412
9359f3d4 2413 srcs = set()
ad96b4c8
YCH
2414 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2415 for medium in media:
2416 src = medium.get('src')
81e1c4e2 2417 if not src or src in srcs:
a107193e 2418 continue
9359f3d4 2419 srcs.add(src)
a107193e 2420
ad96b4c8
YCH
2421 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2422 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2423 width = int_or_none(medium.get('width'))
2424 height = int_or_none(medium.get('height'))
2425 proto = medium.get('proto')
2426 ext = medium.get('ext')
a107193e 2427 src_ext = determine_ext(src)
ad96b4c8 2428 streamer = medium.get('streamer') or base
a107193e
S
2429
2430 if proto == 'rtmp' or streamer.startswith('rtmp'):
2431 rtmp_count += 1
2432 formats.append({
2433 'url': streamer,
2434 'play_path': src,
2435 'ext': 'flv',
2436 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2437 'tbr': bitrate,
2438 'filesize': filesize,
2439 'width': width,
2440 'height': height,
2441 })
f877c6ae
YCH
2442 if transform_rtmp_url:
2443 streamer, src = transform_rtmp_url(streamer, src)
2444 formats[-1].update({
2445 'url': streamer,
2446 'play_path': src,
2447 })
a107193e
S
2448 continue
2449
2450 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 2451 src_url = src_url.strip()
a107193e
S
2452
2453 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2454 m3u8_formats = self._extract_m3u8_formats(
2455 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2456 if len(m3u8_formats) == 1:
2457 m3u8_count += 1
2458 m3u8_formats[0].update({
2459 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2460 'tbr': bitrate,
2461 'width': width,
2462 'height': height,
2463 })
2464 formats.extend(m3u8_formats)
bd21ead2 2465 elif src_ext == 'f4m':
a107193e
S
2466 f4m_url = src_url
2467 if not f4m_params:
2468 f4m_params = {
2469 'hdcore': '3.2.0',
2470 'plugin': 'flowplayer-3.2.0.1',
2471 }
2472 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 2473 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 2474 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2475 elif src_ext == 'mpd':
2476 formats.extend(self._extract_mpd_formats(
2477 src_url, video_id, mpd_id='dash', fatal=False))
2478 elif re.search(r'\.ism/[Mm]anifest', src_url):
2479 formats.extend(self._extract_ism_formats(
2480 src_url, video_id, ism_id='mss', fatal=False))
2481 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2482 http_count += 1
2483 formats.append({
2484 'url': src_url,
2485 'ext': ext or src_ext or 'flv',
2486 'format_id': 'http-%d' % (bitrate or http_count),
2487 'tbr': bitrate,
2488 'filesize': filesize,
2489 'width': width,
2490 'height': height,
2491 })
63757032 2492
9359f3d4
F
2493 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2494 src = medium.get('src')
2495 if not src or src in srcs:
2496 continue
2497 srcs.add(src)
2498
2499 imgs_count += 1
2500 formats.append({
2501 'format_id': 'imagestream-%d' % (imgs_count),
2502 'url': src,
2503 'ext': mimetype2ext(medium.get('type')),
2504 'acodec': 'none',
2505 'vcodec': 'none',
2506 'width': int_or_none(medium.get('width')),
2507 'height': int_or_none(medium.get('height')),
2508 'format_note': 'SMIL storyboards',
2509 })
2510
e89a2aab
S
2511 return formats
2512
ce00af87 2513 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2514 urls = []
a107193e
S
2515 subtitles = {}
2516 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2517 src = textstream.get('src')
d413095f 2518 if not src or src in urls:
a107193e 2519 continue
d413095f 2520 urls.append(src)
df634be2 2521 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2522 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2523 subtitles.setdefault(lang, []).append({
2524 'url': src,
2525 'ext': ext,
2526 })
2527 return subtitles
63757032 2528
47a5cb77 2529 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 2530 xspf = self._download_xml(
47a5cb77 2531 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
2532 'Unable to download xspf manifest', fatal=fatal)
2533 if xspf is False:
2534 return []
47a5cb77
S
2535 return self._parse_xspf(
2536 xspf, playlist_id, xspf_url=xspf_url,
2537 xspf_base_url=base_url(xspf_url))
8d6765cf 2538
47a5cb77 2539 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2540 NS_MAP = {
2541 'xspf': 'http://xspf.org/ns/0/',
2542 's1': 'http://static.streamone.nl/player/ns/0',
2543 }
2544
2545 entries = []
47a5cb77 2546 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2547 title = xpath_text(
98044462 2548 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2549 description = xpath_text(
2550 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2551 thumbnail = xpath_text(
2552 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2553 duration = float_or_none(
2554 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2555
47a5cb77
S
2556 formats = []
2557 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2558 format_url = urljoin(xspf_base_url, location.text)
2559 if not format_url:
2560 continue
2561 formats.append({
2562 'url': format_url,
2563 'manifest_url': xspf_url,
2564 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2565 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2566 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2567 })
8d6765cf
S
2568 self._sort_formats(formats)
2569
2570 entries.append({
2571 'id': playlist_id,
2572 'title': title,
2573 'description': description,
2574 'thumbnail': thumbnail,
2575 'duration': duration,
2576 'formats': formats,
2577 })
2578 return entries
2579
171e59ed
F
2580 def _extract_mpd_formats(self, *args, **kwargs):
2581 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2582 if subs:
b5ae35ee 2583 self._report_ignoring_subs('DASH')
171e59ed
F
2584 return fmts
2585
2586 def _extract_mpd_formats_and_subtitles(
2587 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2588 fatal=True, data=None, headers={}, query={}):
47a5cb77 2589 res = self._download_xml_handle(
1bac3455 2590 mpd_url, video_id,
37a3bb66 2591 note='Downloading MPD manifest' if note is None else note,
2592 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2593 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2594 if res is False:
171e59ed 2595 return [], {}
47a5cb77 2596 mpd_doc, urlh = res
c25720ef 2597 if mpd_doc is None:
171e59ed 2598 return [], {}
02dc0a36 2599 mpd_base_url = base_url(urlh.geturl())
1bac3455 2600
171e59ed 2601 return self._parse_mpd_formats_and_subtitles(
545cc85d 2602 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2603
171e59ed
F
2604 def _parse_mpd_formats(self, *args, **kwargs):
2605 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2606 if subs:
b5ae35ee 2607 self._report_ignoring_subs('DASH')
171e59ed
F
2608 return fmts
2609
2610 def _parse_mpd_formats_and_subtitles(
2611 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2612 """
2613 Parse formats from MPD manifest.
2614 References:
2615 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2616 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2617 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2618 """
a06916d9 2619 if not self.get_param('dynamic_mpd', True):
78895bd3 2620 if mpd_doc.get('type') == 'dynamic':
171e59ed 2621 return [], {}
2d2fa82d 2622
91cb6b50 2623 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2624
2625 def _add_ns(path):
2626 return self._xpath_ns(path, namespace)
2627
675d0016 2628 def is_drm_protected(element):
2629 return element.find(_add_ns('ContentProtection')) is not None
2630
1bac3455 2631 def extract_multisegment_info(element, ms_parent_info):
2632 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2633
2634 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2635 # common attributes and elements. We will only extract relevant
2636 # for us.
2637 def extract_common(source):
2638 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2639 if segment_timeline is not None:
2640 s_e = segment_timeline.findall(_add_ns('S'))
2641 if s_e:
2642 ms_info['total_number'] = 0
2643 ms_info['s'] = []
2644 for s in s_e:
2645 r = int(s.get('r', 0))
2646 ms_info['total_number'] += 1 + r
2647 ms_info['s'].append({
2648 't': int(s.get('t', 0)),
2649 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2650 'd': int(s.attrib['d']),
2651 'r': r,
2652 })
2653 start_number = source.get('startNumber')
2654 if start_number:
2655 ms_info['start_number'] = int(start_number)
2656 timescale = source.get('timescale')
2657 if timescale:
2658 ms_info['timescale'] = int(timescale)
2659 segment_duration = source.get('duration')
2660 if segment_duration:
48504785 2661 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2662
2663 def extract_Initialization(source):
2664 initialization = source.find(_add_ns('Initialization'))
2665 if initialization is not None:
2666 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2667
f14be228 2668 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2669 if segment_list is not None:
b4c1d6e8
S
2670 extract_common(segment_list)
2671 extract_Initialization(segment_list)
f14be228 2672 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2673 if segment_urls_e:
2674 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2675 else:
f14be228 2676 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2677 if segment_template is not None:
b4c1d6e8 2678 extract_common(segment_template)
e228616c
S
2679 media = segment_template.get('media')
2680 if media:
2681 ms_info['media'] = media
1bac3455 2682 initialization = segment_template.get('initialization')
2683 if initialization:
e228616c 2684 ms_info['initialization'] = initialization
1bac3455 2685 else:
b4c1d6e8 2686 extract_Initialization(segment_template)
1bac3455 2687 return ms_info
b323e170 2688
1bac3455 2689 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2690 formats, subtitles = [], {}
234416e4 2691 stream_numbers = collections.defaultdict(int)
f14be228 2692 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2693 period_duration = parse_duration(period.get('duration')) or mpd_duration
2694 period_ms_info = extract_multisegment_info(period, {
2695 'start_number': 1,
2696 'timescale': 1,
2697 })
f14be228 2698 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2699 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2700 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2701 representation_attrib = adaptation_set.attrib.copy()
2702 representation_attrib.update(representation.attrib)
f0948348 2703 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2704 mime_type = representation_attrib['mimeType']
171e59ed
F
2705 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2706
be2fc5b2 2707 codecs = representation_attrib.get('codecs', '')
2708 if content_type not in ('video', 'audio', 'text'):
2709 if mime_type == 'image/jpeg':
a8731fcc 2710 content_type = mime_type
2711 elif codecs.split('.')[0] == 'stpp':
be2fc5b2 2712 content_type = 'text'
6993f78d 2713 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2714 content_type = 'text'
cdb19aa4 2715 else:
be2fc5b2 2716 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2717 continue
2718
2719 base_url = ''
2720 for element in (representation, adaptation_set, period, mpd_doc):
2721 base_url_e = element.find(_add_ns('BaseURL'))
2722 if base_url_e is not None:
2723 base_url = base_url_e.text + base_url
2724 if re.match(r'^https?://', base_url):
2725 break
f9cc0161
D
2726 if mpd_base_url and base_url.startswith('/'):
2727 base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2728 elif mpd_base_url and not re.match(r'^https?://', base_url):
2729 if not mpd_base_url.endswith('/'):
be2fc5b2 2730 mpd_base_url += '/'
2731 base_url = mpd_base_url + base_url
2732 representation_id = representation_attrib.get('id')
2733 lang = representation_attrib.get('lang')
2734 url_el = representation.find(_add_ns('BaseURL'))
2735 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2736 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2737 if representation_id is not None:
2738 format_id = representation_id
2739 else:
2740 format_id = content_type
2741 if mpd_id:
2742 format_id = mpd_id + '-' + format_id
2743 if content_type in ('video', 'audio'):
2744 f = {
2745 'format_id': format_id,
2746 'manifest_url': mpd_url,
2747 'ext': mimetype2ext(mime_type),
2748 'width': int_or_none(representation_attrib.get('width')),
2749 'height': int_or_none(representation_attrib.get('height')),
2750 'tbr': float_or_none(bandwidth, 1000),
2751 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2752 'fps': int_or_none(representation_attrib.get('frameRate')),
2753 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2754 'format_note': 'DASH %s' % content_type,
2755 'filesize': filesize,
2756 'container': mimetype2ext(mime_type) + '_dash',
2757 }
2758 f.update(parse_codecs(codecs))
2759 elif content_type == 'text':
2760 f = {
2761 'ext': mimetype2ext(mime_type),
2762 'manifest_url': mpd_url,
2763 'filesize': filesize,
2764 }
2765 elif content_type == 'image/jpeg':
2766 # See test case in VikiIE
2767 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2768 f = {
2769 'format_id': format_id,
2770 'ext': 'mhtml',
2771 'manifest_url': mpd_url,
2772 'format_note': 'DASH storyboards (jpeg)',
2773 'acodec': 'none',
2774 'vcodec': 'none',
2775 }
88acdbc2 2776 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2777 f['has_drm'] = True
be2fc5b2 2778 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2779
2780 def prepare_template(template_name, identifiers):
2781 tmpl = representation_ms_info[template_name]
2782 # First of, % characters outside $...$ templates
2783 # must be escaped by doubling for proper processing
2784 # by % operator string formatting used further (see
2785 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2786 t = ''
2787 in_template = False
2788 for c in tmpl:
2789 t += c
2790 if c == '$':
2791 in_template = not in_template
2792 elif c == '%' and not in_template:
eca1f0d1 2793 t += c
be2fc5b2 2794 # Next, $...$ templates are translated to their
2795 # %(...) counterparts to be used with % operator
2796 if representation_id is not None:
2797 t = t.replace('$RepresentationID$', representation_id)
2798 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2799 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2800 t.replace('$$', '$')
2801 return t
2802
2803 # @initialization is a regular template like @media one
2804 # so it should be handled just the same way (see
2805 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2806 if 'initialization' in representation_ms_info:
2807 initialization_template = prepare_template(
2808 'initialization',
2809 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2810 # $Time$ shall not be included for @initialization thus
2811 # only $Bandwidth$ remains
2812 ('Bandwidth', ))
2813 representation_ms_info['initialization_url'] = initialization_template % {
2814 'Bandwidth': bandwidth,
2815 }
2816
2817 def location_key(location):
2818 return 'url' if re.match(r'^https?://', location) else 'path'
2819
2820 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2821
2822 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2823 media_location_key = location_key(media_template)
2824
2825 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2826 # can't be used at the same time
2827 if '%(Number' in media_template and 's' not in representation_ms_info:
2828 segment_duration = None
2829 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2830 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2831 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2832 representation_ms_info['fragments'] = [{
2833 media_location_key: media_template % {
2834 'Number': segment_number,
2835 'Bandwidth': bandwidth,
2836 },
2837 'duration': segment_duration,
2838 } for segment_number in range(
2839 representation_ms_info['start_number'],
2840 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2841 else:
2842 # $Number*$ or $Time$ in media template with S list available
2843 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2844 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2845 representation_ms_info['fragments'] = []
2846 segment_time = 0
2847 segment_d = None
2848 segment_number = representation_ms_info['start_number']
2849
2850 def add_segment_url():
2851 segment_url = media_template % {
2852 'Time': segment_time,
2853 'Bandwidth': bandwidth,
2854 'Number': segment_number,
2855 }
2856 representation_ms_info['fragments'].append({
2857 media_location_key: segment_url,
2858 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2859 })
2860
2861 for num, s in enumerate(representation_ms_info['s']):
2862 segment_time = s.get('t') or segment_time
2863 segment_d = s['d']
2864 add_segment_url()
2865 segment_number += 1
2866 for r in range(s.get('r', 0)):
2867 segment_time += segment_d
f0948348 2868 add_segment_url()
b4c1d6e8 2869 segment_number += 1
be2fc5b2 2870 segment_time += segment_d
2871 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2872 # No media template
2873 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2874 # or any YouTube dashsegments video
2875 fragments = []
2876 segment_index = 0
2877 timescale = representation_ms_info['timescale']
2878 for s in representation_ms_info['s']:
2879 duration = float_or_none(s['d'], timescale)
2880 for r in range(s.get('r', 0) + 1):
2881 segment_uri = representation_ms_info['segment_urls'][segment_index]
2882 fragments.append({
2883 location_key(segment_uri): segment_uri,
2884 'duration': duration,
2885 })
2886 segment_index += 1
2887 representation_ms_info['fragments'] = fragments
2888 elif 'segment_urls' in representation_ms_info:
2889 # Segment URLs with no SegmentTimeline
2890 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2891 # https://github.com/ytdl-org/youtube-dl/pull/14844
2892 fragments = []
2893 segment_duration = float_or_none(
2894 representation_ms_info['segment_duration'],
2895 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2896 for segment_url in representation_ms_info['segment_urls']:
2897 fragment = {
2898 location_key(segment_url): segment_url,
2899 }
2900 if segment_duration:
2901 fragment['duration'] = segment_duration
2902 fragments.append(fragment)
2903 representation_ms_info['fragments'] = fragments
2904 # If there is a fragments key available then we correctly recognized fragmented media.
2905 # Otherwise we will assume unfragmented media with direct access. Technically, such
2906 # assumption is not necessarily correct since we may simply have no support for
2907 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2908 if 'fragments' in representation_ms_info:
2909 f.update({
2910 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2911 'url': mpd_url or base_url,
2912 'fragment_base_url': base_url,
2913 'fragments': [],
2914 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2915 })
2916 if 'initialization_url' in representation_ms_info:
2917 initialization_url = representation_ms_info['initialization_url']
2918 if not f.get('url'):
2919 f['url'] = initialization_url
2920 f['fragments'].append({location_key(initialization_url): initialization_url})
2921 f['fragments'].extend(representation_ms_info['fragments'])
17b598d3 2922 else:
be2fc5b2 2923 # Assuming direct URL to unfragmented media.
2924 f['url'] = base_url
234416e4 2925 if content_type in ('video', 'audio', 'image/jpeg'):
2926 f['manifest_stream_number'] = stream_numbers[f['url']]
2927 stream_numbers[f['url']] += 1
be2fc5b2 2928 formats.append(f)
2929 elif content_type == 'text':
2930 subtitles.setdefault(lang or 'und', []).append(f)
2931
171e59ed 2932 return formats, subtitles
17b598d3 2933
fd76a142
F
2934 def _extract_ism_formats(self, *args, **kwargs):
2935 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2936 if subs:
b5ae35ee 2937 self._report_ignoring_subs('ISM')
fd76a142
F
2938 return fmts
2939
2940 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2941 res = self._download_xml_handle(
b2758123 2942 ism_url, video_id,
37a3bb66 2943 note='Downloading ISM manifest' if note is None else note,
2944 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2945 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2946 if res is False:
fd76a142 2947 return [], {}
47a5cb77 2948 ism_doc, urlh = res
13b08034 2949 if ism_doc is None:
fd76a142 2950 return [], {}
b2758123 2951
fd76a142 2952 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 2953
fd76a142 2954 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2955 """
2956 Parse formats from ISM manifest.
2957 References:
2958 1. [MS-SSTR]: Smooth Streaming Protocol,
2959 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2960 """
06869367 2961 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2962 return [], {}
b2758123 2963
b2758123
RA
2964 duration = int(ism_doc.attrib['Duration'])
2965 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2966
2967 formats = []
fd76a142 2968 subtitles = {}
b2758123
RA
2969 for stream in ism_doc.findall('StreamIndex'):
2970 stream_type = stream.get('Type')
fd76a142 2971 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2972 continue
2973 url_pattern = stream.attrib['Url']
2974 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2975 stream_name = stream.get('Name')
fd76a142 2976 stream_language = stream.get('Language', 'und')
b2758123 2977 for track in stream.findall('QualityLevel'):
e2efe599 2978 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
b2758123 2979 # TODO: add support for WVC1 and WMAP
66a1b864 2980 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
b2758123
RA
2981 self.report_warning('%s is not a supported codec' % fourcc)
2982 continue
2983 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2984 # [1] does not mention Width and Height attributes. However,
2985 # they're often present while MaxWidth and MaxHeight are
2986 # missing, so should be used as fallbacks
2987 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2988 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2989 sampling_rate = int_or_none(track.get('SamplingRate'))
2990
2991 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2992 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2993
2994 fragments = []
2995 fragment_ctx = {
2996 'time': 0,
2997 }
2998 stream_fragments = stream.findall('c')
2999 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3000 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3001 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3002 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3003 if not fragment_ctx['duration']:
3004 try:
3005 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3006 except IndexError:
3007 next_fragment_time = duration
1616f9b4 3008 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
3009 for _ in range(fragment_repeat):
3010 fragments.append({
1616f9b4 3011 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
3012 'duration': fragment_ctx['duration'] / stream_timescale,
3013 })
3014 fragment_ctx['time'] += fragment_ctx['duration']
3015
fd76a142
F
3016 if stream_type == 'text':
3017 subtitles.setdefault(stream_language, []).append({
3018 'ext': 'ismt',
3019 'protocol': 'ism',
3020 'url': ism_url,
3021 'manifest_url': ism_url,
3022 'fragments': fragments,
3023 '_download_params': {
3024 'stream_type': stream_type,
3025 'duration': duration,
3026 'timescale': stream_timescale,
3027 'fourcc': fourcc,
3028 'language': stream_language,
3029 'codec_private_data': track.get('CodecPrivateData'),
3030 }
3031 })
3032 elif stream_type in ('video', 'audio'):
3033 formats.append({
34921b43 3034 'format_id': join_nonempty(ism_id, stream_name, tbr),
fd76a142
F
3035 'url': ism_url,
3036 'manifest_url': ism_url,
3037 'ext': 'ismv' if stream_type == 'video' else 'isma',
3038 'width': width,
3039 'height': height,
3040 'tbr': tbr,
3041 'asr': sampling_rate,
3042 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3043 'acodec': 'none' if stream_type == 'video' else fourcc,
3044 'protocol': 'ism',
3045 'fragments': fragments,
88acdbc2 3046 'has_drm': ism_doc.find('Protection') is not None,
fd76a142
F
3047 '_download_params': {
3048 'stream_type': stream_type,
3049 'duration': duration,
3050 'timescale': stream_timescale,
3051 'width': width or 0,
3052 'height': height or 0,
3053 'fourcc': fourcc,
3054 'language': stream_language,
3055 'codec_private_data': track.get('CodecPrivateData'),
3056 'sampling_rate': sampling_rate,
3057 'channels': int_or_none(track.get('Channels', 2)),
3058 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3059 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3060 },
3061 })
3062 return formats, subtitles
b2758123 3063
f983b875 3064 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
6780154e
S
3065 def absolute_url(item_url):
3066 return urljoin(base_url, item_url)
59bbe491 3067
3068 def parse_content_type(content_type):
3069 if not content_type:
3070 return {}
3071 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3072 if ctr:
3073 mimetype, codecs = ctr.groups()
3074 f = parse_codecs(codecs)
3075 f['ext'] = mimetype2ext(mimetype)
3076 return f
3077 return {}
3078
868f79db 3079 def _media_formats(src, cur_media_type, type_info={}):
520251c0 3080 full_url = absolute_url(src)
82889d4a 3081 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3082 if ext == 'm3u8':
520251c0
YCH
3083 is_plain_url = False
3084 formats = self._extract_m3u8_formats(
ad120ae1 3085 full_url, video_id, ext='mp4',
eeb0a956 3086 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3087 preference=preference, quality=quality, fatal=False)
87a449c1
S
3088 elif ext == 'mpd':
3089 is_plain_url = False
3090 formats = self._extract_mpd_formats(
b359e977 3091 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3092 else:
3093 is_plain_url = True
3094 formats = [{
3095 'url': full_url,
3096 'vcodec': 'none' if cur_media_type == 'audio' else None,
3097 }]
3098 return is_plain_url, formats
3099
59bbe491 3100 entries = []
4328ddf8
S
3101 # amp-video and amp-audio are very similar to their HTML5 counterparts
3102 # so we wll include them right here (see
3103 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3104 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3105 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3106 media_tags = [(media_tag, media_tag_name, media_type, '')
3107 for media_tag, media_tag_name, media_type
3108 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3109 media_tags.extend(re.findall(
3110 # We only allow video|audio followed by a whitespace or '>'.
3111 # Allowing more characters may end up in significant slow down (see
067aa17e 3112 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 3113 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3114 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3115 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3116 media_info = {
3117 'formats': [],
3118 'subtitles': {},
3119 }
3120 media_attributes = extract_attributes(media_tag)
f856816b 3121 src = strip_or_none(media_attributes.get('src'))
59bbe491 3122 if src:
dedb1770 3123 _, formats = _media_formats(src, media_type)
520251c0 3124 media_info['formats'].extend(formats)
6780154e 3125 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3126 if media_content:
3127 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3128 s_attr = extract_attributes(source_tag)
3129 # data-video-src and data-src are non standard but seen
3130 # several times in the wild
f856816b 3131 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 3132 if not src:
3133 continue
d493f15c 3134 f = parse_content_type(s_attr.get('type'))
868f79db 3135 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3136 if is_plain_url:
d493f15c
S
3137 # width, height, res, label and title attributes are
3138 # all not standard but seen several times in the wild
3139 labels = [
3140 s_attr.get(lbl)
3141 for lbl in ('label', 'title')
3142 if str_or_none(s_attr.get(lbl))
3143 ]
3144 width = int_or_none(s_attr.get('width'))
3089bc74
S
3145 height = (int_or_none(s_attr.get('height'))
3146 or int_or_none(s_attr.get('res')))
d493f15c
S
3147 if not width or not height:
3148 for lbl in labels:
3149 resolution = parse_resolution(lbl)
3150 if not resolution:
3151 continue
3152 width = width or resolution.get('width')
3153 height = height or resolution.get('height')
3154 for lbl in labels:
3155 tbr = parse_bitrate(lbl)
3156 if tbr:
3157 break
3158 else:
3159 tbr = None
1ed45499 3160 f.update({
d493f15c
S
3161 'width': width,
3162 'height': height,
3163 'tbr': tbr,
3164 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3165 })
520251c0
YCH
3166 f.update(formats[0])
3167 media_info['formats'].append(f)
3168 else:
3169 media_info['formats'].extend(formats)
59bbe491 3170 for track_tag in re.findall(r'<track[^>]+>', media_content):
3171 track_attributes = extract_attributes(track_tag)
3172 kind = track_attributes.get('kind')
5968d7d2 3173 if not kind or kind in ('subtitles', 'captions'):
f856816b 3174 src = strip_or_none(track_attributes.get('src'))
59bbe491 3175 if not src:
3176 continue
3177 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3178 media_info['subtitles'].setdefault(lang, []).append({
3179 'url': absolute_url(src),
3180 })
5e8e2fa5
S
3181 for f in media_info['formats']:
3182 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3183 if media_info['formats'] or media_info['subtitles']:
59bbe491 3184 entries.append(media_info)
3185 return entries
3186
f6a1d69a
F
3187 def _extract_akamai_formats(self, *args, **kwargs):
3188 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3189 if subs:
b5ae35ee 3190 self._report_ignoring_subs('akamai')
f6a1d69a
F
3191 return fmts
3192
3193 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3194 signed = 'hdnea=' in manifest_url
3195 if not signed:
3196 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3197 manifest_url = re.sub(
3198 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3199 '', manifest_url).strip('?')
3200
c7c43a93 3201 formats = []
f6a1d69a 3202 subtitles = {}
70c5802b 3203
e71a4509 3204 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3205 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3206 hds_host = hosts.get('hds')
3207 if hds_host:
3208 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3209 if 'hdcore=' not in f4m_url:
3210 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3211 f4m_formats = self._extract_f4m_formats(
3212 f4m_url, video_id, f4m_id='hds', fatal=False)
3213 for entry in f4m_formats:
3214 entry.update({'extra_param_to_segment_url': hdcore_sign})
3215 formats.extend(f4m_formats)
70c5802b 3216
c4251b9a
RA
3217 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3218 hls_host = hosts.get('hls')
3219 if hls_host:
3220 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3221 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3222 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3223 m3u8_id='hls', fatal=False)
3224 formats.extend(m3u8_formats)
f6a1d69a 3225 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3226
3227 http_host = hosts.get('http')
29f7c58a 3228 if http_host and m3u8_formats and not signed:
3229 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3230 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3231 qualities_length = len(qualities)
29f7c58a 3232 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3233 i = 0
29f7c58a 3234 for f in m3u8_formats:
3235 if f['vcodec'] != 'none':
70c5802b 3236 for protocol in ('http', 'https'):
3237 http_f = f.copy()
3238 del http_f['manifest_url']
3239 http_url = re.sub(
29f7c58a 3240 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
70c5802b 3241 http_f.update({
3242 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3243 'url': http_url,
3244 'protocol': protocol,
3245 })
29f7c58a 3246 formats.append(http_f)
70c5802b 3247 i += 1
70c5802b 3248
f6a1d69a 3249 return formats, subtitles
c7c43a93 3250
6ad02195 3251 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 3252 query = compat_urlparse.urlparse(url).query
6ad02195 3253 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3254 mobj = re.search(
3255 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3256 url_base = mobj.group('url')
3257 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3258 formats = []
044eeb14
S
3259
3260 def manifest_url(manifest):
3261 m_url = '%s/%s' % (http_base_url, manifest)
3262 if query:
3263 m_url += '?%s' % query
3264 return m_url
3265
6ad02195
RA
3266 if 'm3u8' not in skip_protocols:
3267 formats.extend(self._extract_m3u8_formats(
044eeb14 3268 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3269 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3270 if 'f4m' not in skip_protocols:
3271 formats.extend(self._extract_f4m_formats(
044eeb14 3272 manifest_url('manifest.f4m'),
6ad02195 3273 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3274 if 'dash' not in skip_protocols:
3275 formats.extend(self._extract_mpd_formats(
044eeb14 3276 manifest_url('manifest.mpd'),
0384932e 3277 video_id, mpd_id='dash', fatal=False))
6ad02195 3278 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3279 if 'smil' not in skip_protocols:
3280 rtmp_formats = self._extract_smil_formats(
044eeb14 3281 manifest_url('jwplayer.smil'),
6ad02195
RA
3282 video_id, fatal=False)
3283 for rtmp_format in rtmp_formats:
3284 rtsp_format = rtmp_format.copy()
3285 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3286 del rtsp_format['play_path']
3287 del rtsp_format['ext']
3288 rtsp_format.update({
3289 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3290 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3291 'protocol': 'rtsp',
3292 })
3293 formats.extend([rtmp_format, rtsp_format])
3294 else:
3295 for protocol in ('rtmp', 'rtsp'):
3296 if protocol not in skip_protocols:
3297 formats.append({
f2e2f0c7 3298 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
3299 'format_id': protocol,
3300 'protocol': protocol,
3301 })
3302 return formats
3303
c73e330e 3304 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3305 mobj = re.search(
ac9c69ac 3306 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3307 webpage)
3308 if mobj:
c73e330e
RU
3309 try:
3310 jwplayer_data = self._parse_json(mobj.group('options'),
3311 video_id=video_id,
3312 transform_source=transform_source)
3313 except ExtractorError:
3314 pass
3315 else:
3316 if isinstance(jwplayer_data, dict):
3317 return jwplayer_data
a4a554a7
YCH
3318
3319 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3320 jwplayer_data = self._find_jwplayer_data(
3321 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3322 return self._parse_jwplayer_data(
3323 jwplayer_data, video_id, *args, **kwargs)
3324
3325 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3326 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3327 # JWPlayer backward compatibility: flattened playlists
3328 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3329 if 'playlist' not in jwplayer_data:
3330 jwplayer_data = {'playlist': [jwplayer_data]}
3331
3332 entries = []
3333
3334 # JWPlayer backward compatibility: single playlist item
3335 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3336 if not isinstance(jwplayer_data['playlist'], list):
3337 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3338
3339 for video_data in jwplayer_data['playlist']:
3340 # JWPlayer backward compatibility: flattened sources
3341 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3342 if 'sources' not in video_data:
3343 video_data['sources'] = [video_data]
3344
3345 this_video_id = video_id or video_data['mediaid']
3346
1a2192cb
S
3347 formats = self._parse_jwplayer_formats(
3348 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3349 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3350
3351 subtitles = {}
3352 tracks = video_data.get('tracks')
3353 if tracks and isinstance(tracks, list):
3354 for track in tracks:
96a2daa1
S
3355 if not isinstance(track, dict):
3356 continue
f4b74272
S
3357 track_kind = track.get('kind')
3358 if not track_kind or not isinstance(track_kind, compat_str):
3359 continue
3360 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3361 continue
3362 track_url = urljoin(base_url, track.get('file'))
3363 if not track_url:
3364 continue
3365 subtitles.setdefault(track.get('label') or 'en', []).append({
3366 'url': self._proto_relative_url(track_url)
3367 })
3368
50d808f5 3369 entry = {
a4a554a7 3370 'id': this_video_id,
50d808f5 3371 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3372 'description': clean_html(video_data.get('description')),
6945b9e7 3373 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3374 'timestamp': int_or_none(video_data.get('pubdate')),
3375 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3376 'subtitles': subtitles,
50d808f5
RA
3377 }
3378 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3379 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3380 entry.update({
3381 '_type': 'url_transparent',
3382 'url': formats[0]['url'],
3383 })
3384 else:
3385 self._sort_formats(formats)
3386 entry['formats'] = formats
3387 entries.append(entry)
a4a554a7
YCH
3388 if len(entries) == 1:
3389 return entries[0]
3390 else:
3391 return self.playlist_result(entries)
3392
ed0cf9b3
S
3393 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3394 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3395 urls = []
ed0cf9b3 3396 formats = []
1a2192cb 3397 for source in jwplayer_sources_data:
0a268c6e
S
3398 if not isinstance(source, dict):
3399 continue
6945b9e7
RA
3400 source_url = urljoin(
3401 base_url, self._proto_relative_url(source.get('file')))
3402 if not source_url or source_url in urls:
bf1b87cd
RA
3403 continue
3404 urls.append(source_url)
ed0cf9b3
S
3405 source_type = source.get('type') or ''
3406 ext = mimetype2ext(source_type) or determine_ext(source_url)
3407 if source_type == 'hls' or ext == 'm3u8':
3408 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3409 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3410 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3411 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3412 formats.extend(self._extract_mpd_formats(
3413 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3414 elif ext == 'smil':
3415 formats.extend(self._extract_smil_formats(
3416 source_url, video_id, fatal=False))
ed0cf9b3 3417 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3418 elif source_type.startswith('audio') or ext in (
3419 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3420 formats.append({
3421 'url': source_url,
3422 'vcodec': 'none',
3423 'ext': ext,
3424 })
3425 else:
3426 height = int_or_none(source.get('height'))
3427 if height is None:
3428 # Often no height is provided but there is a label in
0236cd0d 3429 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3430 height = int_or_none(self._search_regex(
0236cd0d 3431 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
3432 'height', default=None))
3433 a_format = {
3434 'url': source_url,
3435 'width': int_or_none(source.get('width')),
3436 'height': height,
0236cd0d 3437 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3438 'ext': ext,
3439 }
3440 if source_url.startswith('rtmp'):
3441 a_format['ext'] = 'flv'
ed0cf9b3
S
3442 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3443 # of jwplayer.flash.swf
3444 rtmp_url_parts = re.split(
3445 r'((?:mp4|mp3|flv):)', source_url, 1)
3446 if len(rtmp_url_parts) == 3:
3447 rtmp_url, prefix, play_path = rtmp_url_parts
3448 a_format.update({
3449 'url': rtmp_url,
3450 'play_path': prefix + play_path,
3451 })
3452 if rtmp_params:
3453 a_format.update(rtmp_params)
3454 formats.append(a_format)
3455 return formats
3456
f4b1c7ad 3457 def _live_title(self, name):
39ca3b5c 3458 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3459 return name
f4b1c7ad 3460
b14f3a4c
PH
3461 def _int(self, v, name, fatal=False, **kwargs):
3462 res = int_or_none(v, **kwargs)
3463 if 'get_attr' in kwargs:
3464 print(getattr(v, kwargs['get_attr']))
3465 if res is None:
3466 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3467 if fatal:
3468 raise ExtractorError(msg)
3469 else:
6a39ee13 3470 self.report_warning(msg)
b14f3a4c
PH
3471 return res
3472
3473 def _float(self, v, name, fatal=False, **kwargs):
3474 res = float_or_none(v, **kwargs)
3475 if res is None:
3476 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3477 if fatal:
3478 raise ExtractorError(msg)
3479 else:
6a39ee13 3480 self.report_warning(msg)
b14f3a4c
PH
3481 return res
3482
40e41780
TF
3483 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3484 path='/', secure=False, discard=False, rest={}, **kwargs):
6c22cee6 3485 cookie = compat_cookiejar_Cookie(
4ed2d7b7 3486 0, name, value, port, port is not None, domain, True,
40e41780
TF
3487 domain.startswith('.'), path, True, secure, expire_time,
3488 discard, None, None, rest)
42939b61
JMF
3489 self._downloader.cookiejar.set_cookie(cookie)
3490
799207e8 3491 def _get_cookies(self, url):
f7ad7160 3492 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
5c2266df 3493 req = sanitized_Request(url)
799207e8 3494 self._downloader.cookiejar.add_cookie_header(req)
f7ad7160 3495 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
799207e8 3496
e3c1266f 3497 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3498 """
3499 Apply first Set-Cookie header instead of the last. Experimental.
3500
3501 Some sites (e.g. [1-3]) may serve two cookies under the same name
3502 in Set-Cookie header and expect the first (old) one to be set rather
3503 than second (new). However, as of RFC6265 the newer one cookie
3504 should be set into cookie store what actually happens.
3505 We will workaround this issue by resetting the cookie to
3506 the first one manually.
3507 1. https://new.vk.com/
3508 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3509 3. https://learning.oreilly.com/
3510 """
e3c1266f
S
3511 for header, cookies in url_handle.headers.items():
3512 if header.lower() != 'set-cookie':
3513 continue
3514 if sys.version_info[0] >= 3:
3515 cookies = cookies.encode('iso-8859-1')
3516 cookies = cookies.decode('utf-8')
3517 cookie_value = re.search(
3518 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3519 if cookie_value:
3520 value, domain = cookie_value.groups()
3521 self._set_cookie(domain, cookie, value)
3522 break
3523
05900629
PH
3524 def get_testcases(self, include_onlymatching=False):
3525 t = getattr(self, '_TEST', None)
3526 if t:
3527 assert not hasattr(self, '_TESTS'), \
3528 '%s has _TEST and _TESTS' % type(self).__name__
3529 tests = [t]
3530 else:
3531 tests = getattr(self, '_TESTS', [])
3532 for t in tests:
3533 if not include_onlymatching and t.get('only_matching', False):
3534 continue
3535 t['name'] = type(self).__name__[:-len('IE')]
3536 yield t
3537
3538 def is_suitable(self, age_limit):
3539 """ Test whether the extractor is generally suitable for the given
3540 age limit (i.e. pornographic sites are not, all others usually are) """
3541
3542 any_restricted = False
3543 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 3544 if tc.get('playlist', []):
05900629
PH
3545 tc = tc['playlist'][0]
3546 is_restricted = age_restricted(
3547 tc.get('info_dict', {}).get('age_limit'), age_limit)
3548 if not is_restricted:
3549 return True
3550 any_restricted = any_restricted or is_restricted
3551 return not any_restricted
3552
a504ced0 3553 def extract_subtitles(self, *args, **kwargs):
a06916d9 3554 if (self.get_param('writesubtitles', False)
3555 or self.get_param('listsubtitles')):
9868ea49
JMF
3556 return self._get_subtitles(*args, **kwargs)
3557 return {}
a504ced0
JMF
3558
3559 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3560 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3561
a2160aa4 3562 def extract_comments(self, *args, **kwargs):
3563 if not self.get_param('getcomments'):
3564 return None
3565 generator = self._get_comments(*args, **kwargs)
3566
3567 def extractor():
3568 comments = []
d2b2fca5 3569 interrupted = True
a2160aa4 3570 try:
3571 while True:
3572 comments.append(next(generator))
a2160aa4 3573 except StopIteration:
3574 interrupted = False
d2b2fca5 3575 except KeyboardInterrupt:
3576 self.to_screen('Interrupted by user')
3577 except Exception as e:
3578 if self.get_param('ignoreerrors') is not True:
3579 raise
3580 self._downloader.report_error(e)
a2160aa4 3581 comment_count = len(comments)
3582 self.to_screen(f'Extracted {comment_count} comments')
3583 return {
3584 'comments': comments,
3585 'comment_count': None if interrupted else comment_count
3586 }
3587 return extractor
3588
3589 def _get_comments(self, *args, **kwargs):
3590 raise NotImplementedError('This method must be implemented by subclasses')
3591
912e0b7e
YCH
3592 @staticmethod
3593 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3594 """ Merge subtitle items for one language. Items with duplicated URLs
3595 will be dropped. """
3596 list1_urls = set([item['url'] for item in subtitle_list1])
3597 ret = list(subtitle_list1)
3598 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3599 return ret
3600
3601 @classmethod
46890374 3602 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3603 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3604 if target is None:
3605 target = {}
3606 for d in dicts:
3607 for lang, subs in d.items():
3608 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3609 return target
912e0b7e 3610
360e1ca5 3611 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3612 if (self.get_param('writeautomaticsub', False)
3613 or self.get_param('listsubtitles')):
9868ea49
JMF
3614 return self._get_automatic_captions(*args, **kwargs)
3615 return {}
360e1ca5
JMF
3616
3617 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3618 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3619
d77ab8e2 3620 def mark_watched(self, *args, **kwargs):
1813a6cc 3621 if not self.get_param('mark_watched', False):
3622 return
3623 if (self._get_login_info()[0] is not None
3624 or self.get_param('cookiefile')
3625 or self.get_param('cookiesfrombrowser')):
d77ab8e2
S
3626 self._mark_watched(*args, **kwargs)
3627
3628 def _mark_watched(self, *args, **kwargs):
3629 raise NotImplementedError('This method must be implemented by subclasses')
3630
38cce791
YCH
3631 def geo_verification_headers(self):
3632 headers = {}
a06916d9 3633 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3634 if geo_verification_proxy:
3635 headers['Ytdl-request-proxy'] = geo_verification_proxy
3636 return headers
3637
98763ee3
YCH
3638 def _generic_id(self, url):
3639 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3640
3641 def _generic_title(self, url):
3642 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3643
c224251a 3644 @staticmethod
b0089e89 3645 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3646 all_known = all(map(
3647 lambda x: x is not None,
3648 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3649 return (
3650 'private' if is_private
3651 else 'premium_only' if needs_premium
3652 else 'subscriber_only' if needs_subscription
3653 else 'needs_auth' if needs_auth
3654 else 'unlisted' if is_unlisted
3655 else 'public' if all_known
3656 else None)
3657
4bb6b02f 3658 def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3659 '''
3660 @returns A list of values for the extractor argument given by "key"
3661 or "default" if no such key is present
3662 @param default The default value to return when the key is not present (default: [])
3663 @param casesense When false, the values are converted to lower case
3664 '''
3665 val = traverse_obj(
5d3a0e79 3666 self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
4bb6b02f 3667 if val is None:
3668 return [] if default is NO_DEFAULT else default
3669 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3670
8dbe9899 3671
d6983cb4
PH
3672class SearchInfoExtractor(InfoExtractor):
3673 """
3674 Base class for paged search queries extractors.
10952eb2 3675 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
96565c7e 3676 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
d6983cb4
PH
3677 """
3678
96565c7e 3679 _MAX_RESULTS = float('inf')
3680
d6983cb4
PH
3681 @classmethod
3682 def _make_valid_url(cls):
3683 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3684
d6983cb4 3685 def _real_extract(self, query):
2c4aaadd 3686 prefix, query = self._match_valid_url(query).group('prefix', 'query')
d6983cb4
PH
3687 if prefix == '':
3688 return self._get_n_results(query, 1)
3689 elif prefix == 'all':
3690 return self._get_n_results(query, self._MAX_RESULTS)
3691 else:
3692 n = int(prefix)
3693 if n <= 0:
f1a9d64e 3694 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 3695 elif n > self._MAX_RESULTS:
6a39ee13 3696 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3697 n = self._MAX_RESULTS
3698 return self._get_n_results(query, n)
3699
3700 def _get_n_results(self, query, n):
cc16383f 3701 """Get a specified number of results for a query.
3702 Either this function or _search_results must be overridden by subclasses """
3703 return self.playlist_result(
3704 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3705 query, query)
3706
3707 def _search_results(self, query):
3708 """Returns an iterator of search results"""
611c1dd9 3709 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
3710
3711 @property
3712 def SEARCH_KEY(self):
3713 return self._SEARCH_KEY