]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
Add HDR information to formats
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
f4b1c7ad 5import datetime
3ec05685 6import hashlib
cc16383f 7import itertools
3d3538e4 8import json
4094b6e3 9import netrc
d6983cb4 10import os
773f291d 11import random
d6983cb4 12import re
d6983cb4 13import sys
4094b6e3 14import time
1bac3455 15import math
d6983cb4 16
8c25f81b 17from ..compat import (
6c22cee6 18 compat_cookiejar_Cookie,
f7ad7160 19 compat_cookies_SimpleCookie,
ee0ba927 20 compat_etree_Element,
e9c0cdd3 21 compat_etree_fromstring,
0001fcb5 22 compat_expanduser,
e64b7569 23 compat_getpass,
d6983cb4 24 compat_http_client,
e9c0cdd3
YCH
25 compat_os_name,
26 compat_str,
d6983cb4 27 compat_urllib_error,
98763ee3 28 compat_urllib_parse_unquote,
15707c7e 29 compat_urllib_parse_urlencode,
41d06b04 30 compat_urllib_request,
f0b5d6af 31 compat_urlparse,
e01c3d2e 32 compat_xml_parse_error,
8c25f81b 33)
eb8a4433 34from ..downloader import FileDownloader
48107c19
S
35from ..downloader.f4m import (
36 get_base_url,
37 remove_encrypted_media,
38)
8c25f81b 39from ..utils import (
05900629 40 age_restricted,
02dc0a36 41 base_url,
08f2a92c 42 bug_reports_message,
d6983cb4
PH
43 clean_html,
44 compiled_regex_type,
70f0f5a8 45 determine_ext,
46b18f23 46 determine_protocol,
d493f15c 47 dict_get,
9b9c5355 48 error_to_compat_str,
46b18f23 49 extract_attributes,
b868936c 50 ExtractorError,
97f4aecf 51 fix_xml_ampersands,
b14f3a4c 52 float_or_none,
b868936c 53 format_field,
773f291d
S
54 GeoRestrictedError,
55 GeoUtils,
31bb8d3f 56 int_or_none,
a4a554a7 57 js_to_json,
0685d972 58 JSON_LD_RE,
46b18f23 59 mimetype2ext,
3158150c 60 network_exceptions,
b868936c 61 NO_DEFAULT,
46b18f23 62 orderedSet,
d493f15c 63 parse_bitrate,
46b18f23
JH
64 parse_codecs,
65 parse_duration,
4ca2a3cf 66 parse_iso8601,
46b18f23 67 parse_m3u8_attributes,
d493f15c 68 parse_resolution,
55b3e45b 69 RegexNotFoundError,
46b18f23 70 sanitize_filename,
b868936c 71 sanitized_Request,
d493f15c 72 str_or_none,
ce5b9040 73 str_to_int,
f856816b 74 strip_or_none,
5d3a0e79 75 traverse_obj,
f38de77f 76 unescapeHTML,
647eab45 77 unified_strdate,
6b3a3098 78 unified_timestamp,
46b18f23
JH
79 update_Request,
80 update_url_query,
a107193e 81 url_basename,
bebef109 82 url_or_none,
b868936c 83 urljoin,
6606817a 84 variadic,
a6571f10 85 xpath_element,
8d6765cf
S
86 xpath_text,
87 xpath_with_ns,
d6983cb4 88)
c342041f 89
d6983cb4
PH
90
91class InfoExtractor(object):
92 """Information Extractor class.
93
94 Information extractors are the classes that, given a URL, extract
95 information about the video (or videos) the URL refers to. This
96 information includes the real video URL, the video title, author and
97 others. The information is stored in a dictionary which is then
5d380852 98 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
99 information possibly downloading the video to the file system, among
100 other possible outcomes.
101
cf0649f8 102 The type field determines the type of the result.
fed5d032
PH
103 By far the most common value (and the default if _type is missing) is
104 "video", which indicates a single video.
105
106 For a video, the dictionaries must include the following fields:
d6983cb4
PH
107
108 id: Video identifier.
d6983cb4 109 title: Video title, unescaped.
d67b0b15 110
f49d89ee 111 Additionally, it must contain either a formats entry or a url one:
d67b0b15 112
f49d89ee
PH
113 formats: A list of dictionaries for each format available, ordered
114 from worst to best quality.
115
116 Potential fields:
c790e93a
S
117 * url The mandatory URL representing the media:
118 for plain file media - HTTP URL of this file,
119 for RTMP - RTMP URL,
120 for HLS - URL of the M3U8 media playlist,
121 for HDS - URL of the F4M manifest,
79d2077e
S
122 for DASH
123 - HTTP URL to plain file media (in case of
124 unfragmented media)
125 - URL of the MPD manifest or base URL
126 representing the media if MPD manifest
8ed7a233 127 is parsed from a string (in case of
79d2077e 128 fragmented media)
c790e93a 129 for MSS - URL of the ISM manifest.
86f4d14f
S
130 * manifest_url
131 The URL of the manifest file in case of
c790e93a
S
132 fragmented media:
133 for HLS - URL of the M3U8 master playlist,
134 for HDS - URL of the F4M manifest,
135 for DASH - URL of the MPD manifest,
136 for MSS - URL of the ISM manifest.
10952eb2 137 * ext Will be calculated from URL if missing
d67b0b15
PH
138 * format A human-readable description of the format
139 ("mp4 container with h264/opus").
140 Calculated from the format_id, width, height.
141 and format_note fields if missing.
142 * format_id A short description of the format
5d4f3985
PH
143 ("mp4_h264_opus" or "19").
144 Technically optional, but strongly recommended.
d67b0b15
PH
145 * format_note Additional info about the format
146 ("3D" or "DASH video")
147 * width Width of the video, if known
148 * height Height of the video, if known
f49d89ee 149 * resolution Textual description of width and height
176f1866 150 * dynamic_range The dynamic range of the video. One of:
151 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
7217e148 152 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
153 * abr Average audio bitrate in KBit/s
154 * acodec Name of the audio codec in use
dd27fd17 155 * asr Audio sampling rate in Hertz
d67b0b15 156 * vbr Average video bitrate in KBit/s
fbb21cf5 157 * fps Frame rate
d67b0b15 158 * vcodec Name of the video codec in use
1394ce65 159 * container Name of the container format
d67b0b15 160 * filesize The number of bytes, if known in advance
9732d77e 161 * filesize_approx An estimate for the number of bytes
d67b0b15 162 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
163 * protocol The protocol that will be used for the actual
164 download, lower-case.
0fa9a1e2 165 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
af7d5a63 166 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
167 * fragment_base_url
168 Base URL for fragments. Each fragment's path
169 value (if present) will be relative to
170 this URL.
171 * fragments A list of fragments of a fragmented media.
172 Each fragment entry must contain either an url
173 or a path. If an url is present it should be
174 considered by a client. Otherwise both path and
175 fragment_base_url must be present. Here is
176 the list of all potential fields:
177 * "url" - fragment's URL
178 * "path" - fragment's path relative to
179 fragment_base_url
a0d5077c
S
180 * "duration" (optional, int or float)
181 * "filesize" (optional, int)
f49d89ee 182 * preference Order number of this format. If this field is
08d13955 183 present and not None, the formats get sorted
38d63d84 184 by this field, regardless of all other values.
f49d89ee
PH
185 -1 for default (order by other properties),
186 -2 or smaller for less than default.
e65566a9
PH
187 < -1000 to hide the format (if there is
188 another one which is strictly better)
32f90364
PH
189 * language Language code, e.g. "de" or "en-US".
190 * language_preference Is this in the language mentioned in
191 the URL?
aff2f4f4
PH
192 10 if it's what the URL is about,
193 -1 for default (don't know),
194 -10 otherwise, other values reserved for now.
5d73273f
PH
195 * quality Order number of the video quality of this
196 format, irrespective of the file format.
197 -1 for default (order by other properties),
198 -2 or smaller for less than default.
c64ed2a3
PH
199 * source_preference Order number for this video source
200 (quality takes higher priority)
201 -1 for default (order by other properties),
202 -2 or smaller for less than default.
d769be6c
PH
203 * http_headers A dictionary of additional HTTP headers
204 to add to the request.
6271f1ca 205 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
206 video's pixels are not square.
207 width : height ratio as float.
208 * no_resume The server does not support resuming the
209 (HTTP or RTMP) download. Boolean.
88acdbc2 210 * has_drm The format has DRM and cannot be downloaded. Boolean
00c97e3e
S
211 * downloader_options A dictionary of downloader options as
212 described in FileDownloader
3b1fe47d 213 RTMP formats can also have the additional fields: page_url,
214 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
215 rtmp_protocol, rtmp_real_time
3dee7826 216
c0ba0f48 217 url: Final video URL.
d6983cb4 218 ext: Video filename extension.
d67b0b15
PH
219 format: The video format, defaults to ext (used for --get-format)
220 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 221
d6983cb4
PH
222 The following fields are optional:
223
f5e43bc6 224 alt_title: A secondary title of the video.
0afef30b
PH
225 display_id An alternative identifier for the video, not necessarily
226 unique, but available before title. Typically, id is
227 something like "4234987", title "Dancing naked mole rats",
228 and display_id "dancing-naked-mole-rats"
d5519808 229 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 230 * "id" (optional, string) - Thumbnail format ID
d5519808 231 * "url"
cfb56d1a 232 * "preference" (optional, int) - quality of the image
d5519808
PH
233 * "width" (optional, int)
234 * "height" (optional, int)
5e1c39ac 235 * "resolution" (optional, string "{width}x{height}",
d5519808 236 deprecated)
2de624fd 237 * "filesize" (optional, int)
d6983cb4 238 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 239 description: Full video description.
d6983cb4 240 uploader: Full name of the video uploader.
2bc0c46f 241 license: License name the video is licensed under.
8a92e51c 242 creator: The creator of the video.
10db0d2f 243 release_timestamp: UNIX timestamp of the moment the video was released.
8aab976b 244 release_date: The date (YYYYMMDD) when the video was released.
10db0d2f 245 timestamp: UNIX timestamp of the moment the video was uploaded
d6983cb4 246 upload_date: Video upload date (YYYYMMDD).
955c4514 247 If not explicitly set, calculated from timestamp.
d6983cb4 248 uploader_id: Nickname or id of the video uploader.
7bcd2830 249 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 250 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 251 Note that channel fields may or may not repeat uploader
6f1f59f3
S
252 fields. This depends on a particular extractor.
253 channel_id: Id of the channel.
254 channel_url: Full URL to a channel webpage.
da9ec3b9 255 location: Physical location where the video was filmed.
a504ced0 256 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
257 {tag: subformats}. "tag" is usually a language code, and
258 "subformats" is a list sorted from lower to higher
259 preference, each element is a dictionary with the "ext"
260 entry and one of:
a504ced0 261 * "data": The subtitles file contents
10952eb2 262 * "url": A URL pointing to the subtitles file
2412044c 263 It can optionally also have:
264 * "name": Name or description of the subtitles
4bba3716 265 "ext" will be calculated from URL if missing
e167860c 266 automatic_captions: Like 'subtitles'; contains automatically generated
267 captions instead of normal subtitles
62d231c0 268 duration: Length of the video in seconds, as an integer or float.
f3d29461 269 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
270 like_count: Number of positive ratings of the video
271 dislike_count: Number of negative ratings of the video
02835c6b 272 repost_count: Number of reposts of the video
2d30521a 273 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 274 comment_count: Number of comments on the video
dd622d7c
PH
275 comments: A list of comments, each with one or more of the following
276 properties (all but one of text or html optional):
277 * "author" - human-readable name of the comment author
278 * "author_id" - user ID of the comment author
a1c5d2ca 279 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
280 * "id" - Comment ID
281 * "html" - Comment as HTML
282 * "text" - Plain text of the comment
283 * "timestamp" - UNIX timestamp of comment
284 * "parent" - ID of the comment this one is replying to.
285 Set to "root" to indicate that this is a
286 comment to the original video.
a1c5d2ca
M
287 * "like_count" - Number of positive ratings of the comment
288 * "dislike_count" - Number of negative ratings of the comment
289 * "is_favorited" - Whether the comment is marked as
290 favorite by the video uploader
291 * "author_is_uploader" - Whether the comment is made by
292 the video uploader
8dbe9899 293 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 294 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
295 should allow to get the same result again. (It will be set
296 by YoutubeDL if it's missing)
ad3bc6ac
PH
297 categories: A list of categories that the video falls in, for example
298 ["Sports", "Berlin"]
864f24bd 299 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 300 cast: A list of the video cast
7267bd53
PH
301 is_live: True, False, or None (=unknown). Whether this video is a
302 live stream that goes on instead of a fixed-length video.
f76ede8e 303 was_live: True, False, or None (=unknown). Whether this video was
304 originally a live stream.
3dbb2a9d 305 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
ae30b840 306 If absent, automatically set from is_live, was_live
7c80519c 307 start_time: Time in seconds where the reproduction should start, as
10952eb2 308 specified in the URL.
297a564b 309 end_time: Time in seconds where the reproduction should end, as
10952eb2 310 specified in the URL.
55949fed 311 chapters: A list of dictionaries, with the following entries:
312 * "start_time" - The start time of the chapter in seconds
313 * "end_time" - The end time of the chapter in seconds
314 * "title" (optional, string)
6cfda058 315 playable_in_embed: Whether this video is allowed to play in embedded
316 players on other sites. Can be True (=always allowed),
317 False (=never allowed), None (=unknown), or a string
c224251a
M
318 specifying the criteria for embedability (Eg: 'whitelist')
319 availability: Under what condition the video is available. One of
320 'private', 'premium_only', 'subscriber_only', 'needs_auth',
321 'unlisted' or 'public'. Use 'InfoExtractor._availability'
322 to set it
277d6ff5 323 __post_extractor: A function to be called just before the metadata is
324 written to either disk, logger or console. The function
325 must return a dict which will be added to the info_dict.
326 This is usefull for additional information that is
327 time-consuming to extract. Note that the fields thus
328 extracted will not be available to output template and
329 match_filter. So, only "comments" and "comment_count" are
330 currently allowed to be extracted via this method.
d6983cb4 331
7109903e
S
332 The following fields should only be used when the video belongs to some logical
333 chapter or section:
334
335 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
336 chapter_number: Number of the chapter the video belongs to, as an integer.
337 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
338
339 The following fields should only be used when the video is an episode of some
8d76bdf1 340 series, programme or podcast:
7109903e
S
341
342 series: Title of the series or programme the video episode belongs to.
343 season: Title of the season the video episode belongs to.
27bfd4e5
S
344 season_number: Number of the season the video episode belongs to, as an integer.
345 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
346 episode: Title of the video episode. Unlike mandatory video title field,
347 this field should denote the exact title of the video episode
348 without any kind of decoration.
27bfd4e5
S
349 episode_number: Number of the video episode within a season, as an integer.
350 episode_id: Id of the video episode, as a unicode string.
7109903e 351
7a93ab5f
S
352 The following fields should only be used when the media is a track or a part of
353 a music album:
354
355 track: Title of the track.
356 track_number: Number of the track within an album or a disc, as an integer.
357 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
358 as a unicode string.
359 artist: Artist(s) of the track.
360 genre: Genre(s) of the track.
361 album: Title of the album the track belongs to.
362 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
363 album_artist: List of all artists appeared on the album (e.g.
364 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
365 and compilations).
366 disc_number: Number of the disc or other physical medium the track belongs to,
367 as an integer.
368 release_year: Year (YYYY) when the album was released.
369
deefc05b 370 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 371
d838b1bd
PH
372 Unless mentioned otherwise, None is equivalent to absence of information.
373
fed5d032
PH
374
375 _type "playlist" indicates multiple videos.
b82f815f
PH
376 There must be a key "entries", which is a list, an iterable, or a PagedList
377 object, each element of which is a valid dictionary by this specification.
fed5d032 378
b60419c5 379 Additionally, playlists can have "id", "title", and any other relevent
380 attributes with the same semantics as videos (see above).
fed5d032
PH
381
382
383 _type "multi_video" indicates that there are multiple videos that
384 form a single show, for examples multiple acts of an opera or TV episode.
385 It must have an entries key like a playlist and contain all the keys
386 required for a video at the same time.
387
388
389 _type "url" indicates that the video must be extracted from another
390 location, possibly by a different extractor. Its only required key is:
391 "url" - the next URL to extract.
f58766ce
PH
392 The key "ie_key" can be set to the class name (minus the trailing "IE",
393 e.g. "Youtube") if the extractor class is known in advance.
394 Additionally, the dictionary may have any properties of the resolved entity
395 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
396 known ahead of time.
397
398
399 _type "url_transparent" entities have the same specification as "url", but
400 indicate that the given additional information is more precise than the one
401 associated with the resolved URL.
402 This is useful when a site employs a video service that hosts the video and
403 its technical metadata, but that video service does not embed a useful
404 title, description etc.
405
406
d6983cb4
PH
407 Subclasses of this one should re-define the _real_initialize() and
408 _real_extract() methods and define a _VALID_URL regexp.
409 Probably, they should also be added to the list of extractors.
410
e6f21b3d 411 Subclasses may also override suitable() if necessary, but ensure the function
412 signature is preserved and that this function imports everything it needs
413 (except other extractors), so that lazy_extractors works correctly
414
4248dad9 415 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
416 geo restriction bypass mechanisms for a particular extractor.
417 Though it won't disable explicit geo restriction bypass based on
504f20dd 418 country code provided with geo_bypass_country.
4248dad9
S
419
420 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
421 countries for this extractor. One of these countries will be used by
422 geo restriction bypass mechanism right away in order to bypass
504f20dd 423 geo restriction, of course, if the mechanism is not disabled.
773f291d 424
5f95927a
S
425 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
426 IP blocks in CIDR notation for this extractor. One of these IP blocks
427 will be used by geo restriction bypass mechanism similarly
504f20dd 428 to _GEO_COUNTRIES.
3ccdde8c 429
e6f21b3d 430 The _WORKING attribute should be set to False for broken IEs
d6983cb4
PH
431 in order to warn the users and skip the tests.
432 """
433
434 _ready = False
435 _downloader = None
773f291d 436 _x_forwarded_for_ip = None
4248dad9
S
437 _GEO_BYPASS = True
438 _GEO_COUNTRIES = None
5f95927a 439 _GEO_IP_BLOCKS = None
d6983cb4
PH
440 _WORKING = True
441
9d5d4d64 442 _LOGIN_HINTS = {
443 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
444 'cookies': (
a0c716bb 445 'Use --cookies-from-browser or --cookies for the authentication. '
446 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
9d5d4d64 447 'password': 'Use --username and --password or --netrc to provide account credentials',
448 }
449
d6983cb4
PH
450 def __init__(self, downloader=None):
451 """Constructor. Receives an optional downloader."""
452 self._ready = False
773f291d 453 self._x_forwarded_for_ip = None
28f436ba 454 self._printed_messages = set()
d6983cb4
PH
455 self.set_downloader(downloader)
456
457 @classmethod
5ad28e7f 458 def _match_valid_url(cls, url):
79cb2577
PH
459 # This does not use has/getattr intentionally - we want to know whether
460 # we have cached the regexp for *this* class, whereas getattr would also
461 # match the superclass
462 if '_VALID_URL_RE' not in cls.__dict__:
463 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
5ad28e7f 464 return cls._VALID_URL_RE.match(url)
465
466 @classmethod
467 def suitable(cls, url):
468 """Receives a URL and returns True if suitable for this IE."""
3fb4e21b 469 # This function must import everything it needs (except other extractors),
470 # so that lazy_extractors works correctly
5ad28e7f 471 return cls._match_valid_url(url) is not None
d6983cb4 472
ed9266db
PH
473 @classmethod
474 def _match_id(cls, url):
5ad28e7f 475 return cls._match_valid_url(url).group('id')
ed9266db 476
1151c407 477 @classmethod
478 def get_temp_id(cls, url):
479 try:
480 return cls._match_id(url)
481 except (IndexError, AttributeError):
482 return None
483
d6983cb4
PH
484 @classmethod
485 def working(cls):
486 """Getter method for _WORKING."""
487 return cls._WORKING
488
489 def initialize(self):
490 """Initializes an instance (authentication, etc)."""
28f436ba 491 self._printed_messages = set()
5f95927a
S
492 self._initialize_geo_bypass({
493 'countries': self._GEO_COUNTRIES,
494 'ip_blocks': self._GEO_IP_BLOCKS,
495 })
4248dad9
S
496 if not self._ready:
497 self._real_initialize()
498 self._ready = True
499
5f95927a 500 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
501 """
502 Initialize geo restriction bypass mechanism.
503
504 This method is used to initialize geo bypass mechanism based on faking
505 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 506 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
507 IP will be passed as X-Forwarded-For HTTP header in all subsequent
508 HTTP requests.
e39b5d4a
S
509
510 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
511 during the instance initialization with _GEO_COUNTRIES and
512 _GEO_IP_BLOCKS.
e39b5d4a 513
5f95927a 514 You may also manually call it from extractor's code if geo bypass
e39b5d4a 515 information is not available beforehand (e.g. obtained during
5f95927a
S
516 extraction) or due to some other reason. In this case you should pass
517 this information in geo bypass context passed as first argument. It may
518 contain following fields:
519
520 countries: List of geo unrestricted countries (similar
521 to _GEO_COUNTRIES)
522 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
523 (similar to _GEO_IP_BLOCKS)
524
e39b5d4a 525 """
773f291d 526 if not self._x_forwarded_for_ip:
5f95927a
S
527
528 # Geo bypass mechanism is explicitly disabled by user
a06916d9 529 if not self.get_param('geo_bypass', True):
5f95927a
S
530 return
531
532 if not geo_bypass_context:
533 geo_bypass_context = {}
534
535 # Backward compatibility: previously _initialize_geo_bypass
536 # expected a list of countries, some 3rd party code may still use
537 # it this way
538 if isinstance(geo_bypass_context, (list, tuple)):
539 geo_bypass_context = {
540 'countries': geo_bypass_context,
541 }
542
543 # The whole point of geo bypass mechanism is to fake IP
544 # as X-Forwarded-For HTTP header based on some IP block or
545 # country code.
546
547 # Path 1: bypassing based on IP block in CIDR notation
548
549 # Explicit IP block specified by user, use it right away
550 # regardless of whether extractor is geo bypassable or not
a06916d9 551 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
552
553 # Otherwise use random IP block from geo bypass context but only
554 # if extractor is known as geo bypassable
555 if not ip_block:
556 ip_blocks = geo_bypass_context.get('ip_blocks')
557 if self._GEO_BYPASS and ip_blocks:
558 ip_block = random.choice(ip_blocks)
559
560 if ip_block:
561 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
0760b0a7 562 self._downloader.write_debug(
563 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
5f95927a
S
564 return
565
566 # Path 2: bypassing based on country code
567
568 # Explicit country code specified by user, use it right away
569 # regardless of whether extractor is geo bypassable or not
a06916d9 570 country = self.get_param('geo_bypass_country', None)
5f95927a
S
571
572 # Otherwise use random country code from geo bypass context but
573 # only if extractor is known as geo bypassable
574 if not country:
575 countries = geo_bypass_context.get('countries')
576 if self._GEO_BYPASS and countries:
577 country = random.choice(countries)
578
579 if country:
580 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 581 self._downloader.write_debug(
582 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
583
584 def extract(self, url):
585 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 586 try:
773f291d
S
587 for _ in range(2):
588 try:
589 self.initialize()
a06916d9 590 self.write_debug('Extracting URL: %s' % url)
0016b84e 591 ie_result = self._real_extract(url)
07cce701 592 if ie_result is None:
593 return None
0016b84e
S
594 if self._x_forwarded_for_ip:
595 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
53ed7066 596 subtitles = ie_result.get('subtitles')
597 if (subtitles and 'live_chat' in subtitles
a06916d9 598 and 'no-live-chat' in self.get_param('compat_opts', [])):
53ed7066 599 del subtitles['live_chat']
0016b84e 600 return ie_result
773f291d 601 except GeoRestrictedError as e:
4248dad9
S
602 if self.__maybe_fake_ip_and_retry(e.countries):
603 continue
773f291d 604 raise
1151c407 605 except ExtractorError as e:
606 video_id = e.video_id or self.get_temp_id(url)
607 raise ExtractorError(
608 e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
3a5bcd03 609 except compat_http_client.IncompleteRead as e:
1151c407 610 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
9650885b 611 except (KeyError, StopIteration) as e:
1151c407 612 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
d6983cb4 613
4248dad9 614 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 615 if (not self.get_param('geo_bypass_country', None)
3089bc74 616 and self._GEO_BYPASS
a06916d9 617 and self.get_param('geo_bypass', True)
3089bc74
S
618 and not self._x_forwarded_for_ip
619 and countries):
eea0716c
S
620 country_code = random.choice(countries)
621 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
622 if self._x_forwarded_for_ip:
623 self.report_warning(
eea0716c
S
624 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
625 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
626 return True
627 return False
628
d6983cb4
PH
629 def set_downloader(self, downloader):
630 """Sets the downloader for this IE."""
631 self._downloader = downloader
632
633 def _real_initialize(self):
634 """Real initialization process. Redefine in subclasses."""
635 pass
636
637 def _real_extract(self, url):
638 """Real extraction process. Redefine in subclasses."""
639 pass
640
56c73665
JMF
641 @classmethod
642 def ie_key(cls):
643 """A string for getting the InfoExtractor with get_info_extractor"""
3fb4e21b 644 return cls.__name__[:-2]
56c73665 645
d6983cb4
PH
646 @property
647 def IE_NAME(self):
dc519b54 648 return compat_str(type(self).__name__[:-2])
d6983cb4 649
d391b7e2
S
650 @staticmethod
651 def __can_accept_status_code(err, expected_status):
652 assert isinstance(err, compat_urllib_error.HTTPError)
653 if expected_status is None:
654 return False
d391b7e2
S
655 elif callable(expected_status):
656 return expected_status(err.code) is True
657 else:
6606817a 658 return err.code in variadic(expected_status)
d391b7e2
S
659
660 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
661 """
662 Return the response handle.
663
664 See _download_webpage docstring for arguments specification.
665 """
1cf376f5 666 if not self._downloader._first_webpage_request:
a06916d9 667 sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
1cf376f5 668 if sleep_interval > 0:
5ef7d9bd 669 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 670 time.sleep(sleep_interval)
671 else:
672 self._downloader._first_webpage_request = False
673
d6983cb4
PH
674 if note is None:
675 self.report_download_webpage(video_id)
676 elif note is not False:
7cc3570e 677 if video_id is None:
f1a9d64e 678 self.to_screen('%s' % (note,))
7cc3570e 679 else:
f1a9d64e 680 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
681
682 # Some sites check X-Forwarded-For HTTP header in order to figure out
683 # the origin of the client behind proxy. This allows bypassing geo
684 # restriction by faking this header's value to IP that belongs to some
685 # geo unrestricted country. We will do so once we encounter any
686 # geo restriction error.
687 if self._x_forwarded_for_ip:
688 if 'X-Forwarded-For' not in headers:
689 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
690
41d06b04
S
691 if isinstance(url_or_request, compat_urllib_request.Request):
692 url_or_request = update_Request(
693 url_or_request, data=data, headers=headers, query=query)
694 else:
cdfee168 695 if query:
696 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 697 if data is not None or headers:
41d06b04 698 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 699 try:
dca08720 700 return self._downloader.urlopen(url_or_request)
3158150c 701 except network_exceptions as err:
d391b7e2
S
702 if isinstance(err, compat_urllib_error.HTTPError):
703 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
704 # Retain reference to error to prevent file object from
705 # being closed before it can be read. Works around the
706 # effects of <https://bugs.python.org/issue15002>
707 # introduced in Python 3.4.1.
708 err.fp._error = err
d391b7e2
S
709 return err.fp
710
aa94a6d3
PH
711 if errnote is False:
712 return False
d6983cb4 713 if errnote is None:
f1a9d64e 714 errnote = 'Unable to download webpage'
7f8b2714 715
9b9c5355 716 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
717 if fatal:
718 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
719 else:
6a39ee13 720 self.report_warning(errmsg)
7cc3570e 721 return False
d6983cb4 722
d391b7e2
S
723 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
724 """
725 Return a tuple (page content as string, URL handle).
726
727 See _download_webpage docstring for arguments specification.
728 """
b9d3e163
PH
729 # Strip hashes from the URL (#1038)
730 if isinstance(url_or_request, (compat_str, str)):
731 url_or_request = url_or_request.partition('#')[0]
732
d391b7e2 733 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
734 if urlh is False:
735 assert not fatal
736 return False
c9a77969 737 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
738 return (content, urlh)
739
c9a77969
YCH
740 @staticmethod
741 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
742 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
743 if m:
744 encoding = m.group(1)
745 else:
0d75ae2c 746 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
747 webpage_bytes[:1024])
748 if m:
749 encoding = m.group(1).decode('ascii')
b60016e8
PH
750 elif webpage_bytes.startswith(b'\xff\xfe'):
751 encoding = 'utf-16'
f143d86a
PH
752 else:
753 encoding = 'utf-8'
c9a77969
YCH
754
755 return encoding
756
4457823d
S
757 def __check_blocked(self, content):
758 first_block = content[:512]
3089bc74
S
759 if ('<title>Access to this site is blocked</title>' in content
760 and 'Websense' in first_block):
4457823d
S
761 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
762 blocked_iframe = self._html_search_regex(
763 r'<iframe src="([^"]+)"', content,
764 'Websense information URL', default=None)
765 if blocked_iframe:
766 msg += ' Visit %s for more details' % blocked_iframe
767 raise ExtractorError(msg, expected=True)
768 if '<title>The URL you requested has been blocked</title>' in first_block:
769 msg = (
770 'Access to this webpage has been blocked by Indian censorship. '
771 'Use a VPN or proxy server (with --proxy) to route around it.')
772 block_msg = self._html_search_regex(
773 r'</h1><p>(.*?)</p>',
774 content, 'block message', default=None)
775 if block_msg:
776 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
777 raise ExtractorError(msg, expected=True)
3089bc74
S
778 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
779 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
780 raise ExtractorError(
781 'Access to this webpage has been blocked by decision of the Russian government. '
782 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
783 expected=True)
784
c9a77969
YCH
785 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
786 content_type = urlh.headers.get('Content-Type', '')
787 webpage_bytes = urlh.read()
788 if prefix is not None:
789 webpage_bytes = prefix + webpage_bytes
790 if not encoding:
791 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
a06916d9 792 if self.get_param('dump_intermediate_pages', False):
f610dbb0 793 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
794 dump = base64.b64encode(webpage_bytes).decode('ascii')
795 self._downloader.to_screen(dump)
a06916d9 796 if self.get_param('write_pages', False):
f610dbb0 797 basen = '%s_%s' % (video_id, urlh.geturl())
bd6f722d 798 trim_length = self.get_param('trim_file_name') or 240
799 if len(basen) > trim_length:
f1a9d64e 800 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
bd6f722d 801 basen = basen[:trim_length - len(h)] + h
c1bce22f 802 raw_filename = basen + '.dump'
d41e6efc 803 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 804 self.to_screen('Saving request to ' + filename)
5f58165d
S
805 # Working around MAX_PATH limitation on Windows (see
806 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 807 if compat_os_name == 'nt':
5f58165d
S
808 absfilepath = os.path.abspath(filename)
809 if len(absfilepath) > 259:
810 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
811 with open(filename, 'wb') as outf:
812 outf.write(webpage_bytes)
813
ec0fafbb
AA
814 try:
815 content = webpage_bytes.decode(encoding, 'replace')
816 except LookupError:
817 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 818
4457823d 819 self.__check_blocked(content)
2410c43d 820
23be51d8 821 return content
d6983cb4 822
d391b7e2
S
823 def _download_webpage(
824 self, url_or_request, video_id, note=None, errnote=None,
825 fatal=True, tries=1, timeout=5, encoding=None, data=None,
826 headers={}, query={}, expected_status=None):
827 """
828 Return the data of the page as a string.
829
830 Arguments:
831 url_or_request -- plain text URL as a string or
832 a compat_urllib_request.Requestobject
833 video_id -- Video/playlist/item identifier (string)
834
835 Keyword arguments:
836 note -- note printed before downloading (string)
837 errnote -- note printed in case of an error (string)
838 fatal -- flag denoting whether error should be considered fatal,
839 i.e. whether it should cause ExtractionError to be raised,
840 otherwise a warning will be reported and extraction continued
841 tries -- number of tries
842 timeout -- sleep interval between tries
843 encoding -- encoding for a page content decoding, guessed automatically
844 when not explicitly specified
845 data -- POST data (bytes)
846 headers -- HTTP headers (dict)
847 query -- URL query (dict)
848 expected_status -- allows to accept failed HTTP requests (non 2xx
849 status code) by explicitly specifying a set of accepted status
850 codes. Can be any of the following entities:
851 - an integer type specifying an exact failed status code to
852 accept
853 - a list or a tuple of integer types specifying a list of
854 failed status codes to accept
855 - a callable accepting an actual failed status code and
856 returning True if it should be accepted
857 Note that this argument does not affect success status codes (2xx)
858 which are always accepted.
859 """
860
995ad69c
TF
861 success = False
862 try_count = 0
863 while success is False:
864 try:
d391b7e2
S
865 res = self._download_webpage_handle(
866 url_or_request, video_id, note, errnote, fatal,
867 encoding=encoding, data=data, headers=headers, query=query,
868 expected_status=expected_status)
995ad69c
TF
869 success = True
870 except compat_http_client.IncompleteRead as e:
871 try_count += 1
872 if try_count >= tries:
873 raise e
874 self._sleep(timeout, video_id)
7cc3570e
PH
875 if res is False:
876 return res
877 else:
878 content, _ = res
879 return content
d6983cb4 880
e0d198c1
S
881 def _download_xml_handle(
882 self, url_or_request, video_id, note='Downloading XML',
883 errnote='Unable to download XML', transform_source=None,
d391b7e2
S
884 fatal=True, encoding=None, data=None, headers={}, query={},
885 expected_status=None):
886 """
ee0ba927 887 Return a tuple (xml as an compat_etree_Element, URL handle).
d391b7e2
S
888
889 See _download_webpage docstring for arguments specification.
890 """
e0d198c1
S
891 res = self._download_webpage_handle(
892 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
893 encoding=encoding, data=data, headers=headers, query=query,
894 expected_status=expected_status)
e0d198c1
S
895 if res is False:
896 return res
897 xml_string, urlh = res
898 return self._parse_xml(
899 xml_string, video_id, transform_source=transform_source,
900 fatal=fatal), urlh
901
d391b7e2
S
902 def _download_xml(
903 self, url_or_request, video_id,
904 note='Downloading XML', errnote='Unable to download XML',
905 transform_source=None, fatal=True, encoding=None,
906 data=None, headers={}, query={}, expected_status=None):
907 """
ee0ba927 908 Return the xml as an compat_etree_Element.
d391b7e2
S
909
910 See _download_webpage docstring for arguments specification.
911 """
e0d198c1
S
912 res = self._download_xml_handle(
913 url_or_request, video_id, note=note, errnote=errnote,
914 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
915 data=data, headers=headers, query=query,
916 expected_status=expected_status)
e0d198c1 917 return res if res is False else res[0]
e01c3d2e
S
918
919 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
920 if transform_source:
921 xml_string = transform_source(xml_string)
e01c3d2e
S
922 try:
923 return compat_etree_fromstring(xml_string.encode('utf-8'))
924 except compat_xml_parse_error as ve:
925 errmsg = '%s: Failed to parse XML ' % video_id
926 if fatal:
927 raise ExtractorError(errmsg, cause=ve)
928 else:
929 self.report_warning(errmsg + str(ve))
267ed0c5 930
0fe7783e
S
931 def _download_json_handle(
932 self, url_or_request, video_id, note='Downloading JSON metadata',
933 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
934 fatal=True, encoding=None, data=None, headers={}, query={},
935 expected_status=None):
936 """
937 Return a tuple (JSON object, URL handle).
938
939 See _download_webpage docstring for arguments specification.
940 """
0fe7783e 941 res = self._download_webpage_handle(
c9a77969 942 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
943 encoding=encoding, data=data, headers=headers, query=query,
944 expected_status=expected_status)
0fe7783e
S
945 if res is False:
946 return res
947 json_string, urlh = res
ebb64199 948 return self._parse_json(
0fe7783e
S
949 json_string, video_id, transform_source=transform_source,
950 fatal=fatal), urlh
951
952 def _download_json(
953 self, url_or_request, video_id, note='Downloading JSON metadata',
954 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
955 fatal=True, encoding=None, data=None, headers={}, query={},
956 expected_status=None):
957 """
958 Return the JSON object as a dict.
959
960 See _download_webpage docstring for arguments specification.
961 """
0fe7783e
S
962 res = self._download_json_handle(
963 url_or_request, video_id, note=note, errnote=errnote,
964 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
965 data=data, headers=headers, query=query,
966 expected_status=expected_status)
0fe7783e 967 return res if res is False else res[0]
ebb64199
TF
968
969 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
970 if transform_source:
971 json_string = transform_source(json_string)
3d3538e4
PH
972 try:
973 return json.loads(json_string)
974 except ValueError as ve:
e7b6d122
PH
975 errmsg = '%s: Failed to parse JSON ' % video_id
976 if fatal:
977 raise ExtractorError(errmsg, cause=ve)
978 else:
979 self.report_warning(errmsg + str(ve))
3d3538e4 980
adddc50c 981 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
982 return self._parse_json(
983 data[data.find('{'):data.rfind('}') + 1],
984 video_id, transform_source, fatal)
985
986 def _download_socket_json_handle(
987 self, url_or_request, video_id, note='Polling socket',
988 errnote='Unable to poll socket', transform_source=None,
989 fatal=True, encoding=None, data=None, headers={}, query={},
990 expected_status=None):
991 """
992 Return a tuple (JSON object, URL handle).
993
994 See _download_webpage docstring for arguments specification.
995 """
996 res = self._download_webpage_handle(
997 url_or_request, video_id, note, errnote, fatal=fatal,
998 encoding=encoding, data=data, headers=headers, query=query,
999 expected_status=expected_status)
1000 if res is False:
1001 return res
1002 webpage, urlh = res
1003 return self._parse_socket_response_as_json(
1004 webpage, video_id, transform_source=transform_source,
1005 fatal=fatal), urlh
1006
1007 def _download_socket_json(
1008 self, url_or_request, video_id, note='Polling socket',
1009 errnote='Unable to poll socket', transform_source=None,
1010 fatal=True, encoding=None, data=None, headers={}, query={},
1011 expected_status=None):
1012 """
1013 Return the JSON object as a dict.
1014
1015 See _download_webpage docstring for arguments specification.
1016 """
1017 res = self._download_socket_json_handle(
1018 url_or_request, video_id, note=note, errnote=errnote,
1019 transform_source=transform_source, fatal=fatal, encoding=encoding,
1020 data=data, headers=headers, query=query,
1021 expected_status=expected_status)
1022 return res if res is False else res[0]
1023
28f436ba 1024 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
b868936c 1025 idstr = format_field(video_id, template='%s: ')
28f436ba 1026 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1027 if only_once:
1028 if f'WARNING: {msg}' in self._printed_messages:
1029 return
1030 self._printed_messages.add(f'WARNING: {msg}')
1031 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1032
a06916d9 1033 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1034 """Print msg to screen, prefixing it with '[ie_name]'"""
a06916d9 1035 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1036
1037 def write_debug(self, msg, *args, **kwargs):
1038 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1039
1040 def get_param(self, name, default=None, *args, **kwargs):
1041 if self._downloader:
1042 return self._downloader.params.get(name, default, *args, **kwargs)
1043 return default
d6983cb4 1044
88acdbc2 1045 def report_drm(self, video_id, partial=False):
1046 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1047
d6983cb4
PH
1048 def report_extraction(self, id_or_name):
1049 """Report information extraction."""
f1a9d64e 1050 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1051
1052 def report_download_webpage(self, video_id):
1053 """Report webpage download."""
f1a9d64e 1054 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1055
1056 def report_age_confirmation(self):
1057 """Report attempt to confirm age."""
f1a9d64e 1058 self.to_screen('Confirming age')
d6983cb4 1059
fc79158d
JMF
1060 def report_login(self):
1061 """Report attempt to log in."""
f1a9d64e 1062 self.to_screen('Logging in')
fc79158d 1063
b7da73eb 1064 def raise_login_required(
9d5d4d64 1065 self, msg='This video is only available for registered users',
1066 metadata_available=False, method='any'):
a06916d9 1067 if metadata_available and self.get_param('ignore_no_formats_error'):
b7da73eb 1068 self.report_warning(msg)
46890374 1069 if method is not None:
1070 msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1071 raise ExtractorError(msg, expected=True)
43e7d3c9 1072
b7da73eb 1073 def raise_geo_restricted(
1074 self, msg='This video is not available from your location due to geo restriction',
1075 countries=None, metadata_available=False):
a06916d9 1076 if metadata_available and self.get_param('ignore_no_formats_error'):
b7da73eb 1077 self.report_warning(msg)
1078 else:
1079 raise GeoRestrictedError(msg, countries=countries)
1080
1081 def raise_no_formats(self, msg, expected=False, video_id=None):
a06916d9 1082 if expected and self.get_param('ignore_no_formats_error'):
b7da73eb 1083 self.report_warning(msg, video_id)
68f5867c
L
1084 elif isinstance(msg, ExtractorError):
1085 raise msg
b7da73eb 1086 else:
1087 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1088
5f6a1245 1089 # Methods for following #608
c0d0b01f 1090 @staticmethod
ec3f6640 1091 def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
10952eb2 1092 """Returns a URL that points to a page that should be processed"""
5f6a1245 1093 # TODO: ie should be the class used for getting the info
d6983cb4
PH
1094 video_info = {'_type': 'url',
1095 'url': url,
1096 'ie_key': ie}
ec3f6640 1097 video_info.update(kwargs)
7012b23c
PH
1098 if video_id is not None:
1099 video_info['id'] = video_id
830d53bf
S
1100 if video_title is not None:
1101 video_info['title'] = video_title
d6983cb4 1102 return video_info
5f6a1245 1103
749ca5ec
S
1104 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1105 urls = orderedSet(
46b18f23
JH
1106 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1107 for m in matches)
1108 return self.playlist_result(
749ca5ec 1109 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 1110
c0d0b01f 1111 @staticmethod
b60419c5 1112 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
d6983cb4
PH
1113 """Returns a playlist"""
1114 video_info = {'_type': 'playlist',
1115 'entries': entries}
b60419c5 1116 video_info.update(kwargs)
d6983cb4
PH
1117 if playlist_id:
1118 video_info['id'] = playlist_id
1119 if playlist_title:
1120 video_info['title'] = playlist_title
ecc97af3 1121 if playlist_description is not None:
acf5cbfe 1122 video_info['description'] = playlist_description
d6983cb4
PH
1123 return video_info
1124
c342041f 1125 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1126 """
1127 Perform a regex search on the given string, using a single or a list of
1128 patterns returning the first matching group.
1129 In case of failure return a default value or raise a WARNING or a
55b3e45b 1130 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
1131 """
1132 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1133 mobj = re.search(pattern, string, flags)
1134 else:
1135 for p in pattern:
1136 mobj = re.search(p, string, flags)
c3415d1b
PH
1137 if mobj:
1138 break
d6983cb4 1139
819e0531 1140 _name = self._downloader._color_text(name, 'blue')
d6983cb4
PH
1141
1142 if mobj:
711ede6e
PH
1143 if group is None:
1144 # return the first matching group
1145 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1146 elif isinstance(group, (list, tuple)):
1147 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1148 else:
1149 return mobj.group(group)
c342041f 1150 elif default is not NO_DEFAULT:
d6983cb4
PH
1151 return default
1152 elif fatal:
f1a9d64e 1153 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1154 else:
6a39ee13 1155 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1156 return None
1157
c342041f 1158 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1159 """
1160 Like _search_regex, but strips HTML tags and unescapes entities.
1161 """
711ede6e 1162 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1163 if res:
1164 return clean_html(res).strip()
1165 else:
1166 return res
1167
2118fdd1
RA
1168 def _get_netrc_login_info(self, netrc_machine=None):
1169 username = None
1170 password = None
1171 netrc_machine = netrc_machine or self._NETRC_MACHINE
1172
a06916d9 1173 if self.get_param('usenetrc', False):
2118fdd1 1174 try:
0001fcb5 1175 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1176 if os.path.isdir(netrc_file):
1177 netrc_file = os.path.join(netrc_file, '.netrc')
1178 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
2118fdd1
RA
1179 if info is not None:
1180 username = info[0]
1181 password = info[2]
1182 else:
dcce092e
S
1183 raise netrc.NetrcParseError(
1184 'No authenticators for %s' % netrc_machine)
2118fdd1 1185 except (IOError, netrc.NetrcParseError) as err:
6a39ee13 1186 self.report_warning(
dcce092e 1187 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1188
dcce092e 1189 return username, password
2118fdd1 1190
1b6712ab 1191 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1192 """
cf0649f8 1193 Get the login info as (username, password)
32443dd3
S
1194 First look for the manually specified credentials using username_option
1195 and password_option as keys in params dictionary. If no such credentials
1196 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1197 value.
fc79158d
JMF
1198 If there's no info available, return (None, None)
1199 """
fc79158d
JMF
1200
1201 # Attempt to use provided username and password or .netrc data
a06916d9 1202 username = self.get_param(username_option)
1203 if username is not None:
1204 password = self.get_param(password_option)
2118fdd1 1205 else:
1b6712ab 1206 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1207
2133565c 1208 return username, password
fc79158d 1209
e64b7569 1210 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1211 """
1212 Get the two-factor authentication info
1213 TODO - asking the user will be required for sms/phone verify
1214 currently just uses the command line option
1215 If there's no info available, return None
1216 """
83317f69 1217
a06916d9 1218 tfa = self.get_param('twofactor')
1219 if tfa is not None:
1220 return tfa
83317f69 1221
e64b7569 1222 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1223
46720279
JMF
1224 # Helper functions for extracting OpenGraph info
1225 @staticmethod
ab2d5247 1226 def _og_regexes(prop):
448ef1f3 1227 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
22f5f5c6 1228 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
7a6d76a6 1229 % {'prop': re.escape(prop)})
78fb87b2 1230 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1231 return [
78fb87b2
JMF
1232 template % (property_re, content_re),
1233 template % (content_re, property_re),
ab2d5247 1234 ]
46720279 1235
864f24bd
S
1236 @staticmethod
1237 def _meta_regex(prop):
1238 return r'''(?isx)<meta
8b9848ac 1239 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1240 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1241
3c4e6d83 1242 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1243 prop = variadic(prop)
46720279 1244 if name is None:
b070564e
S
1245 name = 'OpenGraph %s' % prop[0]
1246 og_regexes = []
1247 for p in prop:
1248 og_regexes.extend(self._og_regexes(p))
1249 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1250 if escaped is None:
1251 return None
1252 return unescapeHTML(escaped)
46720279
JMF
1253
1254 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1255 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1256
1257 def _og_search_description(self, html, **kargs):
1258 return self._og_search_property('description', html, fatal=False, **kargs)
1259
1260 def _og_search_title(self, html, **kargs):
1261 return self._og_search_property('title', html, **kargs)
1262
8ffa13e0 1263 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1264 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1265 if secure:
1266 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1267 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1268
78338f71
JMF
1269 def _og_search_url(self, html, **kargs):
1270 return self._og_search_property('url', html, **kargs)
1271
40c696e5 1272 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1273 name = variadic(name)
59040888 1274 if display_name is None:
88d9f6c0 1275 display_name = name[0]
59040888 1276 return self._html_search_regex(
88d9f6c0 1277 [self._meta_regex(n) for n in name],
711ede6e 1278 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1279
1280 def _dc_search_uploader(self, html):
1281 return self._html_search_meta('dc.creator', html, 'uploader')
1282
8dbe9899
PH
1283 def _rta_search(self, html):
1284 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1285 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1286 r' content="RTA-5042-1996-1400-1577-RTA"',
1287 html):
1288 return 18
1289 return 0
1290
59040888
PH
1291 def _media_rating_search(self, html):
1292 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1293 rating = self._html_search_meta('rating', html)
1294
1295 if not rating:
1296 return None
1297
1298 RATING_TABLE = {
1299 'safe for kids': 0,
1300 'general': 8,
1301 '14 years': 14,
1302 'mature': 17,
1303 'restricted': 19,
1304 }
d800609c 1305 return RATING_TABLE.get(rating.lower())
59040888 1306
69319969 1307 def _family_friendly_search(self, html):
6ca7732d 1308 # See http://schema.org/VideoObject
ac8491fc
S
1309 family_friendly = self._html_search_meta(
1310 'isFamilyFriendly', html, default=None)
69319969
NJ
1311
1312 if not family_friendly:
1313 return None
1314
1315 RATING_TABLE = {
1316 '1': 0,
1317 'true': 0,
1318 '0': 18,
1319 'false': 18,
1320 }
d800609c 1321 return RATING_TABLE.get(family_friendly.lower())
69319969 1322
0c708f11
JMF
1323 def _twitter_search_player(self, html):
1324 return self._html_search_meta('twitter:player', html,
9e1a5b84 1325 'twitter card player')
0c708f11 1326
95b31e26 1327 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4433bb02 1328 json_ld_list = list(re.finditer(JSON_LD_RE, html))
321b5e08 1329 default = kwargs.get('default', NO_DEFAULT)
321b5e08
S
1330 # JSON-LD may be malformed and thus `fatal` should be respected.
1331 # At the same time `default` may be passed that assumes `fatal=False`
1332 # for _search_regex. Let's simulate the same behavior here as well.
dbf5416a 1333 fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
4433bb02
S
1334 json_ld = []
1335 for mobj in json_ld_list:
1336 json_ld_item = self._parse_json(
1337 mobj.group('json_ld'), video_id, fatal=fatal)
1338 if not json_ld_item:
1339 continue
1340 if isinstance(json_ld_item, dict):
1341 json_ld.append(json_ld_item)
1342 elif isinstance(json_ld_item, (list, tuple)):
1343 json_ld.extend(json_ld_item)
1344 if json_ld:
1345 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1346 if json_ld:
1347 return json_ld
1348 if default is not NO_DEFAULT:
1349 return default
1350 elif fatal:
1351 raise RegexNotFoundError('Unable to extract JSON-LD')
1352 else:
6a39ee13 1353 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1354 return {}
4ca2a3cf 1355
95b31e26 1356 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1357 if isinstance(json_ld, compat_str):
1358 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1359 if not json_ld:
1360 return {}
1361 info = {}
46933a15
S
1362 if not isinstance(json_ld, (list, tuple, dict)):
1363 return info
1364 if isinstance(json_ld, dict):
1365 json_ld = [json_ld]
bae14048 1366
e7e4a6e0
S
1367 INTERACTION_TYPE_MAP = {
1368 'CommentAction': 'comment',
1369 'AgreeAction': 'like',
1370 'DisagreeAction': 'dislike',
1371 'LikeAction': 'like',
1372 'DislikeAction': 'dislike',
1373 'ListenAction': 'view',
1374 'WatchAction': 'view',
1375 'ViewAction': 'view',
1376 }
1377
29f7c58a 1378 def extract_interaction_type(e):
1379 interaction_type = e.get('interactionType')
1380 if isinstance(interaction_type, dict):
1381 interaction_type = interaction_type.get('@type')
1382 return str_or_none(interaction_type)
1383
e7e4a6e0
S
1384 def extract_interaction_statistic(e):
1385 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1386 if isinstance(interaction_statistic, dict):
1387 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1388 if not isinstance(interaction_statistic, list):
1389 return
1390 for is_e in interaction_statistic:
1391 if not isinstance(is_e, dict):
1392 continue
1393 if is_e.get('@type') != 'InteractionCounter':
1394 continue
29f7c58a 1395 interaction_type = extract_interaction_type(is_e)
1396 if not interaction_type:
e7e4a6e0 1397 continue
ce5b9040
S
1398 # For interaction count some sites provide string instead of
1399 # an integer (as per spec) with non digit characters (e.g. ",")
1400 # so extracting count with more relaxed str_to_int
1401 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1402 if interaction_count is None:
1403 continue
1404 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1405 if not count_kind:
1406 continue
1407 count_key = '%s_count' % count_kind
1408 if info.get(count_key) is not None:
1409 continue
1410 info[count_key] = interaction_count
1411
bae14048
S
1412 def extract_video_object(e):
1413 assert e['@type'] == 'VideoObject'
f7ad7160 1414 author = e.get('author')
bae14048 1415 info.update({
bebef109 1416 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1417 'title': unescapeHTML(e.get('name')),
1418 'description': unescapeHTML(e.get('description')),
bebef109 1419 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
bae14048
S
1420 'duration': parse_duration(e.get('duration')),
1421 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1422 # author can be an instance of 'Organization' or 'Person' types.
1423 # both types can have 'name' property(inherited from 'Thing' type). [1]
1424 # however some websites are using 'Text' type instead.
1425 # 1. https://schema.org/VideoObject
1426 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
bae14048
S
1427 'filesize': float_or_none(e.get('contentSize')),
1428 'tbr': int_or_none(e.get('bitrate')),
1429 'width': int_or_none(e.get('width')),
1430 'height': int_or_none(e.get('height')),
33a81c2c 1431 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1432 })
e7e4a6e0 1433 extract_interaction_statistic(e)
bae14048 1434
46933a15 1435 for e in json_ld:
4433bb02 1436 if '@context' in e:
46933a15
S
1437 item_type = e.get('@type')
1438 if expected_type is not None and expected_type != item_type:
4433bb02 1439 continue
c69701c6 1440 if item_type in ('TVEpisode', 'Episode'):
440863ad 1441 episode_name = unescapeHTML(e.get('name'))
46933a15 1442 info.update({
440863ad 1443 'episode': episode_name,
46933a15
S
1444 'episode_number': int_or_none(e.get('episodeNumber')),
1445 'description': unescapeHTML(e.get('description')),
1446 })
440863ad
S
1447 if not info.get('title') and episode_name:
1448 info['title'] = episode_name
46933a15 1449 part_of_season = e.get('partOfSeason')
c69701c6 1450 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1451 info.update({
1452 'season': unescapeHTML(part_of_season.get('name')),
1453 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1454 })
d16b3c66 1455 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1456 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1457 info['series'] = unescapeHTML(part_of_series.get('name'))
391256dc
S
1458 elif item_type == 'Movie':
1459 info.update({
1460 'title': unescapeHTML(e.get('name')),
1461 'description': unescapeHTML(e.get('description')),
1462 'duration': parse_duration(e.get('duration')),
1463 'timestamp': unified_timestamp(e.get('dateCreated')),
1464 })
3931b845 1465 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1466 info.update({
1467 'timestamp': parse_iso8601(e.get('datePublished')),
1468 'title': unescapeHTML(e.get('headline')),
1469 'description': unescapeHTML(e.get('articleBody')),
1470 })
1471 elif item_type == 'VideoObject':
bae14048 1472 extract_video_object(e)
4433bb02
S
1473 if expected_type is None:
1474 continue
1475 else:
1476 break
c69701c6
S
1477 video = e.get('video')
1478 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1479 extract_video_object(video)
4433bb02
S
1480 if expected_type is None:
1481 continue
1482 else:
1483 break
4ca2a3cf
S
1484 return dict((k, v) for k, v in info.items() if v is not None)
1485
27713812 1486 @staticmethod
f8da79f8 1487 def _hidden_inputs(html):
586f1cc5 1488 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1489 hidden_inputs = {}
c8498368
S
1490 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1491 attrs = extract_attributes(input)
1492 if not input:
201ea3ee 1493 continue
c8498368 1494 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1495 continue
c8498368
S
1496 name = attrs.get('name') or attrs.get('id')
1497 value = attrs.get('value')
1498 if name and value is not None:
1499 hidden_inputs[name] = value
201ea3ee 1500 return hidden_inputs
27713812 1501
cf61d96d
S
1502 def _form_hidden_inputs(self, form_id, html):
1503 form = self._search_regex(
73eb13df 1504 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1505 html, '%s form' % form_id, group='form')
1506 return self._hidden_inputs(form)
1507
eb8a4433 1508 class FormatSort:
b050d210 1509 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
eb8a4433 1510
8326b00a 1511 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
176f1866 1512 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
53ed7066 1513 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases
198e3a04 1514 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
53ed7066 1515 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1516 'fps', 'fs_approx', 'source', 'format_id')
eb8a4433 1517
1518 settings = {
1519 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1520 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1521 'acodec': {'type': 'ordered', 'regex': True,
1522 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
176f1866 1523 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1524 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
f137c99e 1525 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
e36d50c5 1526 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
eb8a4433 1527 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1528 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1529 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1530 'aext': {'type': 'ordered', 'field': 'audio_ext',
1531 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1532 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1533 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f5510afe 1534 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
8326b00a 1535 'field': ('vcodec', 'acodec'),
1536 'function': lambda it: int(any(v != 'none' for v in it))},
f983b875 1537 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1538 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1539 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
198e3a04 1540 'lang': {'convert': 'ignore', 'field': 'language_preference'},
6a04a74e 1541 'quality': {'convert': 'float_none', 'default': -1},
eb8a4433 1542 'filesize': {'convert': 'bytes'},
f137c99e 1543 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1544 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1545 'height': {'convert': 'float_none'},
1546 'width': {'convert': 'float_none'},
1547 'fps': {'convert': 'float_none'},
1548 'tbr': {'convert': 'float_none'},
1549 'vbr': {'convert': 'float_none'},
1550 'abr': {'convert': 'float_none'},
1551 'asr': {'convert': 'float_none'},
e4beae70 1552 'source': {'convert': 'ignore', 'field': 'source_preference'},
63be1aab 1553
eb8a4433 1554 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1555 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1556 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1557 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
f5510afe 1558 'res': {'type': 'multiple', 'field': ('height', 'width'),
dbf5416a 1559 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
63be1aab 1560
1561 # Most of these exist only for compatibility reasons
1562 'dimension': {'type': 'alias', 'field': 'res'},
1563 'resolution': {'type': 'alias', 'field': 'res'},
1564 'extension': {'type': 'alias', 'field': 'ext'},
1565 'bitrate': {'type': 'alias', 'field': 'br'},
eb8a4433 1566 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1567 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1568 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1569 'framerate': {'type': 'alias', 'field': 'fps'},
63be1aab 1570 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1571 'protocol': {'type': 'alias', 'field': 'proto'},
1572 'source_preference': {'type': 'alias', 'field': 'source'},
1573 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1574 'filesize_estimate': {'type': 'alias', 'field': 'size'},
eb8a4433 1575 'samplerate': {'type': 'alias', 'field': 'asr'},
1576 'video_ext': {'type': 'alias', 'field': 'vext'},
1577 'audio_ext': {'type': 'alias', 'field': 'aext'},
1578 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1579 'audio_codec': {'type': 'alias', 'field': 'acodec'},
63be1aab 1580 'video': {'type': 'alias', 'field': 'hasvid'},
1581 'has_video': {'type': 'alias', 'field': 'hasvid'},
1582 'audio': {'type': 'alias', 'field': 'hasaud'},
1583 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1584 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1585 'preference': {'type': 'alias', 'field': 'ie_pref'},
1586 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1587 'format_id': {'type': 'alias', 'field': 'id'},
1588 }
eb8a4433 1589
1590 _order = []
1591
1592 def _get_field_setting(self, field, key):
1593 if field not in self.settings:
1594 self.settings[field] = {}
1595 propObj = self.settings[field]
1596 if key not in propObj:
1597 type = propObj.get('type')
1598 if key == 'field':
1599 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1600 elif key == 'convert':
1601 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1602 else:
f5510afe 1603 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
eb8a4433 1604 propObj[key] = default
1605 return propObj[key]
1606
1607 def _resolve_field_value(self, field, value, convertNone=False):
1608 if value is None:
1609 if not convertNone:
1610 return None
4bcc7bd1 1611 else:
eb8a4433 1612 value = value.lower()
1613 conversion = self._get_field_setting(field, 'convert')
1614 if conversion == 'ignore':
1615 return None
1616 if conversion == 'string':
1617 return value
1618 elif conversion == 'float_none':
1619 return float_or_none(value)
1620 elif conversion == 'bytes':
1621 return FileDownloader.parse_bytes(value)
1622 elif conversion == 'order':
da9be05e 1623 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1624 use_regex = self._get_field_setting(field, 'regex')
1625 list_length = len(order_list)
1626 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1627 if use_regex and value is not None:
da9be05e 1628 for i, regex in enumerate(order_list):
eb8a4433 1629 if regex and re.match(regex, value):
1630 return list_length - i
1631 return list_length - empty_pos # not in list
1632 else: # not regex or value = None
1633 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1634 else:
1635 if value.isnumeric():
1636 return float(value)
4bcc7bd1 1637 else:
eb8a4433 1638 self.settings[field]['convert'] = 'string'
1639 return value
1640
1641 def evaluate_params(self, params, sort_extractor):
1642 self._use_free_order = params.get('prefer_free_formats', False)
1643 self._sort_user = params.get('format_sort', [])
1644 self._sort_extractor = sort_extractor
1645
1646 def add_item(field, reverse, closest, limit_text):
1647 field = field.lower()
1648 if field in self._order:
1649 return
1650 self._order.append(field)
1651 limit = self._resolve_field_value(field, limit_text)
1652 data = {
1653 'reverse': reverse,
1654 'closest': False if limit is None else closest,
1655 'limit_text': limit_text,
1656 'limit': limit}
1657 if field in self.settings:
1658 self.settings[field].update(data)
1659 else:
1660 self.settings[field] = data
1661
1662 sort_list = (
1663 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1664 + (tuple() if params.get('format_sort_force', False)
1665 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1666 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1667
1668 for item in sort_list:
1669 match = re.match(self.regex, item)
1670 if match is None:
1671 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1672 field = match.group('field')
1673 if field is None:
1674 continue
1675 if self._get_field_setting(field, 'type') == 'alias':
1676 field = self._get_field_setting(field, 'field')
1677 reverse = match.group('reverse') is not None
b050d210 1678 closest = match.group('separator') == '~'
eb8a4433 1679 limit_text = match.group('limit')
1680
1681 has_limit = limit_text is not None
1682 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1683 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1684
1685 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
b5ae35ee 1686 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
eb8a4433 1687 limit_count = len(limits)
1688 for (i, f) in enumerate(fields):
1689 add_item(f, reverse, closest,
1690 limits[i] if i < limit_count
1691 else limits[0] if has_limit and not has_multiple_limits
1692 else None)
1693
0760b0a7 1694 def print_verbose_info(self, write_debug):
b31fdeed 1695 if self._sort_user:
0760b0a7 1696 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
eb8a4433 1697 if self._sort_extractor:
0760b0a7 1698 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1699 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
eb8a4433 1700 '+' if self._get_field_setting(field, 'reverse') else '', field,
1701 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1702 self._get_field_setting(field, 'limit_text'),
1703 self._get_field_setting(field, 'limit'))
1704 if self._get_field_setting(field, 'limit_text') is not None else '')
1705 for field in self._order if self._get_field_setting(field, 'visible')]))
1706
1707 def _calculate_field_preference_from_value(self, format, field, type, value):
1708 reverse = self._get_field_setting(field, 'reverse')
1709 closest = self._get_field_setting(field, 'closest')
1710 limit = self._get_field_setting(field, 'limit')
1711
1712 if type == 'extractor':
1713 maximum = self._get_field_setting(field, 'max')
1714 if value is None or (maximum is not None and value >= maximum):
f983b875 1715 value = -1
eb8a4433 1716 elif type == 'boolean':
1717 in_list = self._get_field_setting(field, 'in_list')
1718 not_in_list = self._get_field_setting(field, 'not_in_list')
1719 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1720 elif type == 'ordered':
1721 value = self._resolve_field_value(field, value, True)
1722
1723 # try to convert to number
6a04a74e 1724 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
eb8a4433 1725 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1726 if is_num:
1727 value = val_num
1728
1729 return ((-10, 0) if value is None
1730 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1731 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1732 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1733 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1734 else (-1, value, 0))
1735
1736 def _calculate_field_preference(self, format, field):
1737 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1738 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1739 if type == 'multiple':
1740 type = 'field' # Only 'field' is allowed in multiple for now
1741 actual_fields = self._get_field_setting(field, 'field')
1742
f5510afe 1743 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
eb8a4433 1744 else:
1745 value = get_value(field)
1746 return self._calculate_field_preference_from_value(format, field, type, value)
1747
1748 def calculate_preference(self, format):
1749 # Determine missing protocol
1750 if not format.get('protocol'):
1751 format['protocol'] = determine_protocol(format)
1752
1753 # Determine missing ext
1754 if not format.get('ext') and 'url' in format:
1755 format['ext'] = determine_ext(format['url'])
1756 if format.get('vcodec') == 'none':
8326b00a 1757 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
eb8a4433 1758 format['video_ext'] = 'none'
1759 else:
1760 format['video_ext'] = format['ext']
1761 format['audio_ext'] = 'none'
1762 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1763 # format['preference'] = -1000
1764
1765 # Determine missing bitrates
1766 if format.get('tbr') is None:
1767 if format.get('vbr') is not None and format.get('abr') is not None:
1768 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1769 else:
b5ae35ee 1770 if format.get('vcodec') != 'none' and format.get('vbr') is None:
eb8a4433 1771 format['vbr'] = format.get('tbr') - format.get('abr', 0)
b5ae35ee 1772 if format.get('acodec') != 'none' and format.get('abr') is None:
eb8a4433 1773 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1774
1775 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1776
1777 def _sort_formats(self, formats, field_preference=[]):
1778 if not formats:
88acdbc2 1779 return
eb8a4433 1780 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1781 format_sort.evaluate_params(self._downloader.params, field_preference)
a06916d9 1782 if self.get_param('verbose', False):
0760b0a7 1783 format_sort.print_verbose_info(self._downloader.write_debug)
eb8a4433 1784 formats.sort(key=lambda f: format_sort.calculate_preference(f))
59040888 1785
96a53167
S
1786 def _check_formats(self, formats, video_id):
1787 if formats:
1788 formats[:] = filter(
1789 lambda f: self._is_valid_url(
1790 f['url'], video_id,
1791 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1792 formats)
1793
f5bdb444
S
1794 @staticmethod
1795 def _remove_duplicate_formats(formats):
1796 format_urls = set()
1797 unique_formats = []
1798 for f in formats:
1799 if f['url'] not in format_urls:
1800 format_urls.add(f['url'])
1801 unique_formats.append(f)
1802 formats[:] = unique_formats
1803
45024183 1804 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1805 url = self._proto_relative_url(url, scheme='http:')
1806 # For now assume non HTTP(S) URLs always valid
1807 if not (url.startswith('http://') or url.startswith('https://')):
1808 return True
96a53167 1809 try:
45024183 1810 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1811 return True
8bdd16b4 1812 except ExtractorError as e:
25e911a9 1813 self.to_screen(
8bdd16b4 1814 '%s: %s URL is invalid, skipping: %s'
1815 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1816 return False
96a53167 1817
20991253 1818 def http_scheme(self):
1ede5b24 1819 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1820 return (
1821 'http:'
a06916d9 1822 if self.get_param('prefer_insecure', False)
20991253
PH
1823 else 'https:')
1824
57c7411f
PH
1825 def _proto_relative_url(self, url, scheme=None):
1826 if url is None:
1827 return url
1828 if url.startswith('//'):
1829 if scheme is None:
1830 scheme = self.http_scheme()
1831 return scheme + url
1832 else:
1833 return url
1834
4094b6e3
PH
1835 def _sleep(self, timeout, video_id, msg_template=None):
1836 if msg_template is None:
f1a9d64e 1837 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1838 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1839 self.to_screen(msg)
1840 time.sleep(timeout)
1841
f983b875 1842 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1843 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1844 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
f036a632
JMF
1845 manifest = self._download_xml(
1846 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1847 'Unable to download f4m manifest',
1848 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1849 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1850 transform_source=transform_source,
7360c06f 1851 fatal=fatal, data=data, headers=headers, query=query)
4de61310
S
1852
1853 if manifest is False:
8d29e47f 1854 return []
31bb8d3f 1855
0fdbb332 1856 return self._parse_f4m_formats(
f983b875 1857 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1858 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1859
f983b875 1860 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1861 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1862 fatal=True, m3u8_id=None):
ee0ba927 1863 if not isinstance(manifest, compat_etree_Element) and not fatal:
d9eb580a
S
1864 return []
1865
7a5c1cfe 1866 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1867 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1868 if akamai_pv is not None and ';' in akamai_pv.text:
1869 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1870 if playerVerificationChallenge.strip() != '':
1871 return []
1872
31bb8d3f 1873 formats = []
7a47d07c 1874 manifest_version = '1.0'
b2527359 1875 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1876 if not media_nodes:
7a47d07c 1877 manifest_version = '2.0'
34e48bed 1878 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1879 # Remove unsupported DRM protected media from final formats
067aa17e 1880 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1881 media_nodes = remove_encrypted_media(media_nodes)
1882 if not media_nodes:
1883 return formats
48107c19
S
1884
1885 manifest_base_url = get_base_url(manifest)
0a5685b2 1886
a6571f10 1887 bootstrap_info = xpath_element(
0a5685b2
YCH
1888 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1889 'bootstrap info', default=None)
1890
edd6074c
RA
1891 vcodec = None
1892 mime_type = xpath_text(
1893 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1894 'base URL', default=None)
1895 if mime_type and mime_type.startswith('audio/'):
1896 vcodec = 'none'
1897
b2527359 1898 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1899 tbr = int_or_none(media_el.attrib.get('bitrate'))
1900 width = int_or_none(media_el.attrib.get('width'))
1901 height = int_or_none(media_el.attrib.get('height'))
1902 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1903 # If <bootstrapInfo> is present, the specified f4m is a
1904 # stream-level manifest, and only set-level manifests may refer to
1905 # external resources. See section 11.4 and section 4 of F4M spec
1906 if bootstrap_info is None:
1907 media_url = None
1908 # @href is introduced in 2.0, see section 11.6 of F4M spec
1909 if manifest_version == '2.0':
1910 media_url = media_el.attrib.get('href')
1911 if media_url is None:
1912 media_url = media_el.attrib.get('url')
31c746e5
S
1913 if not media_url:
1914 continue
cc357c4d
S
1915 manifest_url = (
1916 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1917 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1918 # If media_url is itself a f4m manifest do the recursive extraction
1919 # since bitrates in parent manifest (this one) and media_url manifest
1920 # may differ leading to inability to resolve the format by requested
1921 # bitrate in f4m downloader
240b6045
YCH
1922 ext = determine_ext(manifest_url)
1923 if ext == 'f4m':
77b8b4e6 1924 f4m_formats = self._extract_f4m_formats(
f983b875 1925 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1926 transform_source=transform_source, fatal=fatal)
1927 # Sometimes stream-level manifest contains single media entry that
1928 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1929 # At the same time parent's media entry in set-level manifest may
1930 # contain it. We will copy it from parent in such cases.
1931 if len(f4m_formats) == 1:
1932 f = f4m_formats[0]
1933 f.update({
1934 'tbr': f.get('tbr') or tbr,
1935 'width': f.get('width') or width,
1936 'height': f.get('height') or height,
1937 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1938 'vcodec': vcodec,
77b8b4e6
S
1939 })
1940 formats.extend(f4m_formats)
70f0f5a8 1941 continue
240b6045
YCH
1942 elif ext == 'm3u8':
1943 formats.extend(self._extract_m3u8_formats(
1944 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1945 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1946 continue
31bb8d3f 1947 formats.append({
77b8b4e6 1948 'format_id': format_id,
31bb8d3f 1949 'url': manifest_url,
30d0b549 1950 'manifest_url': manifest_url,
a6571f10 1951 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1952 'protocol': 'f4m',
b2527359 1953 'tbr': tbr,
77b8b4e6
S
1954 'width': width,
1955 'height': height,
edd6074c 1956 'vcodec': vcodec,
60ca389c 1957 'preference': preference,
f983b875 1958 'quality': quality,
31bb8d3f 1959 })
31bb8d3f
JMF
1960 return formats
1961
f983b875 1962 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1963 return {
f207019c 1964 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1965 'url': m3u8_url,
1966 'ext': ext,
1967 'protocol': 'm3u8',
37768f92 1968 'preference': preference - 100 if preference else -100,
f983b875 1969 'quality': quality,
704df56d
PH
1970 'resolution': 'multiple',
1971 'format_note': 'Quality selection URL',
16da9bbc
YCH
1972 }
1973
b5ae35ee 1974 def _report_ignoring_subs(self, name):
1975 self.report_warning(bug_reports_message(
1976 f'Ignoring subtitle tracks found in the {name} manifest; '
1977 'if any subtitle tracks are missing,'
1978 ), only_once=True)
1979
a0c3b2d5
F
1980 def _extract_m3u8_formats(self, *args, **kwargs):
1981 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1982 if subs:
b5ae35ee 1983 self._report_ignoring_subs('HLS')
a0c3b2d5
F
1984 return fmts
1985
1986 def _extract_m3u8_formats_and_subtitles(
177877c5 1987 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1988 preference=None, quality=None, m3u8_id=None, note=None,
1989 errnote=None, fatal=True, live=False, data=None, headers={},
1990 query={}):
1991
dbd82a1d 1992 res = self._download_webpage_handle(
81515ad9 1993 m3u8_url, video_id,
37a3bb66 1994 note='Downloading m3u8 information' if note is None else note,
1995 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1996 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1997
dbd82a1d 1998 if res is False:
a0c3b2d5 1999 return [], {}
cb252080 2000
dbd82a1d 2001 m3u8_doc, urlh = res
37113045 2002 m3u8_url = urlh.geturl()
9cdffeeb 2003
a0c3b2d5 2004 return self._parse_m3u8_formats_and_subtitles(
cb252080 2005 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 2006 preference=preference, quality=quality, m3u8_id=m3u8_id,
2007 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2008 headers=headers, query=query, video_id=video_id)
cb252080 2009
a0c3b2d5 2010 def _parse_m3u8_formats_and_subtitles(
177877c5 2011 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
2012 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2013 errnote=None, fatal=True, data=None, headers={}, query={},
2014 video_id=None):
60755938 2015 formats, subtitles = [], {}
a0c3b2d5 2016
08a00eef 2017 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
60755938 2018 return formats, subtitles
08a00eef 2019
ba107574 2020 has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
a0c3b2d5 2021
60755938 2022 def format_url(url):
2023 return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2024
2025 if self.get_param('hls_split_discontinuity', False):
2026 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2027 if not m3u8_doc:
2028 if not manifest_url:
2029 return []
2030 m3u8_doc = self._download_webpage(
2031 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2032 note=False, errnote='Failed to download m3u8 playlist information')
2033 if m3u8_doc is False:
2034 return []
2035 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 2036
60755938 2037 else:
2038 def _extract_m3u8_playlist_indices(*args, **kwargs):
2039 return [None]
310c2ed2 2040
cb252080
S
2041 # References:
2042 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2043 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2044 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2045
2046 # We should try extracting formats only from master playlists [1, 4.3.4],
2047 # i.e. playlists that describe available qualities. On the other hand
2048 # media playlists [1, 4.3.3] should be returned as is since they contain
2049 # just the media without qualities renditions.
9cdffeeb 2050 # Fortunately, master playlist can be easily distinguished from media
cb252080 2051 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2052 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2053 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2054 # media playlist and MUST NOT appear in master playlist thus we can
2055 # clearly detect media playlist with this criterion.
2056
9cdffeeb 2057 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2058 formats = [{
2059 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2060 'format_index': idx,
2061 'url': m3u8_url,
2062 'ext': ext,
2063 'protocol': entry_protocol,
2064 'preference': preference,
2065 'quality': quality,
88acdbc2 2066 'has_drm': has_drm,
60755938 2067 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2068
a0c3b2d5 2069 return formats, subtitles
cb252080
S
2070
2071 groups = {}
2072 last_stream_inf = {}
2073
2074 def extract_media(x_media_line):
2075 media = parse_m3u8_attributes(x_media_line)
2076 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2077 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2078 if not (media_type and group_id and name):
2079 return
2080 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2081 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2082 if media_type == 'SUBTITLES':
3907333c 2083 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2084 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2085 # However, lack of URI has been spotted in the wild.
2086 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2087 if not media.get('URI'):
2088 return
a0c3b2d5
F
2089 url = format_url(media['URI'])
2090 sub_info = {
2091 'url': url,
2092 'ext': determine_ext(url),
2093 }
4a2f19ab
F
2094 if sub_info['ext'] == 'm3u8':
2095 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2096 # files may contain is WebVTT:
2097 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2098 sub_info['ext'] = 'vtt'
2099 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2100 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2101 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2102 if media_type not in ('VIDEO', 'AUDIO'):
2103 return
2104 media_url = media.get('URI')
2105 if media_url:
310c2ed2 2106 manifest_url = format_url(media_url)
60755938 2107 formats.extend({
2108 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2109 'format_note': name,
2110 'format_index': idx,
2111 'url': manifest_url,
2112 'manifest_url': m3u8_url,
2113 'language': media.get('LANGUAGE'),
2114 'ext': ext,
2115 'protocol': entry_protocol,
2116 'preference': preference,
2117 'quality': quality,
2118 'vcodec': 'none' if media_type == 'AUDIO' else None,
2119 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2120
2121 def build_stream_name():
2122 # Despite specification does not mention NAME attribute for
3019cb0c
S
2123 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2124 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2125 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2126 stream_name = last_stream_inf.get('NAME')
2127 if stream_name:
2128 return stream_name
2129 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2130 # from corresponding rendition group
2131 stream_group_id = last_stream_inf.get('VIDEO')
2132 if not stream_group_id:
2133 return
2134 stream_group = groups.get(stream_group_id)
2135 if not stream_group:
2136 return stream_group_id
2137 rendition = stream_group[0]
2138 return rendition.get('NAME') or stream_group_id
2139
379306ef 2140 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2141 # chance to detect video only formats when EXT-X-STREAM-INF tags
2142 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2143 for line in m3u8_doc.splitlines():
2144 if line.startswith('#EXT-X-MEDIA:'):
2145 extract_media(line)
2146
704df56d
PH
2147 for line in m3u8_doc.splitlines():
2148 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2149 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2150 elif line.startswith('#') or not line.strip():
2151 continue
2152 else:
9c99bef7 2153 tbr = float_or_none(
3089bc74
S
2154 last_stream_inf.get('AVERAGE-BANDWIDTH')
2155 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2156 manifest_url = format_url(line.strip())
5ef62fc4 2157
60755938 2158 for idx in _extract_m3u8_playlist_indices(manifest_url):
2159 format_id = [m3u8_id, None, idx]
310c2ed2 2160 # Bandwidth of live streams may differ over time thus making
2161 # format_id unpredictable. So it's better to keep provided
2162 # format_id intact.
2163 if not live:
60755938 2164 stream_name = build_stream_name()
2165 format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
310c2ed2 2166 f = {
60755938 2167 'format_id': '-'.join(map(str, filter(None, format_id))),
2168 'format_index': idx,
310c2ed2 2169 'url': manifest_url,
2170 'manifest_url': m3u8_url,
2171 'tbr': tbr,
2172 'ext': ext,
2173 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2174 'protocol': entry_protocol,
2175 'preference': preference,
2176 'quality': quality,
2177 }
2178 resolution = last_stream_inf.get('RESOLUTION')
2179 if resolution:
2180 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2181 if mobj:
2182 f['width'] = int(mobj.group('width'))
2183 f['height'] = int(mobj.group('height'))
2184 # Unified Streaming Platform
2185 mobj = re.search(
2186 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2187 if mobj:
2188 abr, vbr = mobj.groups()
2189 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2190 f.update({
2191 'vbr': vbr,
2192 'abr': abr,
2193 })
2194 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2195 f.update(codecs)
2196 audio_group_id = last_stream_inf.get('AUDIO')
2197 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2198 # references a rendition group MUST have a CODECS attribute.
2199 # However, this is not always respected, for example, [2]
2200 # contains EXT-X-STREAM-INF tag which references AUDIO
2201 # rendition group but does not have CODECS and despite
2202 # referencing an audio group it represents a complete
2203 # (with audio and video) format. So, for such cases we will
2204 # ignore references to rendition groups and treat them
2205 # as complete formats.
2206 if audio_group_id and codecs and f.get('vcodec') != 'none':
2207 audio_group = groups.get(audio_group_id)
2208 if audio_group and audio_group[0].get('URI'):
2209 # TODO: update acodec for audio only formats with
2210 # the same GROUP-ID
2211 f['acodec'] = 'none'
fc21af50 2212 if not f.get('ext'):
2213 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2214 formats.append(f)
2215
2216 # for DailyMotion
2217 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2218 if progressive_uri:
2219 http_f = f.copy()
2220 del http_f['manifest_url']
2221 http_f.update({
2222 'format_id': f['format_id'].replace('hls-', 'http-'),
2223 'protocol': 'http',
2224 'url': progressive_uri,
2225 })
2226 formats.append(http_f)
5ef62fc4 2227
cb252080 2228 last_stream_inf = {}
a0c3b2d5 2229 return formats, subtitles
704df56d 2230
3cf4b91d
C
2231 def _extract_m3u8_vod_duration(
2232 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2233
2234 m3u8_vod = self._download_webpage(
2235 m3u8_vod_url, video_id,
2236 note='Downloading m3u8 VOD manifest' if note is None else note,
2237 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2238 fatal=False, data=data, headers=headers, query=query)
2239
2240 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2241
2242 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2243 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2244 return None
2245
2246 return int(sum(
2247 float(line[len('#EXTINF:'):].split(',')[0])
2248 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2249
a107193e
S
2250 @staticmethod
2251 def _xpath_ns(path, namespace=None):
2252 if not namespace:
2253 return path
2254 out = []
2255 for c in path.split('/'):
2256 if not c or c == '.':
2257 out.append(c)
2258 else:
2259 out.append('{%s}%s' % (namespace, c))
2260 return '/'.join(out)
2261
da1c94ee 2262 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
09f572fb 2263 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 2264
995029a1
PH
2265 if smil is False:
2266 assert not fatal
2267 return []
e89a2aab 2268
17712eeb 2269 namespace = self._parse_smil_namespace(smil)
a107193e 2270
da1c94ee 2271 fmts = self._parse_smil_formats(
a107193e 2272 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2273 subs = self._parse_smil_subtitles(
2274 smil, namespace=namespace)
2275
2276 return fmts, subs
2277
2278 def _extract_smil_formats(self, *args, **kwargs):
2279 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2280 if subs:
b5ae35ee 2281 self._report_ignoring_subs('SMIL')
da1c94ee 2282 return fmts
a107193e
S
2283
2284 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2285 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2286 if smil is False:
2287 return {}
2288 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2289
09f572fb 2290 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
2291 return self._download_xml(
2292 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2293 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2294
2295 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2296 namespace = self._parse_smil_namespace(smil)
a107193e
S
2297
2298 formats = self._parse_smil_formats(
2299 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2300 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2301
2302 video_id = os.path.splitext(url_basename(smil_url))[0]
2303 title = None
2304 description = None
647eab45 2305 upload_date = None
a107193e
S
2306 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2307 name = meta.attrib.get('name')
2308 content = meta.attrib.get('content')
2309 if not name or not content:
2310 continue
2311 if not title and name == 'title':
2312 title = content
2313 elif not description and name in ('description', 'abstract'):
2314 description = content
647eab45
S
2315 elif not upload_date and name == 'date':
2316 upload_date = unified_strdate(content)
a107193e 2317
1e5bcdec
S
2318 thumbnails = [{
2319 'id': image.get('type'),
2320 'url': image.get('src'),
2321 'width': int_or_none(image.get('width')),
2322 'height': int_or_none(image.get('height')),
2323 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2324
a107193e
S
2325 return {
2326 'id': video_id,
2327 'title': title or video_id,
2328 'description': description,
647eab45 2329 'upload_date': upload_date,
1e5bcdec 2330 'thumbnails': thumbnails,
a107193e
S
2331 'formats': formats,
2332 'subtitles': subtitles,
2333 }
2334
17712eeb
S
2335 def _parse_smil_namespace(self, smil):
2336 return self._search_regex(
2337 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2338
f877c6ae 2339 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2340 base = smil_url
2341 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2342 b = meta.get('base') or meta.get('httpBase')
2343 if b:
2344 base = b
2345 break
e89a2aab
S
2346
2347 formats = []
2348 rtmp_count = 0
a107193e 2349 http_count = 0
7f32e5dc 2350 m3u8_count = 0
9359f3d4 2351 imgs_count = 0
a107193e 2352
9359f3d4 2353 srcs = set()
ad96b4c8
YCH
2354 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2355 for medium in media:
2356 src = medium.get('src')
81e1c4e2 2357 if not src or src in srcs:
a107193e 2358 continue
9359f3d4 2359 srcs.add(src)
a107193e 2360
ad96b4c8
YCH
2361 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2362 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2363 width = int_or_none(medium.get('width'))
2364 height = int_or_none(medium.get('height'))
2365 proto = medium.get('proto')
2366 ext = medium.get('ext')
a107193e 2367 src_ext = determine_ext(src)
ad96b4c8 2368 streamer = medium.get('streamer') or base
a107193e
S
2369
2370 if proto == 'rtmp' or streamer.startswith('rtmp'):
2371 rtmp_count += 1
2372 formats.append({
2373 'url': streamer,
2374 'play_path': src,
2375 'ext': 'flv',
2376 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2377 'tbr': bitrate,
2378 'filesize': filesize,
2379 'width': width,
2380 'height': height,
2381 })
f877c6ae
YCH
2382 if transform_rtmp_url:
2383 streamer, src = transform_rtmp_url(streamer, src)
2384 formats[-1].update({
2385 'url': streamer,
2386 'play_path': src,
2387 })
a107193e
S
2388 continue
2389
2390 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 2391 src_url = src_url.strip()
a107193e
S
2392
2393 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2394 m3u8_formats = self._extract_m3u8_formats(
2395 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2396 if len(m3u8_formats) == 1:
2397 m3u8_count += 1
2398 m3u8_formats[0].update({
2399 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2400 'tbr': bitrate,
2401 'width': width,
2402 'height': height,
2403 })
2404 formats.extend(m3u8_formats)
bd21ead2 2405 elif src_ext == 'f4m':
a107193e
S
2406 f4m_url = src_url
2407 if not f4m_params:
2408 f4m_params = {
2409 'hdcore': '3.2.0',
2410 'plugin': 'flowplayer-3.2.0.1',
2411 }
2412 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 2413 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 2414 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2415 elif src_ext == 'mpd':
2416 formats.extend(self._extract_mpd_formats(
2417 src_url, video_id, mpd_id='dash', fatal=False))
2418 elif re.search(r'\.ism/[Mm]anifest', src_url):
2419 formats.extend(self._extract_ism_formats(
2420 src_url, video_id, ism_id='mss', fatal=False))
2421 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2422 http_count += 1
2423 formats.append({
2424 'url': src_url,
2425 'ext': ext or src_ext or 'flv',
2426 'format_id': 'http-%d' % (bitrate or http_count),
2427 'tbr': bitrate,
2428 'filesize': filesize,
2429 'width': width,
2430 'height': height,
2431 })
63757032 2432
9359f3d4
F
2433 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2434 src = medium.get('src')
2435 if not src or src in srcs:
2436 continue
2437 srcs.add(src)
2438
2439 imgs_count += 1
2440 formats.append({
2441 'format_id': 'imagestream-%d' % (imgs_count),
2442 'url': src,
2443 'ext': mimetype2ext(medium.get('type')),
2444 'acodec': 'none',
2445 'vcodec': 'none',
2446 'width': int_or_none(medium.get('width')),
2447 'height': int_or_none(medium.get('height')),
2448 'format_note': 'SMIL storyboards',
2449 })
2450
e89a2aab
S
2451 return formats
2452
ce00af87 2453 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2454 urls = []
a107193e
S
2455 subtitles = {}
2456 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2457 src = textstream.get('src')
d413095f 2458 if not src or src in urls:
a107193e 2459 continue
d413095f 2460 urls.append(src)
df634be2 2461 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2462 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2463 subtitles.setdefault(lang, []).append({
2464 'url': src,
2465 'ext': ext,
2466 })
2467 return subtitles
63757032 2468
47a5cb77 2469 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 2470 xspf = self._download_xml(
47a5cb77 2471 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
2472 'Unable to download xspf manifest', fatal=fatal)
2473 if xspf is False:
2474 return []
47a5cb77
S
2475 return self._parse_xspf(
2476 xspf, playlist_id, xspf_url=xspf_url,
2477 xspf_base_url=base_url(xspf_url))
8d6765cf 2478
47a5cb77 2479 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2480 NS_MAP = {
2481 'xspf': 'http://xspf.org/ns/0/',
2482 's1': 'http://static.streamone.nl/player/ns/0',
2483 }
2484
2485 entries = []
47a5cb77 2486 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2487 title = xpath_text(
98044462 2488 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2489 description = xpath_text(
2490 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2491 thumbnail = xpath_text(
2492 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2493 duration = float_or_none(
2494 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2495
47a5cb77
S
2496 formats = []
2497 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2498 format_url = urljoin(xspf_base_url, location.text)
2499 if not format_url:
2500 continue
2501 formats.append({
2502 'url': format_url,
2503 'manifest_url': xspf_url,
2504 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2505 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2506 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2507 })
8d6765cf
S
2508 self._sort_formats(formats)
2509
2510 entries.append({
2511 'id': playlist_id,
2512 'title': title,
2513 'description': description,
2514 'thumbnail': thumbnail,
2515 'duration': duration,
2516 'formats': formats,
2517 })
2518 return entries
2519
171e59ed
F
2520 def _extract_mpd_formats(self, *args, **kwargs):
2521 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2522 if subs:
b5ae35ee 2523 self._report_ignoring_subs('DASH')
171e59ed
F
2524 return fmts
2525
2526 def _extract_mpd_formats_and_subtitles(
2527 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2528 fatal=True, data=None, headers={}, query={}):
47a5cb77 2529 res = self._download_xml_handle(
1bac3455 2530 mpd_url, video_id,
37a3bb66 2531 note='Downloading MPD manifest' if note is None else note,
2532 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2533 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2534 if res is False:
171e59ed 2535 return [], {}
47a5cb77 2536 mpd_doc, urlh = res
c25720ef 2537 if mpd_doc is None:
171e59ed 2538 return [], {}
02dc0a36 2539 mpd_base_url = base_url(urlh.geturl())
1bac3455 2540
171e59ed 2541 return self._parse_mpd_formats_and_subtitles(
545cc85d 2542 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2543
171e59ed
F
2544 def _parse_mpd_formats(self, *args, **kwargs):
2545 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2546 if subs:
b5ae35ee 2547 self._report_ignoring_subs('DASH')
171e59ed
F
2548 return fmts
2549
2550 def _parse_mpd_formats_and_subtitles(
2551 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2552 """
2553 Parse formats from MPD manifest.
2554 References:
2555 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2556 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2557 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2558 """
a06916d9 2559 if not self.get_param('dynamic_mpd', True):
78895bd3 2560 if mpd_doc.get('type') == 'dynamic':
171e59ed 2561 return [], {}
2d2fa82d 2562
91cb6b50 2563 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2564
2565 def _add_ns(path):
2566 return self._xpath_ns(path, namespace)
2567
675d0016 2568 def is_drm_protected(element):
2569 return element.find(_add_ns('ContentProtection')) is not None
2570
1bac3455 2571 def extract_multisegment_info(element, ms_parent_info):
2572 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2573
2574 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2575 # common attributes and elements. We will only extract relevant
2576 # for us.
2577 def extract_common(source):
2578 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2579 if segment_timeline is not None:
2580 s_e = segment_timeline.findall(_add_ns('S'))
2581 if s_e:
2582 ms_info['total_number'] = 0
2583 ms_info['s'] = []
2584 for s in s_e:
2585 r = int(s.get('r', 0))
2586 ms_info['total_number'] += 1 + r
2587 ms_info['s'].append({
2588 't': int(s.get('t', 0)),
2589 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2590 'd': int(s.attrib['d']),
2591 'r': r,
2592 })
2593 start_number = source.get('startNumber')
2594 if start_number:
2595 ms_info['start_number'] = int(start_number)
2596 timescale = source.get('timescale')
2597 if timescale:
2598 ms_info['timescale'] = int(timescale)
2599 segment_duration = source.get('duration')
2600 if segment_duration:
48504785 2601 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2602
2603 def extract_Initialization(source):
2604 initialization = source.find(_add_ns('Initialization'))
2605 if initialization is not None:
2606 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2607
f14be228 2608 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2609 if segment_list is not None:
b4c1d6e8
S
2610 extract_common(segment_list)
2611 extract_Initialization(segment_list)
f14be228 2612 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2613 if segment_urls_e:
2614 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2615 else:
f14be228 2616 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2617 if segment_template is not None:
b4c1d6e8 2618 extract_common(segment_template)
e228616c
S
2619 media = segment_template.get('media')
2620 if media:
2621 ms_info['media'] = media
1bac3455 2622 initialization = segment_template.get('initialization')
2623 if initialization:
e228616c 2624 ms_info['initialization'] = initialization
1bac3455 2625 else:
b4c1d6e8 2626 extract_Initialization(segment_template)
1bac3455 2627 return ms_info
b323e170 2628
1bac3455 2629 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
6251555f 2630 formats, subtitles = [], {}
2631 stream_numbers = {'audio': 0, 'video': 0}
f14be228 2632 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2633 period_duration = parse_duration(period.get('duration')) or mpd_duration
2634 period_ms_info = extract_multisegment_info(period, {
2635 'start_number': 1,
2636 'timescale': 1,
2637 })
f14be228 2638 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1bac3455 2639 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2640 for representation in adaptation_set.findall(_add_ns('Representation')):
1bac3455 2641 representation_attrib = adaptation_set.attrib.copy()
2642 representation_attrib.update(representation.attrib)
f0948348 2643 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2644 mime_type = representation_attrib['mimeType']
171e59ed
F
2645 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2646
be2fc5b2 2647 codecs = representation_attrib.get('codecs', '')
2648 if content_type not in ('video', 'audio', 'text'):
2649 if mime_type == 'image/jpeg':
a8731fcc 2650 content_type = mime_type
2651 elif codecs.split('.')[0] == 'stpp':
be2fc5b2 2652 content_type = 'text'
6993f78d 2653 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2654 content_type = 'text'
cdb19aa4 2655 else:
be2fc5b2 2656 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2657 continue
2658
2659 base_url = ''
2660 for element in (representation, adaptation_set, period, mpd_doc):
2661 base_url_e = element.find(_add_ns('BaseURL'))
2662 if base_url_e is not None:
2663 base_url = base_url_e.text + base_url
2664 if re.match(r'^https?://', base_url):
2665 break
f9cc0161
D
2666 if mpd_base_url and base_url.startswith('/'):
2667 base_url = compat_urlparse.urljoin(mpd_base_url, base_url)
2668 elif mpd_base_url and not re.match(r'^https?://', base_url):
2669 if not mpd_base_url.endswith('/'):
be2fc5b2 2670 mpd_base_url += '/'
2671 base_url = mpd_base_url + base_url
2672 representation_id = representation_attrib.get('id')
2673 lang = representation_attrib.get('lang')
2674 url_el = representation.find(_add_ns('BaseURL'))
2675 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2676 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2677 if representation_id is not None:
2678 format_id = representation_id
2679 else:
2680 format_id = content_type
2681 if mpd_id:
2682 format_id = mpd_id + '-' + format_id
2683 if content_type in ('video', 'audio'):
2684 f = {
2685 'format_id': format_id,
2686 'manifest_url': mpd_url,
2687 'ext': mimetype2ext(mime_type),
2688 'width': int_or_none(representation_attrib.get('width')),
2689 'height': int_or_none(representation_attrib.get('height')),
2690 'tbr': float_or_none(bandwidth, 1000),
2691 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2692 'fps': int_or_none(representation_attrib.get('frameRate')),
2693 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2694 'format_note': 'DASH %s' % content_type,
2695 'filesize': filesize,
2696 'container': mimetype2ext(mime_type) + '_dash',
6251555f 2697 'manifest_stream_number': stream_numbers[content_type]
be2fc5b2 2698 }
2699 f.update(parse_codecs(codecs))
6251555f 2700 stream_numbers[content_type] += 1
be2fc5b2 2701 elif content_type == 'text':
2702 f = {
2703 'ext': mimetype2ext(mime_type),
2704 'manifest_url': mpd_url,
2705 'filesize': filesize,
2706 }
2707 elif content_type == 'image/jpeg':
2708 # See test case in VikiIE
2709 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2710 f = {
2711 'format_id': format_id,
2712 'ext': 'mhtml',
2713 'manifest_url': mpd_url,
2714 'format_note': 'DASH storyboards (jpeg)',
2715 'acodec': 'none',
2716 'vcodec': 'none',
2717 }
88acdbc2 2718 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2719 f['has_drm'] = True
be2fc5b2 2720 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2721
2722 def prepare_template(template_name, identifiers):
2723 tmpl = representation_ms_info[template_name]
2724 # First of, % characters outside $...$ templates
2725 # must be escaped by doubling for proper processing
2726 # by % operator string formatting used further (see
2727 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2728 t = ''
2729 in_template = False
2730 for c in tmpl:
2731 t += c
2732 if c == '$':
2733 in_template = not in_template
2734 elif c == '%' and not in_template:
eca1f0d1 2735 t += c
be2fc5b2 2736 # Next, $...$ templates are translated to their
2737 # %(...) counterparts to be used with % operator
2738 if representation_id is not None:
2739 t = t.replace('$RepresentationID$', representation_id)
2740 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2741 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2742 t.replace('$$', '$')
2743 return t
2744
2745 # @initialization is a regular template like @media one
2746 # so it should be handled just the same way (see
2747 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2748 if 'initialization' in representation_ms_info:
2749 initialization_template = prepare_template(
2750 'initialization',
2751 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2752 # $Time$ shall not be included for @initialization thus
2753 # only $Bandwidth$ remains
2754 ('Bandwidth', ))
2755 representation_ms_info['initialization_url'] = initialization_template % {
2756 'Bandwidth': bandwidth,
2757 }
2758
2759 def location_key(location):
2760 return 'url' if re.match(r'^https?://', location) else 'path'
2761
2762 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2763
2764 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2765 media_location_key = location_key(media_template)
2766
2767 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2768 # can't be used at the same time
2769 if '%(Number' in media_template and 's' not in representation_ms_info:
2770 segment_duration = None
2771 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2772 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2773 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2774 representation_ms_info['fragments'] = [{
2775 media_location_key: media_template % {
2776 'Number': segment_number,
2777 'Bandwidth': bandwidth,
2778 },
2779 'duration': segment_duration,
2780 } for segment_number in range(
2781 representation_ms_info['start_number'],
2782 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2783 else:
2784 # $Number*$ or $Time$ in media template with S list available
2785 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2786 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2787 representation_ms_info['fragments'] = []
2788 segment_time = 0
2789 segment_d = None
2790 segment_number = representation_ms_info['start_number']
2791
2792 def add_segment_url():
2793 segment_url = media_template % {
2794 'Time': segment_time,
2795 'Bandwidth': bandwidth,
2796 'Number': segment_number,
2797 }
2798 representation_ms_info['fragments'].append({
2799 media_location_key: segment_url,
2800 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2801 })
2802
2803 for num, s in enumerate(representation_ms_info['s']):
2804 segment_time = s.get('t') or segment_time
2805 segment_d = s['d']
2806 add_segment_url()
2807 segment_number += 1
2808 for r in range(s.get('r', 0)):
2809 segment_time += segment_d
f0948348 2810 add_segment_url()
b4c1d6e8 2811 segment_number += 1
be2fc5b2 2812 segment_time += segment_d
2813 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2814 # No media template
2815 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2816 # or any YouTube dashsegments video
2817 fragments = []
2818 segment_index = 0
2819 timescale = representation_ms_info['timescale']
2820 for s in representation_ms_info['s']:
2821 duration = float_or_none(s['d'], timescale)
2822 for r in range(s.get('r', 0) + 1):
2823 segment_uri = representation_ms_info['segment_urls'][segment_index]
2824 fragments.append({
2825 location_key(segment_uri): segment_uri,
2826 'duration': duration,
2827 })
2828 segment_index += 1
2829 representation_ms_info['fragments'] = fragments
2830 elif 'segment_urls' in representation_ms_info:
2831 # Segment URLs with no SegmentTimeline
2832 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2833 # https://github.com/ytdl-org/youtube-dl/pull/14844
2834 fragments = []
2835 segment_duration = float_or_none(
2836 representation_ms_info['segment_duration'],
2837 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2838 for segment_url in representation_ms_info['segment_urls']:
2839 fragment = {
2840 location_key(segment_url): segment_url,
2841 }
2842 if segment_duration:
2843 fragment['duration'] = segment_duration
2844 fragments.append(fragment)
2845 representation_ms_info['fragments'] = fragments
2846 # If there is a fragments key available then we correctly recognized fragmented media.
2847 # Otherwise we will assume unfragmented media with direct access. Technically, such
2848 # assumption is not necessarily correct since we may simply have no support for
2849 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2850 if 'fragments' in representation_ms_info:
2851 f.update({
2852 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2853 'url': mpd_url or base_url,
2854 'fragment_base_url': base_url,
2855 'fragments': [],
2856 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2857 })
2858 if 'initialization_url' in representation_ms_info:
2859 initialization_url = representation_ms_info['initialization_url']
2860 if not f.get('url'):
2861 f['url'] = initialization_url
2862 f['fragments'].append({location_key(initialization_url): initialization_url})
2863 f['fragments'].extend(representation_ms_info['fragments'])
17b598d3 2864 else:
be2fc5b2 2865 # Assuming direct URL to unfragmented media.
2866 f['url'] = base_url
2867 if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
2868 formats.append(f)
2869 elif content_type == 'text':
2870 subtitles.setdefault(lang or 'und', []).append(f)
2871
171e59ed 2872 return formats, subtitles
17b598d3 2873
fd76a142
F
2874 def _extract_ism_formats(self, *args, **kwargs):
2875 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2876 if subs:
b5ae35ee 2877 self._report_ignoring_subs('ISM')
fd76a142
F
2878 return fmts
2879
2880 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2881 res = self._download_xml_handle(
b2758123 2882 ism_url, video_id,
37a3bb66 2883 note='Downloading ISM manifest' if note is None else note,
2884 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2885 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2886 if res is False:
fd76a142 2887 return [], {}
47a5cb77 2888 ism_doc, urlh = res
13b08034 2889 if ism_doc is None:
fd76a142 2890 return [], {}
b2758123 2891
fd76a142 2892 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 2893
fd76a142 2894 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2895 """
2896 Parse formats from ISM manifest.
2897 References:
2898 1. [MS-SSTR]: Smooth Streaming Protocol,
2899 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2900 """
06869367 2901 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2902 return [], {}
b2758123 2903
b2758123
RA
2904 duration = int(ism_doc.attrib['Duration'])
2905 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2906
2907 formats = []
fd76a142 2908 subtitles = {}
b2758123
RA
2909 for stream in ism_doc.findall('StreamIndex'):
2910 stream_type = stream.get('Type')
fd76a142 2911 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2912 continue
2913 url_pattern = stream.attrib['Url']
2914 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2915 stream_name = stream.get('Name')
fd76a142 2916 stream_language = stream.get('Language', 'und')
b2758123 2917 for track in stream.findall('QualityLevel'):
e2efe599 2918 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
b2758123 2919 # TODO: add support for WVC1 and WMAP
66a1b864 2920 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
b2758123
RA
2921 self.report_warning('%s is not a supported codec' % fourcc)
2922 continue
2923 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2924 # [1] does not mention Width and Height attributes. However,
2925 # they're often present while MaxWidth and MaxHeight are
2926 # missing, so should be used as fallbacks
2927 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2928 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2929 sampling_rate = int_or_none(track.get('SamplingRate'))
2930
2931 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2932 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2933
2934 fragments = []
2935 fragment_ctx = {
2936 'time': 0,
2937 }
2938 stream_fragments = stream.findall('c')
2939 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2940 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2941 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2942 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2943 if not fragment_ctx['duration']:
2944 try:
2945 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2946 except IndexError:
2947 next_fragment_time = duration
1616f9b4 2948 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2949 for _ in range(fragment_repeat):
2950 fragments.append({
1616f9b4 2951 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2952 'duration': fragment_ctx['duration'] / stream_timescale,
2953 })
2954 fragment_ctx['time'] += fragment_ctx['duration']
2955
2956 format_id = []
2957 if ism_id:
2958 format_id.append(ism_id)
2959 if stream_name:
2960 format_id.append(stream_name)
2961 format_id.append(compat_str(tbr))
2962
fd76a142
F
2963 if stream_type == 'text':
2964 subtitles.setdefault(stream_language, []).append({
2965 'ext': 'ismt',
2966 'protocol': 'ism',
2967 'url': ism_url,
2968 'manifest_url': ism_url,
2969 'fragments': fragments,
2970 '_download_params': {
2971 'stream_type': stream_type,
2972 'duration': duration,
2973 'timescale': stream_timescale,
2974 'fourcc': fourcc,
2975 'language': stream_language,
2976 'codec_private_data': track.get('CodecPrivateData'),
2977 }
2978 })
2979 elif stream_type in ('video', 'audio'):
2980 formats.append({
2981 'format_id': '-'.join(format_id),
2982 'url': ism_url,
2983 'manifest_url': ism_url,
2984 'ext': 'ismv' if stream_type == 'video' else 'isma',
2985 'width': width,
2986 'height': height,
2987 'tbr': tbr,
2988 'asr': sampling_rate,
2989 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2990 'acodec': 'none' if stream_type == 'video' else fourcc,
2991 'protocol': 'ism',
2992 'fragments': fragments,
88acdbc2 2993 'has_drm': ism_doc.find('Protection') is not None,
fd76a142
F
2994 '_download_params': {
2995 'stream_type': stream_type,
2996 'duration': duration,
2997 'timescale': stream_timescale,
2998 'width': width or 0,
2999 'height': height or 0,
3000 'fourcc': fourcc,
3001 'language': stream_language,
3002 'codec_private_data': track.get('CodecPrivateData'),
3003 'sampling_rate': sampling_rate,
3004 'channels': int_or_none(track.get('Channels', 2)),
3005 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3006 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3007 },
3008 })
3009 return formats, subtitles
b2758123 3010
f983b875 3011 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
6780154e
S
3012 def absolute_url(item_url):
3013 return urljoin(base_url, item_url)
59bbe491 3014
3015 def parse_content_type(content_type):
3016 if not content_type:
3017 return {}
3018 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3019 if ctr:
3020 mimetype, codecs = ctr.groups()
3021 f = parse_codecs(codecs)
3022 f['ext'] = mimetype2ext(mimetype)
3023 return f
3024 return {}
3025
868f79db 3026 def _media_formats(src, cur_media_type, type_info={}):
520251c0 3027 full_url = absolute_url(src)
82889d4a 3028 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 3029 if ext == 'm3u8':
520251c0
YCH
3030 is_plain_url = False
3031 formats = self._extract_m3u8_formats(
ad120ae1 3032 full_url, video_id, ext='mp4',
eeb0a956 3033 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 3034 preference=preference, quality=quality, fatal=False)
87a449c1
S
3035 elif ext == 'mpd':
3036 is_plain_url = False
3037 formats = self._extract_mpd_formats(
b359e977 3038 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3039 else:
3040 is_plain_url = True
3041 formats = [{
3042 'url': full_url,
3043 'vcodec': 'none' if cur_media_type == 'audio' else None,
3044 }]
3045 return is_plain_url, formats
3046
59bbe491 3047 entries = []
4328ddf8
S
3048 # amp-video and amp-audio are very similar to their HTML5 counterparts
3049 # so we wll include them right here (see
3050 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3051 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3052 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3053 media_tags = [(media_tag, media_tag_name, media_type, '')
3054 for media_tag, media_tag_name, media_type
3055 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3056 media_tags.extend(re.findall(
3057 # We only allow video|audio followed by a whitespace or '>'.
3058 # Allowing more characters may end up in significant slow down (see
067aa17e 3059 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 3060 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3061 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3062 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3063 media_info = {
3064 'formats': [],
3065 'subtitles': {},
3066 }
3067 media_attributes = extract_attributes(media_tag)
f856816b 3068 src = strip_or_none(media_attributes.get('src'))
59bbe491 3069 if src:
dedb1770 3070 _, formats = _media_formats(src, media_type)
520251c0 3071 media_info['formats'].extend(formats)
6780154e 3072 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3073 if media_content:
3074 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3075 s_attr = extract_attributes(source_tag)
3076 # data-video-src and data-src are non standard but seen
3077 # several times in the wild
f856816b 3078 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 3079 if not src:
3080 continue
d493f15c 3081 f = parse_content_type(s_attr.get('type'))
868f79db 3082 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3083 if is_plain_url:
d493f15c
S
3084 # width, height, res, label and title attributes are
3085 # all not standard but seen several times in the wild
3086 labels = [
3087 s_attr.get(lbl)
3088 for lbl in ('label', 'title')
3089 if str_or_none(s_attr.get(lbl))
3090 ]
3091 width = int_or_none(s_attr.get('width'))
3089bc74
S
3092 height = (int_or_none(s_attr.get('height'))
3093 or int_or_none(s_attr.get('res')))
d493f15c
S
3094 if not width or not height:
3095 for lbl in labels:
3096 resolution = parse_resolution(lbl)
3097 if not resolution:
3098 continue
3099 width = width or resolution.get('width')
3100 height = height or resolution.get('height')
3101 for lbl in labels:
3102 tbr = parse_bitrate(lbl)
3103 if tbr:
3104 break
3105 else:
3106 tbr = None
1ed45499 3107 f.update({
d493f15c
S
3108 'width': width,
3109 'height': height,
3110 'tbr': tbr,
3111 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3112 })
520251c0
YCH
3113 f.update(formats[0])
3114 media_info['formats'].append(f)
3115 else:
3116 media_info['formats'].extend(formats)
59bbe491 3117 for track_tag in re.findall(r'<track[^>]+>', media_content):
3118 track_attributes = extract_attributes(track_tag)
3119 kind = track_attributes.get('kind')
5968d7d2 3120 if not kind or kind in ('subtitles', 'captions'):
f856816b 3121 src = strip_or_none(track_attributes.get('src'))
59bbe491 3122 if not src:
3123 continue
3124 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3125 media_info['subtitles'].setdefault(lang, []).append({
3126 'url': absolute_url(src),
3127 })
5e8e2fa5
S
3128 for f in media_info['formats']:
3129 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3130 if media_info['formats'] or media_info['subtitles']:
59bbe491 3131 entries.append(media_info)
3132 return entries
3133
f6a1d69a
F
3134 def _extract_akamai_formats(self, *args, **kwargs):
3135 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3136 if subs:
b5ae35ee 3137 self._report_ignoring_subs('akamai')
f6a1d69a
F
3138 return fmts
3139
3140 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3141 signed = 'hdnea=' in manifest_url
3142 if not signed:
3143 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3144 manifest_url = re.sub(
3145 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3146 '', manifest_url).strip('?')
3147
c7c43a93 3148 formats = []
f6a1d69a 3149 subtitles = {}
70c5802b 3150
e71a4509 3151 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3152 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3153 hds_host = hosts.get('hds')
3154 if hds_host:
3155 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3156 if 'hdcore=' not in f4m_url:
3157 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3158 f4m_formats = self._extract_f4m_formats(
3159 f4m_url, video_id, f4m_id='hds', fatal=False)
3160 for entry in f4m_formats:
3161 entry.update({'extra_param_to_segment_url': hdcore_sign})
3162 formats.extend(f4m_formats)
70c5802b 3163
c4251b9a
RA
3164 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3165 hls_host = hosts.get('hls')
3166 if hls_host:
3167 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3168 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3169 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3170 m3u8_id='hls', fatal=False)
3171 formats.extend(m3u8_formats)
f6a1d69a 3172 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3173
3174 http_host = hosts.get('http')
29f7c58a 3175 if http_host and m3u8_formats and not signed:
3176 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3177 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3178 qualities_length = len(qualities)
29f7c58a 3179 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3180 i = 0
29f7c58a 3181 for f in m3u8_formats:
3182 if f['vcodec'] != 'none':
70c5802b 3183 for protocol in ('http', 'https'):
3184 http_f = f.copy()
3185 del http_f['manifest_url']
3186 http_url = re.sub(
29f7c58a 3187 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
70c5802b 3188 http_f.update({
3189 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3190 'url': http_url,
3191 'protocol': protocol,
3192 })
29f7c58a 3193 formats.append(http_f)
70c5802b 3194 i += 1
70c5802b 3195
f6a1d69a 3196 return formats, subtitles
c7c43a93 3197
6ad02195 3198 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 3199 query = compat_urlparse.urlparse(url).query
6ad02195 3200 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3201 mobj = re.search(
3202 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3203 url_base = mobj.group('url')
3204 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3205 formats = []
044eeb14
S
3206
3207 def manifest_url(manifest):
3208 m_url = '%s/%s' % (http_base_url, manifest)
3209 if query:
3210 m_url += '?%s' % query
3211 return m_url
3212
6ad02195
RA
3213 if 'm3u8' not in skip_protocols:
3214 formats.extend(self._extract_m3u8_formats(
044eeb14 3215 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3216 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3217 if 'f4m' not in skip_protocols:
3218 formats.extend(self._extract_f4m_formats(
044eeb14 3219 manifest_url('manifest.f4m'),
6ad02195 3220 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3221 if 'dash' not in skip_protocols:
3222 formats.extend(self._extract_mpd_formats(
044eeb14 3223 manifest_url('manifest.mpd'),
0384932e 3224 video_id, mpd_id='dash', fatal=False))
6ad02195 3225 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3226 if 'smil' not in skip_protocols:
3227 rtmp_formats = self._extract_smil_formats(
044eeb14 3228 manifest_url('jwplayer.smil'),
6ad02195
RA
3229 video_id, fatal=False)
3230 for rtmp_format in rtmp_formats:
3231 rtsp_format = rtmp_format.copy()
3232 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3233 del rtsp_format['play_path']
3234 del rtsp_format['ext']
3235 rtsp_format.update({
3236 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3237 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3238 'protocol': 'rtsp',
3239 })
3240 formats.extend([rtmp_format, rtsp_format])
3241 else:
3242 for protocol in ('rtmp', 'rtsp'):
3243 if protocol not in skip_protocols:
3244 formats.append({
f2e2f0c7 3245 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
3246 'format_id': protocol,
3247 'protocol': protocol,
3248 })
3249 return formats
3250
c73e330e 3251 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3252 mobj = re.search(
ac9c69ac 3253 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3254 webpage)
3255 if mobj:
c73e330e
RU
3256 try:
3257 jwplayer_data = self._parse_json(mobj.group('options'),
3258 video_id=video_id,
3259 transform_source=transform_source)
3260 except ExtractorError:
3261 pass
3262 else:
3263 if isinstance(jwplayer_data, dict):
3264 return jwplayer_data
a4a554a7
YCH
3265
3266 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3267 jwplayer_data = self._find_jwplayer_data(
3268 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3269 return self._parse_jwplayer_data(
3270 jwplayer_data, video_id, *args, **kwargs)
3271
3272 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3273 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3274 # JWPlayer backward compatibility: flattened playlists
3275 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3276 if 'playlist' not in jwplayer_data:
3277 jwplayer_data = {'playlist': [jwplayer_data]}
3278
3279 entries = []
3280
3281 # JWPlayer backward compatibility: single playlist item
3282 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3283 if not isinstance(jwplayer_data['playlist'], list):
3284 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3285
3286 for video_data in jwplayer_data['playlist']:
3287 # JWPlayer backward compatibility: flattened sources
3288 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3289 if 'sources' not in video_data:
3290 video_data['sources'] = [video_data]
3291
3292 this_video_id = video_id or video_data['mediaid']
3293
1a2192cb
S
3294 formats = self._parse_jwplayer_formats(
3295 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3296 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3297
3298 subtitles = {}
3299 tracks = video_data.get('tracks')
3300 if tracks and isinstance(tracks, list):
3301 for track in tracks:
96a2daa1
S
3302 if not isinstance(track, dict):
3303 continue
f4b74272
S
3304 track_kind = track.get('kind')
3305 if not track_kind or not isinstance(track_kind, compat_str):
3306 continue
3307 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3308 continue
3309 track_url = urljoin(base_url, track.get('file'))
3310 if not track_url:
3311 continue
3312 subtitles.setdefault(track.get('label') or 'en', []).append({
3313 'url': self._proto_relative_url(track_url)
3314 })
3315
50d808f5 3316 entry = {
a4a554a7 3317 'id': this_video_id,
50d808f5 3318 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3319 'description': clean_html(video_data.get('description')),
6945b9e7 3320 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3321 'timestamp': int_or_none(video_data.get('pubdate')),
3322 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3323 'subtitles': subtitles,
50d808f5
RA
3324 }
3325 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3326 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3327 entry.update({
3328 '_type': 'url_transparent',
3329 'url': formats[0]['url'],
3330 })
3331 else:
3332 self._sort_formats(formats)
3333 entry['formats'] = formats
3334 entries.append(entry)
a4a554a7
YCH
3335 if len(entries) == 1:
3336 return entries[0]
3337 else:
3338 return self.playlist_result(entries)
3339
ed0cf9b3
S
3340 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3341 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3342 urls = []
ed0cf9b3 3343 formats = []
1a2192cb 3344 for source in jwplayer_sources_data:
0a268c6e
S
3345 if not isinstance(source, dict):
3346 continue
6945b9e7
RA
3347 source_url = urljoin(
3348 base_url, self._proto_relative_url(source.get('file')))
3349 if not source_url or source_url in urls:
bf1b87cd
RA
3350 continue
3351 urls.append(source_url)
ed0cf9b3
S
3352 source_type = source.get('type') or ''
3353 ext = mimetype2ext(source_type) or determine_ext(source_url)
3354 if source_type == 'hls' or ext == 'm3u8':
3355 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3356 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3357 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3358 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3359 formats.extend(self._extract_mpd_formats(
3360 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3361 elif ext == 'smil':
3362 formats.extend(self._extract_smil_formats(
3363 source_url, video_id, fatal=False))
ed0cf9b3 3364 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3365 elif source_type.startswith('audio') or ext in (
3366 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3367 formats.append({
3368 'url': source_url,
3369 'vcodec': 'none',
3370 'ext': ext,
3371 })
3372 else:
3373 height = int_or_none(source.get('height'))
3374 if height is None:
3375 # Often no height is provided but there is a label in
0236cd0d 3376 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3377 height = int_or_none(self._search_regex(
0236cd0d 3378 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
3379 'height', default=None))
3380 a_format = {
3381 'url': source_url,
3382 'width': int_or_none(source.get('width')),
3383 'height': height,
0236cd0d 3384 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3385 'ext': ext,
3386 }
3387 if source_url.startswith('rtmp'):
3388 a_format['ext'] = 'flv'
ed0cf9b3
S
3389 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3390 # of jwplayer.flash.swf
3391 rtmp_url_parts = re.split(
3392 r'((?:mp4|mp3|flv):)', source_url, 1)
3393 if len(rtmp_url_parts) == 3:
3394 rtmp_url, prefix, play_path = rtmp_url_parts
3395 a_format.update({
3396 'url': rtmp_url,
3397 'play_path': prefix + play_path,
3398 })
3399 if rtmp_params:
3400 a_format.update(rtmp_params)
3401 formats.append(a_format)
3402 return formats
3403
f4b1c7ad
PH
3404 def _live_title(self, name):
3405 """ Generate the title for a live video """
3406 now = datetime.datetime.now()
611c1dd9 3407 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
3408 return name + ' ' + now_str
3409
b14f3a4c
PH
3410 def _int(self, v, name, fatal=False, **kwargs):
3411 res = int_or_none(v, **kwargs)
3412 if 'get_attr' in kwargs:
3413 print(getattr(v, kwargs['get_attr']))
3414 if res is None:
3415 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3416 if fatal:
3417 raise ExtractorError(msg)
3418 else:
6a39ee13 3419 self.report_warning(msg)
b14f3a4c
PH
3420 return res
3421
3422 def _float(self, v, name, fatal=False, **kwargs):
3423 res = float_or_none(v, **kwargs)
3424 if res is None:
3425 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3426 if fatal:
3427 raise ExtractorError(msg)
3428 else:
6a39ee13 3429 self.report_warning(msg)
b14f3a4c
PH
3430 return res
3431
40e41780
TF
3432 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3433 path='/', secure=False, discard=False, rest={}, **kwargs):
6c22cee6 3434 cookie = compat_cookiejar_Cookie(
4ed2d7b7 3435 0, name, value, port, port is not None, domain, True,
40e41780
TF
3436 domain.startswith('.'), path, True, secure, expire_time,
3437 discard, None, None, rest)
42939b61
JMF
3438 self._downloader.cookiejar.set_cookie(cookie)
3439
799207e8 3440 def _get_cookies(self, url):
f7ad7160 3441 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
5c2266df 3442 req = sanitized_Request(url)
799207e8 3443 self._downloader.cookiejar.add_cookie_header(req)
f7ad7160 3444 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
799207e8 3445
e3c1266f 3446 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3447 """
3448 Apply first Set-Cookie header instead of the last. Experimental.
3449
3450 Some sites (e.g. [1-3]) may serve two cookies under the same name
3451 in Set-Cookie header and expect the first (old) one to be set rather
3452 than second (new). However, as of RFC6265 the newer one cookie
3453 should be set into cookie store what actually happens.
3454 We will workaround this issue by resetting the cookie to
3455 the first one manually.
3456 1. https://new.vk.com/
3457 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3458 3. https://learning.oreilly.com/
3459 """
e3c1266f
S
3460 for header, cookies in url_handle.headers.items():
3461 if header.lower() != 'set-cookie':
3462 continue
3463 if sys.version_info[0] >= 3:
3464 cookies = cookies.encode('iso-8859-1')
3465 cookies = cookies.decode('utf-8')
3466 cookie_value = re.search(
3467 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3468 if cookie_value:
3469 value, domain = cookie_value.groups()
3470 self._set_cookie(domain, cookie, value)
3471 break
3472
05900629
PH
3473 def get_testcases(self, include_onlymatching=False):
3474 t = getattr(self, '_TEST', None)
3475 if t:
3476 assert not hasattr(self, '_TESTS'), \
3477 '%s has _TEST and _TESTS' % type(self).__name__
3478 tests = [t]
3479 else:
3480 tests = getattr(self, '_TESTS', [])
3481 for t in tests:
3482 if not include_onlymatching and t.get('only_matching', False):
3483 continue
3484 t['name'] = type(self).__name__[:-len('IE')]
3485 yield t
3486
3487 def is_suitable(self, age_limit):
3488 """ Test whether the extractor is generally suitable for the given
3489 age limit (i.e. pornographic sites are not, all others usually are) """
3490
3491 any_restricted = False
3492 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 3493 if tc.get('playlist', []):
05900629
PH
3494 tc = tc['playlist'][0]
3495 is_restricted = age_restricted(
3496 tc.get('info_dict', {}).get('age_limit'), age_limit)
3497 if not is_restricted:
3498 return True
3499 any_restricted = any_restricted or is_restricted
3500 return not any_restricted
3501
a504ced0 3502 def extract_subtitles(self, *args, **kwargs):
a06916d9 3503 if (self.get_param('writesubtitles', False)
3504 or self.get_param('listsubtitles')):
9868ea49
JMF
3505 return self._get_subtitles(*args, **kwargs)
3506 return {}
a504ced0
JMF
3507
3508 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3509 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3510
a2160aa4 3511 def extract_comments(self, *args, **kwargs):
3512 if not self.get_param('getcomments'):
3513 return None
3514 generator = self._get_comments(*args, **kwargs)
3515
3516 def extractor():
3517 comments = []
3518 try:
3519 while True:
3520 comments.append(next(generator))
3521 except KeyboardInterrupt:
3522 interrupted = True
3523 self.to_screen('Interrupted by user')
3524 except StopIteration:
3525 interrupted = False
3526 comment_count = len(comments)
3527 self.to_screen(f'Extracted {comment_count} comments')
3528 return {
3529 'comments': comments,
3530 'comment_count': None if interrupted else comment_count
3531 }
3532 return extractor
3533
3534 def _get_comments(self, *args, **kwargs):
3535 raise NotImplementedError('This method must be implemented by subclasses')
3536
912e0b7e
YCH
3537 @staticmethod
3538 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3539 """ Merge subtitle items for one language. Items with duplicated URLs
3540 will be dropped. """
3541 list1_urls = set([item['url'] for item in subtitle_list1])
3542 ret = list(subtitle_list1)
3543 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3544 return ret
3545
3546 @classmethod
46890374 3547 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3548 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3549 if target is None:
3550 target = {}
3551 for d in dicts:
3552 for lang, subs in d.items():
3553 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3554 return target
912e0b7e 3555
360e1ca5 3556 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3557 if (self.get_param('writeautomaticsub', False)
3558 or self.get_param('listsubtitles')):
9868ea49
JMF
3559 return self._get_automatic_captions(*args, **kwargs)
3560 return {}
360e1ca5
JMF
3561
3562 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3563 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3564
d77ab8e2 3565 def mark_watched(self, *args, **kwargs):
1813a6cc 3566 if not self.get_param('mark_watched', False):
3567 return
3568 if (self._get_login_info()[0] is not None
3569 or self.get_param('cookiefile')
3570 or self.get_param('cookiesfrombrowser')):
d77ab8e2
S
3571 self._mark_watched(*args, **kwargs)
3572
3573 def _mark_watched(self, *args, **kwargs):
3574 raise NotImplementedError('This method must be implemented by subclasses')
3575
38cce791
YCH
3576 def geo_verification_headers(self):
3577 headers = {}
a06916d9 3578 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3579 if geo_verification_proxy:
3580 headers['Ytdl-request-proxy'] = geo_verification_proxy
3581 return headers
3582
98763ee3
YCH
3583 def _generic_id(self, url):
3584 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3585
3586 def _generic_title(self, url):
3587 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3588
c224251a 3589 @staticmethod
b0089e89 3590 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3591 all_known = all(map(
3592 lambda x: x is not None,
3593 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3594 return (
3595 'private' if is_private
3596 else 'premium_only' if needs_premium
3597 else 'subscriber_only' if needs_subscription
3598 else 'needs_auth' if needs_auth
3599 else 'unlisted' if is_unlisted
3600 else 'public' if all_known
3601 else None)
3602
4bb6b02f 3603 def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3604 '''
3605 @returns A list of values for the extractor argument given by "key"
3606 or "default" if no such key is present
3607 @param default The default value to return when the key is not present (default: [])
3608 @param casesense When false, the values are converted to lower case
3609 '''
3610 val = traverse_obj(
5d3a0e79 3611 self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
4bb6b02f 3612 if val is None:
3613 return [] if default is NO_DEFAULT else default
3614 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3615
8dbe9899 3616
d6983cb4
PH
3617class SearchInfoExtractor(InfoExtractor):
3618 """
3619 Base class for paged search queries extractors.
10952eb2 3620 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
3621 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3622 """
3623
3624 @classmethod
3625 def _make_valid_url(cls):
3626 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3627
3628 @classmethod
3629 def suitable(cls, url):
3630 return re.match(cls._make_valid_url(), url) is not None
3631
3632 def _real_extract(self, query):
3633 mobj = re.match(self._make_valid_url(), query)
3634 if mobj is None:
f1a9d64e 3635 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
3636
3637 prefix = mobj.group('prefix')
3638 query = mobj.group('query')
3639 if prefix == '':
3640 return self._get_n_results(query, 1)
3641 elif prefix == 'all':
3642 return self._get_n_results(query, self._MAX_RESULTS)
3643 else:
3644 n = int(prefix)
3645 if n <= 0:
f1a9d64e 3646 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 3647 elif n > self._MAX_RESULTS:
6a39ee13 3648 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3649 n = self._MAX_RESULTS
3650 return self._get_n_results(query, n)
3651
3652 def _get_n_results(self, query, n):
cc16383f 3653 """Get a specified number of results for a query.
3654 Either this function or _search_results must be overridden by subclasses """
3655 return self.playlist_result(
3656 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3657 query, query)
3658
3659 def _search_results(self, query):
3660 """Returns an iterator of search results"""
611c1dd9 3661 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
3662
3663 @property
3664 def SEARCH_KEY(self):
3665 return self._SEARCH_KEY