]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
[youtube] Improve signature function detection (#641)
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
f4b1c7ad 5import datetime
3ec05685 6import hashlib
3d3538e4 7import json
4094b6e3 8import netrc
d6983cb4 9import os
773f291d 10import random
d6983cb4 11import re
d6983cb4 12import sys
4094b6e3 13import time
1bac3455 14import math
d6983cb4 15
8c25f81b 16from ..compat import (
6c22cee6 17 compat_cookiejar_Cookie,
f7ad7160 18 compat_cookies_SimpleCookie,
ee0ba927 19 compat_etree_Element,
e9c0cdd3 20 compat_etree_fromstring,
e64b7569 21 compat_getpass,
d6983cb4 22 compat_http_client,
e9c0cdd3
YCH
23 compat_os_name,
24 compat_str,
d6983cb4 25 compat_urllib_error,
98763ee3 26 compat_urllib_parse_unquote,
15707c7e 27 compat_urllib_parse_urlencode,
41d06b04 28 compat_urllib_request,
f0b5d6af 29 compat_urlparse,
e01c3d2e 30 compat_xml_parse_error,
8c25f81b 31)
eb8a4433 32from ..downloader import FileDownloader
48107c19
S
33from ..downloader.f4m import (
34 get_base_url,
35 remove_encrypted_media,
36)
8c25f81b 37from ..utils import (
05900629 38 age_restricted,
02dc0a36 39 base_url,
08f2a92c 40 bug_reports_message,
d6983cb4
PH
41 clean_html,
42 compiled_regex_type,
70f0f5a8 43 determine_ext,
46b18f23 44 determine_protocol,
d493f15c 45 dict_get,
9b9c5355 46 error_to_compat_str,
46b18f23 47 extract_attributes,
b868936c 48 ExtractorError,
97f4aecf 49 fix_xml_ampersands,
b14f3a4c 50 float_or_none,
b868936c 51 format_field,
773f291d
S
52 GeoRestrictedError,
53 GeoUtils,
31bb8d3f 54 int_or_none,
a4a554a7 55 js_to_json,
0685d972 56 JSON_LD_RE,
46b18f23 57 mimetype2ext,
3158150c 58 network_exceptions,
b868936c 59 NO_DEFAULT,
46b18f23 60 orderedSet,
d493f15c 61 parse_bitrate,
46b18f23
JH
62 parse_codecs,
63 parse_duration,
4ca2a3cf 64 parse_iso8601,
46b18f23 65 parse_m3u8_attributes,
d493f15c 66 parse_resolution,
55b3e45b 67 RegexNotFoundError,
46b18f23 68 sanitize_filename,
b868936c 69 sanitized_Request,
d493f15c 70 str_or_none,
ce5b9040 71 str_to_int,
f856816b 72 strip_or_none,
5d3a0e79 73 traverse_obj,
f38de77f 74 unescapeHTML,
647eab45 75 unified_strdate,
6b3a3098 76 unified_timestamp,
46b18f23
JH
77 update_Request,
78 update_url_query,
a107193e 79 url_basename,
bebef109 80 url_or_none,
b868936c 81 urljoin,
6606817a 82 variadic,
a6571f10 83 xpath_element,
8d6765cf
S
84 xpath_text,
85 xpath_with_ns,
d6983cb4 86)
c342041f 87
d6983cb4
PH
88
89class InfoExtractor(object):
90 """Information Extractor class.
91
92 Information extractors are the classes that, given a URL, extract
93 information about the video (or videos) the URL refers to. This
94 information includes the real video URL, the video title, author and
95 others. The information is stored in a dictionary which is then
5d380852 96 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
97 information possibly downloading the video to the file system, among
98 other possible outcomes.
99
cf0649f8 100 The type field determines the type of the result.
fed5d032
PH
101 By far the most common value (and the default if _type is missing) is
102 "video", which indicates a single video.
103
104 For a video, the dictionaries must include the following fields:
d6983cb4
PH
105
106 id: Video identifier.
d6983cb4 107 title: Video title, unescaped.
d67b0b15 108
f49d89ee 109 Additionally, it must contain either a formats entry or a url one:
d67b0b15 110
f49d89ee
PH
111 formats: A list of dictionaries for each format available, ordered
112 from worst to best quality.
113
114 Potential fields:
c790e93a
S
115 * url The mandatory URL representing the media:
116 for plain file media - HTTP URL of this file,
117 for RTMP - RTMP URL,
118 for HLS - URL of the M3U8 media playlist,
119 for HDS - URL of the F4M manifest,
79d2077e
S
120 for DASH
121 - HTTP URL to plain file media (in case of
122 unfragmented media)
123 - URL of the MPD manifest or base URL
124 representing the media if MPD manifest
8ed7a233 125 is parsed from a string (in case of
79d2077e 126 fragmented media)
c790e93a 127 for MSS - URL of the ISM manifest.
86f4d14f
S
128 * manifest_url
129 The URL of the manifest file in case of
c790e93a
S
130 fragmented media:
131 for HLS - URL of the M3U8 master playlist,
132 for HDS - URL of the F4M manifest,
133 for DASH - URL of the MPD manifest,
134 for MSS - URL of the ISM manifest.
10952eb2 135 * ext Will be calculated from URL if missing
d67b0b15
PH
136 * format A human-readable description of the format
137 ("mp4 container with h264/opus").
138 Calculated from the format_id, width, height.
139 and format_note fields if missing.
140 * format_id A short description of the format
5d4f3985
PH
141 ("mp4_h264_opus" or "19").
142 Technically optional, but strongly recommended.
d67b0b15
PH
143 * format_note Additional info about the format
144 ("3D" or "DASH video")
145 * width Width of the video, if known
146 * height Height of the video, if known
f49d89ee 147 * resolution Textual description of width and height
7217e148 148 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
149 * abr Average audio bitrate in KBit/s
150 * acodec Name of the audio codec in use
dd27fd17 151 * asr Audio sampling rate in Hertz
d67b0b15 152 * vbr Average video bitrate in KBit/s
fbb21cf5 153 * fps Frame rate
d67b0b15 154 * vcodec Name of the video codec in use
1394ce65 155 * container Name of the container format
d67b0b15 156 * filesize The number of bytes, if known in advance
9732d77e 157 * filesize_approx An estimate for the number of bytes
d67b0b15 158 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
159 * protocol The protocol that will be used for the actual
160 download, lower-case.
0fa9a1e2 161 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
af7d5a63 162 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
163 * fragment_base_url
164 Base URL for fragments. Each fragment's path
165 value (if present) will be relative to
166 this URL.
167 * fragments A list of fragments of a fragmented media.
168 Each fragment entry must contain either an url
169 or a path. If an url is present it should be
170 considered by a client. Otherwise both path and
171 fragment_base_url must be present. Here is
172 the list of all potential fields:
173 * "url" - fragment's URL
174 * "path" - fragment's path relative to
175 fragment_base_url
a0d5077c
S
176 * "duration" (optional, int or float)
177 * "filesize" (optional, int)
f49d89ee 178 * preference Order number of this format. If this field is
08d13955 179 present and not None, the formats get sorted
38d63d84 180 by this field, regardless of all other values.
f49d89ee
PH
181 -1 for default (order by other properties),
182 -2 or smaller for less than default.
e65566a9
PH
183 < -1000 to hide the format (if there is
184 another one which is strictly better)
32f90364
PH
185 * language Language code, e.g. "de" or "en-US".
186 * language_preference Is this in the language mentioned in
187 the URL?
aff2f4f4
PH
188 10 if it's what the URL is about,
189 -1 for default (don't know),
190 -10 otherwise, other values reserved for now.
5d73273f
PH
191 * quality Order number of the video quality of this
192 format, irrespective of the file format.
193 -1 for default (order by other properties),
194 -2 or smaller for less than default.
c64ed2a3
PH
195 * source_preference Order number for this video source
196 (quality takes higher priority)
197 -1 for default (order by other properties),
198 -2 or smaller for less than default.
d769be6c
PH
199 * http_headers A dictionary of additional HTTP headers
200 to add to the request.
6271f1ca 201 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
202 video's pixels are not square.
203 width : height ratio as float.
204 * no_resume The server does not support resuming the
205 (HTTP or RTMP) download. Boolean.
00c97e3e
S
206 * downloader_options A dictionary of downloader options as
207 described in FileDownloader
3b1fe47d 208 RTMP formats can also have the additional fields: page_url,
209 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
210 rtmp_protocol, rtmp_real_time
3dee7826 211
c0ba0f48 212 url: Final video URL.
d6983cb4 213 ext: Video filename extension.
d67b0b15
PH
214 format: The video format, defaults to ext (used for --get-format)
215 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 216
d6983cb4
PH
217 The following fields are optional:
218
f5e43bc6 219 alt_title: A secondary title of the video.
0afef30b
PH
220 display_id An alternative identifier for the video, not necessarily
221 unique, but available before title. Typically, id is
222 something like "4234987", title "Dancing naked mole rats",
223 and display_id "dancing-naked-mole-rats"
d5519808 224 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 225 * "id" (optional, string) - Thumbnail format ID
d5519808 226 * "url"
cfb56d1a 227 * "preference" (optional, int) - quality of the image
d5519808
PH
228 * "width" (optional, int)
229 * "height" (optional, int)
5e1c39ac 230 * "resolution" (optional, string "{width}x{height}",
d5519808 231 deprecated)
2de624fd 232 * "filesize" (optional, int)
0ba692ac 233 * "_test_url" (optional, bool) - If true, test the URL
d6983cb4 234 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 235 description: Full video description.
d6983cb4 236 uploader: Full name of the video uploader.
2bc0c46f 237 license: License name the video is licensed under.
8a92e51c 238 creator: The creator of the video.
10db0d2f 239 release_timestamp: UNIX timestamp of the moment the video was released.
8aab976b 240 release_date: The date (YYYYMMDD) when the video was released.
10db0d2f 241 timestamp: UNIX timestamp of the moment the video was uploaded
d6983cb4 242 upload_date: Video upload date (YYYYMMDD).
955c4514 243 If not explicitly set, calculated from timestamp.
d6983cb4 244 uploader_id: Nickname or id of the video uploader.
7bcd2830 245 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 246 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 247 Note that channel fields may or may not repeat uploader
6f1f59f3
S
248 fields. This depends on a particular extractor.
249 channel_id: Id of the channel.
250 channel_url: Full URL to a channel webpage.
da9ec3b9 251 location: Physical location where the video was filmed.
a504ced0 252 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
253 {tag: subformats}. "tag" is usually a language code, and
254 "subformats" is a list sorted from lower to higher
255 preference, each element is a dictionary with the "ext"
256 entry and one of:
a504ced0 257 * "data": The subtitles file contents
10952eb2 258 * "url": A URL pointing to the subtitles file
2412044c 259 It can optionally also have:
260 * "name": Name or description of the subtitles
4bba3716 261 "ext" will be calculated from URL if missing
e167860c 262 automatic_captions: Like 'subtitles'; contains automatically generated
263 captions instead of normal subtitles
62d231c0 264 duration: Length of the video in seconds, as an integer or float.
f3d29461 265 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
266 like_count: Number of positive ratings of the video
267 dislike_count: Number of negative ratings of the video
02835c6b 268 repost_count: Number of reposts of the video
2d30521a 269 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 270 comment_count: Number of comments on the video
dd622d7c
PH
271 comments: A list of comments, each with one or more of the following
272 properties (all but one of text or html optional):
273 * "author" - human-readable name of the comment author
274 * "author_id" - user ID of the comment author
a1c5d2ca 275 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
276 * "id" - Comment ID
277 * "html" - Comment as HTML
278 * "text" - Plain text of the comment
279 * "timestamp" - UNIX timestamp of comment
280 * "parent" - ID of the comment this one is replying to.
281 Set to "root" to indicate that this is a
282 comment to the original video.
a1c5d2ca
M
283 * "like_count" - Number of positive ratings of the comment
284 * "dislike_count" - Number of negative ratings of the comment
285 * "is_favorited" - Whether the comment is marked as
286 favorite by the video uploader
287 * "author_is_uploader" - Whether the comment is made by
288 the video uploader
8dbe9899 289 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 290 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
291 should allow to get the same result again. (It will be set
292 by YoutubeDL if it's missing)
ad3bc6ac
PH
293 categories: A list of categories that the video falls in, for example
294 ["Sports", "Berlin"]
864f24bd 295 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
d0fb4bd1 296 cast: A list of the video cast
7267bd53
PH
297 is_live: True, False, or None (=unknown). Whether this video is a
298 live stream that goes on instead of a fixed-length video.
f76ede8e 299 was_live: True, False, or None (=unknown). Whether this video was
300 originally a live stream.
3dbb2a9d 301 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
ae30b840 302 If absent, automatically set from is_live, was_live
7c80519c 303 start_time: Time in seconds where the reproduction should start, as
10952eb2 304 specified in the URL.
297a564b 305 end_time: Time in seconds where the reproduction should end, as
10952eb2 306 specified in the URL.
55949fed 307 chapters: A list of dictionaries, with the following entries:
308 * "start_time" - The start time of the chapter in seconds
309 * "end_time" - The end time of the chapter in seconds
310 * "title" (optional, string)
6cfda058 311 playable_in_embed: Whether this video is allowed to play in embedded
312 players on other sites. Can be True (=always allowed),
313 False (=never allowed), None (=unknown), or a string
c224251a
M
314 specifying the criteria for embedability (Eg: 'whitelist')
315 availability: Under what condition the video is available. One of
316 'private', 'premium_only', 'subscriber_only', 'needs_auth',
317 'unlisted' or 'public'. Use 'InfoExtractor._availability'
318 to set it
277d6ff5 319 __post_extractor: A function to be called just before the metadata is
320 written to either disk, logger or console. The function
321 must return a dict which will be added to the info_dict.
322 This is usefull for additional information that is
323 time-consuming to extract. Note that the fields thus
324 extracted will not be available to output template and
325 match_filter. So, only "comments" and "comment_count" are
326 currently allowed to be extracted via this method.
d6983cb4 327
7109903e
S
328 The following fields should only be used when the video belongs to some logical
329 chapter or section:
330
331 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
332 chapter_number: Number of the chapter the video belongs to, as an integer.
333 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
334
335 The following fields should only be used when the video is an episode of some
8d76bdf1 336 series, programme or podcast:
7109903e
S
337
338 series: Title of the series or programme the video episode belongs to.
339 season: Title of the season the video episode belongs to.
27bfd4e5
S
340 season_number: Number of the season the video episode belongs to, as an integer.
341 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
342 episode: Title of the video episode. Unlike mandatory video title field,
343 this field should denote the exact title of the video episode
344 without any kind of decoration.
27bfd4e5
S
345 episode_number: Number of the video episode within a season, as an integer.
346 episode_id: Id of the video episode, as a unicode string.
7109903e 347
7a93ab5f
S
348 The following fields should only be used when the media is a track or a part of
349 a music album:
350
351 track: Title of the track.
352 track_number: Number of the track within an album or a disc, as an integer.
353 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
354 as a unicode string.
355 artist: Artist(s) of the track.
356 genre: Genre(s) of the track.
357 album: Title of the album the track belongs to.
358 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
359 album_artist: List of all artists appeared on the album (e.g.
360 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
361 and compilations).
362 disc_number: Number of the disc or other physical medium the track belongs to,
363 as an integer.
364 release_year: Year (YYYY) when the album was released.
365
deefc05b 366 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 367
d838b1bd
PH
368 Unless mentioned otherwise, None is equivalent to absence of information.
369
fed5d032
PH
370
371 _type "playlist" indicates multiple videos.
b82f815f
PH
372 There must be a key "entries", which is a list, an iterable, or a PagedList
373 object, each element of which is a valid dictionary by this specification.
fed5d032 374
b60419c5 375 Additionally, playlists can have "id", "title", and any other relevent
376 attributes with the same semantics as videos (see above).
fed5d032
PH
377
378
379 _type "multi_video" indicates that there are multiple videos that
380 form a single show, for examples multiple acts of an opera or TV episode.
381 It must have an entries key like a playlist and contain all the keys
382 required for a video at the same time.
383
384
385 _type "url" indicates that the video must be extracted from another
386 location, possibly by a different extractor. Its only required key is:
387 "url" - the next URL to extract.
f58766ce
PH
388 The key "ie_key" can be set to the class name (minus the trailing "IE",
389 e.g. "Youtube") if the extractor class is known in advance.
390 Additionally, the dictionary may have any properties of the resolved entity
391 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
392 known ahead of time.
393
394
395 _type "url_transparent" entities have the same specification as "url", but
396 indicate that the given additional information is more precise than the one
397 associated with the resolved URL.
398 This is useful when a site employs a video service that hosts the video and
399 its technical metadata, but that video service does not embed a useful
400 title, description etc.
401
402
d6983cb4
PH
403 Subclasses of this one should re-define the _real_initialize() and
404 _real_extract() methods and define a _VALID_URL regexp.
405 Probably, they should also be added to the list of extractors.
406
4248dad9 407 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
408 geo restriction bypass mechanisms for a particular extractor.
409 Though it won't disable explicit geo restriction bypass based on
504f20dd 410 country code provided with geo_bypass_country.
4248dad9
S
411
412 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
413 countries for this extractor. One of these countries will be used by
414 geo restriction bypass mechanism right away in order to bypass
504f20dd 415 geo restriction, of course, if the mechanism is not disabled.
773f291d 416
5f95927a
S
417 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
418 IP blocks in CIDR notation for this extractor. One of these IP blocks
419 will be used by geo restriction bypass mechanism similarly
504f20dd 420 to _GEO_COUNTRIES.
3ccdde8c 421
d6983cb4
PH
422 Finally, the _WORKING attribute should be set to False for broken IEs
423 in order to warn the users and skip the tests.
424 """
425
426 _ready = False
427 _downloader = None
773f291d 428 _x_forwarded_for_ip = None
4248dad9
S
429 _GEO_BYPASS = True
430 _GEO_COUNTRIES = None
5f95927a 431 _GEO_IP_BLOCKS = None
d6983cb4
PH
432 _WORKING = True
433
9d5d4d64 434 _LOGIN_HINTS = {
435 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
436 'cookies': (
437 'Use --cookies for the authentication. '
438 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to pass cookies'),
439 'password': 'Use --username and --password or --netrc to provide account credentials',
440 }
441
d6983cb4
PH
442 def __init__(self, downloader=None):
443 """Constructor. Receives an optional downloader."""
444 self._ready = False
773f291d 445 self._x_forwarded_for_ip = None
28f436ba 446 self._printed_messages = set()
d6983cb4
PH
447 self.set_downloader(downloader)
448
449 @classmethod
450 def suitable(cls, url):
451 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
452
453 # This does not use has/getattr intentionally - we want to know whether
454 # we have cached the regexp for *this* class, whereas getattr would also
455 # match the superclass
456 if '_VALID_URL_RE' not in cls.__dict__:
457 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
458 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 459
ed9266db
PH
460 @classmethod
461 def _match_id(cls, url):
462 if '_VALID_URL_RE' not in cls.__dict__:
463 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
464 m = cls._VALID_URL_RE.match(url)
465 assert m
1afd0b0d 466 return compat_str(m.group('id'))
ed9266db 467
d6983cb4
PH
468 @classmethod
469 def working(cls):
470 """Getter method for _WORKING."""
471 return cls._WORKING
472
473 def initialize(self):
474 """Initializes an instance (authentication, etc)."""
28f436ba 475 self._printed_messages = set()
5f95927a
S
476 self._initialize_geo_bypass({
477 'countries': self._GEO_COUNTRIES,
478 'ip_blocks': self._GEO_IP_BLOCKS,
479 })
4248dad9
S
480 if not self._ready:
481 self._real_initialize()
482 self._ready = True
483
5f95927a 484 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
485 """
486 Initialize geo restriction bypass mechanism.
487
488 This method is used to initialize geo bypass mechanism based on faking
489 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 490 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
491 IP will be passed as X-Forwarded-For HTTP header in all subsequent
492 HTTP requests.
e39b5d4a
S
493
494 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
495 during the instance initialization with _GEO_COUNTRIES and
496 _GEO_IP_BLOCKS.
e39b5d4a 497
5f95927a 498 You may also manually call it from extractor's code if geo bypass
e39b5d4a 499 information is not available beforehand (e.g. obtained during
5f95927a
S
500 extraction) or due to some other reason. In this case you should pass
501 this information in geo bypass context passed as first argument. It may
502 contain following fields:
503
504 countries: List of geo unrestricted countries (similar
505 to _GEO_COUNTRIES)
506 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
507 (similar to _GEO_IP_BLOCKS)
508
e39b5d4a 509 """
773f291d 510 if not self._x_forwarded_for_ip:
5f95927a
S
511
512 # Geo bypass mechanism is explicitly disabled by user
a06916d9 513 if not self.get_param('geo_bypass', True):
5f95927a
S
514 return
515
516 if not geo_bypass_context:
517 geo_bypass_context = {}
518
519 # Backward compatibility: previously _initialize_geo_bypass
520 # expected a list of countries, some 3rd party code may still use
521 # it this way
522 if isinstance(geo_bypass_context, (list, tuple)):
523 geo_bypass_context = {
524 'countries': geo_bypass_context,
525 }
526
527 # The whole point of geo bypass mechanism is to fake IP
528 # as X-Forwarded-For HTTP header based on some IP block or
529 # country code.
530
531 # Path 1: bypassing based on IP block in CIDR notation
532
533 # Explicit IP block specified by user, use it right away
534 # regardless of whether extractor is geo bypassable or not
a06916d9 535 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
536
537 # Otherwise use random IP block from geo bypass context but only
538 # if extractor is known as geo bypassable
539 if not ip_block:
540 ip_blocks = geo_bypass_context.get('ip_blocks')
541 if self._GEO_BYPASS and ip_blocks:
542 ip_block = random.choice(ip_blocks)
543
544 if ip_block:
545 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
0760b0a7 546 self._downloader.write_debug(
547 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
5f95927a
S
548 return
549
550 # Path 2: bypassing based on country code
551
552 # Explicit country code specified by user, use it right away
553 # regardless of whether extractor is geo bypassable or not
a06916d9 554 country = self.get_param('geo_bypass_country', None)
5f95927a
S
555
556 # Otherwise use random country code from geo bypass context but
557 # only if extractor is known as geo bypassable
558 if not country:
559 countries = geo_bypass_context.get('countries')
560 if self._GEO_BYPASS and countries:
561 country = random.choice(countries)
562
563 if country:
564 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 565 self._downloader.write_debug(
566 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
567
568 def extract(self, url):
569 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 570 try:
773f291d
S
571 for _ in range(2):
572 try:
573 self.initialize()
a06916d9 574 self.write_debug('Extracting URL: %s' % url)
0016b84e 575 ie_result = self._real_extract(url)
07cce701 576 if ie_result is None:
577 return None
0016b84e
S
578 if self._x_forwarded_for_ip:
579 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
53ed7066 580 subtitles = ie_result.get('subtitles')
581 if (subtitles and 'live_chat' in subtitles
a06916d9 582 and 'no-live-chat' in self.get_param('compat_opts', [])):
53ed7066 583 del subtitles['live_chat']
0016b84e 584 return ie_result
773f291d 585 except GeoRestrictedError as e:
4248dad9
S
586 if self.__maybe_fake_ip_and_retry(e.countries):
587 continue
773f291d 588 raise
3a5bcd03
PH
589 except ExtractorError:
590 raise
591 except compat_http_client.IncompleteRead as e:
dfb1b146 592 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 593 except (KeyError, StopIteration) as e:
dfb1b146 594 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4 595
4248dad9 596 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 597 if (not self.get_param('geo_bypass_country', None)
3089bc74 598 and self._GEO_BYPASS
a06916d9 599 and self.get_param('geo_bypass', True)
3089bc74
S
600 and not self._x_forwarded_for_ip
601 and countries):
eea0716c
S
602 country_code = random.choice(countries)
603 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
604 if self._x_forwarded_for_ip:
605 self.report_warning(
eea0716c
S
606 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
607 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
608 return True
609 return False
610
d6983cb4
PH
611 def set_downloader(self, downloader):
612 """Sets the downloader for this IE."""
613 self._downloader = downloader
614
615 def _real_initialize(self):
616 """Real initialization process. Redefine in subclasses."""
617 pass
618
619 def _real_extract(self, url):
620 """Real extraction process. Redefine in subclasses."""
621 pass
622
56c73665
JMF
623 @classmethod
624 def ie_key(cls):
625 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 626 return compat_str(cls.__name__[:-2])
56c73665 627
d6983cb4
PH
628 @property
629 def IE_NAME(self):
dc519b54 630 return compat_str(type(self).__name__[:-2])
d6983cb4 631
d391b7e2
S
632 @staticmethod
633 def __can_accept_status_code(err, expected_status):
634 assert isinstance(err, compat_urllib_error.HTTPError)
635 if expected_status is None:
636 return False
d391b7e2
S
637 elif callable(expected_status):
638 return expected_status(err.code) is True
639 else:
6606817a 640 return err.code in variadic(expected_status)
d391b7e2
S
641
642 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
643 """
644 Return the response handle.
645
646 See _download_webpage docstring for arguments specification.
647 """
1cf376f5 648 if not self._downloader._first_webpage_request:
a06916d9 649 sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
1cf376f5 650 if sleep_interval > 0:
5ef7d9bd 651 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 652 time.sleep(sleep_interval)
653 else:
654 self._downloader._first_webpage_request = False
655
d6983cb4
PH
656 if note is None:
657 self.report_download_webpage(video_id)
658 elif note is not False:
7cc3570e 659 if video_id is None:
f1a9d64e 660 self.to_screen('%s' % (note,))
7cc3570e 661 else:
f1a9d64e 662 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
663
664 # Some sites check X-Forwarded-For HTTP header in order to figure out
665 # the origin of the client behind proxy. This allows bypassing geo
666 # restriction by faking this header's value to IP that belongs to some
667 # geo unrestricted country. We will do so once we encounter any
668 # geo restriction error.
669 if self._x_forwarded_for_ip:
670 if 'X-Forwarded-For' not in headers:
671 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
672
41d06b04
S
673 if isinstance(url_or_request, compat_urllib_request.Request):
674 url_or_request = update_Request(
675 url_or_request, data=data, headers=headers, query=query)
676 else:
cdfee168 677 if query:
678 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 679 if data is not None or headers:
41d06b04 680 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 681 try:
dca08720 682 return self._downloader.urlopen(url_or_request)
3158150c 683 except network_exceptions as err:
d391b7e2
S
684 if isinstance(err, compat_urllib_error.HTTPError):
685 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
686 # Retain reference to error to prevent file object from
687 # being closed before it can be read. Works around the
688 # effects of <https://bugs.python.org/issue15002>
689 # introduced in Python 3.4.1.
690 err.fp._error = err
d391b7e2
S
691 return err.fp
692
aa94a6d3
PH
693 if errnote is False:
694 return False
d6983cb4 695 if errnote is None:
f1a9d64e 696 errnote = 'Unable to download webpage'
7f8b2714 697
9b9c5355 698 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
699 if fatal:
700 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
701 else:
6a39ee13 702 self.report_warning(errmsg)
7cc3570e 703 return False
d6983cb4 704
d391b7e2
S
705 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
706 """
707 Return a tuple (page content as string, URL handle).
708
709 See _download_webpage docstring for arguments specification.
710 """
b9d3e163
PH
711 # Strip hashes from the URL (#1038)
712 if isinstance(url_or_request, (compat_str, str)):
713 url_or_request = url_or_request.partition('#')[0]
714
d391b7e2 715 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
716 if urlh is False:
717 assert not fatal
718 return False
c9a77969 719 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
720 return (content, urlh)
721
c9a77969
YCH
722 @staticmethod
723 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
724 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
725 if m:
726 encoding = m.group(1)
727 else:
0d75ae2c 728 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
729 webpage_bytes[:1024])
730 if m:
731 encoding = m.group(1).decode('ascii')
b60016e8
PH
732 elif webpage_bytes.startswith(b'\xff\xfe'):
733 encoding = 'utf-16'
f143d86a
PH
734 else:
735 encoding = 'utf-8'
c9a77969
YCH
736
737 return encoding
738
4457823d
S
739 def __check_blocked(self, content):
740 first_block = content[:512]
3089bc74
S
741 if ('<title>Access to this site is blocked</title>' in content
742 and 'Websense' in first_block):
4457823d
S
743 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
744 blocked_iframe = self._html_search_regex(
745 r'<iframe src="([^"]+)"', content,
746 'Websense information URL', default=None)
747 if blocked_iframe:
748 msg += ' Visit %s for more details' % blocked_iframe
749 raise ExtractorError(msg, expected=True)
750 if '<title>The URL you requested has been blocked</title>' in first_block:
751 msg = (
752 'Access to this webpage has been blocked by Indian censorship. '
753 'Use a VPN or proxy server (with --proxy) to route around it.')
754 block_msg = self._html_search_regex(
755 r'</h1><p>(.*?)</p>',
756 content, 'block message', default=None)
757 if block_msg:
758 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
759 raise ExtractorError(msg, expected=True)
3089bc74
S
760 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
761 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
762 raise ExtractorError(
763 'Access to this webpage has been blocked by decision of the Russian government. '
764 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
765 expected=True)
766
c9a77969
YCH
767 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
768 content_type = urlh.headers.get('Content-Type', '')
769 webpage_bytes = urlh.read()
770 if prefix is not None:
771 webpage_bytes = prefix + webpage_bytes
772 if not encoding:
773 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
a06916d9 774 if self.get_param('dump_intermediate_pages', False):
f610dbb0 775 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
776 dump = base64.b64encode(webpage_bytes).decode('ascii')
777 self._downloader.to_screen(dump)
a06916d9 778 if self.get_param('write_pages', False):
f610dbb0 779 basen = '%s_%s' % (video_id, urlh.geturl())
c1bce22f 780 if len(basen) > 240:
f1a9d64e 781 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
782 basen = basen[:240 - len(h)] + h
783 raw_filename = basen + '.dump'
d41e6efc 784 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 785 self.to_screen('Saving request to ' + filename)
5f58165d
S
786 # Working around MAX_PATH limitation on Windows (see
787 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 788 if compat_os_name == 'nt':
5f58165d
S
789 absfilepath = os.path.abspath(filename)
790 if len(absfilepath) > 259:
791 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
792 with open(filename, 'wb') as outf:
793 outf.write(webpage_bytes)
794
ec0fafbb
AA
795 try:
796 content = webpage_bytes.decode(encoding, 'replace')
797 except LookupError:
798 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 799
4457823d 800 self.__check_blocked(content)
2410c43d 801
23be51d8 802 return content
d6983cb4 803
d391b7e2
S
804 def _download_webpage(
805 self, url_or_request, video_id, note=None, errnote=None,
806 fatal=True, tries=1, timeout=5, encoding=None, data=None,
807 headers={}, query={}, expected_status=None):
808 """
809 Return the data of the page as a string.
810
811 Arguments:
812 url_or_request -- plain text URL as a string or
813 a compat_urllib_request.Requestobject
814 video_id -- Video/playlist/item identifier (string)
815
816 Keyword arguments:
817 note -- note printed before downloading (string)
818 errnote -- note printed in case of an error (string)
819 fatal -- flag denoting whether error should be considered fatal,
820 i.e. whether it should cause ExtractionError to be raised,
821 otherwise a warning will be reported and extraction continued
822 tries -- number of tries
823 timeout -- sleep interval between tries
824 encoding -- encoding for a page content decoding, guessed automatically
825 when not explicitly specified
826 data -- POST data (bytes)
827 headers -- HTTP headers (dict)
828 query -- URL query (dict)
829 expected_status -- allows to accept failed HTTP requests (non 2xx
830 status code) by explicitly specifying a set of accepted status
831 codes. Can be any of the following entities:
832 - an integer type specifying an exact failed status code to
833 accept
834 - a list or a tuple of integer types specifying a list of
835 failed status codes to accept
836 - a callable accepting an actual failed status code and
837 returning True if it should be accepted
838 Note that this argument does not affect success status codes (2xx)
839 which are always accepted.
840 """
841
995ad69c
TF
842 success = False
843 try_count = 0
844 while success is False:
845 try:
d391b7e2
S
846 res = self._download_webpage_handle(
847 url_or_request, video_id, note, errnote, fatal,
848 encoding=encoding, data=data, headers=headers, query=query,
849 expected_status=expected_status)
995ad69c
TF
850 success = True
851 except compat_http_client.IncompleteRead as e:
852 try_count += 1
853 if try_count >= tries:
854 raise e
855 self._sleep(timeout, video_id)
7cc3570e
PH
856 if res is False:
857 return res
858 else:
859 content, _ = res
860 return content
d6983cb4 861
e0d198c1
S
862 def _download_xml_handle(
863 self, url_or_request, video_id, note='Downloading XML',
864 errnote='Unable to download XML', transform_source=None,
d391b7e2
S
865 fatal=True, encoding=None, data=None, headers={}, query={},
866 expected_status=None):
867 """
ee0ba927 868 Return a tuple (xml as an compat_etree_Element, URL handle).
d391b7e2
S
869
870 See _download_webpage docstring for arguments specification.
871 """
e0d198c1
S
872 res = self._download_webpage_handle(
873 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
874 encoding=encoding, data=data, headers=headers, query=query,
875 expected_status=expected_status)
e0d198c1
S
876 if res is False:
877 return res
878 xml_string, urlh = res
879 return self._parse_xml(
880 xml_string, video_id, transform_source=transform_source,
881 fatal=fatal), urlh
882
d391b7e2
S
883 def _download_xml(
884 self, url_or_request, video_id,
885 note='Downloading XML', errnote='Unable to download XML',
886 transform_source=None, fatal=True, encoding=None,
887 data=None, headers={}, query={}, expected_status=None):
888 """
ee0ba927 889 Return the xml as an compat_etree_Element.
d391b7e2
S
890
891 See _download_webpage docstring for arguments specification.
892 """
e0d198c1
S
893 res = self._download_xml_handle(
894 url_or_request, video_id, note=note, errnote=errnote,
895 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
896 data=data, headers=headers, query=query,
897 expected_status=expected_status)
e0d198c1 898 return res if res is False else res[0]
e01c3d2e
S
899
900 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
901 if transform_source:
902 xml_string = transform_source(xml_string)
e01c3d2e
S
903 try:
904 return compat_etree_fromstring(xml_string.encode('utf-8'))
905 except compat_xml_parse_error as ve:
906 errmsg = '%s: Failed to parse XML ' % video_id
907 if fatal:
908 raise ExtractorError(errmsg, cause=ve)
909 else:
910 self.report_warning(errmsg + str(ve))
267ed0c5 911
0fe7783e
S
912 def _download_json_handle(
913 self, url_or_request, video_id, note='Downloading JSON metadata',
914 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
915 fatal=True, encoding=None, data=None, headers={}, query={},
916 expected_status=None):
917 """
918 Return a tuple (JSON object, URL handle).
919
920 See _download_webpage docstring for arguments specification.
921 """
0fe7783e 922 res = self._download_webpage_handle(
c9a77969 923 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
924 encoding=encoding, data=data, headers=headers, query=query,
925 expected_status=expected_status)
0fe7783e
S
926 if res is False:
927 return res
928 json_string, urlh = res
ebb64199 929 return self._parse_json(
0fe7783e
S
930 json_string, video_id, transform_source=transform_source,
931 fatal=fatal), urlh
932
933 def _download_json(
934 self, url_or_request, video_id, note='Downloading JSON metadata',
935 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
936 fatal=True, encoding=None, data=None, headers={}, query={},
937 expected_status=None):
938 """
939 Return the JSON object as a dict.
940
941 See _download_webpage docstring for arguments specification.
942 """
0fe7783e
S
943 res = self._download_json_handle(
944 url_or_request, video_id, note=note, errnote=errnote,
945 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
946 data=data, headers=headers, query=query,
947 expected_status=expected_status)
0fe7783e 948 return res if res is False else res[0]
ebb64199
TF
949
950 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
951 if transform_source:
952 json_string = transform_source(json_string)
3d3538e4
PH
953 try:
954 return json.loads(json_string)
955 except ValueError as ve:
e7b6d122
PH
956 errmsg = '%s: Failed to parse JSON ' % video_id
957 if fatal:
958 raise ExtractorError(errmsg, cause=ve)
959 else:
960 self.report_warning(errmsg + str(ve))
3d3538e4 961
adddc50c 962 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
963 return self._parse_json(
964 data[data.find('{'):data.rfind('}') + 1],
965 video_id, transform_source, fatal)
966
967 def _download_socket_json_handle(
968 self, url_or_request, video_id, note='Polling socket',
969 errnote='Unable to poll socket', transform_source=None,
970 fatal=True, encoding=None, data=None, headers={}, query={},
971 expected_status=None):
972 """
973 Return a tuple (JSON object, URL handle).
974
975 See _download_webpage docstring for arguments specification.
976 """
977 res = self._download_webpage_handle(
978 url_or_request, video_id, note, errnote, fatal=fatal,
979 encoding=encoding, data=data, headers=headers, query=query,
980 expected_status=expected_status)
981 if res is False:
982 return res
983 webpage, urlh = res
984 return self._parse_socket_response_as_json(
985 webpage, video_id, transform_source=transform_source,
986 fatal=fatal), urlh
987
988 def _download_socket_json(
989 self, url_or_request, video_id, note='Polling socket',
990 errnote='Unable to poll socket', transform_source=None,
991 fatal=True, encoding=None, data=None, headers={}, query={},
992 expected_status=None):
993 """
994 Return the JSON object as a dict.
995
996 See _download_webpage docstring for arguments specification.
997 """
998 res = self._download_socket_json_handle(
999 url_or_request, video_id, note=note, errnote=errnote,
1000 transform_source=transform_source, fatal=fatal, encoding=encoding,
1001 data=data, headers=headers, query=query,
1002 expected_status=expected_status)
1003 return res if res is False else res[0]
1004
28f436ba 1005 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
b868936c 1006 idstr = format_field(video_id, template='%s: ')
28f436ba 1007 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1008 if only_once:
1009 if f'WARNING: {msg}' in self._printed_messages:
1010 return
1011 self._printed_messages.add(f'WARNING: {msg}')
1012 self._downloader.report_warning(msg, *args, **kwargs)
f45f96f8 1013
a06916d9 1014 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1015 """Print msg to screen, prefixing it with '[ie_name]'"""
a06916d9 1016 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1017
1018 def write_debug(self, msg, *args, **kwargs):
1019 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1020
1021 def get_param(self, name, default=None, *args, **kwargs):
1022 if self._downloader:
1023 return self._downloader.params.get(name, default, *args, **kwargs)
1024 return default
d6983cb4
PH
1025
1026 def report_extraction(self, id_or_name):
1027 """Report information extraction."""
f1a9d64e 1028 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1029
1030 def report_download_webpage(self, video_id):
1031 """Report webpage download."""
f1a9d64e 1032 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1033
1034 def report_age_confirmation(self):
1035 """Report attempt to confirm age."""
f1a9d64e 1036 self.to_screen('Confirming age')
d6983cb4 1037
fc79158d
JMF
1038 def report_login(self):
1039 """Report attempt to log in."""
f1a9d64e 1040 self.to_screen('Logging in')
fc79158d 1041
b7da73eb 1042 def raise_login_required(
9d5d4d64 1043 self, msg='This video is only available for registered users',
1044 metadata_available=False, method='any'):
a06916d9 1045 if metadata_available and self.get_param('ignore_no_formats_error'):
b7da73eb 1046 self.report_warning(msg)
46890374 1047 if method is not None:
1048 msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1049 raise ExtractorError(msg, expected=True)
43e7d3c9 1050
b7da73eb 1051 def raise_geo_restricted(
1052 self, msg='This video is not available from your location due to geo restriction',
1053 countries=None, metadata_available=False):
a06916d9 1054 if metadata_available and self.get_param('ignore_no_formats_error'):
b7da73eb 1055 self.report_warning(msg)
1056 else:
1057 raise GeoRestrictedError(msg, countries=countries)
1058
1059 def raise_no_formats(self, msg, expected=False, video_id=None):
a06916d9 1060 if expected and self.get_param('ignore_no_formats_error'):
b7da73eb 1061 self.report_warning(msg, video_id)
68f5867c
L
1062 elif isinstance(msg, ExtractorError):
1063 raise msg
b7da73eb 1064 else:
1065 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1066
5f6a1245 1067 # Methods for following #608
c0d0b01f 1068 @staticmethod
830d53bf 1069 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 1070 """Returns a URL that points to a page that should be processed"""
5f6a1245 1071 # TODO: ie should be the class used for getting the info
d6983cb4
PH
1072 video_info = {'_type': 'url',
1073 'url': url,
1074 'ie_key': ie}
7012b23c
PH
1075 if video_id is not None:
1076 video_info['id'] = video_id
830d53bf
S
1077 if video_title is not None:
1078 video_info['title'] = video_title
d6983cb4 1079 return video_info
5f6a1245 1080
749ca5ec
S
1081 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1082 urls = orderedSet(
46b18f23
JH
1083 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1084 for m in matches)
1085 return self.playlist_result(
749ca5ec 1086 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 1087
c0d0b01f 1088 @staticmethod
b60419c5 1089 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
d6983cb4
PH
1090 """Returns a playlist"""
1091 video_info = {'_type': 'playlist',
1092 'entries': entries}
b60419c5 1093 video_info.update(kwargs)
d6983cb4
PH
1094 if playlist_id:
1095 video_info['id'] = playlist_id
1096 if playlist_title:
1097 video_info['title'] = playlist_title
ecc97af3 1098 if playlist_description is not None:
acf5cbfe 1099 video_info['description'] = playlist_description
d6983cb4
PH
1100 return video_info
1101
c342041f 1102 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1103 """
1104 Perform a regex search on the given string, using a single or a list of
1105 patterns returning the first matching group.
1106 In case of failure return a default value or raise a WARNING or a
55b3e45b 1107 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
1108 """
1109 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1110 mobj = re.search(pattern, string, flags)
1111 else:
1112 for p in pattern:
1113 mobj = re.search(p, string, flags)
c3415d1b
PH
1114 if mobj:
1115 break
d6983cb4 1116
a06916d9 1117 if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 1118 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
1119 else:
1120 _name = name
1121
1122 if mobj:
711ede6e
PH
1123 if group is None:
1124 # return the first matching group
1125 return next(g for g in mobj.groups() if g is not None)
198f7ea8 1126 elif isinstance(group, (list, tuple)):
1127 return tuple(mobj.group(g) for g in group)
711ede6e
PH
1128 else:
1129 return mobj.group(group)
c342041f 1130 elif default is not NO_DEFAULT:
d6983cb4
PH
1131 return default
1132 elif fatal:
f1a9d64e 1133 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1134 else:
6a39ee13 1135 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1136 return None
1137
c342041f 1138 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1139 """
1140 Like _search_regex, but strips HTML tags and unescapes entities.
1141 """
711ede6e 1142 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1143 if res:
1144 return clean_html(res).strip()
1145 else:
1146 return res
1147
2118fdd1
RA
1148 def _get_netrc_login_info(self, netrc_machine=None):
1149 username = None
1150 password = None
1151 netrc_machine = netrc_machine or self._NETRC_MACHINE
1152
a06916d9 1153 if self.get_param('usenetrc', False):
2118fdd1
RA
1154 try:
1155 info = netrc.netrc().authenticators(netrc_machine)
1156 if info is not None:
1157 username = info[0]
1158 password = info[2]
1159 else:
dcce092e
S
1160 raise netrc.NetrcParseError(
1161 'No authenticators for %s' % netrc_machine)
2118fdd1 1162 except (IOError, netrc.NetrcParseError) as err:
6a39ee13 1163 self.report_warning(
dcce092e 1164 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1165
dcce092e 1166 return username, password
2118fdd1 1167
1b6712ab 1168 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1169 """
cf0649f8 1170 Get the login info as (username, password)
32443dd3
S
1171 First look for the manually specified credentials using username_option
1172 and password_option as keys in params dictionary. If no such credentials
1173 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1174 value.
fc79158d
JMF
1175 If there's no info available, return (None, None)
1176 """
fc79158d
JMF
1177
1178 # Attempt to use provided username and password or .netrc data
a06916d9 1179 username = self.get_param(username_option)
1180 if username is not None:
1181 password = self.get_param(password_option)
2118fdd1 1182 else:
1b6712ab 1183 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1184
2133565c 1185 return username, password
fc79158d 1186
e64b7569 1187 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1188 """
1189 Get the two-factor authentication info
1190 TODO - asking the user will be required for sms/phone verify
1191 currently just uses the command line option
1192 If there's no info available, return None
1193 """
83317f69 1194
a06916d9 1195 tfa = self.get_param('twofactor')
1196 if tfa is not None:
1197 return tfa
83317f69 1198
e64b7569 1199 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1200
46720279
JMF
1201 # Helper functions for extracting OpenGraph info
1202 @staticmethod
ab2d5247 1203 def _og_regexes(prop):
448ef1f3 1204 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
22f5f5c6 1205 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
7a6d76a6 1206 % {'prop': re.escape(prop)})
78fb87b2 1207 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1208 return [
78fb87b2
JMF
1209 template % (property_re, content_re),
1210 template % (content_re, property_re),
ab2d5247 1211 ]
46720279 1212
864f24bd
S
1213 @staticmethod
1214 def _meta_regex(prop):
1215 return r'''(?isx)<meta
8b9848ac 1216 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1217 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1218
3c4e6d83 1219 def _og_search_property(self, prop, html, name=None, **kargs):
6606817a 1220 prop = variadic(prop)
46720279 1221 if name is None:
b070564e
S
1222 name = 'OpenGraph %s' % prop[0]
1223 og_regexes = []
1224 for p in prop:
1225 og_regexes.extend(self._og_regexes(p))
1226 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1227 if escaped is None:
1228 return None
1229 return unescapeHTML(escaped)
46720279
JMF
1230
1231 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1232 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1233
1234 def _og_search_description(self, html, **kargs):
1235 return self._og_search_property('description', html, fatal=False, **kargs)
1236
1237 def _og_search_title(self, html, **kargs):
1238 return self._og_search_property('title', html, **kargs)
1239
8ffa13e0 1240 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1241 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1242 if secure:
1243 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1244 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1245
78338f71
JMF
1246 def _og_search_url(self, html, **kargs):
1247 return self._og_search_property('url', html, **kargs)
1248
40c696e5 1249 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
6606817a 1250 name = variadic(name)
59040888 1251 if display_name is None:
88d9f6c0 1252 display_name = name[0]
59040888 1253 return self._html_search_regex(
88d9f6c0 1254 [self._meta_regex(n) for n in name],
711ede6e 1255 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1256
1257 def _dc_search_uploader(self, html):
1258 return self._html_search_meta('dc.creator', html, 'uploader')
1259
8dbe9899
PH
1260 def _rta_search(self, html):
1261 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1262 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1263 r' content="RTA-5042-1996-1400-1577-RTA"',
1264 html):
1265 return 18
1266 return 0
1267
59040888
PH
1268 def _media_rating_search(self, html):
1269 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1270 rating = self._html_search_meta('rating', html)
1271
1272 if not rating:
1273 return None
1274
1275 RATING_TABLE = {
1276 'safe for kids': 0,
1277 'general': 8,
1278 '14 years': 14,
1279 'mature': 17,
1280 'restricted': 19,
1281 }
d800609c 1282 return RATING_TABLE.get(rating.lower())
59040888 1283
69319969 1284 def _family_friendly_search(self, html):
6ca7732d 1285 # See http://schema.org/VideoObject
ac8491fc
S
1286 family_friendly = self._html_search_meta(
1287 'isFamilyFriendly', html, default=None)
69319969
NJ
1288
1289 if not family_friendly:
1290 return None
1291
1292 RATING_TABLE = {
1293 '1': 0,
1294 'true': 0,
1295 '0': 18,
1296 'false': 18,
1297 }
d800609c 1298 return RATING_TABLE.get(family_friendly.lower())
69319969 1299
0c708f11
JMF
1300 def _twitter_search_player(self, html):
1301 return self._html_search_meta('twitter:player', html,
9e1a5b84 1302 'twitter card player')
0c708f11 1303
95b31e26 1304 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4433bb02 1305 json_ld_list = list(re.finditer(JSON_LD_RE, html))
321b5e08 1306 default = kwargs.get('default', NO_DEFAULT)
321b5e08
S
1307 # JSON-LD may be malformed and thus `fatal` should be respected.
1308 # At the same time `default` may be passed that assumes `fatal=False`
1309 # for _search_regex. Let's simulate the same behavior here as well.
dbf5416a 1310 fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False
4433bb02
S
1311 json_ld = []
1312 for mobj in json_ld_list:
1313 json_ld_item = self._parse_json(
1314 mobj.group('json_ld'), video_id, fatal=fatal)
1315 if not json_ld_item:
1316 continue
1317 if isinstance(json_ld_item, dict):
1318 json_ld.append(json_ld_item)
1319 elif isinstance(json_ld_item, (list, tuple)):
1320 json_ld.extend(json_ld_item)
1321 if json_ld:
1322 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1323 if json_ld:
1324 return json_ld
1325 if default is not NO_DEFAULT:
1326 return default
1327 elif fatal:
1328 raise RegexNotFoundError('Unable to extract JSON-LD')
1329 else:
6a39ee13 1330 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1331 return {}
4ca2a3cf 1332
95b31e26 1333 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1334 if isinstance(json_ld, compat_str):
1335 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1336 if not json_ld:
1337 return {}
1338 info = {}
46933a15
S
1339 if not isinstance(json_ld, (list, tuple, dict)):
1340 return info
1341 if isinstance(json_ld, dict):
1342 json_ld = [json_ld]
bae14048 1343
e7e4a6e0
S
1344 INTERACTION_TYPE_MAP = {
1345 'CommentAction': 'comment',
1346 'AgreeAction': 'like',
1347 'DisagreeAction': 'dislike',
1348 'LikeAction': 'like',
1349 'DislikeAction': 'dislike',
1350 'ListenAction': 'view',
1351 'WatchAction': 'view',
1352 'ViewAction': 'view',
1353 }
1354
29f7c58a 1355 def extract_interaction_type(e):
1356 interaction_type = e.get('interactionType')
1357 if isinstance(interaction_type, dict):
1358 interaction_type = interaction_type.get('@type')
1359 return str_or_none(interaction_type)
1360
e7e4a6e0
S
1361 def extract_interaction_statistic(e):
1362 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1363 if isinstance(interaction_statistic, dict):
1364 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1365 if not isinstance(interaction_statistic, list):
1366 return
1367 for is_e in interaction_statistic:
1368 if not isinstance(is_e, dict):
1369 continue
1370 if is_e.get('@type') != 'InteractionCounter':
1371 continue
29f7c58a 1372 interaction_type = extract_interaction_type(is_e)
1373 if not interaction_type:
e7e4a6e0 1374 continue
ce5b9040
S
1375 # For interaction count some sites provide string instead of
1376 # an integer (as per spec) with non digit characters (e.g. ",")
1377 # so extracting count with more relaxed str_to_int
1378 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1379 if interaction_count is None:
1380 continue
1381 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1382 if not count_kind:
1383 continue
1384 count_key = '%s_count' % count_kind
1385 if info.get(count_key) is not None:
1386 continue
1387 info[count_key] = interaction_count
1388
bae14048
S
1389 def extract_video_object(e):
1390 assert e['@type'] == 'VideoObject'
f7ad7160 1391 author = e.get('author')
bae14048 1392 info.update({
bebef109 1393 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1394 'title': unescapeHTML(e.get('name')),
1395 'description': unescapeHTML(e.get('description')),
bebef109 1396 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
bae14048
S
1397 'duration': parse_duration(e.get('duration')),
1398 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1399 # author can be an instance of 'Organization' or 'Person' types.
1400 # both types can have 'name' property(inherited from 'Thing' type). [1]
1401 # however some websites are using 'Text' type instead.
1402 # 1. https://schema.org/VideoObject
1403 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
bae14048
S
1404 'filesize': float_or_none(e.get('contentSize')),
1405 'tbr': int_or_none(e.get('bitrate')),
1406 'width': int_or_none(e.get('width')),
1407 'height': int_or_none(e.get('height')),
33a81c2c 1408 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1409 })
e7e4a6e0 1410 extract_interaction_statistic(e)
bae14048 1411
46933a15 1412 for e in json_ld:
4433bb02 1413 if '@context' in e:
46933a15
S
1414 item_type = e.get('@type')
1415 if expected_type is not None and expected_type != item_type:
4433bb02 1416 continue
c69701c6 1417 if item_type in ('TVEpisode', 'Episode'):
440863ad 1418 episode_name = unescapeHTML(e.get('name'))
46933a15 1419 info.update({
440863ad 1420 'episode': episode_name,
46933a15
S
1421 'episode_number': int_or_none(e.get('episodeNumber')),
1422 'description': unescapeHTML(e.get('description')),
1423 })
440863ad
S
1424 if not info.get('title') and episode_name:
1425 info['title'] = episode_name
46933a15 1426 part_of_season = e.get('partOfSeason')
c69701c6 1427 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1428 info.update({
1429 'season': unescapeHTML(part_of_season.get('name')),
1430 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1431 })
d16b3c66 1432 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1433 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1434 info['series'] = unescapeHTML(part_of_series.get('name'))
391256dc
S
1435 elif item_type == 'Movie':
1436 info.update({
1437 'title': unescapeHTML(e.get('name')),
1438 'description': unescapeHTML(e.get('description')),
1439 'duration': parse_duration(e.get('duration')),
1440 'timestamp': unified_timestamp(e.get('dateCreated')),
1441 })
3931b845 1442 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1443 info.update({
1444 'timestamp': parse_iso8601(e.get('datePublished')),
1445 'title': unescapeHTML(e.get('headline')),
1446 'description': unescapeHTML(e.get('articleBody')),
1447 })
1448 elif item_type == 'VideoObject':
bae14048 1449 extract_video_object(e)
4433bb02
S
1450 if expected_type is None:
1451 continue
1452 else:
1453 break
c69701c6
S
1454 video = e.get('video')
1455 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1456 extract_video_object(video)
4433bb02
S
1457 if expected_type is None:
1458 continue
1459 else:
1460 break
4ca2a3cf
S
1461 return dict((k, v) for k, v in info.items() if v is not None)
1462
27713812 1463 @staticmethod
f8da79f8 1464 def _hidden_inputs(html):
586f1cc5 1465 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1466 hidden_inputs = {}
c8498368
S
1467 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1468 attrs = extract_attributes(input)
1469 if not input:
201ea3ee 1470 continue
c8498368 1471 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1472 continue
c8498368
S
1473 name = attrs.get('name') or attrs.get('id')
1474 value = attrs.get('value')
1475 if name and value is not None:
1476 hidden_inputs[name] = value
201ea3ee 1477 return hidden_inputs
27713812 1478
cf61d96d
S
1479 def _form_hidden_inputs(self, form_id, html):
1480 form = self._search_regex(
73eb13df 1481 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1482 html, '%s form' % form_id, group='form')
1483 return self._hidden_inputs(form)
1484
eb8a4433 1485 class FormatSort:
b050d210 1486 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
eb8a4433 1487
8326b00a 1488 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
155d2b48 1489 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
53ed7066 1490 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases
1491 ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1492 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1493 'fps', 'fs_approx', 'source', 'format_id')
eb8a4433 1494
1495 settings = {
1496 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1497 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1498 'acodec': {'type': 'ordered', 'regex': True,
1499 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
f137c99e 1500 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
e36d50c5 1501 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']},
eb8a4433 1502 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1503 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1504 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1505 'aext': {'type': 'ordered', 'field': 'audio_ext',
1506 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1507 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1508 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f5510afe 1509 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
8326b00a 1510 'field': ('vcodec', 'acodec'),
1511 'function': lambda it: int(any(v != 'none' for v in it))},
f983b875 1512 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1513 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1514 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
e4beae70 1515 'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
6a04a74e 1516 'quality': {'convert': 'float_none', 'default': -1},
eb8a4433 1517 'filesize': {'convert': 'bytes'},
f137c99e 1518 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1519 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1520 'height': {'convert': 'float_none'},
1521 'width': {'convert': 'float_none'},
1522 'fps': {'convert': 'float_none'},
1523 'tbr': {'convert': 'float_none'},
1524 'vbr': {'convert': 'float_none'},
1525 'abr': {'convert': 'float_none'},
1526 'asr': {'convert': 'float_none'},
e4beae70 1527 'source': {'convert': 'ignore', 'field': 'source_preference'},
63be1aab 1528
eb8a4433 1529 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1530 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1531 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1532 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
f5510afe 1533 'res': {'type': 'multiple', 'field': ('height', 'width'),
dbf5416a 1534 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
63be1aab 1535
1536 # Most of these exist only for compatibility reasons
1537 'dimension': {'type': 'alias', 'field': 'res'},
1538 'resolution': {'type': 'alias', 'field': 'res'},
1539 'extension': {'type': 'alias', 'field': 'ext'},
1540 'bitrate': {'type': 'alias', 'field': 'br'},
eb8a4433 1541 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1542 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1543 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1544 'framerate': {'type': 'alias', 'field': 'fps'},
63be1aab 1545 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1546 'protocol': {'type': 'alias', 'field': 'proto'},
1547 'source_preference': {'type': 'alias', 'field': 'source'},
1548 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1549 'filesize_estimate': {'type': 'alias', 'field': 'size'},
eb8a4433 1550 'samplerate': {'type': 'alias', 'field': 'asr'},
1551 'video_ext': {'type': 'alias', 'field': 'vext'},
1552 'audio_ext': {'type': 'alias', 'field': 'aext'},
1553 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1554 'audio_codec': {'type': 'alias', 'field': 'acodec'},
63be1aab 1555 'video': {'type': 'alias', 'field': 'hasvid'},
1556 'has_video': {'type': 'alias', 'field': 'hasvid'},
1557 'audio': {'type': 'alias', 'field': 'hasaud'},
1558 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1559 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1560 'preference': {'type': 'alias', 'field': 'ie_pref'},
1561 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1562 'format_id': {'type': 'alias', 'field': 'id'},
1563 }
eb8a4433 1564
1565 _order = []
1566
1567 def _get_field_setting(self, field, key):
1568 if field not in self.settings:
1569 self.settings[field] = {}
1570 propObj = self.settings[field]
1571 if key not in propObj:
1572 type = propObj.get('type')
1573 if key == 'field':
1574 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1575 elif key == 'convert':
1576 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1577 else:
f5510afe 1578 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
eb8a4433 1579 propObj[key] = default
1580 return propObj[key]
1581
1582 def _resolve_field_value(self, field, value, convertNone=False):
1583 if value is None:
1584 if not convertNone:
1585 return None
4bcc7bd1 1586 else:
eb8a4433 1587 value = value.lower()
1588 conversion = self._get_field_setting(field, 'convert')
1589 if conversion == 'ignore':
1590 return None
1591 if conversion == 'string':
1592 return value
1593 elif conversion == 'float_none':
1594 return float_or_none(value)
1595 elif conversion == 'bytes':
1596 return FileDownloader.parse_bytes(value)
1597 elif conversion == 'order':
da9be05e 1598 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1599 use_regex = self._get_field_setting(field, 'regex')
1600 list_length = len(order_list)
1601 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1602 if use_regex and value is not None:
da9be05e 1603 for i, regex in enumerate(order_list):
eb8a4433 1604 if regex and re.match(regex, value):
1605 return list_length - i
1606 return list_length - empty_pos # not in list
1607 else: # not regex or value = None
1608 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1609 else:
1610 if value.isnumeric():
1611 return float(value)
4bcc7bd1 1612 else:
eb8a4433 1613 self.settings[field]['convert'] = 'string'
1614 return value
1615
1616 def evaluate_params(self, params, sort_extractor):
1617 self._use_free_order = params.get('prefer_free_formats', False)
1618 self._sort_user = params.get('format_sort', [])
1619 self._sort_extractor = sort_extractor
1620
1621 def add_item(field, reverse, closest, limit_text):
1622 field = field.lower()
1623 if field in self._order:
1624 return
1625 self._order.append(field)
1626 limit = self._resolve_field_value(field, limit_text)
1627 data = {
1628 'reverse': reverse,
1629 'closest': False if limit is None else closest,
1630 'limit_text': limit_text,
1631 'limit': limit}
1632 if field in self.settings:
1633 self.settings[field].update(data)
1634 else:
1635 self.settings[field] = data
1636
1637 sort_list = (
1638 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1639 + (tuple() if params.get('format_sort_force', False)
1640 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1641 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1642
1643 for item in sort_list:
1644 match = re.match(self.regex, item)
1645 if match is None:
1646 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1647 field = match.group('field')
1648 if field is None:
1649 continue
1650 if self._get_field_setting(field, 'type') == 'alias':
1651 field = self._get_field_setting(field, 'field')
1652 reverse = match.group('reverse') is not None
b050d210 1653 closest = match.group('separator') == '~'
eb8a4433 1654 limit_text = match.group('limit')
1655
1656 has_limit = limit_text is not None
1657 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1658 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1659
1660 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1661 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1662 limit_count = len(limits)
1663 for (i, f) in enumerate(fields):
1664 add_item(f, reverse, closest,
1665 limits[i] if i < limit_count
1666 else limits[0] if has_limit and not has_multiple_limits
1667 else None)
1668
0760b0a7 1669 def print_verbose_info(self, write_debug):
b31fdeed 1670 if self._sort_user:
0760b0a7 1671 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
eb8a4433 1672 if self._sort_extractor:
0760b0a7 1673 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1674 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
eb8a4433 1675 '+' if self._get_field_setting(field, 'reverse') else '', field,
1676 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1677 self._get_field_setting(field, 'limit_text'),
1678 self._get_field_setting(field, 'limit'))
1679 if self._get_field_setting(field, 'limit_text') is not None else '')
1680 for field in self._order if self._get_field_setting(field, 'visible')]))
1681
1682 def _calculate_field_preference_from_value(self, format, field, type, value):
1683 reverse = self._get_field_setting(field, 'reverse')
1684 closest = self._get_field_setting(field, 'closest')
1685 limit = self._get_field_setting(field, 'limit')
1686
1687 if type == 'extractor':
1688 maximum = self._get_field_setting(field, 'max')
1689 if value is None or (maximum is not None and value >= maximum):
f983b875 1690 value = -1
eb8a4433 1691 elif type == 'boolean':
1692 in_list = self._get_field_setting(field, 'in_list')
1693 not_in_list = self._get_field_setting(field, 'not_in_list')
1694 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1695 elif type == 'ordered':
1696 value = self._resolve_field_value(field, value, True)
1697
1698 # try to convert to number
6a04a74e 1699 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
eb8a4433 1700 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1701 if is_num:
1702 value = val_num
1703
1704 return ((-10, 0) if value is None
1705 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1706 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1707 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1708 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1709 else (-1, value, 0))
1710
1711 def _calculate_field_preference(self, format, field):
1712 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1713 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1714 if type == 'multiple':
1715 type = 'field' # Only 'field' is allowed in multiple for now
1716 actual_fields = self._get_field_setting(field, 'field')
1717
f5510afe 1718 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
eb8a4433 1719 else:
1720 value = get_value(field)
1721 return self._calculate_field_preference_from_value(format, field, type, value)
1722
1723 def calculate_preference(self, format):
1724 # Determine missing protocol
1725 if not format.get('protocol'):
1726 format['protocol'] = determine_protocol(format)
1727
1728 # Determine missing ext
1729 if not format.get('ext') and 'url' in format:
1730 format['ext'] = determine_ext(format['url'])
1731 if format.get('vcodec') == 'none':
8326b00a 1732 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
eb8a4433 1733 format['video_ext'] = 'none'
1734 else:
1735 format['video_ext'] = format['ext']
1736 format['audio_ext'] = 'none'
1737 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1738 # format['preference'] = -1000
1739
1740 # Determine missing bitrates
1741 if format.get('tbr') is None:
1742 if format.get('vbr') is not None and format.get('abr') is not None:
1743 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1744 else:
1745 if format.get('vcodec') != "none" and format.get('vbr') is None:
1746 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1747 if format.get('acodec') != "none" and format.get('abr') is None:
1748 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1749
1750 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1751
1752 def _sort_formats(self, formats, field_preference=[]):
1753 if not formats:
a06916d9 1754 if self.get_param('ignore_no_formats_error'):
b7da73eb 1755 return
eb8a4433 1756 raise ExtractorError('No video formats found')
1757 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1758 format_sort.evaluate_params(self._downloader.params, field_preference)
a06916d9 1759 if self.get_param('verbose', False):
0760b0a7 1760 format_sort.print_verbose_info(self._downloader.write_debug)
eb8a4433 1761 formats.sort(key=lambda f: format_sort.calculate_preference(f))
59040888 1762
96a53167
S
1763 def _check_formats(self, formats, video_id):
1764 if formats:
1765 formats[:] = filter(
1766 lambda f: self._is_valid_url(
1767 f['url'], video_id,
1768 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1769 formats)
1770
f5bdb444
S
1771 @staticmethod
1772 def _remove_duplicate_formats(formats):
1773 format_urls = set()
1774 unique_formats = []
1775 for f in formats:
1776 if f['url'] not in format_urls:
1777 format_urls.add(f['url'])
1778 unique_formats.append(f)
1779 formats[:] = unique_formats
1780
45024183 1781 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1782 url = self._proto_relative_url(url, scheme='http:')
1783 # For now assume non HTTP(S) URLs always valid
1784 if not (url.startswith('http://') or url.startswith('https://')):
1785 return True
96a53167 1786 try:
45024183 1787 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1788 return True
8bdd16b4 1789 except ExtractorError as e:
25e911a9 1790 self.to_screen(
8bdd16b4 1791 '%s: %s URL is invalid, skipping: %s'
1792 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1793 return False
96a53167 1794
20991253 1795 def http_scheme(self):
1ede5b24 1796 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1797 return (
1798 'http:'
a06916d9 1799 if self.get_param('prefer_insecure', False)
20991253
PH
1800 else 'https:')
1801
57c7411f
PH
1802 def _proto_relative_url(self, url, scheme=None):
1803 if url is None:
1804 return url
1805 if url.startswith('//'):
1806 if scheme is None:
1807 scheme = self.http_scheme()
1808 return scheme + url
1809 else:
1810 return url
1811
4094b6e3
PH
1812 def _sleep(self, timeout, video_id, msg_template=None):
1813 if msg_template is None:
f1a9d64e 1814 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1815 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1816 self.to_screen(msg)
1817 time.sleep(timeout)
1818
f983b875 1819 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1820 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1821 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
f036a632
JMF
1822 manifest = self._download_xml(
1823 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1824 'Unable to download f4m manifest',
1825 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1826 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1827 transform_source=transform_source,
7360c06f 1828 fatal=fatal, data=data, headers=headers, query=query)
4de61310
S
1829
1830 if manifest is False:
8d29e47f 1831 return []
31bb8d3f 1832
0fdbb332 1833 return self._parse_f4m_formats(
f983b875 1834 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1835 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1836
f983b875 1837 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1838 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1839 fatal=True, m3u8_id=None):
ee0ba927 1840 if not isinstance(manifest, compat_etree_Element) and not fatal:
d9eb580a
S
1841 return []
1842
7a5c1cfe 1843 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1844 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1845 if akamai_pv is not None and ';' in akamai_pv.text:
1846 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1847 if playerVerificationChallenge.strip() != '':
1848 return []
1849
31bb8d3f 1850 formats = []
7a47d07c 1851 manifest_version = '1.0'
b2527359 1852 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1853 if not media_nodes:
7a47d07c 1854 manifest_version = '2.0'
34e48bed 1855 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1856 # Remove unsupported DRM protected media from final formats
067aa17e 1857 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1858 media_nodes = remove_encrypted_media(media_nodes)
1859 if not media_nodes:
1860 return formats
48107c19
S
1861
1862 manifest_base_url = get_base_url(manifest)
0a5685b2 1863
a6571f10 1864 bootstrap_info = xpath_element(
0a5685b2
YCH
1865 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1866 'bootstrap info', default=None)
1867
edd6074c
RA
1868 vcodec = None
1869 mime_type = xpath_text(
1870 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1871 'base URL', default=None)
1872 if mime_type and mime_type.startswith('audio/'):
1873 vcodec = 'none'
1874
b2527359 1875 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1876 tbr = int_or_none(media_el.attrib.get('bitrate'))
1877 width = int_or_none(media_el.attrib.get('width'))
1878 height = int_or_none(media_el.attrib.get('height'))
1879 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1880 # If <bootstrapInfo> is present, the specified f4m is a
1881 # stream-level manifest, and only set-level manifests may refer to
1882 # external resources. See section 11.4 and section 4 of F4M spec
1883 if bootstrap_info is None:
1884 media_url = None
1885 # @href is introduced in 2.0, see section 11.6 of F4M spec
1886 if manifest_version == '2.0':
1887 media_url = media_el.attrib.get('href')
1888 if media_url is None:
1889 media_url = media_el.attrib.get('url')
31c746e5
S
1890 if not media_url:
1891 continue
cc357c4d
S
1892 manifest_url = (
1893 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1894 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1895 # If media_url is itself a f4m manifest do the recursive extraction
1896 # since bitrates in parent manifest (this one) and media_url manifest
1897 # may differ leading to inability to resolve the format by requested
1898 # bitrate in f4m downloader
240b6045
YCH
1899 ext = determine_ext(manifest_url)
1900 if ext == 'f4m':
77b8b4e6 1901 f4m_formats = self._extract_f4m_formats(
f983b875 1902 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1903 transform_source=transform_source, fatal=fatal)
1904 # Sometimes stream-level manifest contains single media entry that
1905 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1906 # At the same time parent's media entry in set-level manifest may
1907 # contain it. We will copy it from parent in such cases.
1908 if len(f4m_formats) == 1:
1909 f = f4m_formats[0]
1910 f.update({
1911 'tbr': f.get('tbr') or tbr,
1912 'width': f.get('width') or width,
1913 'height': f.get('height') or height,
1914 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1915 'vcodec': vcodec,
77b8b4e6
S
1916 })
1917 formats.extend(f4m_formats)
70f0f5a8 1918 continue
240b6045
YCH
1919 elif ext == 'm3u8':
1920 formats.extend(self._extract_m3u8_formats(
1921 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1922 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1923 continue
31bb8d3f 1924 formats.append({
77b8b4e6 1925 'format_id': format_id,
31bb8d3f 1926 'url': manifest_url,
30d0b549 1927 'manifest_url': manifest_url,
a6571f10 1928 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1929 'protocol': 'f4m',
b2527359 1930 'tbr': tbr,
77b8b4e6
S
1931 'width': width,
1932 'height': height,
edd6074c 1933 'vcodec': vcodec,
60ca389c 1934 'preference': preference,
f983b875 1935 'quality': quality,
31bb8d3f 1936 })
31bb8d3f
JMF
1937 return formats
1938
f983b875 1939 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1940 return {
f207019c 1941 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1942 'url': m3u8_url,
1943 'ext': ext,
1944 'protocol': 'm3u8',
37768f92 1945 'preference': preference - 100 if preference else -100,
f983b875 1946 'quality': quality,
704df56d
PH
1947 'resolution': 'multiple',
1948 'format_note': 'Quality selection URL',
16da9bbc
YCH
1949 }
1950
a0c3b2d5
F
1951 def _extract_m3u8_formats(self, *args, **kwargs):
1952 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1953 if subs:
1954 self.report_warning(bug_reports_message(
1955 "Ignoring subtitle tracks found in the HLS manifest; "
1956 "if any subtitle tracks are missing,"
28f436ba 1957 ), only_once=True)
a0c3b2d5
F
1958 return fmts
1959
1960 def _extract_m3u8_formats_and_subtitles(
177877c5 1961 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1962 preference=None, quality=None, m3u8_id=None, note=None,
1963 errnote=None, fatal=True, live=False, data=None, headers={},
1964 query={}):
1965
dbd82a1d 1966 res = self._download_webpage_handle(
81515ad9 1967 m3u8_url, video_id,
37a3bb66 1968 note='Downloading m3u8 information' if note is None else note,
1969 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1970 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1971
dbd82a1d 1972 if res is False:
a0c3b2d5 1973 return [], {}
cb252080 1974
dbd82a1d 1975 m3u8_doc, urlh = res
37113045 1976 m3u8_url = urlh.geturl()
9cdffeeb 1977
a0c3b2d5 1978 return self._parse_m3u8_formats_and_subtitles(
cb252080 1979 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1980 preference=preference, quality=quality, m3u8_id=m3u8_id,
1981 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1982 headers=headers, query=query, video_id=video_id)
cb252080 1983
a0c3b2d5 1984 def _parse_m3u8_formats_and_subtitles(
177877c5 1985 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1986 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1987 errnote=None, fatal=True, data=None, headers={}, query={},
1988 video_id=None):
60755938 1989 formats, subtitles = [], {}
a0c3b2d5 1990
08a00eef 1991 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
60755938 1992 return formats, subtitles
08a00eef 1993
a06916d9 1994 if (not self.get_param('allow_unplayable_formats')
73d4343e 1995 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
60755938 1996 return formats, subtitles
a0c3b2d5 1997
60755938 1998 def format_url(url):
1999 return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
2000
2001 if self.get_param('hls_split_discontinuity', False):
2002 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2003 if not m3u8_doc:
2004 if not manifest_url:
2005 return []
2006 m3u8_doc = self._download_webpage(
2007 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2008 note=False, errnote='Failed to download m3u8 playlist information')
2009 if m3u8_doc is False:
2010 return []
2011 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
0def7587 2012
60755938 2013 else:
2014 def _extract_m3u8_playlist_indices(*args, **kwargs):
2015 return [None]
310c2ed2 2016
cb252080
S
2017 # References:
2018 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2019 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2020 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2021
2022 # We should try extracting formats only from master playlists [1, 4.3.4],
2023 # i.e. playlists that describe available qualities. On the other hand
2024 # media playlists [1, 4.3.3] should be returned as is since they contain
2025 # just the media without qualities renditions.
9cdffeeb 2026 # Fortunately, master playlist can be easily distinguished from media
cb252080 2027 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2028 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2029 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2030 # media playlist and MUST NOT appear in master playlist thus we can
2031 # clearly detect media playlist with this criterion.
2032
9cdffeeb 2033 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
60755938 2034 formats = [{
2035 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
2036 'format_index': idx,
2037 'url': m3u8_url,
2038 'ext': ext,
2039 'protocol': entry_protocol,
2040 'preference': preference,
2041 'quality': quality,
2042 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
310c2ed2 2043
a0c3b2d5 2044 return formats, subtitles
cb252080
S
2045
2046 groups = {}
2047 last_stream_inf = {}
2048
2049 def extract_media(x_media_line):
2050 media = parse_m3u8_attributes(x_media_line)
2051 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2052 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2053 if not (media_type and group_id and name):
2054 return
2055 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2056 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2057 if media_type == 'SUBTITLES':
3907333c 2058 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2059 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2060 # However, lack of URI has been spotted in the wild.
2061 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2062 if not media.get('URI'):
2063 return
a0c3b2d5
F
2064 url = format_url(media['URI'])
2065 sub_info = {
2066 'url': url,
2067 'ext': determine_ext(url),
2068 }
4a2f19ab
F
2069 if sub_info['ext'] == 'm3u8':
2070 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2071 # files may contain is WebVTT:
2072 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2073 sub_info['ext'] = 'vtt'
2074 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2075 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2076 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2077 if media_type not in ('VIDEO', 'AUDIO'):
2078 return
2079 media_url = media.get('URI')
2080 if media_url:
310c2ed2 2081 manifest_url = format_url(media_url)
60755938 2082 formats.extend({
2083 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
2084 'format_note': name,
2085 'format_index': idx,
2086 'url': manifest_url,
2087 'manifest_url': m3u8_url,
2088 'language': media.get('LANGUAGE'),
2089 'ext': ext,
2090 'protocol': entry_protocol,
2091 'preference': preference,
2092 'quality': quality,
2093 'vcodec': 'none' if media_type == 'AUDIO' else None,
2094 } for idx in _extract_m3u8_playlist_indices(manifest_url))
cb252080
S
2095
2096 def build_stream_name():
2097 # Despite specification does not mention NAME attribute for
3019cb0c
S
2098 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2099 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2100 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2101 stream_name = last_stream_inf.get('NAME')
2102 if stream_name:
2103 return stream_name
2104 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2105 # from corresponding rendition group
2106 stream_group_id = last_stream_inf.get('VIDEO')
2107 if not stream_group_id:
2108 return
2109 stream_group = groups.get(stream_group_id)
2110 if not stream_group:
2111 return stream_group_id
2112 rendition = stream_group[0]
2113 return rendition.get('NAME') or stream_group_id
2114
379306ef 2115 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2116 # chance to detect video only formats when EXT-X-STREAM-INF tags
2117 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2118 for line in m3u8_doc.splitlines():
2119 if line.startswith('#EXT-X-MEDIA:'):
2120 extract_media(line)
2121
704df56d
PH
2122 for line in m3u8_doc.splitlines():
2123 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2124 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2125 elif line.startswith('#') or not line.strip():
2126 continue
2127 else:
9c99bef7 2128 tbr = float_or_none(
3089bc74
S
2129 last_stream_inf.get('AVERAGE-BANDWIDTH')
2130 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2131 manifest_url = format_url(line.strip())
5ef62fc4 2132
60755938 2133 for idx in _extract_m3u8_playlist_indices(manifest_url):
2134 format_id = [m3u8_id, None, idx]
310c2ed2 2135 # Bandwidth of live streams may differ over time thus making
2136 # format_id unpredictable. So it's better to keep provided
2137 # format_id intact.
2138 if not live:
60755938 2139 stream_name = build_stream_name()
2140 format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
310c2ed2 2141 f = {
60755938 2142 'format_id': '-'.join(map(str, filter(None, format_id))),
2143 'format_index': idx,
310c2ed2 2144 'url': manifest_url,
2145 'manifest_url': m3u8_url,
2146 'tbr': tbr,
2147 'ext': ext,
2148 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2149 'protocol': entry_protocol,
2150 'preference': preference,
2151 'quality': quality,
2152 }
2153 resolution = last_stream_inf.get('RESOLUTION')
2154 if resolution:
2155 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2156 if mobj:
2157 f['width'] = int(mobj.group('width'))
2158 f['height'] = int(mobj.group('height'))
2159 # Unified Streaming Platform
2160 mobj = re.search(
2161 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2162 if mobj:
2163 abr, vbr = mobj.groups()
2164 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2165 f.update({
2166 'vbr': vbr,
2167 'abr': abr,
2168 })
2169 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2170 f.update(codecs)
2171 audio_group_id = last_stream_inf.get('AUDIO')
2172 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2173 # references a rendition group MUST have a CODECS attribute.
2174 # However, this is not always respected, for example, [2]
2175 # contains EXT-X-STREAM-INF tag which references AUDIO
2176 # rendition group but does not have CODECS and despite
2177 # referencing an audio group it represents a complete
2178 # (with audio and video) format. So, for such cases we will
2179 # ignore references to rendition groups and treat them
2180 # as complete formats.
2181 if audio_group_id and codecs and f.get('vcodec') != 'none':
2182 audio_group = groups.get(audio_group_id)
2183 if audio_group and audio_group[0].get('URI'):
2184 # TODO: update acodec for audio only formats with
2185 # the same GROUP-ID
2186 f['acodec'] = 'none'
fc21af50 2187 if not f.get('ext'):
2188 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2189 formats.append(f)
2190
2191 # for DailyMotion
2192 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2193 if progressive_uri:
2194 http_f = f.copy()
2195 del http_f['manifest_url']
2196 http_f.update({
2197 'format_id': f['format_id'].replace('hls-', 'http-'),
2198 'protocol': 'http',
2199 'url': progressive_uri,
2200 })
2201 formats.append(http_f)
5ef62fc4 2202
cb252080 2203 last_stream_inf = {}
a0c3b2d5 2204 return formats, subtitles
704df56d 2205
a107193e
S
2206 @staticmethod
2207 def _xpath_ns(path, namespace=None):
2208 if not namespace:
2209 return path
2210 out = []
2211 for c in path.split('/'):
2212 if not c or c == '.':
2213 out.append(c)
2214 else:
2215 out.append('{%s}%s' % (namespace, c))
2216 return '/'.join(out)
2217
da1c94ee 2218 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
09f572fb 2219 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 2220
995029a1
PH
2221 if smil is False:
2222 assert not fatal
2223 return []
e89a2aab 2224
17712eeb 2225 namespace = self._parse_smil_namespace(smil)
a107193e 2226
da1c94ee 2227 fmts = self._parse_smil_formats(
a107193e 2228 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
da1c94ee
F
2229 subs = self._parse_smil_subtitles(
2230 smil, namespace=namespace)
2231
2232 return fmts, subs
2233
2234 def _extract_smil_formats(self, *args, **kwargs):
2235 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2236 if subs:
2237 self.report_warning(bug_reports_message(
2238 "Ignoring subtitle tracks found in the SMIL manifest; "
2239 "if any subtitle tracks are missing,"
28f436ba 2240 ), only_once=True)
da1c94ee 2241 return fmts
a107193e
S
2242
2243 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2244 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2245 if smil is False:
2246 return {}
2247 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2248
09f572fb 2249 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
2250 return self._download_xml(
2251 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2252 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2253
2254 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2255 namespace = self._parse_smil_namespace(smil)
a107193e
S
2256
2257 formats = self._parse_smil_formats(
2258 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2259 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2260
2261 video_id = os.path.splitext(url_basename(smil_url))[0]
2262 title = None
2263 description = None
647eab45 2264 upload_date = None
a107193e
S
2265 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2266 name = meta.attrib.get('name')
2267 content = meta.attrib.get('content')
2268 if not name or not content:
2269 continue
2270 if not title and name == 'title':
2271 title = content
2272 elif not description and name in ('description', 'abstract'):
2273 description = content
647eab45
S
2274 elif not upload_date and name == 'date':
2275 upload_date = unified_strdate(content)
a107193e 2276
1e5bcdec
S
2277 thumbnails = [{
2278 'id': image.get('type'),
2279 'url': image.get('src'),
2280 'width': int_or_none(image.get('width')),
2281 'height': int_or_none(image.get('height')),
2282 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2283
a107193e
S
2284 return {
2285 'id': video_id,
2286 'title': title or video_id,
2287 'description': description,
647eab45 2288 'upload_date': upload_date,
1e5bcdec 2289 'thumbnails': thumbnails,
a107193e
S
2290 'formats': formats,
2291 'subtitles': subtitles,
2292 }
2293
17712eeb
S
2294 def _parse_smil_namespace(self, smil):
2295 return self._search_regex(
2296 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2297
f877c6ae 2298 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2299 base = smil_url
2300 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2301 b = meta.get('base') or meta.get('httpBase')
2302 if b:
2303 base = b
2304 break
e89a2aab
S
2305
2306 formats = []
2307 rtmp_count = 0
a107193e 2308 http_count = 0
7f32e5dc 2309 m3u8_count = 0
a107193e 2310
81e1c4e2 2311 srcs = []
ad96b4c8
YCH
2312 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2313 for medium in media:
2314 src = medium.get('src')
81e1c4e2 2315 if not src or src in srcs:
a107193e 2316 continue
81e1c4e2 2317 srcs.append(src)
a107193e 2318
ad96b4c8
YCH
2319 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2320 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2321 width = int_or_none(medium.get('width'))
2322 height = int_or_none(medium.get('height'))
2323 proto = medium.get('proto')
2324 ext = medium.get('ext')
a107193e 2325 src_ext = determine_ext(src)
ad96b4c8 2326 streamer = medium.get('streamer') or base
a107193e
S
2327
2328 if proto == 'rtmp' or streamer.startswith('rtmp'):
2329 rtmp_count += 1
2330 formats.append({
2331 'url': streamer,
2332 'play_path': src,
2333 'ext': 'flv',
2334 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2335 'tbr': bitrate,
2336 'filesize': filesize,
2337 'width': width,
2338 'height': height,
2339 })
f877c6ae
YCH
2340 if transform_rtmp_url:
2341 streamer, src = transform_rtmp_url(streamer, src)
2342 formats[-1].update({
2343 'url': streamer,
2344 'play_path': src,
2345 })
a107193e
S
2346 continue
2347
2348 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 2349 src_url = src_url.strip()
a107193e
S
2350
2351 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2352 m3u8_formats = self._extract_m3u8_formats(
2353 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2354 if len(m3u8_formats) == 1:
2355 m3u8_count += 1
2356 m3u8_formats[0].update({
2357 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2358 'tbr': bitrate,
2359 'width': width,
2360 'height': height,
2361 })
2362 formats.extend(m3u8_formats)
bd21ead2 2363 elif src_ext == 'f4m':
a107193e
S
2364 f4m_url = src_url
2365 if not f4m_params:
2366 f4m_params = {
2367 'hdcore': '3.2.0',
2368 'plugin': 'flowplayer-3.2.0.1',
2369 }
2370 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 2371 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 2372 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2373 elif src_ext == 'mpd':
2374 formats.extend(self._extract_mpd_formats(
2375 src_url, video_id, mpd_id='dash', fatal=False))
2376 elif re.search(r'\.ism/[Mm]anifest', src_url):
2377 formats.extend(self._extract_ism_formats(
2378 src_url, video_id, ism_id='mss', fatal=False))
2379 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2380 http_count += 1
2381 formats.append({
2382 'url': src_url,
2383 'ext': ext or src_ext or 'flv',
2384 'format_id': 'http-%d' % (bitrate or http_count),
2385 'tbr': bitrate,
2386 'filesize': filesize,
2387 'width': width,
2388 'height': height,
2389 })
63757032 2390
e89a2aab
S
2391 return formats
2392
ce00af87 2393 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2394 urls = []
a107193e
S
2395 subtitles = {}
2396 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2397 src = textstream.get('src')
d413095f 2398 if not src or src in urls:
a107193e 2399 continue
d413095f 2400 urls.append(src)
df634be2 2401 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2402 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2403 subtitles.setdefault(lang, []).append({
2404 'url': src,
2405 'ext': ext,
2406 })
2407 return subtitles
63757032 2408
47a5cb77 2409 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 2410 xspf = self._download_xml(
47a5cb77 2411 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
2412 'Unable to download xspf manifest', fatal=fatal)
2413 if xspf is False:
2414 return []
47a5cb77
S
2415 return self._parse_xspf(
2416 xspf, playlist_id, xspf_url=xspf_url,
2417 xspf_base_url=base_url(xspf_url))
8d6765cf 2418
47a5cb77 2419 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2420 NS_MAP = {
2421 'xspf': 'http://xspf.org/ns/0/',
2422 's1': 'http://static.streamone.nl/player/ns/0',
2423 }
2424
2425 entries = []
47a5cb77 2426 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2427 title = xpath_text(
98044462 2428 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2429 description = xpath_text(
2430 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2431 thumbnail = xpath_text(
2432 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2433 duration = float_or_none(
2434 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2435
47a5cb77
S
2436 formats = []
2437 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2438 format_url = urljoin(xspf_base_url, location.text)
2439 if not format_url:
2440 continue
2441 formats.append({
2442 'url': format_url,
2443 'manifest_url': xspf_url,
2444 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2445 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2446 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2447 })
8d6765cf
S
2448 self._sort_formats(formats)
2449
2450 entries.append({
2451 'id': playlist_id,
2452 'title': title,
2453 'description': description,
2454 'thumbnail': thumbnail,
2455 'duration': duration,
2456 'formats': formats,
2457 })
2458 return entries
2459
171e59ed
F
2460 def _extract_mpd_formats(self, *args, **kwargs):
2461 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2462 if subs:
2463 self.report_warning(bug_reports_message(
2464 "Ignoring subtitle tracks found in the DASH manifest; "
2465 "if any subtitle tracks are missing,"
28f436ba 2466 ), only_once=True)
171e59ed
F
2467 return fmts
2468
2469 def _extract_mpd_formats_and_subtitles(
2470 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2471 fatal=True, data=None, headers={}, query={}):
47a5cb77 2472 res = self._download_xml_handle(
1bac3455 2473 mpd_url, video_id,
37a3bb66 2474 note='Downloading MPD manifest' if note is None else note,
2475 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2476 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2477 if res is False:
171e59ed 2478 return [], {}
47a5cb77 2479 mpd_doc, urlh = res
c25720ef 2480 if mpd_doc is None:
171e59ed 2481 return [], {}
02dc0a36 2482 mpd_base_url = base_url(urlh.geturl())
1bac3455 2483
171e59ed 2484 return self._parse_mpd_formats_and_subtitles(
545cc85d 2485 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2486
171e59ed
F
2487 def _parse_mpd_formats(self, *args, **kwargs):
2488 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2489 if subs:
2490 self.report_warning(bug_reports_message(
2491 "Ignoring subtitle tracks found in the DASH manifest; "
2492 "if any subtitle tracks are missing,"
28f436ba 2493 ), only_once=True)
171e59ed
F
2494 return fmts
2495
2496 def _parse_mpd_formats_and_subtitles(
2497 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2498 """
2499 Parse formats from MPD manifest.
2500 References:
2501 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2502 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2503 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2504 """
a06916d9 2505 if not self.get_param('dynamic_mpd', True):
78895bd3 2506 if mpd_doc.get('type') == 'dynamic':
171e59ed 2507 return [], {}
2d2fa82d 2508
91cb6b50 2509 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2510
2511 def _add_ns(path):
2512 return self._xpath_ns(path, namespace)
2513
675d0016 2514 def is_drm_protected(element):
2515 return element.find(_add_ns('ContentProtection')) is not None
2516
1bac3455 2517 def extract_multisegment_info(element, ms_parent_info):
2518 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2519
2520 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2521 # common attributes and elements. We will only extract relevant
2522 # for us.
2523 def extract_common(source):
2524 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2525 if segment_timeline is not None:
2526 s_e = segment_timeline.findall(_add_ns('S'))
2527 if s_e:
2528 ms_info['total_number'] = 0
2529 ms_info['s'] = []
2530 for s in s_e:
2531 r = int(s.get('r', 0))
2532 ms_info['total_number'] += 1 + r
2533 ms_info['s'].append({
2534 't': int(s.get('t', 0)),
2535 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2536 'd': int(s.attrib['d']),
2537 'r': r,
2538 })
2539 start_number = source.get('startNumber')
2540 if start_number:
2541 ms_info['start_number'] = int(start_number)
2542 timescale = source.get('timescale')
2543 if timescale:
2544 ms_info['timescale'] = int(timescale)
2545 segment_duration = source.get('duration')
2546 if segment_duration:
48504785 2547 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2548
2549 def extract_Initialization(source):
2550 initialization = source.find(_add_ns('Initialization'))
2551 if initialization is not None:
2552 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2553
f14be228 2554 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2555 if segment_list is not None:
b4c1d6e8
S
2556 extract_common(segment_list)
2557 extract_Initialization(segment_list)
f14be228 2558 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2559 if segment_urls_e:
2560 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2561 else:
f14be228 2562 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2563 if segment_template is not None:
b4c1d6e8 2564 extract_common(segment_template)
e228616c
S
2565 media = segment_template.get('media')
2566 if media:
2567 ms_info['media'] = media
1bac3455 2568 initialization = segment_template.get('initialization')
2569 if initialization:
e228616c 2570 ms_info['initialization'] = initialization
1bac3455 2571 else:
b4c1d6e8 2572 extract_Initialization(segment_template)
1bac3455 2573 return ms_info
b323e170 2574
a06916d9 2575 skip_unplayable = not self.get_param('allow_unplayable_formats')
63ad4d43 2576
1bac3455 2577 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 2578 formats = []
171e59ed 2579 subtitles = {}
f14be228 2580 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2581 period_duration = parse_duration(period.get('duration')) or mpd_duration
2582 period_ms_info = extract_multisegment_info(period, {
2583 'start_number': 1,
2584 'timescale': 1,
2585 })
f14be228 2586 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
06869367 2587 if skip_unplayable and is_drm_protected(adaptation_set):
675d0016 2588 continue
1bac3455 2589 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2590 for representation in adaptation_set.findall(_add_ns('Representation')):
06869367 2591 if skip_unplayable and is_drm_protected(representation):
675d0016 2592 continue
1bac3455 2593 representation_attrib = adaptation_set.attrib.copy()
2594 representation_attrib.update(representation.attrib)
f0948348 2595 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2596 mime_type = representation_attrib['mimeType']
171e59ed
F
2597 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2598
cdb19aa4 2599 if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg':
1bac3455 2600 base_url = ''
2601 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 2602 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 2603 if base_url_e is not None:
2604 base_url = base_url_e.text + base_url
2605 if re.match(r'^https?://', base_url):
2606 break
bb20526b
S
2607 if mpd_base_url and not re.match(r'^https?://', base_url):
2608 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2609 mpd_base_url += '/'
1bac3455 2610 base_url = mpd_base_url + base_url
2611 representation_id = representation_attrib.get('id')
d577c796 2612 lang = representation_attrib.get('lang')
51e9094f 2613 url_el = representation.find(_add_ns('BaseURL'))
2614 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
e228616c 2615 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
cdb19aa4 2616 if representation_id is not None:
2617 format_id = representation_id
2618 else:
2619 format_id = content_type
2620 if mpd_id:
2621 format_id = mpd_id + '-' + format_id
171e59ed
F
2622 if content_type in ('video', 'audio'):
2623 f = {
cdb19aa4 2624 'format_id': format_id,
171e59ed
F
2625 'manifest_url': mpd_url,
2626 'ext': mimetype2ext(mime_type),
2627 'width': int_or_none(representation_attrib.get('width')),
2628 'height': int_or_none(representation_attrib.get('height')),
2629 'tbr': float_or_none(bandwidth, 1000),
2630 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2631 'fps': int_or_none(representation_attrib.get('frameRate')),
2632 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2633 'format_note': 'DASH %s' % content_type,
2634 'filesize': filesize,
2635 'container': mimetype2ext(mime_type) + '_dash',
2636 }
2637 f.update(parse_codecs(representation_attrib.get('codecs')))
2638 elif content_type == 'text':
2639 f = {
2640 'ext': mimetype2ext(mime_type),
2641 'manifest_url': mpd_url,
2642 'filesize': filesize,
2643 }
cdb19aa4 2644 elif mime_type == 'image/jpeg':
2645 # See test case in VikiIE
2646 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2647 f = {
2648 'format_id': format_id,
2649 'ext': 'mhtml',
2650 'manifest_url': mpd_url,
2651 'format_note': 'DASH storyboards (jpeg)',
2652 'acodec': 'none',
2653 'vcodec': 'none',
2654 }
1bac3455 2655 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
b4c1d6e8 2656
e228616c 2657 def prepare_template(template_name, identifiers):
eca1f0d1
S
2658 tmpl = representation_ms_info[template_name]
2659 # First of, % characters outside $...$ templates
2660 # must be escaped by doubling for proper processing
2661 # by % operator string formatting used further (see
067aa17e 2662 # https://github.com/ytdl-org/youtube-dl/issues/16867).
eca1f0d1
S
2663 t = ''
2664 in_template = False
2665 for c in tmpl:
2666 t += c
2667 if c == '$':
2668 in_template = not in_template
2669 elif c == '%' and not in_template:
2670 t += c
2671 # Next, $...$ templates are translated to their
2672 # %(...) counterparts to be used with % operator
cdb19aa4 2673 if representation_id is not None:
2674 t = t.replace('$RepresentationID$', representation_id)
e228616c
S
2675 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2676 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2677 t.replace('$$', '$')
2678 return t
2679
2680 # @initialization is a regular template like @media one
2681 # so it should be handled just the same way (see
067aa17e 2682 # https://github.com/ytdl-org/youtube-dl/issues/11605)
e228616c
S
2683 if 'initialization' in representation_ms_info:
2684 initialization_template = prepare_template(
2685 'initialization',
2686 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2687 # $Time$ shall not be included for @initialization thus
2688 # only $Bandwidth$ remains
2689 ('Bandwidth', ))
2690 representation_ms_info['initialization_url'] = initialization_template % {
2691 'Bandwidth': bandwidth,
2692 }
2693
1141e910
S
2694 def location_key(location):
2695 return 'url' if re.match(r'^https?://', location) else 'path'
2696
e228616c
S
2697 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2698
2699 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1141e910 2700 media_location_key = location_key(media_template)
f0948348
S
2701
2702 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2703 # can't be used at the same time
b4c1d6e8
S
2704 if '%(Number' in media_template and 's' not in representation_ms_info:
2705 segment_duration = None
c110944f 2706 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
b4c1d6e8
S
2707 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2708 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
b4c1d6e8 2709 representation_ms_info['fragments'] = [{
1141e910 2710 media_location_key: media_template % {
b4c1d6e8 2711 'Number': segment_number,
e228616c 2712 'Bandwidth': bandwidth,
b4c1d6e8
S
2713 },
2714 'duration': segment_duration,
2715 } for segment_number in range(
2716 representation_ms_info['start_number'],
2717 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
f0948348 2718 else:
b4c1d6e8
S
2719 # $Number*$ or $Time$ in media template with S list available
2720 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2721 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
b4c1d6e8 2722 representation_ms_info['fragments'] = []
f0948348 2723 segment_time = 0
b4c1d6e8
S
2724 segment_d = None
2725 segment_number = representation_ms_info['start_number']
f0948348
S
2726
2727 def add_segment_url():
b4c1d6e8
S
2728 segment_url = media_template % {
2729 'Time': segment_time,
e228616c 2730 'Bandwidth': bandwidth,
b4c1d6e8
S
2731 'Number': segment_number,
2732 }
b4c1d6e8 2733 representation_ms_info['fragments'].append({
1141e910 2734 media_location_key: segment_url,
b4c1d6e8
S
2735 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2736 })
f0948348
S
2737
2738 for num, s in enumerate(representation_ms_info['s']):
2739 segment_time = s.get('t') or segment_time
b4c1d6e8 2740 segment_d = s['d']
f0948348 2741 add_segment_url()
b4c1d6e8 2742 segment_number += 1
f0948348 2743 for r in range(s.get('r', 0)):
b4c1d6e8 2744 segment_time += segment_d
f0948348 2745 add_segment_url()
b4c1d6e8
S
2746 segment_number += 1
2747 segment_time += segment_d
2748 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2749 # No media template
2750 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2751 # or any YouTube dashsegments video
2752 fragments = []
d04621da
S
2753 segment_index = 0
2754 timescale = representation_ms_info['timescale']
2755 for s in representation_ms_info['s']:
2756 duration = float_or_none(s['d'], timescale)
b4c1d6e8 2757 for r in range(s.get('r', 0) + 1):
1141e910 2758 segment_uri = representation_ms_info['segment_urls'][segment_index]
b4c1d6e8 2759 fragments.append({
1141e910 2760 location_key(segment_uri): segment_uri,
d04621da 2761 'duration': duration,
b4c1d6e8 2762 })
d04621da 2763 segment_index += 1
b4c1d6e8 2764 representation_ms_info['fragments'] = fragments
41bf647e
PN
2765 elif 'segment_urls' in representation_ms_info:
2766 # Segment URLs with no SegmentTimeline
2767 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
067aa17e 2768 # https://github.com/ytdl-org/youtube-dl/pull/14844
41bf647e 2769 fragments = []
603fc4e0
S
2770 segment_duration = float_or_none(
2771 representation_ms_info['segment_duration'],
2772 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
41bf647e 2773 for segment_url in representation_ms_info['segment_urls']:
603fc4e0 2774 fragment = {
41bf647e 2775 location_key(segment_url): segment_url,
603fc4e0
S
2776 }
2777 if segment_duration:
2778 fragment['duration'] = segment_duration
2779 fragments.append(fragment)
41bf647e 2780 representation_ms_info['fragments'] = fragments
79d2077e
S
2781 # If there is a fragments key available then we correctly recognized fragmented media.
2782 # Otherwise we will assume unfragmented media with direct access. Technically, such
2783 # assumption is not necessarily correct since we may simply have no support for
2784 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
86f4d14f 2785 if 'fragments' in representation_ms_info:
1bac3455 2786 f.update({
79d2077e
S
2787 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2788 'url': mpd_url or base_url,
1141e910 2789 'fragment_base_url': base_url,
b4c1d6e8 2790 'fragments': [],
cdb19aa4 2791 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
df374b52 2792 })
1bac3455 2793 if 'initialization_url' in representation_ms_info:
e228616c 2794 initialization_url = representation_ms_info['initialization_url']
1bac3455 2795 if not f.get('url'):
2796 f['url'] = initialization_url
1141e910 2797 f['fragments'].append({location_key(initialization_url): initialization_url})
b4c1d6e8 2798 f['fragments'].extend(representation_ms_info['fragments'])
79d2077e
S
2799 else:
2800 # Assuming direct URL to unfragmented media.
2801 f['url'] = base_url
cdb19aa4 2802 if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
fd76a142
F
2803 formats.append(f)
2804 elif content_type == 'text':
2805 subtitles.setdefault(lang or 'und', []).append(f)
17b598d3 2806 else:
1bac3455 2807 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
171e59ed 2808 return formats, subtitles
17b598d3 2809
fd76a142
F
2810 def _extract_ism_formats(self, *args, **kwargs):
2811 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2812 if subs:
2813 self.report_warning(bug_reports_message(
2814 "Ignoring subtitle tracks found in the ISM manifest; "
2815 "if any subtitle tracks are missing,"
2816 ))
2817 return fmts
2818
2819 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2820 res = self._download_xml_handle(
b2758123 2821 ism_url, video_id,
37a3bb66 2822 note='Downloading ISM manifest' if note is None else note,
2823 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2824 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2825 if res is False:
fd76a142 2826 return [], {}
47a5cb77 2827 ism_doc, urlh = res
13b08034 2828 if ism_doc is None:
fd76a142 2829 return [], {}
b2758123 2830
fd76a142 2831 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 2832
fd76a142 2833 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2834 """
2835 Parse formats from ISM manifest.
2836 References:
2837 1. [MS-SSTR]: Smooth Streaming Protocol,
2838 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2839 """
06869367 2840 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2841 return [], {}
a06916d9 2842 if (not self.get_param('allow_unplayable_formats')
06869367 2843 and ism_doc.find('Protection') is not None):
fd76a142 2844 return [], {}
b2758123 2845
b2758123
RA
2846 duration = int(ism_doc.attrib['Duration'])
2847 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2848
2849 formats = []
fd76a142 2850 subtitles = {}
b2758123
RA
2851 for stream in ism_doc.findall('StreamIndex'):
2852 stream_type = stream.get('Type')
fd76a142 2853 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2854 continue
2855 url_pattern = stream.attrib['Url']
2856 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2857 stream_name = stream.get('Name')
fd76a142 2858 stream_language = stream.get('Language', 'und')
b2758123 2859 for track in stream.findall('QualityLevel'):
e2efe599 2860 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
b2758123 2861 # TODO: add support for WVC1 and WMAP
66a1b864 2862 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
b2758123
RA
2863 self.report_warning('%s is not a supported codec' % fourcc)
2864 continue
2865 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2866 # [1] does not mention Width and Height attributes. However,
2867 # they're often present while MaxWidth and MaxHeight are
2868 # missing, so should be used as fallbacks
2869 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2870 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2871 sampling_rate = int_or_none(track.get('SamplingRate'))
2872
2873 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2874 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2875
2876 fragments = []
2877 fragment_ctx = {
2878 'time': 0,
2879 }
2880 stream_fragments = stream.findall('c')
2881 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2882 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2883 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2884 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2885 if not fragment_ctx['duration']:
2886 try:
2887 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2888 except IndexError:
2889 next_fragment_time = duration
1616f9b4 2890 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2891 for _ in range(fragment_repeat):
2892 fragments.append({
1616f9b4 2893 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2894 'duration': fragment_ctx['duration'] / stream_timescale,
2895 })
2896 fragment_ctx['time'] += fragment_ctx['duration']
2897
2898 format_id = []
2899 if ism_id:
2900 format_id.append(ism_id)
2901 if stream_name:
2902 format_id.append(stream_name)
2903 format_id.append(compat_str(tbr))
2904
fd76a142
F
2905 if stream_type == 'text':
2906 subtitles.setdefault(stream_language, []).append({
2907 'ext': 'ismt',
2908 'protocol': 'ism',
2909 'url': ism_url,
2910 'manifest_url': ism_url,
2911 'fragments': fragments,
2912 '_download_params': {
2913 'stream_type': stream_type,
2914 'duration': duration,
2915 'timescale': stream_timescale,
2916 'fourcc': fourcc,
2917 'language': stream_language,
2918 'codec_private_data': track.get('CodecPrivateData'),
2919 }
2920 })
2921 elif stream_type in ('video', 'audio'):
2922 formats.append({
2923 'format_id': '-'.join(format_id),
2924 'url': ism_url,
2925 'manifest_url': ism_url,
2926 'ext': 'ismv' if stream_type == 'video' else 'isma',
2927 'width': width,
2928 'height': height,
2929 'tbr': tbr,
2930 'asr': sampling_rate,
2931 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2932 'acodec': 'none' if stream_type == 'video' else fourcc,
2933 'protocol': 'ism',
2934 'fragments': fragments,
2935 '_download_params': {
2936 'stream_type': stream_type,
2937 'duration': duration,
2938 'timescale': stream_timescale,
2939 'width': width or 0,
2940 'height': height or 0,
2941 'fourcc': fourcc,
2942 'language': stream_language,
2943 'codec_private_data': track.get('CodecPrivateData'),
2944 'sampling_rate': sampling_rate,
2945 'channels': int_or_none(track.get('Channels', 2)),
2946 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2947 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2948 },
2949 })
2950 return formats, subtitles
b2758123 2951
f983b875 2952 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
6780154e
S
2953 def absolute_url(item_url):
2954 return urljoin(base_url, item_url)
59bbe491 2955
2956 def parse_content_type(content_type):
2957 if not content_type:
2958 return {}
2959 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2960 if ctr:
2961 mimetype, codecs = ctr.groups()
2962 f = parse_codecs(codecs)
2963 f['ext'] = mimetype2ext(mimetype)
2964 return f
2965 return {}
2966
868f79db 2967 def _media_formats(src, cur_media_type, type_info={}):
520251c0 2968 full_url = absolute_url(src)
82889d4a 2969 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 2970 if ext == 'm3u8':
520251c0
YCH
2971 is_plain_url = False
2972 formats = self._extract_m3u8_formats(
ad120ae1 2973 full_url, video_id, ext='mp4',
eeb0a956 2974 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 2975 preference=preference, quality=quality, fatal=False)
87a449c1
S
2976 elif ext == 'mpd':
2977 is_plain_url = False
2978 formats = self._extract_mpd_formats(
b359e977 2979 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
2980 else:
2981 is_plain_url = True
2982 formats = [{
2983 'url': full_url,
2984 'vcodec': 'none' if cur_media_type == 'audio' else None,
2985 }]
2986 return is_plain_url, formats
2987
59bbe491 2988 entries = []
4328ddf8
S
2989 # amp-video and amp-audio are very similar to their HTML5 counterparts
2990 # so we wll include them right here (see
2991 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 2992 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2993 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2994 media_tags = [(media_tag, media_tag_name, media_type, '')
2995 for media_tag, media_tag_name, media_type
2996 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
2997 media_tags.extend(re.findall(
2998 # We only allow video|audio followed by a whitespace or '>'.
2999 # Allowing more characters may end up in significant slow down (see
067aa17e 3000 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 3001 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3002 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3003 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3004 media_info = {
3005 'formats': [],
3006 'subtitles': {},
3007 }
3008 media_attributes = extract_attributes(media_tag)
f856816b 3009 src = strip_or_none(media_attributes.get('src'))
59bbe491 3010 if src:
dedb1770 3011 _, formats = _media_formats(src, media_type)
520251c0 3012 media_info['formats'].extend(formats)
6780154e 3013 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3014 if media_content:
3015 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3016 s_attr = extract_attributes(source_tag)
3017 # data-video-src and data-src are non standard but seen
3018 # several times in the wild
f856816b 3019 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 3020 if not src:
3021 continue
d493f15c 3022 f = parse_content_type(s_attr.get('type'))
868f79db 3023 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3024 if is_plain_url:
d493f15c
S
3025 # width, height, res, label and title attributes are
3026 # all not standard but seen several times in the wild
3027 labels = [
3028 s_attr.get(lbl)
3029 for lbl in ('label', 'title')
3030 if str_or_none(s_attr.get(lbl))
3031 ]
3032 width = int_or_none(s_attr.get('width'))
3089bc74
S
3033 height = (int_or_none(s_attr.get('height'))
3034 or int_or_none(s_attr.get('res')))
d493f15c
S
3035 if not width or not height:
3036 for lbl in labels:
3037 resolution = parse_resolution(lbl)
3038 if not resolution:
3039 continue
3040 width = width or resolution.get('width')
3041 height = height or resolution.get('height')
3042 for lbl in labels:
3043 tbr = parse_bitrate(lbl)
3044 if tbr:
3045 break
3046 else:
3047 tbr = None
1ed45499 3048 f.update({
d493f15c
S
3049 'width': width,
3050 'height': height,
3051 'tbr': tbr,
3052 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3053 })
520251c0
YCH
3054 f.update(formats[0])
3055 media_info['formats'].append(f)
3056 else:
3057 media_info['formats'].extend(formats)
59bbe491 3058 for track_tag in re.findall(r'<track[^>]+>', media_content):
3059 track_attributes = extract_attributes(track_tag)
3060 kind = track_attributes.get('kind')
5968d7d2 3061 if not kind or kind in ('subtitles', 'captions'):
f856816b 3062 src = strip_or_none(track_attributes.get('src'))
59bbe491 3063 if not src:
3064 continue
3065 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3066 media_info['subtitles'].setdefault(lang, []).append({
3067 'url': absolute_url(src),
3068 })
5e8e2fa5
S
3069 for f in media_info['formats']:
3070 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3071 if media_info['formats'] or media_info['subtitles']:
59bbe491 3072 entries.append(media_info)
3073 return entries
3074
f6a1d69a
F
3075 def _extract_akamai_formats(self, *args, **kwargs):
3076 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3077 if subs:
3078 self.report_warning(bug_reports_message(
3079 "Ignoring subtitle tracks found in the manifests; "
3080 "if any subtitle tracks are missing,"
3081 ))
3082 return fmts
3083
3084 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3085 signed = 'hdnea=' in manifest_url
3086 if not signed:
3087 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3088 manifest_url = re.sub(
3089 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3090 '', manifest_url).strip('?')
3091
c7c43a93 3092 formats = []
f6a1d69a 3093 subtitles = {}
70c5802b 3094
e71a4509 3095 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3096 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3097 hds_host = hosts.get('hds')
3098 if hds_host:
3099 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3100 if 'hdcore=' not in f4m_url:
3101 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3102 f4m_formats = self._extract_f4m_formats(
3103 f4m_url, video_id, f4m_id='hds', fatal=False)
3104 for entry in f4m_formats:
3105 entry.update({'extra_param_to_segment_url': hdcore_sign})
3106 formats.extend(f4m_formats)
70c5802b 3107
c4251b9a
RA
3108 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3109 hls_host = hosts.get('hls')
3110 if hls_host:
3111 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3112 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3113 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3114 m3u8_id='hls', fatal=False)
3115 formats.extend(m3u8_formats)
f6a1d69a 3116 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3117
3118 http_host = hosts.get('http')
29f7c58a 3119 if http_host and m3u8_formats and not signed:
3120 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3121 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3122 qualities_length = len(qualities)
29f7c58a 3123 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3124 i = 0
29f7c58a 3125 for f in m3u8_formats:
3126 if f['vcodec'] != 'none':
70c5802b 3127 for protocol in ('http', 'https'):
3128 http_f = f.copy()
3129 del http_f['manifest_url']
3130 http_url = re.sub(
29f7c58a 3131 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
70c5802b 3132 http_f.update({
3133 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3134 'url': http_url,
3135 'protocol': protocol,
3136 })
29f7c58a 3137 formats.append(http_f)
70c5802b 3138 i += 1
70c5802b 3139
f6a1d69a 3140 return formats, subtitles
c7c43a93 3141
6ad02195 3142 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 3143 query = compat_urlparse.urlparse(url).query
6ad02195 3144 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3145 mobj = re.search(
3146 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3147 url_base = mobj.group('url')
3148 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3149 formats = []
044eeb14
S
3150
3151 def manifest_url(manifest):
3152 m_url = '%s/%s' % (http_base_url, manifest)
3153 if query:
3154 m_url += '?%s' % query
3155 return m_url
3156
6ad02195
RA
3157 if 'm3u8' not in skip_protocols:
3158 formats.extend(self._extract_m3u8_formats(
044eeb14 3159 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3160 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3161 if 'f4m' not in skip_protocols:
3162 formats.extend(self._extract_f4m_formats(
044eeb14 3163 manifest_url('manifest.f4m'),
6ad02195 3164 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3165 if 'dash' not in skip_protocols:
3166 formats.extend(self._extract_mpd_formats(
044eeb14 3167 manifest_url('manifest.mpd'),
0384932e 3168 video_id, mpd_id='dash', fatal=False))
6ad02195 3169 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3170 if 'smil' not in skip_protocols:
3171 rtmp_formats = self._extract_smil_formats(
044eeb14 3172 manifest_url('jwplayer.smil'),
6ad02195
RA
3173 video_id, fatal=False)
3174 for rtmp_format in rtmp_formats:
3175 rtsp_format = rtmp_format.copy()
3176 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3177 del rtsp_format['play_path']
3178 del rtsp_format['ext']
3179 rtsp_format.update({
3180 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3181 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3182 'protocol': 'rtsp',
3183 })
3184 formats.extend([rtmp_format, rtsp_format])
3185 else:
3186 for protocol in ('rtmp', 'rtsp'):
3187 if protocol not in skip_protocols:
3188 formats.append({
f2e2f0c7 3189 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
3190 'format_id': protocol,
3191 'protocol': protocol,
3192 })
3193 return formats
3194
c73e330e 3195 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3196 mobj = re.search(
ac9c69ac 3197 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3198 webpage)
3199 if mobj:
c73e330e
RU
3200 try:
3201 jwplayer_data = self._parse_json(mobj.group('options'),
3202 video_id=video_id,
3203 transform_source=transform_source)
3204 except ExtractorError:
3205 pass
3206 else:
3207 if isinstance(jwplayer_data, dict):
3208 return jwplayer_data
a4a554a7
YCH
3209
3210 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3211 jwplayer_data = self._find_jwplayer_data(
3212 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3213 return self._parse_jwplayer_data(
3214 jwplayer_data, video_id, *args, **kwargs)
3215
3216 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3217 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3218 # JWPlayer backward compatibility: flattened playlists
3219 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3220 if 'playlist' not in jwplayer_data:
3221 jwplayer_data = {'playlist': [jwplayer_data]}
3222
3223 entries = []
3224
3225 # JWPlayer backward compatibility: single playlist item
3226 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3227 if not isinstance(jwplayer_data['playlist'], list):
3228 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3229
3230 for video_data in jwplayer_data['playlist']:
3231 # JWPlayer backward compatibility: flattened sources
3232 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3233 if 'sources' not in video_data:
3234 video_data['sources'] = [video_data]
3235
3236 this_video_id = video_id or video_data['mediaid']
3237
1a2192cb
S
3238 formats = self._parse_jwplayer_formats(
3239 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3240 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3241
3242 subtitles = {}
3243 tracks = video_data.get('tracks')
3244 if tracks and isinstance(tracks, list):
3245 for track in tracks:
96a2daa1
S
3246 if not isinstance(track, dict):
3247 continue
f4b74272
S
3248 track_kind = track.get('kind')
3249 if not track_kind or not isinstance(track_kind, compat_str):
3250 continue
3251 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3252 continue
3253 track_url = urljoin(base_url, track.get('file'))
3254 if not track_url:
3255 continue
3256 subtitles.setdefault(track.get('label') or 'en', []).append({
3257 'url': self._proto_relative_url(track_url)
3258 })
3259
50d808f5 3260 entry = {
a4a554a7 3261 'id': this_video_id,
50d808f5 3262 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3263 'description': clean_html(video_data.get('description')),
6945b9e7 3264 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3265 'timestamp': int_or_none(video_data.get('pubdate')),
3266 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3267 'subtitles': subtitles,
50d808f5
RA
3268 }
3269 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3270 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3271 entry.update({
3272 '_type': 'url_transparent',
3273 'url': formats[0]['url'],
3274 })
3275 else:
3276 self._sort_formats(formats)
3277 entry['formats'] = formats
3278 entries.append(entry)
a4a554a7
YCH
3279 if len(entries) == 1:
3280 return entries[0]
3281 else:
3282 return self.playlist_result(entries)
3283
ed0cf9b3
S
3284 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3285 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3286 urls = []
ed0cf9b3 3287 formats = []
1a2192cb 3288 for source in jwplayer_sources_data:
0a268c6e
S
3289 if not isinstance(source, dict):
3290 continue
6945b9e7
RA
3291 source_url = urljoin(
3292 base_url, self._proto_relative_url(source.get('file')))
3293 if not source_url or source_url in urls:
bf1b87cd
RA
3294 continue
3295 urls.append(source_url)
ed0cf9b3
S
3296 source_type = source.get('type') or ''
3297 ext = mimetype2ext(source_type) or determine_ext(source_url)
3298 if source_type == 'hls' or ext == 'm3u8':
3299 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3300 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3301 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3302 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3303 formats.extend(self._extract_mpd_formats(
3304 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3305 elif ext == 'smil':
3306 formats.extend(self._extract_smil_formats(
3307 source_url, video_id, fatal=False))
ed0cf9b3 3308 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3309 elif source_type.startswith('audio') or ext in (
3310 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3311 formats.append({
3312 'url': source_url,
3313 'vcodec': 'none',
3314 'ext': ext,
3315 })
3316 else:
3317 height = int_or_none(source.get('height'))
3318 if height is None:
3319 # Often no height is provided but there is a label in
0236cd0d 3320 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3321 height = int_or_none(self._search_regex(
0236cd0d 3322 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
3323 'height', default=None))
3324 a_format = {
3325 'url': source_url,
3326 'width': int_or_none(source.get('width')),
3327 'height': height,
0236cd0d 3328 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3329 'ext': ext,
3330 }
3331 if source_url.startswith('rtmp'):
3332 a_format['ext'] = 'flv'
ed0cf9b3
S
3333 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3334 # of jwplayer.flash.swf
3335 rtmp_url_parts = re.split(
3336 r'((?:mp4|mp3|flv):)', source_url, 1)
3337 if len(rtmp_url_parts) == 3:
3338 rtmp_url, prefix, play_path = rtmp_url_parts
3339 a_format.update({
3340 'url': rtmp_url,
3341 'play_path': prefix + play_path,
3342 })
3343 if rtmp_params:
3344 a_format.update(rtmp_params)
3345 formats.append(a_format)
3346 return formats
3347
f4b1c7ad
PH
3348 def _live_title(self, name):
3349 """ Generate the title for a live video """
3350 now = datetime.datetime.now()
611c1dd9 3351 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
3352 return name + ' ' + now_str
3353
b14f3a4c
PH
3354 def _int(self, v, name, fatal=False, **kwargs):
3355 res = int_or_none(v, **kwargs)
3356 if 'get_attr' in kwargs:
3357 print(getattr(v, kwargs['get_attr']))
3358 if res is None:
3359 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3360 if fatal:
3361 raise ExtractorError(msg)
3362 else:
6a39ee13 3363 self.report_warning(msg)
b14f3a4c
PH
3364 return res
3365
3366 def _float(self, v, name, fatal=False, **kwargs):
3367 res = float_or_none(v, **kwargs)
3368 if res is None:
3369 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3370 if fatal:
3371 raise ExtractorError(msg)
3372 else:
6a39ee13 3373 self.report_warning(msg)
b14f3a4c
PH
3374 return res
3375
40e41780
TF
3376 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3377 path='/', secure=False, discard=False, rest={}, **kwargs):
6c22cee6 3378 cookie = compat_cookiejar_Cookie(
4ed2d7b7 3379 0, name, value, port, port is not None, domain, True,
40e41780
TF
3380 domain.startswith('.'), path, True, secure, expire_time,
3381 discard, None, None, rest)
42939b61
JMF
3382 self._downloader.cookiejar.set_cookie(cookie)
3383
799207e8 3384 def _get_cookies(self, url):
f7ad7160 3385 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
5c2266df 3386 req = sanitized_Request(url)
799207e8 3387 self._downloader.cookiejar.add_cookie_header(req)
f7ad7160 3388 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
799207e8 3389
e3c1266f 3390 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3391 """
3392 Apply first Set-Cookie header instead of the last. Experimental.
3393
3394 Some sites (e.g. [1-3]) may serve two cookies under the same name
3395 in Set-Cookie header and expect the first (old) one to be set rather
3396 than second (new). However, as of RFC6265 the newer one cookie
3397 should be set into cookie store what actually happens.
3398 We will workaround this issue by resetting the cookie to
3399 the first one manually.
3400 1. https://new.vk.com/
3401 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3402 3. https://learning.oreilly.com/
3403 """
e3c1266f
S
3404 for header, cookies in url_handle.headers.items():
3405 if header.lower() != 'set-cookie':
3406 continue
3407 if sys.version_info[0] >= 3:
3408 cookies = cookies.encode('iso-8859-1')
3409 cookies = cookies.decode('utf-8')
3410 cookie_value = re.search(
3411 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3412 if cookie_value:
3413 value, domain = cookie_value.groups()
3414 self._set_cookie(domain, cookie, value)
3415 break
3416
05900629
PH
3417 def get_testcases(self, include_onlymatching=False):
3418 t = getattr(self, '_TEST', None)
3419 if t:
3420 assert not hasattr(self, '_TESTS'), \
3421 '%s has _TEST and _TESTS' % type(self).__name__
3422 tests = [t]
3423 else:
3424 tests = getattr(self, '_TESTS', [])
3425 for t in tests:
3426 if not include_onlymatching and t.get('only_matching', False):
3427 continue
3428 t['name'] = type(self).__name__[:-len('IE')]
3429 yield t
3430
3431 def is_suitable(self, age_limit):
3432 """ Test whether the extractor is generally suitable for the given
3433 age limit (i.e. pornographic sites are not, all others usually are) """
3434
3435 any_restricted = False
3436 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 3437 if tc.get('playlist', []):
05900629
PH
3438 tc = tc['playlist'][0]
3439 is_restricted = age_restricted(
3440 tc.get('info_dict', {}).get('age_limit'), age_limit)
3441 if not is_restricted:
3442 return True
3443 any_restricted = any_restricted or is_restricted
3444 return not any_restricted
3445
a504ced0 3446 def extract_subtitles(self, *args, **kwargs):
a06916d9 3447 if (self.get_param('writesubtitles', False)
3448 or self.get_param('listsubtitles')):
9868ea49
JMF
3449 return self._get_subtitles(*args, **kwargs)
3450 return {}
a504ced0
JMF
3451
3452 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3453 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3454
912e0b7e
YCH
3455 @staticmethod
3456 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3457 """ Merge subtitle items for one language. Items with duplicated URLs
3458 will be dropped. """
3459 list1_urls = set([item['url'] for item in subtitle_list1])
3460 ret = list(subtitle_list1)
3461 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3462 return ret
3463
3464 @classmethod
46890374 3465 def _merge_subtitles(cls, *dicts, target=None):
19bb3920 3466 """ Merge subtitle dictionaries, language by language. """
19bb3920
F
3467 if target is None:
3468 target = {}
3469 for d in dicts:
3470 for lang, subs in d.items():
3471 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3472 return target
912e0b7e 3473
360e1ca5 3474 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3475 if (self.get_param('writeautomaticsub', False)
3476 or self.get_param('listsubtitles')):
9868ea49
JMF
3477 return self._get_automatic_captions(*args, **kwargs)
3478 return {}
360e1ca5
JMF
3479
3480 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3481 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3482
d77ab8e2 3483 def mark_watched(self, *args, **kwargs):
a06916d9 3484 if (self.get_param('mark_watched', False)
3089bc74 3485 and (self._get_login_info()[0] is not None
a06916d9 3486 or self.get_param('cookiefile') is not None)):
d77ab8e2
S
3487 self._mark_watched(*args, **kwargs)
3488
3489 def _mark_watched(self, *args, **kwargs):
3490 raise NotImplementedError('This method must be implemented by subclasses')
3491
38cce791
YCH
3492 def geo_verification_headers(self):
3493 headers = {}
a06916d9 3494 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3495 if geo_verification_proxy:
3496 headers['Ytdl-request-proxy'] = geo_verification_proxy
3497 return headers
3498
98763ee3
YCH
3499 def _generic_id(self, url):
3500 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3501
3502 def _generic_title(self, url):
3503 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3504
c224251a 3505 @staticmethod
b0089e89 3506 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3507 all_known = all(map(
3508 lambda x: x is not None,
3509 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3510 return (
3511 'private' if is_private
3512 else 'premium_only' if needs_premium
3513 else 'subscriber_only' if needs_subscription
3514 else 'needs_auth' if needs_auth
3515 else 'unlisted' if is_unlisted
3516 else 'public' if all_known
3517 else None)
3518
4bb6b02f 3519 def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
3520 '''
3521 @returns A list of values for the extractor argument given by "key"
3522 or "default" if no such key is present
3523 @param default The default value to return when the key is not present (default: [])
3524 @param casesense When false, the values are converted to lower case
3525 '''
3526 val = traverse_obj(
5d3a0e79 3527 self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
4bb6b02f 3528 if val is None:
3529 return [] if default is NO_DEFAULT else default
3530 return list(val) if casesense else [x.lower() for x in val]
5d3a0e79 3531
8dbe9899 3532
d6983cb4
PH
3533class SearchInfoExtractor(InfoExtractor):
3534 """
3535 Base class for paged search queries extractors.
10952eb2 3536 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
3537 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3538 """
3539
3540 @classmethod
3541 def _make_valid_url(cls):
3542 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3543
3544 @classmethod
3545 def suitable(cls, url):
3546 return re.match(cls._make_valid_url(), url) is not None
3547
3548 def _real_extract(self, query):
3549 mobj = re.match(self._make_valid_url(), query)
3550 if mobj is None:
f1a9d64e 3551 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
3552
3553 prefix = mobj.group('prefix')
3554 query = mobj.group('query')
3555 if prefix == '':
3556 return self._get_n_results(query, 1)
3557 elif prefix == 'all':
3558 return self._get_n_results(query, self._MAX_RESULTS)
3559 else:
3560 n = int(prefix)
3561 if n <= 0:
f1a9d64e 3562 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 3563 elif n > self._MAX_RESULTS:
6a39ee13 3564 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3565 n = self._MAX_RESULTS
3566 return self._get_n_results(query, n)
3567
3568 def _get_n_results(self, query, n):
3569 """Get a specified number of results for a query"""
611c1dd9 3570 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
3571
3572 @property
3573 def SEARCH_KEY(self):
3574 return self._SEARCH_KEY