]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
[Voot] Add VootSeriesIE (#351)
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
f4b1c7ad 5import datetime
3ec05685 6import hashlib
3d3538e4 7import json
4094b6e3 8import netrc
d6983cb4 9import os
773f291d 10import random
d6983cb4 11import re
d6983cb4 12import sys
4094b6e3 13import time
1bac3455 14import math
d6983cb4 15
8c25f81b 16from ..compat import (
6c22cee6 17 compat_cookiejar_Cookie,
f7ad7160 18 compat_cookies_SimpleCookie,
ee0ba927 19 compat_etree_Element,
e9c0cdd3 20 compat_etree_fromstring,
e64b7569 21 compat_getpass,
d391b7e2 22 compat_integer_types,
d6983cb4 23 compat_http_client,
e9c0cdd3
YCH
24 compat_os_name,
25 compat_str,
d6983cb4 26 compat_urllib_error,
98763ee3 27 compat_urllib_parse_unquote,
15707c7e 28 compat_urllib_parse_urlencode,
41d06b04 29 compat_urllib_request,
f0b5d6af 30 compat_urlparse,
e01c3d2e 31 compat_xml_parse_error,
8c25f81b 32)
eb8a4433 33from ..downloader import FileDownloader
48107c19
S
34from ..downloader.f4m import (
35 get_base_url,
36 remove_encrypted_media,
37)
8c25f81b 38from ..utils import (
c342041f 39 NO_DEFAULT,
05900629 40 age_restricted,
02dc0a36 41 base_url,
08f2a92c 42 bug_reports_message,
d6983cb4
PH
43 clean_html,
44 compiled_regex_type,
70f0f5a8 45 determine_ext,
46b18f23 46 determine_protocol,
d493f15c 47 dict_get,
9b9c5355 48 error_to_compat_str,
d6983cb4 49 ExtractorError,
46b18f23 50 extract_attributes,
97f4aecf 51 fix_xml_ampersands,
b14f3a4c 52 float_or_none,
773f291d
S
53 GeoRestrictedError,
54 GeoUtils,
31bb8d3f 55 int_or_none,
a4a554a7 56 js_to_json,
0685d972 57 JSON_LD_RE,
46b18f23 58 mimetype2ext,
3158150c 59 network_exceptions,
46b18f23 60 orderedSet,
d493f15c 61 parse_bitrate,
46b18f23
JH
62 parse_codecs,
63 parse_duration,
4ca2a3cf 64 parse_iso8601,
46b18f23 65 parse_m3u8_attributes,
d493f15c 66 parse_resolution,
55b3e45b 67 RegexNotFoundError,
5c2266df 68 sanitized_Request,
46b18f23 69 sanitize_filename,
d493f15c 70 str_or_none,
ce5b9040 71 str_to_int,
f856816b 72 strip_or_none,
f38de77f 73 unescapeHTML,
647eab45 74 unified_strdate,
6b3a3098 75 unified_timestamp,
46b18f23
JH
76 update_Request,
77 update_url_query,
78 urljoin,
a107193e 79 url_basename,
bebef109 80 url_or_none,
a6571f10 81 xpath_element,
8d6765cf
S
82 xpath_text,
83 xpath_with_ns,
d6983cb4 84)
c342041f 85
d6983cb4
PH
86
87class InfoExtractor(object):
88 """Information Extractor class.
89
90 Information extractors are the classes that, given a URL, extract
91 information about the video (or videos) the URL refers to. This
92 information includes the real video URL, the video title, author and
93 others. The information is stored in a dictionary which is then
5d380852 94 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
95 information possibly downloading the video to the file system, among
96 other possible outcomes.
97
cf0649f8 98 The type field determines the type of the result.
fed5d032
PH
99 By far the most common value (and the default if _type is missing) is
100 "video", which indicates a single video.
101
102 For a video, the dictionaries must include the following fields:
d6983cb4
PH
103
104 id: Video identifier.
d6983cb4 105 title: Video title, unescaped.
d67b0b15 106
f49d89ee 107 Additionally, it must contain either a formats entry or a url one:
d67b0b15 108
f49d89ee
PH
109 formats: A list of dictionaries for each format available, ordered
110 from worst to best quality.
111
112 Potential fields:
c790e93a
S
113 * url The mandatory URL representing the media:
114 for plain file media - HTTP URL of this file,
115 for RTMP - RTMP URL,
116 for HLS - URL of the M3U8 media playlist,
117 for HDS - URL of the F4M manifest,
79d2077e
S
118 for DASH
119 - HTTP URL to plain file media (in case of
120 unfragmented media)
121 - URL of the MPD manifest or base URL
122 representing the media if MPD manifest
8ed7a233 123 is parsed from a string (in case of
79d2077e 124 fragmented media)
c790e93a 125 for MSS - URL of the ISM manifest.
86f4d14f
S
126 * manifest_url
127 The URL of the manifest file in case of
c790e93a
S
128 fragmented media:
129 for HLS - URL of the M3U8 master playlist,
130 for HDS - URL of the F4M manifest,
131 for DASH - URL of the MPD manifest,
132 for MSS - URL of the ISM manifest.
10952eb2 133 * ext Will be calculated from URL if missing
d67b0b15
PH
134 * format A human-readable description of the format
135 ("mp4 container with h264/opus").
136 Calculated from the format_id, width, height.
137 and format_note fields if missing.
138 * format_id A short description of the format
5d4f3985
PH
139 ("mp4_h264_opus" or "19").
140 Technically optional, but strongly recommended.
d67b0b15
PH
141 * format_note Additional info about the format
142 ("3D" or "DASH video")
143 * width Width of the video, if known
144 * height Height of the video, if known
f49d89ee 145 * resolution Textual description of width and height
7217e148 146 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
147 * abr Average audio bitrate in KBit/s
148 * acodec Name of the audio codec in use
dd27fd17 149 * asr Audio sampling rate in Hertz
d67b0b15 150 * vbr Average video bitrate in KBit/s
fbb21cf5 151 * fps Frame rate
d67b0b15 152 * vcodec Name of the video codec in use
1394ce65 153 * container Name of the container format
d67b0b15 154 * filesize The number of bytes, if known in advance
9732d77e 155 * filesize_approx An estimate for the number of bytes
d67b0b15 156 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
157 * protocol The protocol that will be used for the actual
158 download, lower-case.
0fa9a1e2 159 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
af7d5a63 160 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
161 * fragment_base_url
162 Base URL for fragments. Each fragment's path
163 value (if present) will be relative to
164 this URL.
165 * fragments A list of fragments of a fragmented media.
166 Each fragment entry must contain either an url
167 or a path. If an url is present it should be
168 considered by a client. Otherwise both path and
169 fragment_base_url must be present. Here is
170 the list of all potential fields:
171 * "url" - fragment's URL
172 * "path" - fragment's path relative to
173 fragment_base_url
a0d5077c
S
174 * "duration" (optional, int or float)
175 * "filesize" (optional, int)
f49d89ee 176 * preference Order number of this format. If this field is
08d13955 177 present and not None, the formats get sorted
38d63d84 178 by this field, regardless of all other values.
f49d89ee
PH
179 -1 for default (order by other properties),
180 -2 or smaller for less than default.
e65566a9
PH
181 < -1000 to hide the format (if there is
182 another one which is strictly better)
32f90364
PH
183 * language Language code, e.g. "de" or "en-US".
184 * language_preference Is this in the language mentioned in
185 the URL?
aff2f4f4
PH
186 10 if it's what the URL is about,
187 -1 for default (don't know),
188 -10 otherwise, other values reserved for now.
5d73273f
PH
189 * quality Order number of the video quality of this
190 format, irrespective of the file format.
191 -1 for default (order by other properties),
192 -2 or smaller for less than default.
c64ed2a3
PH
193 * source_preference Order number for this video source
194 (quality takes higher priority)
195 -1 for default (order by other properties),
196 -2 or smaller for less than default.
d769be6c
PH
197 * http_headers A dictionary of additional HTTP headers
198 to add to the request.
6271f1ca 199 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
200 video's pixels are not square.
201 width : height ratio as float.
202 * no_resume The server does not support resuming the
203 (HTTP or RTMP) download. Boolean.
00c97e3e
S
204 * downloader_options A dictionary of downloader options as
205 described in FileDownloader
3dee7826 206
c0ba0f48 207 url: Final video URL.
d6983cb4 208 ext: Video filename extension.
d67b0b15
PH
209 format: The video format, defaults to ext (used for --get-format)
210 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 211
d6983cb4
PH
212 The following fields are optional:
213
f5e43bc6 214 alt_title: A secondary title of the video.
0afef30b
PH
215 display_id An alternative identifier for the video, not necessarily
216 unique, but available before title. Typically, id is
217 something like "4234987", title "Dancing naked mole rats",
218 and display_id "dancing-naked-mole-rats"
d5519808 219 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 220 * "id" (optional, string) - Thumbnail format ID
d5519808 221 * "url"
cfb56d1a 222 * "preference" (optional, int) - quality of the image
d5519808
PH
223 * "width" (optional, int)
224 * "height" (optional, int)
5e1c39ac 225 * "resolution" (optional, string "{width}x{height}",
d5519808 226 deprecated)
2de624fd 227 * "filesize" (optional, int)
d6983cb4 228 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 229 description: Full video description.
d6983cb4 230 uploader: Full name of the video uploader.
2bc0c46f 231 license: License name the video is licensed under.
8a92e51c 232 creator: The creator of the video.
10db0d2f 233 release_timestamp: UNIX timestamp of the moment the video was released.
8aab976b 234 release_date: The date (YYYYMMDD) when the video was released.
10db0d2f 235 timestamp: UNIX timestamp of the moment the video was uploaded
d6983cb4 236 upload_date: Video upload date (YYYYMMDD).
955c4514 237 If not explicitly set, calculated from timestamp.
d6983cb4 238 uploader_id: Nickname or id of the video uploader.
7bcd2830 239 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 240 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 241 Note that channel fields may or may not repeat uploader
6f1f59f3
S
242 fields. This depends on a particular extractor.
243 channel_id: Id of the channel.
244 channel_url: Full URL to a channel webpage.
da9ec3b9 245 location: Physical location where the video was filmed.
a504ced0 246 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
247 {tag: subformats}. "tag" is usually a language code, and
248 "subformats" is a list sorted from lower to higher
249 preference, each element is a dictionary with the "ext"
250 entry and one of:
a504ced0 251 * "data": The subtitles file contents
10952eb2 252 * "url": A URL pointing to the subtitles file
2412044c 253 It can optionally also have:
254 * "name": Name or description of the subtitles
4bba3716 255 "ext" will be calculated from URL if missing
e167860c 256 automatic_captions: Like 'subtitles'; contains automatically generated
257 captions instead of normal subtitles
62d231c0 258 duration: Length of the video in seconds, as an integer or float.
f3d29461 259 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
260 like_count: Number of positive ratings of the video
261 dislike_count: Number of negative ratings of the video
02835c6b 262 repost_count: Number of reposts of the video
2d30521a 263 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 264 comment_count: Number of comments on the video
dd622d7c
PH
265 comments: A list of comments, each with one or more of the following
266 properties (all but one of text or html optional):
267 * "author" - human-readable name of the comment author
268 * "author_id" - user ID of the comment author
a1c5d2ca 269 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
270 * "id" - Comment ID
271 * "html" - Comment as HTML
272 * "text" - Plain text of the comment
273 * "timestamp" - UNIX timestamp of comment
274 * "parent" - ID of the comment this one is replying to.
275 Set to "root" to indicate that this is a
276 comment to the original video.
a1c5d2ca
M
277 * "like_count" - Number of positive ratings of the comment
278 * "dislike_count" - Number of negative ratings of the comment
279 * "is_favorited" - Whether the comment is marked as
280 favorite by the video uploader
281 * "author_is_uploader" - Whether the comment is made by
282 the video uploader
8dbe9899 283 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 284 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
285 should allow to get the same result again. (It will be set
286 by YoutubeDL if it's missing)
ad3bc6ac
PH
287 categories: A list of categories that the video falls in, for example
288 ["Sports", "Berlin"]
864f24bd 289 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
290 is_live: True, False, or None (=unknown). Whether this video is a
291 live stream that goes on instead of a fixed-length video.
f76ede8e 292 was_live: True, False, or None (=unknown). Whether this video was
293 originally a live stream.
7c80519c 294 start_time: Time in seconds where the reproduction should start, as
10952eb2 295 specified in the URL.
297a564b 296 end_time: Time in seconds where the reproduction should end, as
10952eb2 297 specified in the URL.
55949fed 298 chapters: A list of dictionaries, with the following entries:
299 * "start_time" - The start time of the chapter in seconds
300 * "end_time" - The end time of the chapter in seconds
301 * "title" (optional, string)
6cfda058 302 playable_in_embed: Whether this video is allowed to play in embedded
303 players on other sites. Can be True (=always allowed),
304 False (=never allowed), None (=unknown), or a string
c224251a
M
305 specifying the criteria for embedability (Eg: 'whitelist')
306 availability: Under what condition the video is available. One of
307 'private', 'premium_only', 'subscriber_only', 'needs_auth',
308 'unlisted' or 'public'. Use 'InfoExtractor._availability'
309 to set it
277d6ff5 310 __post_extractor: A function to be called just before the metadata is
311 written to either disk, logger or console. The function
312 must return a dict which will be added to the info_dict.
313 This is usefull for additional information that is
314 time-consuming to extract. Note that the fields thus
315 extracted will not be available to output template and
316 match_filter. So, only "comments" and "comment_count" are
317 currently allowed to be extracted via this method.
d6983cb4 318
7109903e
S
319 The following fields should only be used when the video belongs to some logical
320 chapter or section:
321
322 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
323 chapter_number: Number of the chapter the video belongs to, as an integer.
324 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
325
326 The following fields should only be used when the video is an episode of some
8d76bdf1 327 series, programme or podcast:
7109903e
S
328
329 series: Title of the series or programme the video episode belongs to.
330 season: Title of the season the video episode belongs to.
27bfd4e5
S
331 season_number: Number of the season the video episode belongs to, as an integer.
332 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
333 episode: Title of the video episode. Unlike mandatory video title field,
334 this field should denote the exact title of the video episode
335 without any kind of decoration.
27bfd4e5
S
336 episode_number: Number of the video episode within a season, as an integer.
337 episode_id: Id of the video episode, as a unicode string.
7109903e 338
7a93ab5f
S
339 The following fields should only be used when the media is a track or a part of
340 a music album:
341
342 track: Title of the track.
343 track_number: Number of the track within an album or a disc, as an integer.
344 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
345 as a unicode string.
346 artist: Artist(s) of the track.
347 genre: Genre(s) of the track.
348 album: Title of the album the track belongs to.
349 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
350 album_artist: List of all artists appeared on the album (e.g.
351 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
352 and compilations).
353 disc_number: Number of the disc or other physical medium the track belongs to,
354 as an integer.
355 release_year: Year (YYYY) when the album was released.
356
deefc05b 357 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 358
d838b1bd
PH
359 Unless mentioned otherwise, None is equivalent to absence of information.
360
fed5d032
PH
361
362 _type "playlist" indicates multiple videos.
b82f815f
PH
363 There must be a key "entries", which is a list, an iterable, or a PagedList
364 object, each element of which is a valid dictionary by this specification.
fed5d032 365
b60419c5 366 Additionally, playlists can have "id", "title", and any other relevent
367 attributes with the same semantics as videos (see above).
fed5d032
PH
368
369
370 _type "multi_video" indicates that there are multiple videos that
371 form a single show, for examples multiple acts of an opera or TV episode.
372 It must have an entries key like a playlist and contain all the keys
373 required for a video at the same time.
374
375
376 _type "url" indicates that the video must be extracted from another
377 location, possibly by a different extractor. Its only required key is:
378 "url" - the next URL to extract.
f58766ce
PH
379 The key "ie_key" can be set to the class name (minus the trailing "IE",
380 e.g. "Youtube") if the extractor class is known in advance.
381 Additionally, the dictionary may have any properties of the resolved entity
382 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
383 known ahead of time.
384
385
386 _type "url_transparent" entities have the same specification as "url", but
387 indicate that the given additional information is more precise than the one
388 associated with the resolved URL.
389 This is useful when a site employs a video service that hosts the video and
390 its technical metadata, but that video service does not embed a useful
391 title, description etc.
392
393
d6983cb4
PH
394 Subclasses of this one should re-define the _real_initialize() and
395 _real_extract() methods and define a _VALID_URL regexp.
396 Probably, they should also be added to the list of extractors.
397
4248dad9 398 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
399 geo restriction bypass mechanisms for a particular extractor.
400 Though it won't disable explicit geo restriction bypass based on
504f20dd 401 country code provided with geo_bypass_country.
4248dad9
S
402
403 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
404 countries for this extractor. One of these countries will be used by
405 geo restriction bypass mechanism right away in order to bypass
504f20dd 406 geo restriction, of course, if the mechanism is not disabled.
773f291d 407
5f95927a
S
408 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
409 IP blocks in CIDR notation for this extractor. One of these IP blocks
410 will be used by geo restriction bypass mechanism similarly
504f20dd 411 to _GEO_COUNTRIES.
3ccdde8c 412
d6983cb4
PH
413 Finally, the _WORKING attribute should be set to False for broken IEs
414 in order to warn the users and skip the tests.
415 """
416
417 _ready = False
418 _downloader = None
773f291d 419 _x_forwarded_for_ip = None
4248dad9
S
420 _GEO_BYPASS = True
421 _GEO_COUNTRIES = None
5f95927a 422 _GEO_IP_BLOCKS = None
d6983cb4
PH
423 _WORKING = True
424
9d5d4d64 425 _LOGIN_HINTS = {
426 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
427 'cookies': (
428 'Use --cookies for the authentication. '
429 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to pass cookies'),
430 'password': 'Use --username and --password or --netrc to provide account credentials',
431 }
432
d6983cb4
PH
433 def __init__(self, downloader=None):
434 """Constructor. Receives an optional downloader."""
435 self._ready = False
773f291d 436 self._x_forwarded_for_ip = None
d6983cb4
PH
437 self.set_downloader(downloader)
438
439 @classmethod
440 def suitable(cls, url):
441 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
442
443 # This does not use has/getattr intentionally - we want to know whether
444 # we have cached the regexp for *this* class, whereas getattr would also
445 # match the superclass
446 if '_VALID_URL_RE' not in cls.__dict__:
447 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
448 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 449
ed9266db
PH
450 @classmethod
451 def _match_id(cls, url):
452 if '_VALID_URL_RE' not in cls.__dict__:
453 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
454 m = cls._VALID_URL_RE.match(url)
455 assert m
1afd0b0d 456 return compat_str(m.group('id'))
ed9266db 457
d6983cb4
PH
458 @classmethod
459 def working(cls):
460 """Getter method for _WORKING."""
461 return cls._WORKING
462
463 def initialize(self):
464 """Initializes an instance (authentication, etc)."""
5f95927a
S
465 self._initialize_geo_bypass({
466 'countries': self._GEO_COUNTRIES,
467 'ip_blocks': self._GEO_IP_BLOCKS,
468 })
4248dad9
S
469 if not self._ready:
470 self._real_initialize()
471 self._ready = True
472
5f95927a 473 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
474 """
475 Initialize geo restriction bypass mechanism.
476
477 This method is used to initialize geo bypass mechanism based on faking
478 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 479 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
480 IP will be passed as X-Forwarded-For HTTP header in all subsequent
481 HTTP requests.
e39b5d4a
S
482
483 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
484 during the instance initialization with _GEO_COUNTRIES and
485 _GEO_IP_BLOCKS.
e39b5d4a 486
5f95927a 487 You may also manually call it from extractor's code if geo bypass
e39b5d4a 488 information is not available beforehand (e.g. obtained during
5f95927a
S
489 extraction) or due to some other reason. In this case you should pass
490 this information in geo bypass context passed as first argument. It may
491 contain following fields:
492
493 countries: List of geo unrestricted countries (similar
494 to _GEO_COUNTRIES)
495 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
496 (similar to _GEO_IP_BLOCKS)
497
e39b5d4a 498 """
773f291d 499 if not self._x_forwarded_for_ip:
5f95927a
S
500
501 # Geo bypass mechanism is explicitly disabled by user
a06916d9 502 if not self.get_param('geo_bypass', True):
5f95927a
S
503 return
504
505 if not geo_bypass_context:
506 geo_bypass_context = {}
507
508 # Backward compatibility: previously _initialize_geo_bypass
509 # expected a list of countries, some 3rd party code may still use
510 # it this way
511 if isinstance(geo_bypass_context, (list, tuple)):
512 geo_bypass_context = {
513 'countries': geo_bypass_context,
514 }
515
516 # The whole point of geo bypass mechanism is to fake IP
517 # as X-Forwarded-For HTTP header based on some IP block or
518 # country code.
519
520 # Path 1: bypassing based on IP block in CIDR notation
521
522 # Explicit IP block specified by user, use it right away
523 # regardless of whether extractor is geo bypassable or not
a06916d9 524 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
525
526 # Otherwise use random IP block from geo bypass context but only
527 # if extractor is known as geo bypassable
528 if not ip_block:
529 ip_blocks = geo_bypass_context.get('ip_blocks')
530 if self._GEO_BYPASS and ip_blocks:
531 ip_block = random.choice(ip_blocks)
532
533 if ip_block:
534 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
0760b0a7 535 self._downloader.write_debug(
536 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
5f95927a
S
537 return
538
539 # Path 2: bypassing based on country code
540
541 # Explicit country code specified by user, use it right away
542 # regardless of whether extractor is geo bypassable or not
a06916d9 543 country = self.get_param('geo_bypass_country', None)
5f95927a
S
544
545 # Otherwise use random country code from geo bypass context but
546 # only if extractor is known as geo bypassable
547 if not country:
548 countries = geo_bypass_context.get('countries')
549 if self._GEO_BYPASS and countries:
550 country = random.choice(countries)
551
552 if country:
553 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 554 self._downloader.write_debug(
555 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
556
557 def extract(self, url):
558 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 559 try:
773f291d
S
560 for _ in range(2):
561 try:
562 self.initialize()
a06916d9 563 self.write_debug('Extracting URL: %s' % url)
0016b84e 564 ie_result = self._real_extract(url)
07cce701 565 if ie_result is None:
566 return None
0016b84e
S
567 if self._x_forwarded_for_ip:
568 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
53ed7066 569 subtitles = ie_result.get('subtitles')
570 if (subtitles and 'live_chat' in subtitles
a06916d9 571 and 'no-live-chat' in self.get_param('compat_opts', [])):
53ed7066 572 del subtitles['live_chat']
0016b84e 573 return ie_result
773f291d 574 except GeoRestrictedError as e:
4248dad9
S
575 if self.__maybe_fake_ip_and_retry(e.countries):
576 continue
773f291d 577 raise
3a5bcd03
PH
578 except ExtractorError:
579 raise
580 except compat_http_client.IncompleteRead as e:
dfb1b146 581 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 582 except (KeyError, StopIteration) as e:
dfb1b146 583 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4 584
4248dad9 585 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 586 if (not self.get_param('geo_bypass_country', None)
3089bc74 587 and self._GEO_BYPASS
a06916d9 588 and self.get_param('geo_bypass', True)
3089bc74
S
589 and not self._x_forwarded_for_ip
590 and countries):
eea0716c
S
591 country_code = random.choice(countries)
592 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
593 if self._x_forwarded_for_ip:
594 self.report_warning(
eea0716c
S
595 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
596 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
597 return True
598 return False
599
d6983cb4
PH
600 def set_downloader(self, downloader):
601 """Sets the downloader for this IE."""
602 self._downloader = downloader
603
604 def _real_initialize(self):
605 """Real initialization process. Redefine in subclasses."""
606 pass
607
608 def _real_extract(self, url):
609 """Real extraction process. Redefine in subclasses."""
610 pass
611
56c73665
JMF
612 @classmethod
613 def ie_key(cls):
614 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 615 return compat_str(cls.__name__[:-2])
56c73665 616
d6983cb4
PH
617 @property
618 def IE_NAME(self):
dc519b54 619 return compat_str(type(self).__name__[:-2])
d6983cb4 620
d391b7e2
S
621 @staticmethod
622 def __can_accept_status_code(err, expected_status):
623 assert isinstance(err, compat_urllib_error.HTTPError)
624 if expected_status is None:
625 return False
626 if isinstance(expected_status, compat_integer_types):
627 return err.code == expected_status
628 elif isinstance(expected_status, (list, tuple)):
629 return err.code in expected_status
630 elif callable(expected_status):
631 return expected_status(err.code) is True
632 else:
633 assert False
634
635 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
636 """
637 Return the response handle.
638
639 See _download_webpage docstring for arguments specification.
640 """
1cf376f5 641 if not self._downloader._first_webpage_request:
a06916d9 642 sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
1cf376f5 643 if sleep_interval > 0:
5ef7d9bd 644 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 645 time.sleep(sleep_interval)
646 else:
647 self._downloader._first_webpage_request = False
648
d6983cb4
PH
649 if note is None:
650 self.report_download_webpage(video_id)
651 elif note is not False:
7cc3570e 652 if video_id is None:
f1a9d64e 653 self.to_screen('%s' % (note,))
7cc3570e 654 else:
f1a9d64e 655 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
656
657 # Some sites check X-Forwarded-For HTTP header in order to figure out
658 # the origin of the client behind proxy. This allows bypassing geo
659 # restriction by faking this header's value to IP that belongs to some
660 # geo unrestricted country. We will do so once we encounter any
661 # geo restriction error.
662 if self._x_forwarded_for_ip:
663 if 'X-Forwarded-For' not in headers:
664 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
665
41d06b04
S
666 if isinstance(url_or_request, compat_urllib_request.Request):
667 url_or_request = update_Request(
668 url_or_request, data=data, headers=headers, query=query)
669 else:
cdfee168 670 if query:
671 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 672 if data is not None or headers:
41d06b04 673 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 674 try:
dca08720 675 return self._downloader.urlopen(url_or_request)
3158150c 676 except network_exceptions as err:
d391b7e2
S
677 if isinstance(err, compat_urllib_error.HTTPError):
678 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
679 # Retain reference to error to prevent file object from
680 # being closed before it can be read. Works around the
681 # effects of <https://bugs.python.org/issue15002>
682 # introduced in Python 3.4.1.
683 err.fp._error = err
d391b7e2
S
684 return err.fp
685
aa94a6d3
PH
686 if errnote is False:
687 return False
d6983cb4 688 if errnote is None:
f1a9d64e 689 errnote = 'Unable to download webpage'
7f8b2714 690
9b9c5355 691 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
692 if fatal:
693 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
694 else:
6a39ee13 695 self.report_warning(errmsg)
7cc3570e 696 return False
d6983cb4 697
d391b7e2
S
698 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
699 """
700 Return a tuple (page content as string, URL handle).
701
702 See _download_webpage docstring for arguments specification.
703 """
b9d3e163
PH
704 # Strip hashes from the URL (#1038)
705 if isinstance(url_or_request, (compat_str, str)):
706 url_or_request = url_or_request.partition('#')[0]
707
d391b7e2 708 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
709 if urlh is False:
710 assert not fatal
711 return False
c9a77969 712 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
713 return (content, urlh)
714
c9a77969
YCH
715 @staticmethod
716 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
717 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
718 if m:
719 encoding = m.group(1)
720 else:
0d75ae2c 721 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
722 webpage_bytes[:1024])
723 if m:
724 encoding = m.group(1).decode('ascii')
b60016e8
PH
725 elif webpage_bytes.startswith(b'\xff\xfe'):
726 encoding = 'utf-16'
f143d86a
PH
727 else:
728 encoding = 'utf-8'
c9a77969
YCH
729
730 return encoding
731
4457823d
S
732 def __check_blocked(self, content):
733 first_block = content[:512]
3089bc74
S
734 if ('<title>Access to this site is blocked</title>' in content
735 and 'Websense' in first_block):
4457823d
S
736 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
737 blocked_iframe = self._html_search_regex(
738 r'<iframe src="([^"]+)"', content,
739 'Websense information URL', default=None)
740 if blocked_iframe:
741 msg += ' Visit %s for more details' % blocked_iframe
742 raise ExtractorError(msg, expected=True)
743 if '<title>The URL you requested has been blocked</title>' in first_block:
744 msg = (
745 'Access to this webpage has been blocked by Indian censorship. '
746 'Use a VPN or proxy server (with --proxy) to route around it.')
747 block_msg = self._html_search_regex(
748 r'</h1><p>(.*?)</p>',
749 content, 'block message', default=None)
750 if block_msg:
751 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
752 raise ExtractorError(msg, expected=True)
3089bc74
S
753 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
754 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
755 raise ExtractorError(
756 'Access to this webpage has been blocked by decision of the Russian government. '
757 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
758 expected=True)
759
c9a77969
YCH
760 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
761 content_type = urlh.headers.get('Content-Type', '')
762 webpage_bytes = urlh.read()
763 if prefix is not None:
764 webpage_bytes = prefix + webpage_bytes
765 if not encoding:
766 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
a06916d9 767 if self.get_param('dump_intermediate_pages', False):
f610dbb0 768 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
769 dump = base64.b64encode(webpage_bytes).decode('ascii')
770 self._downloader.to_screen(dump)
a06916d9 771 if self.get_param('write_pages', False):
f610dbb0 772 basen = '%s_%s' % (video_id, urlh.geturl())
c1bce22f 773 if len(basen) > 240:
f1a9d64e 774 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
775 basen = basen[:240 - len(h)] + h
776 raw_filename = basen + '.dump'
d41e6efc 777 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 778 self.to_screen('Saving request to ' + filename)
5f58165d
S
779 # Working around MAX_PATH limitation on Windows (see
780 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 781 if compat_os_name == 'nt':
5f58165d
S
782 absfilepath = os.path.abspath(filename)
783 if len(absfilepath) > 259:
784 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
785 with open(filename, 'wb') as outf:
786 outf.write(webpage_bytes)
787
ec0fafbb
AA
788 try:
789 content = webpage_bytes.decode(encoding, 'replace')
790 except LookupError:
791 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 792
4457823d 793 self.__check_blocked(content)
2410c43d 794
23be51d8 795 return content
d6983cb4 796
d391b7e2
S
797 def _download_webpage(
798 self, url_or_request, video_id, note=None, errnote=None,
799 fatal=True, tries=1, timeout=5, encoding=None, data=None,
800 headers={}, query={}, expected_status=None):
801 """
802 Return the data of the page as a string.
803
804 Arguments:
805 url_or_request -- plain text URL as a string or
806 a compat_urllib_request.Requestobject
807 video_id -- Video/playlist/item identifier (string)
808
809 Keyword arguments:
810 note -- note printed before downloading (string)
811 errnote -- note printed in case of an error (string)
812 fatal -- flag denoting whether error should be considered fatal,
813 i.e. whether it should cause ExtractionError to be raised,
814 otherwise a warning will be reported and extraction continued
815 tries -- number of tries
816 timeout -- sleep interval between tries
817 encoding -- encoding for a page content decoding, guessed automatically
818 when not explicitly specified
819 data -- POST data (bytes)
820 headers -- HTTP headers (dict)
821 query -- URL query (dict)
822 expected_status -- allows to accept failed HTTP requests (non 2xx
823 status code) by explicitly specifying a set of accepted status
824 codes. Can be any of the following entities:
825 - an integer type specifying an exact failed status code to
826 accept
827 - a list or a tuple of integer types specifying a list of
828 failed status codes to accept
829 - a callable accepting an actual failed status code and
830 returning True if it should be accepted
831 Note that this argument does not affect success status codes (2xx)
832 which are always accepted.
833 """
834
995ad69c
TF
835 success = False
836 try_count = 0
837 while success is False:
838 try:
d391b7e2
S
839 res = self._download_webpage_handle(
840 url_or_request, video_id, note, errnote, fatal,
841 encoding=encoding, data=data, headers=headers, query=query,
842 expected_status=expected_status)
995ad69c
TF
843 success = True
844 except compat_http_client.IncompleteRead as e:
845 try_count += 1
846 if try_count >= tries:
847 raise e
848 self._sleep(timeout, video_id)
7cc3570e
PH
849 if res is False:
850 return res
851 else:
852 content, _ = res
853 return content
d6983cb4 854
e0d198c1
S
855 def _download_xml_handle(
856 self, url_or_request, video_id, note='Downloading XML',
857 errnote='Unable to download XML', transform_source=None,
d391b7e2
S
858 fatal=True, encoding=None, data=None, headers={}, query={},
859 expected_status=None):
860 """
ee0ba927 861 Return a tuple (xml as an compat_etree_Element, URL handle).
d391b7e2
S
862
863 See _download_webpage docstring for arguments specification.
864 """
e0d198c1
S
865 res = self._download_webpage_handle(
866 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
867 encoding=encoding, data=data, headers=headers, query=query,
868 expected_status=expected_status)
e0d198c1
S
869 if res is False:
870 return res
871 xml_string, urlh = res
872 return self._parse_xml(
873 xml_string, video_id, transform_source=transform_source,
874 fatal=fatal), urlh
875
d391b7e2
S
876 def _download_xml(
877 self, url_or_request, video_id,
878 note='Downloading XML', errnote='Unable to download XML',
879 transform_source=None, fatal=True, encoding=None,
880 data=None, headers={}, query={}, expected_status=None):
881 """
ee0ba927 882 Return the xml as an compat_etree_Element.
d391b7e2
S
883
884 See _download_webpage docstring for arguments specification.
885 """
e0d198c1
S
886 res = self._download_xml_handle(
887 url_or_request, video_id, note=note, errnote=errnote,
888 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
889 data=data, headers=headers, query=query,
890 expected_status=expected_status)
e0d198c1 891 return res if res is False else res[0]
e01c3d2e
S
892
893 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
894 if transform_source:
895 xml_string = transform_source(xml_string)
e01c3d2e
S
896 try:
897 return compat_etree_fromstring(xml_string.encode('utf-8'))
898 except compat_xml_parse_error as ve:
899 errmsg = '%s: Failed to parse XML ' % video_id
900 if fatal:
901 raise ExtractorError(errmsg, cause=ve)
902 else:
903 self.report_warning(errmsg + str(ve))
267ed0c5 904
0fe7783e
S
905 def _download_json_handle(
906 self, url_or_request, video_id, note='Downloading JSON metadata',
907 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
908 fatal=True, encoding=None, data=None, headers={}, query={},
909 expected_status=None):
910 """
911 Return a tuple (JSON object, URL handle).
912
913 See _download_webpage docstring for arguments specification.
914 """
0fe7783e 915 res = self._download_webpage_handle(
c9a77969 916 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
917 encoding=encoding, data=data, headers=headers, query=query,
918 expected_status=expected_status)
0fe7783e
S
919 if res is False:
920 return res
921 json_string, urlh = res
ebb64199 922 return self._parse_json(
0fe7783e
S
923 json_string, video_id, transform_source=transform_source,
924 fatal=fatal), urlh
925
926 def _download_json(
927 self, url_or_request, video_id, note='Downloading JSON metadata',
928 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
929 fatal=True, encoding=None, data=None, headers={}, query={},
930 expected_status=None):
931 """
932 Return the JSON object as a dict.
933
934 See _download_webpage docstring for arguments specification.
935 """
0fe7783e
S
936 res = self._download_json_handle(
937 url_or_request, video_id, note=note, errnote=errnote,
938 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
939 data=data, headers=headers, query=query,
940 expected_status=expected_status)
0fe7783e 941 return res if res is False else res[0]
ebb64199
TF
942
943 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
944 if transform_source:
945 json_string = transform_source(json_string)
3d3538e4
PH
946 try:
947 return json.loads(json_string)
948 except ValueError as ve:
e7b6d122
PH
949 errmsg = '%s: Failed to parse JSON ' % video_id
950 if fatal:
951 raise ExtractorError(errmsg, cause=ve)
952 else:
953 self.report_warning(errmsg + str(ve))
3d3538e4 954
adddc50c 955 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
956 return self._parse_json(
957 data[data.find('{'):data.rfind('}') + 1],
958 video_id, transform_source, fatal)
959
960 def _download_socket_json_handle(
961 self, url_or_request, video_id, note='Polling socket',
962 errnote='Unable to poll socket', transform_source=None,
963 fatal=True, encoding=None, data=None, headers={}, query={},
964 expected_status=None):
965 """
966 Return a tuple (JSON object, URL handle).
967
968 See _download_webpage docstring for arguments specification.
969 """
970 res = self._download_webpage_handle(
971 url_or_request, video_id, note, errnote, fatal=fatal,
972 encoding=encoding, data=data, headers=headers, query=query,
973 expected_status=expected_status)
974 if res is False:
975 return res
976 webpage, urlh = res
977 return self._parse_socket_response_as_json(
978 webpage, video_id, transform_source=transform_source,
979 fatal=fatal), urlh
980
981 def _download_socket_json(
982 self, url_or_request, video_id, note='Polling socket',
983 errnote='Unable to poll socket', transform_source=None,
984 fatal=True, encoding=None, data=None, headers={}, query={},
985 expected_status=None):
986 """
987 Return the JSON object as a dict.
988
989 See _download_webpage docstring for arguments specification.
990 """
991 res = self._download_socket_json_handle(
992 url_or_request, video_id, note=note, errnote=errnote,
993 transform_source=transform_source, fatal=fatal, encoding=encoding,
994 data=data, headers=headers, query=query,
995 expected_status=expected_status)
996 return res if res is False else res[0]
997
a06916d9 998 def report_warning(self, msg, video_id=None, *args, **kwargs):
f1a9d64e 999 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 1000 self._downloader.report_warning(
a06916d9 1001 '[%s] %s%s' % (self.IE_NAME, idstr, msg), *args, **kwargs)
f45f96f8 1002
a06916d9 1003 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1004 """Print msg to screen, prefixing it with '[ie_name]'"""
a06916d9 1005 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1006
1007 def write_debug(self, msg, *args, **kwargs):
1008 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1009
1010 def get_param(self, name, default=None, *args, **kwargs):
1011 if self._downloader:
1012 return self._downloader.params.get(name, default, *args, **kwargs)
1013 return default
d6983cb4
PH
1014
1015 def report_extraction(self, id_or_name):
1016 """Report information extraction."""
f1a9d64e 1017 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1018
1019 def report_download_webpage(self, video_id):
1020 """Report webpage download."""
f1a9d64e 1021 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1022
1023 def report_age_confirmation(self):
1024 """Report attempt to confirm age."""
f1a9d64e 1025 self.to_screen('Confirming age')
d6983cb4 1026
fc79158d
JMF
1027 def report_login(self):
1028 """Report attempt to log in."""
f1a9d64e 1029 self.to_screen('Logging in')
fc79158d 1030
b7da73eb 1031 def raise_login_required(
9d5d4d64 1032 self, msg='This video is only available for registered users',
1033 metadata_available=False, method='any'):
a06916d9 1034 if metadata_available and self.get_param('ignore_no_formats_error'):
b7da73eb 1035 self.report_warning(msg)
9d5d4d64 1036 raise ExtractorError('%s. %s' % (msg, self._LOGIN_HINTS[method]), expected=True)
43e7d3c9 1037
b7da73eb 1038 def raise_geo_restricted(
1039 self, msg='This video is not available from your location due to geo restriction',
1040 countries=None, metadata_available=False):
a06916d9 1041 if metadata_available and self.get_param('ignore_no_formats_error'):
b7da73eb 1042 self.report_warning(msg)
1043 else:
1044 raise GeoRestrictedError(msg, countries=countries)
1045
1046 def raise_no_formats(self, msg, expected=False, video_id=None):
a06916d9 1047 if expected and self.get_param('ignore_no_formats_error'):
b7da73eb 1048 self.report_warning(msg, video_id)
1049 else:
1050 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1051
5f6a1245 1052 # Methods for following #608
c0d0b01f 1053 @staticmethod
830d53bf 1054 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 1055 """Returns a URL that points to a page that should be processed"""
5f6a1245 1056 # TODO: ie should be the class used for getting the info
d6983cb4
PH
1057 video_info = {'_type': 'url',
1058 'url': url,
1059 'ie_key': ie}
7012b23c
PH
1060 if video_id is not None:
1061 video_info['id'] = video_id
830d53bf
S
1062 if video_title is not None:
1063 video_info['title'] = video_title
d6983cb4 1064 return video_info
5f6a1245 1065
749ca5ec
S
1066 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1067 urls = orderedSet(
46b18f23
JH
1068 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1069 for m in matches)
1070 return self.playlist_result(
749ca5ec 1071 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 1072
c0d0b01f 1073 @staticmethod
b60419c5 1074 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
d6983cb4
PH
1075 """Returns a playlist"""
1076 video_info = {'_type': 'playlist',
1077 'entries': entries}
b60419c5 1078 video_info.update(kwargs)
d6983cb4
PH
1079 if playlist_id:
1080 video_info['id'] = playlist_id
1081 if playlist_title:
1082 video_info['title'] = playlist_title
ecc97af3 1083 if playlist_description is not None:
acf5cbfe 1084 video_info['description'] = playlist_description
d6983cb4
PH
1085 return video_info
1086
c342041f 1087 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1088 """
1089 Perform a regex search on the given string, using a single or a list of
1090 patterns returning the first matching group.
1091 In case of failure return a default value or raise a WARNING or a
55b3e45b 1092 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
1093 """
1094 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1095 mobj = re.search(pattern, string, flags)
1096 else:
1097 for p in pattern:
1098 mobj = re.search(p, string, flags)
c3415d1b
PH
1099 if mobj:
1100 break
d6983cb4 1101
a06916d9 1102 if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 1103 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
1104 else:
1105 _name = name
1106
1107 if mobj:
711ede6e
PH
1108 if group is None:
1109 # return the first matching group
1110 return next(g for g in mobj.groups() if g is not None)
1111 else:
1112 return mobj.group(group)
c342041f 1113 elif default is not NO_DEFAULT:
d6983cb4
PH
1114 return default
1115 elif fatal:
f1a9d64e 1116 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1117 else:
6a39ee13 1118 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1119 return None
1120
c342041f 1121 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1122 """
1123 Like _search_regex, but strips HTML tags and unescapes entities.
1124 """
711ede6e 1125 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1126 if res:
1127 return clean_html(res).strip()
1128 else:
1129 return res
1130
2118fdd1
RA
1131 def _get_netrc_login_info(self, netrc_machine=None):
1132 username = None
1133 password = None
1134 netrc_machine = netrc_machine or self._NETRC_MACHINE
1135
a06916d9 1136 if self.get_param('usenetrc', False):
2118fdd1
RA
1137 try:
1138 info = netrc.netrc().authenticators(netrc_machine)
1139 if info is not None:
1140 username = info[0]
1141 password = info[2]
1142 else:
dcce092e
S
1143 raise netrc.NetrcParseError(
1144 'No authenticators for %s' % netrc_machine)
2118fdd1 1145 except (IOError, netrc.NetrcParseError) as err:
6a39ee13 1146 self.report_warning(
dcce092e 1147 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1148
dcce092e 1149 return username, password
2118fdd1 1150
1b6712ab 1151 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1152 """
cf0649f8 1153 Get the login info as (username, password)
32443dd3
S
1154 First look for the manually specified credentials using username_option
1155 and password_option as keys in params dictionary. If no such credentials
1156 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1157 value.
fc79158d
JMF
1158 If there's no info available, return (None, None)
1159 """
fc79158d
JMF
1160
1161 # Attempt to use provided username and password or .netrc data
a06916d9 1162 username = self.get_param(username_option)
1163 if username is not None:
1164 password = self.get_param(password_option)
2118fdd1 1165 else:
1b6712ab 1166 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1167
2133565c 1168 return username, password
fc79158d 1169
e64b7569 1170 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1171 """
1172 Get the two-factor authentication info
1173 TODO - asking the user will be required for sms/phone verify
1174 currently just uses the command line option
1175 If there's no info available, return None
1176 """
83317f69 1177
a06916d9 1178 tfa = self.get_param('twofactor')
1179 if tfa is not None:
1180 return tfa
83317f69 1181
e64b7569 1182 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1183
46720279
JMF
1184 # Helper functions for extracting OpenGraph info
1185 @staticmethod
ab2d5247 1186 def _og_regexes(prop):
448ef1f3 1187 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
22f5f5c6 1188 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
7a6d76a6 1189 % {'prop': re.escape(prop)})
78fb87b2 1190 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1191 return [
78fb87b2
JMF
1192 template % (property_re, content_re),
1193 template % (content_re, property_re),
ab2d5247 1194 ]
46720279 1195
864f24bd
S
1196 @staticmethod
1197 def _meta_regex(prop):
1198 return r'''(?isx)<meta
8b9848ac 1199 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1200 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1201
3c4e6d83 1202 def _og_search_property(self, prop, html, name=None, **kargs):
b070564e
S
1203 if not isinstance(prop, (list, tuple)):
1204 prop = [prop]
46720279 1205 if name is None:
b070564e
S
1206 name = 'OpenGraph %s' % prop[0]
1207 og_regexes = []
1208 for p in prop:
1209 og_regexes.extend(self._og_regexes(p))
1210 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1211 if escaped is None:
1212 return None
1213 return unescapeHTML(escaped)
46720279
JMF
1214
1215 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1216 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1217
1218 def _og_search_description(self, html, **kargs):
1219 return self._og_search_property('description', html, fatal=False, **kargs)
1220
1221 def _og_search_title(self, html, **kargs):
1222 return self._og_search_property('title', html, **kargs)
1223
8ffa13e0 1224 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1225 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1226 if secure:
1227 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1228 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1229
78338f71
JMF
1230 def _og_search_url(self, html, **kargs):
1231 return self._og_search_property('url', html, **kargs)
1232
40c696e5 1233 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
88d9f6c0
S
1234 if not isinstance(name, (list, tuple)):
1235 name = [name]
59040888 1236 if display_name is None:
88d9f6c0 1237 display_name = name[0]
59040888 1238 return self._html_search_regex(
88d9f6c0 1239 [self._meta_regex(n) for n in name],
711ede6e 1240 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1241
1242 def _dc_search_uploader(self, html):
1243 return self._html_search_meta('dc.creator', html, 'uploader')
1244
8dbe9899
PH
1245 def _rta_search(self, html):
1246 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1247 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1248 r' content="RTA-5042-1996-1400-1577-RTA"',
1249 html):
1250 return 18
1251 return 0
1252
59040888
PH
1253 def _media_rating_search(self, html):
1254 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1255 rating = self._html_search_meta('rating', html)
1256
1257 if not rating:
1258 return None
1259
1260 RATING_TABLE = {
1261 'safe for kids': 0,
1262 'general': 8,
1263 '14 years': 14,
1264 'mature': 17,
1265 'restricted': 19,
1266 }
d800609c 1267 return RATING_TABLE.get(rating.lower())
59040888 1268
69319969 1269 def _family_friendly_search(self, html):
6ca7732d 1270 # See http://schema.org/VideoObject
ac8491fc
S
1271 family_friendly = self._html_search_meta(
1272 'isFamilyFriendly', html, default=None)
69319969
NJ
1273
1274 if not family_friendly:
1275 return None
1276
1277 RATING_TABLE = {
1278 '1': 0,
1279 'true': 0,
1280 '0': 18,
1281 'false': 18,
1282 }
d800609c 1283 return RATING_TABLE.get(family_friendly.lower())
69319969 1284
0c708f11
JMF
1285 def _twitter_search_player(self, html):
1286 return self._html_search_meta('twitter:player', html,
9e1a5b84 1287 'twitter card player')
0c708f11 1288
95b31e26 1289 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4433bb02 1290 json_ld_list = list(re.finditer(JSON_LD_RE, html))
321b5e08 1291 default = kwargs.get('default', NO_DEFAULT)
321b5e08
S
1292 # JSON-LD may be malformed and thus `fatal` should be respected.
1293 # At the same time `default` may be passed that assumes `fatal=False`
1294 # for _search_regex. Let's simulate the same behavior here as well.
1295 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
4433bb02
S
1296 json_ld = []
1297 for mobj in json_ld_list:
1298 json_ld_item = self._parse_json(
1299 mobj.group('json_ld'), video_id, fatal=fatal)
1300 if not json_ld_item:
1301 continue
1302 if isinstance(json_ld_item, dict):
1303 json_ld.append(json_ld_item)
1304 elif isinstance(json_ld_item, (list, tuple)):
1305 json_ld.extend(json_ld_item)
1306 if json_ld:
1307 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1308 if json_ld:
1309 return json_ld
1310 if default is not NO_DEFAULT:
1311 return default
1312 elif fatal:
1313 raise RegexNotFoundError('Unable to extract JSON-LD')
1314 else:
6a39ee13 1315 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1316 return {}
4ca2a3cf 1317
95b31e26 1318 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1319 if isinstance(json_ld, compat_str):
1320 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1321 if not json_ld:
1322 return {}
1323 info = {}
46933a15
S
1324 if not isinstance(json_ld, (list, tuple, dict)):
1325 return info
1326 if isinstance(json_ld, dict):
1327 json_ld = [json_ld]
bae14048 1328
e7e4a6e0
S
1329 INTERACTION_TYPE_MAP = {
1330 'CommentAction': 'comment',
1331 'AgreeAction': 'like',
1332 'DisagreeAction': 'dislike',
1333 'LikeAction': 'like',
1334 'DislikeAction': 'dislike',
1335 'ListenAction': 'view',
1336 'WatchAction': 'view',
1337 'ViewAction': 'view',
1338 }
1339
29f7c58a 1340 def extract_interaction_type(e):
1341 interaction_type = e.get('interactionType')
1342 if isinstance(interaction_type, dict):
1343 interaction_type = interaction_type.get('@type')
1344 return str_or_none(interaction_type)
1345
e7e4a6e0
S
1346 def extract_interaction_statistic(e):
1347 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1348 if isinstance(interaction_statistic, dict):
1349 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1350 if not isinstance(interaction_statistic, list):
1351 return
1352 for is_e in interaction_statistic:
1353 if not isinstance(is_e, dict):
1354 continue
1355 if is_e.get('@type') != 'InteractionCounter':
1356 continue
29f7c58a 1357 interaction_type = extract_interaction_type(is_e)
1358 if not interaction_type:
e7e4a6e0 1359 continue
ce5b9040
S
1360 # For interaction count some sites provide string instead of
1361 # an integer (as per spec) with non digit characters (e.g. ",")
1362 # so extracting count with more relaxed str_to_int
1363 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1364 if interaction_count is None:
1365 continue
1366 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1367 if not count_kind:
1368 continue
1369 count_key = '%s_count' % count_kind
1370 if info.get(count_key) is not None:
1371 continue
1372 info[count_key] = interaction_count
1373
bae14048
S
1374 def extract_video_object(e):
1375 assert e['@type'] == 'VideoObject'
f7ad7160 1376 author = e.get('author')
bae14048 1377 info.update({
bebef109 1378 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1379 'title': unescapeHTML(e.get('name')),
1380 'description': unescapeHTML(e.get('description')),
bebef109 1381 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
bae14048
S
1382 'duration': parse_duration(e.get('duration')),
1383 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1384 # author can be an instance of 'Organization' or 'Person' types.
1385 # both types can have 'name' property(inherited from 'Thing' type). [1]
1386 # however some websites are using 'Text' type instead.
1387 # 1. https://schema.org/VideoObject
1388 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
bae14048
S
1389 'filesize': float_or_none(e.get('contentSize')),
1390 'tbr': int_or_none(e.get('bitrate')),
1391 'width': int_or_none(e.get('width')),
1392 'height': int_or_none(e.get('height')),
33a81c2c 1393 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1394 })
e7e4a6e0 1395 extract_interaction_statistic(e)
bae14048 1396
46933a15 1397 for e in json_ld:
4433bb02 1398 if '@context' in e:
46933a15
S
1399 item_type = e.get('@type')
1400 if expected_type is not None and expected_type != item_type:
4433bb02 1401 continue
c69701c6 1402 if item_type in ('TVEpisode', 'Episode'):
440863ad 1403 episode_name = unescapeHTML(e.get('name'))
46933a15 1404 info.update({
440863ad 1405 'episode': episode_name,
46933a15
S
1406 'episode_number': int_or_none(e.get('episodeNumber')),
1407 'description': unescapeHTML(e.get('description')),
1408 })
440863ad
S
1409 if not info.get('title') and episode_name:
1410 info['title'] = episode_name
46933a15 1411 part_of_season = e.get('partOfSeason')
c69701c6 1412 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1413 info.update({
1414 'season': unescapeHTML(part_of_season.get('name')),
1415 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1416 })
d16b3c66 1417 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1418 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1419 info['series'] = unescapeHTML(part_of_series.get('name'))
391256dc
S
1420 elif item_type == 'Movie':
1421 info.update({
1422 'title': unescapeHTML(e.get('name')),
1423 'description': unescapeHTML(e.get('description')),
1424 'duration': parse_duration(e.get('duration')),
1425 'timestamp': unified_timestamp(e.get('dateCreated')),
1426 })
3931b845 1427 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1428 info.update({
1429 'timestamp': parse_iso8601(e.get('datePublished')),
1430 'title': unescapeHTML(e.get('headline')),
1431 'description': unescapeHTML(e.get('articleBody')),
1432 })
1433 elif item_type == 'VideoObject':
bae14048 1434 extract_video_object(e)
4433bb02
S
1435 if expected_type is None:
1436 continue
1437 else:
1438 break
c69701c6
S
1439 video = e.get('video')
1440 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1441 extract_video_object(video)
4433bb02
S
1442 if expected_type is None:
1443 continue
1444 else:
1445 break
4ca2a3cf
S
1446 return dict((k, v) for k, v in info.items() if v is not None)
1447
27713812 1448 @staticmethod
f8da79f8 1449 def _hidden_inputs(html):
586f1cc5 1450 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1451 hidden_inputs = {}
c8498368
S
1452 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1453 attrs = extract_attributes(input)
1454 if not input:
201ea3ee 1455 continue
c8498368 1456 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1457 continue
c8498368
S
1458 name = attrs.get('name') or attrs.get('id')
1459 value = attrs.get('value')
1460 if name and value is not None:
1461 hidden_inputs[name] = value
201ea3ee 1462 return hidden_inputs
27713812 1463
cf61d96d
S
1464 def _form_hidden_inputs(self, form_id, html):
1465 form = self._search_regex(
73eb13df 1466 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1467 html, '%s form' % form_id, group='form')
1468 return self._hidden_inputs(form)
1469
eb8a4433 1470 class FormatSort:
b050d210 1471 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
eb8a4433 1472
c10d0213 1473 default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
155d2b48 1474 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
53ed7066 1475 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases
1476 ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1477 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1478 'fps', 'fs_approx', 'source', 'format_id')
eb8a4433 1479
1480 settings = {
1481 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1482 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1483 'acodec': {'type': 'ordered', 'regex': True,
1484 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
f137c99e 1485 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
63be1aab 1486 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
eb8a4433 1487 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1488 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1489 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1490 'aext': {'type': 'ordered', 'field': 'audio_ext',
1491 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1492 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1493 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f983b875 1494 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1495 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1496 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
e4beae70 1497 'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
6a04a74e 1498 'quality': {'convert': 'float_none', 'default': -1},
eb8a4433 1499 'filesize': {'convert': 'bytes'},
f137c99e 1500 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1501 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1502 'height': {'convert': 'float_none'},
1503 'width': {'convert': 'float_none'},
1504 'fps': {'convert': 'float_none'},
1505 'tbr': {'convert': 'float_none'},
1506 'vbr': {'convert': 'float_none'},
1507 'abr': {'convert': 'float_none'},
1508 'asr': {'convert': 'float_none'},
e4beae70 1509 'source': {'convert': 'ignore', 'field': 'source_preference'},
63be1aab 1510
eb8a4433 1511 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1512 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1513 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1514 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1515 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1516
1517 # Most of these exist only for compatibility reasons
1518 'dimension': {'type': 'alias', 'field': 'res'},
1519 'resolution': {'type': 'alias', 'field': 'res'},
1520 'extension': {'type': 'alias', 'field': 'ext'},
1521 'bitrate': {'type': 'alias', 'field': 'br'},
eb8a4433 1522 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1523 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1524 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1525 'framerate': {'type': 'alias', 'field': 'fps'},
63be1aab 1526 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1527 'protocol': {'type': 'alias', 'field': 'proto'},
1528 'source_preference': {'type': 'alias', 'field': 'source'},
1529 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1530 'filesize_estimate': {'type': 'alias', 'field': 'size'},
eb8a4433 1531 'samplerate': {'type': 'alias', 'field': 'asr'},
1532 'video_ext': {'type': 'alias', 'field': 'vext'},
1533 'audio_ext': {'type': 'alias', 'field': 'aext'},
1534 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1535 'audio_codec': {'type': 'alias', 'field': 'acodec'},
63be1aab 1536 'video': {'type': 'alias', 'field': 'hasvid'},
1537 'has_video': {'type': 'alias', 'field': 'hasvid'},
1538 'audio': {'type': 'alias', 'field': 'hasaud'},
1539 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1540 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1541 'preference': {'type': 'alias', 'field': 'ie_pref'},
1542 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1543 'format_id': {'type': 'alias', 'field': 'id'},
1544 }
eb8a4433 1545
1546 _order = []
1547
1548 def _get_field_setting(self, field, key):
1549 if field not in self.settings:
1550 self.settings[field] = {}
1551 propObj = self.settings[field]
1552 if key not in propObj:
1553 type = propObj.get('type')
1554 if key == 'field':
1555 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1556 elif key == 'convert':
1557 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1558 else:
eb8a4433 1559 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1560 propObj[key] = default
1561 return propObj[key]
1562
1563 def _resolve_field_value(self, field, value, convertNone=False):
1564 if value is None:
1565 if not convertNone:
1566 return None
4bcc7bd1 1567 else:
eb8a4433 1568 value = value.lower()
1569 conversion = self._get_field_setting(field, 'convert')
1570 if conversion == 'ignore':
1571 return None
1572 if conversion == 'string':
1573 return value
1574 elif conversion == 'float_none':
1575 return float_or_none(value)
1576 elif conversion == 'bytes':
1577 return FileDownloader.parse_bytes(value)
1578 elif conversion == 'order':
da9be05e 1579 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1580 use_regex = self._get_field_setting(field, 'regex')
1581 list_length = len(order_list)
1582 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1583 if use_regex and value is not None:
da9be05e 1584 for i, regex in enumerate(order_list):
eb8a4433 1585 if regex and re.match(regex, value):
1586 return list_length - i
1587 return list_length - empty_pos # not in list
1588 else: # not regex or value = None
1589 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1590 else:
1591 if value.isnumeric():
1592 return float(value)
4bcc7bd1 1593 else:
eb8a4433 1594 self.settings[field]['convert'] = 'string'
1595 return value
1596
1597 def evaluate_params(self, params, sort_extractor):
1598 self._use_free_order = params.get('prefer_free_formats', False)
1599 self._sort_user = params.get('format_sort', [])
1600 self._sort_extractor = sort_extractor
1601
1602 def add_item(field, reverse, closest, limit_text):
1603 field = field.lower()
1604 if field in self._order:
1605 return
1606 self._order.append(field)
1607 limit = self._resolve_field_value(field, limit_text)
1608 data = {
1609 'reverse': reverse,
1610 'closest': False if limit is None else closest,
1611 'limit_text': limit_text,
1612 'limit': limit}
1613 if field in self.settings:
1614 self.settings[field].update(data)
1615 else:
1616 self.settings[field] = data
1617
1618 sort_list = (
1619 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1620 + (tuple() if params.get('format_sort_force', False)
1621 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1622 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1623
1624 for item in sort_list:
1625 match = re.match(self.regex, item)
1626 if match is None:
1627 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1628 field = match.group('field')
1629 if field is None:
1630 continue
1631 if self._get_field_setting(field, 'type') == 'alias':
1632 field = self._get_field_setting(field, 'field')
1633 reverse = match.group('reverse') is not None
b050d210 1634 closest = match.group('separator') == '~'
eb8a4433 1635 limit_text = match.group('limit')
1636
1637 has_limit = limit_text is not None
1638 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1639 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1640
1641 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1642 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1643 limit_count = len(limits)
1644 for (i, f) in enumerate(fields):
1645 add_item(f, reverse, closest,
1646 limits[i] if i < limit_count
1647 else limits[0] if has_limit and not has_multiple_limits
1648 else None)
1649
0760b0a7 1650 def print_verbose_info(self, write_debug):
b31fdeed 1651 if self._sort_user:
0760b0a7 1652 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
eb8a4433 1653 if self._sort_extractor:
0760b0a7 1654 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1655 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
eb8a4433 1656 '+' if self._get_field_setting(field, 'reverse') else '', field,
1657 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1658 self._get_field_setting(field, 'limit_text'),
1659 self._get_field_setting(field, 'limit'))
1660 if self._get_field_setting(field, 'limit_text') is not None else '')
1661 for field in self._order if self._get_field_setting(field, 'visible')]))
1662
1663 def _calculate_field_preference_from_value(self, format, field, type, value):
1664 reverse = self._get_field_setting(field, 'reverse')
1665 closest = self._get_field_setting(field, 'closest')
1666 limit = self._get_field_setting(field, 'limit')
1667
1668 if type == 'extractor':
1669 maximum = self._get_field_setting(field, 'max')
1670 if value is None or (maximum is not None and value >= maximum):
f983b875 1671 value = -1
eb8a4433 1672 elif type == 'boolean':
1673 in_list = self._get_field_setting(field, 'in_list')
1674 not_in_list = self._get_field_setting(field, 'not_in_list')
1675 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1676 elif type == 'ordered':
1677 value = self._resolve_field_value(field, value, True)
1678
1679 # try to convert to number
6a04a74e 1680 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
eb8a4433 1681 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1682 if is_num:
1683 value = val_num
1684
1685 return ((-10, 0) if value is None
1686 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1687 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1688 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1689 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1690 else (-1, value, 0))
1691
1692 def _calculate_field_preference(self, format, field):
1693 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1694 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1695 if type == 'multiple':
1696 type = 'field' # Only 'field' is allowed in multiple for now
1697 actual_fields = self._get_field_setting(field, 'field')
1698
1699 def wrapped_function(values):
1700 values = tuple(filter(lambda x: x is not None, values))
1701 return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1702 else values[0] if values
1703 else None)
1704
1705 value = wrapped_function((get_value(f) for f in actual_fields))
1706 else:
1707 value = get_value(field)
1708 return self._calculate_field_preference_from_value(format, field, type, value)
1709
1710 def calculate_preference(self, format):
1711 # Determine missing protocol
1712 if not format.get('protocol'):
1713 format['protocol'] = determine_protocol(format)
1714
1715 # Determine missing ext
1716 if not format.get('ext') and 'url' in format:
1717 format['ext'] = determine_ext(format['url'])
1718 if format.get('vcodec') == 'none':
1719 format['audio_ext'] = format['ext']
1720 format['video_ext'] = 'none'
1721 else:
1722 format['video_ext'] = format['ext']
1723 format['audio_ext'] = 'none'
1724 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1725 # format['preference'] = -1000
1726
1727 # Determine missing bitrates
1728 if format.get('tbr') is None:
1729 if format.get('vbr') is not None and format.get('abr') is not None:
1730 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1731 else:
1732 if format.get('vcodec') != "none" and format.get('vbr') is None:
1733 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1734 if format.get('acodec') != "none" and format.get('abr') is None:
1735 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1736
1737 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1738
1739 def _sort_formats(self, formats, field_preference=[]):
1740 if not formats:
a06916d9 1741 if self.get_param('ignore_no_formats_error'):
b7da73eb 1742 return
eb8a4433 1743 raise ExtractorError('No video formats found')
1744 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1745 format_sort.evaluate_params(self._downloader.params, field_preference)
a06916d9 1746 if self.get_param('verbose', False):
0760b0a7 1747 format_sort.print_verbose_info(self._downloader.write_debug)
eb8a4433 1748 formats.sort(key=lambda f: format_sort.calculate_preference(f))
59040888 1749
96a53167
S
1750 def _check_formats(self, formats, video_id):
1751 if formats:
1752 formats[:] = filter(
1753 lambda f: self._is_valid_url(
1754 f['url'], video_id,
1755 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1756 formats)
1757
f5bdb444
S
1758 @staticmethod
1759 def _remove_duplicate_formats(formats):
1760 format_urls = set()
1761 unique_formats = []
1762 for f in formats:
1763 if f['url'] not in format_urls:
1764 format_urls.add(f['url'])
1765 unique_formats.append(f)
1766 formats[:] = unique_formats
1767
45024183 1768 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1769 url = self._proto_relative_url(url, scheme='http:')
1770 # For now assume non HTTP(S) URLs always valid
1771 if not (url.startswith('http://') or url.startswith('https://')):
1772 return True
96a53167 1773 try:
45024183 1774 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1775 return True
8bdd16b4 1776 except ExtractorError as e:
25e911a9 1777 self.to_screen(
8bdd16b4 1778 '%s: %s URL is invalid, skipping: %s'
1779 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1780 return False
96a53167 1781
20991253 1782 def http_scheme(self):
1ede5b24 1783 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1784 return (
1785 'http:'
a06916d9 1786 if self.get_param('prefer_insecure', False)
20991253
PH
1787 else 'https:')
1788
57c7411f
PH
1789 def _proto_relative_url(self, url, scheme=None):
1790 if url is None:
1791 return url
1792 if url.startswith('//'):
1793 if scheme is None:
1794 scheme = self.http_scheme()
1795 return scheme + url
1796 else:
1797 return url
1798
4094b6e3
PH
1799 def _sleep(self, timeout, video_id, msg_template=None):
1800 if msg_template is None:
f1a9d64e 1801 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1802 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1803 self.to_screen(msg)
1804 time.sleep(timeout)
1805
f983b875 1806 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1807 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1808 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
f036a632
JMF
1809 manifest = self._download_xml(
1810 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1811 'Unable to download f4m manifest',
1812 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1813 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1814 transform_source=transform_source,
7360c06f 1815 fatal=fatal, data=data, headers=headers, query=query)
4de61310
S
1816
1817 if manifest is False:
8d29e47f 1818 return []
31bb8d3f 1819
0fdbb332 1820 return self._parse_f4m_formats(
f983b875 1821 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1822 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1823
f983b875 1824 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1825 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1826 fatal=True, m3u8_id=None):
ee0ba927 1827 if not isinstance(manifest, compat_etree_Element) and not fatal:
d9eb580a
S
1828 return []
1829
7a5c1cfe 1830 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1831 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1832 if akamai_pv is not None and ';' in akamai_pv.text:
1833 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1834 if playerVerificationChallenge.strip() != '':
1835 return []
1836
31bb8d3f 1837 formats = []
7a47d07c 1838 manifest_version = '1.0'
b2527359 1839 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1840 if not media_nodes:
7a47d07c 1841 manifest_version = '2.0'
34e48bed 1842 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1843 # Remove unsupported DRM protected media from final formats
067aa17e 1844 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1845 media_nodes = remove_encrypted_media(media_nodes)
1846 if not media_nodes:
1847 return formats
48107c19
S
1848
1849 manifest_base_url = get_base_url(manifest)
0a5685b2 1850
a6571f10 1851 bootstrap_info = xpath_element(
0a5685b2
YCH
1852 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1853 'bootstrap info', default=None)
1854
edd6074c
RA
1855 vcodec = None
1856 mime_type = xpath_text(
1857 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1858 'base URL', default=None)
1859 if mime_type and mime_type.startswith('audio/'):
1860 vcodec = 'none'
1861
b2527359 1862 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1863 tbr = int_or_none(media_el.attrib.get('bitrate'))
1864 width = int_or_none(media_el.attrib.get('width'))
1865 height = int_or_none(media_el.attrib.get('height'))
1866 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1867 # If <bootstrapInfo> is present, the specified f4m is a
1868 # stream-level manifest, and only set-level manifests may refer to
1869 # external resources. See section 11.4 and section 4 of F4M spec
1870 if bootstrap_info is None:
1871 media_url = None
1872 # @href is introduced in 2.0, see section 11.6 of F4M spec
1873 if manifest_version == '2.0':
1874 media_url = media_el.attrib.get('href')
1875 if media_url is None:
1876 media_url = media_el.attrib.get('url')
31c746e5
S
1877 if not media_url:
1878 continue
cc357c4d
S
1879 manifest_url = (
1880 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1881 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1882 # If media_url is itself a f4m manifest do the recursive extraction
1883 # since bitrates in parent manifest (this one) and media_url manifest
1884 # may differ leading to inability to resolve the format by requested
1885 # bitrate in f4m downloader
240b6045
YCH
1886 ext = determine_ext(manifest_url)
1887 if ext == 'f4m':
77b8b4e6 1888 f4m_formats = self._extract_f4m_formats(
f983b875 1889 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1890 transform_source=transform_source, fatal=fatal)
1891 # Sometimes stream-level manifest contains single media entry that
1892 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1893 # At the same time parent's media entry in set-level manifest may
1894 # contain it. We will copy it from parent in such cases.
1895 if len(f4m_formats) == 1:
1896 f = f4m_formats[0]
1897 f.update({
1898 'tbr': f.get('tbr') or tbr,
1899 'width': f.get('width') or width,
1900 'height': f.get('height') or height,
1901 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1902 'vcodec': vcodec,
77b8b4e6
S
1903 })
1904 formats.extend(f4m_formats)
70f0f5a8 1905 continue
240b6045
YCH
1906 elif ext == 'm3u8':
1907 formats.extend(self._extract_m3u8_formats(
1908 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1909 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1910 continue
31bb8d3f 1911 formats.append({
77b8b4e6 1912 'format_id': format_id,
31bb8d3f 1913 'url': manifest_url,
30d0b549 1914 'manifest_url': manifest_url,
a6571f10 1915 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1916 'protocol': 'f4m',
b2527359 1917 'tbr': tbr,
77b8b4e6
S
1918 'width': width,
1919 'height': height,
edd6074c 1920 'vcodec': vcodec,
60ca389c 1921 'preference': preference,
f983b875 1922 'quality': quality,
31bb8d3f 1923 })
31bb8d3f
JMF
1924 return formats
1925
f983b875 1926 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1927 return {
f207019c 1928 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1929 'url': m3u8_url,
1930 'ext': ext,
1931 'protocol': 'm3u8',
37768f92 1932 'preference': preference - 100 if preference else -100,
f983b875 1933 'quality': quality,
704df56d
PH
1934 'resolution': 'multiple',
1935 'format_note': 'Quality selection URL',
16da9bbc
YCH
1936 }
1937
a0c3b2d5
F
1938 def _extract_m3u8_formats(self, *args, **kwargs):
1939 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1940 if subs:
1941 self.report_warning(bug_reports_message(
1942 "Ignoring subtitle tracks found in the HLS manifest; "
1943 "if any subtitle tracks are missing,"
1944 ))
1945 return fmts
1946
1947 def _extract_m3u8_formats_and_subtitles(
177877c5 1948 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1949 preference=None, quality=None, m3u8_id=None, note=None,
1950 errnote=None, fatal=True, live=False, data=None, headers={},
1951 query={}):
1952
dbd82a1d 1953 res = self._download_webpage_handle(
81515ad9 1954 m3u8_url, video_id,
37a3bb66 1955 note='Downloading m3u8 information' if note is None else note,
1956 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1957 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1958
dbd82a1d 1959 if res is False:
a0c3b2d5 1960 return [], {}
cb252080 1961
dbd82a1d 1962 m3u8_doc, urlh = res
37113045 1963 m3u8_url = urlh.geturl()
9cdffeeb 1964
a0c3b2d5 1965 return self._parse_m3u8_formats_and_subtitles(
cb252080 1966 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1967 preference=preference, quality=quality, m3u8_id=m3u8_id,
1968 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1969 headers=headers, query=query, video_id=video_id)
cb252080 1970
a0c3b2d5 1971 def _parse_m3u8_formats_and_subtitles(
177877c5 1972 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1973 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1974 errnote=None, fatal=True, data=None, headers={}, query={},
1975 video_id=None):
1976
08a00eef 1977 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
a0c3b2d5 1978 return [], {}
08a00eef 1979
a06916d9 1980 if (not self.get_param('allow_unplayable_formats')
73d4343e 1981 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
a0c3b2d5 1982 return [], {}
ea229584 1983
ff99fe52 1984 formats = []
0def7587 1985
a0c3b2d5
F
1986 subtitles = {}
1987
0def7587
RA
1988 format_url = lambda u: (
1989 u
1990 if re.match(r'^https?://', u)
1991 else compat_urlparse.urljoin(m3u8_url, u))
1992
a06916d9 1993 split_discontinuity = self.get_param('hls_split_discontinuity', False)
310c2ed2 1994
cb252080
S
1995 # References:
1996 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
1997 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1998 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
1999
2000 # We should try extracting formats only from master playlists [1, 4.3.4],
2001 # i.e. playlists that describe available qualities. On the other hand
2002 # media playlists [1, 4.3.3] should be returned as is since they contain
2003 # just the media without qualities renditions.
9cdffeeb 2004 # Fortunately, master playlist can be easily distinguished from media
cb252080 2005 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2006 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2007 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2008 # media playlist and MUST NOT appear in master playlist thus we can
2009 # clearly detect media playlist with this criterion.
2010
b3b30a4b 2011 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
2012 fatal=True, data=None, headers={}):
310c2ed2 2013 if not m3u8_doc:
ed9b7e3d 2014 if not format_url:
2015 return []
310c2ed2 2016 res = self._download_webpage_handle(
2017 format_url, video_id,
2018 note=False,
b3b30a4b 2019 errnote='Failed to download m3u8 playlist information',
2020 fatal=fatal, data=data, headers=headers)
310c2ed2 2021
2022 if res is False:
2023 return []
2024
2025 m3u8_doc, urlh = res
2026 format_url = urlh.geturl()
2027
2028 playlist_formats = []
2029 i = (
2030 0
2031 if split_discontinuity
2032 else None)
2033 format_info = {
2034 'index': i,
2035 'key_data': None,
2036 'files': [],
2037 }
2038 for line in m3u8_doc.splitlines():
2039 if not line.startswith('#'):
2040 format_info['files'].append(line)
2041 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
2042 i += 1
2043 playlist_formats.append(format_info)
2044 format_info = {
2045 'index': i,
2046 'url': format_url,
2047 'files': [],
2048 }
2049 playlist_formats.append(format_info)
2050 return playlist_formats
2051
9cdffeeb 2052 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
310c2ed2 2053
ed9b7e3d 2054 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
310c2ed2 2055
2056 for format in playlist_formats:
2057 format_id = []
2058 if m3u8_id:
2059 format_id.append(m3u8_id)
2060 format_index = format.get('index')
2061 if format_index:
2062 format_id.append(str(format_index))
2063 f = {
2064 'format_id': '-'.join(format_id),
2065 'format_index': format_index,
2066 'url': m3u8_url,
2067 'ext': ext,
2068 'protocol': entry_protocol,
2069 'preference': preference,
2070 'quality': quality,
2071 }
2072 formats.append(f)
2073
a0c3b2d5 2074 return formats, subtitles
cb252080
S
2075
2076 groups = {}
2077 last_stream_inf = {}
2078
2079 def extract_media(x_media_line):
2080 media = parse_m3u8_attributes(x_media_line)
2081 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2082 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2083 if not (media_type and group_id and name):
2084 return
2085 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2086 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2087 if media_type == 'SUBTITLES':
3907333c 2088 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2089 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2090 # However, lack of URI has been spotted in the wild.
2091 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2092 if not media.get('URI'):
2093 return
a0c3b2d5
F
2094 url = format_url(media['URI'])
2095 sub_info = {
2096 'url': url,
2097 'ext': determine_ext(url),
2098 }
4a2f19ab
F
2099 if sub_info['ext'] == 'm3u8':
2100 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2101 # files may contain is WebVTT:
2102 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2103 sub_info['ext'] = 'vtt'
2104 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2105 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2106 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2107 if media_type not in ('VIDEO', 'AUDIO'):
2108 return
2109 media_url = media.get('URI')
2110 if media_url:
310c2ed2 2111 manifest_url = format_url(media_url)
cb252080 2112 format_id = []
b3b30a4b 2113 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2114 fatal=fatal, data=data, headers=headers)
310c2ed2 2115
2116 for format in playlist_formats:
2117 format_index = format.get('index')
2118 for v in (m3u8_id, group_id, name):
2119 if v:
2120 format_id.append(v)
2121 if format_index:
2122 format_id.append(str(format_index))
2123 f = {
2124 'format_id': '-'.join(format_id),
2125 'format_index': format_index,
2126 'url': manifest_url,
2127 'manifest_url': m3u8_url,
2128 'language': media.get('LANGUAGE'),
2129 'ext': ext,
2130 'protocol': entry_protocol,
2131 'preference': preference,
2132 'quality': quality,
2133 }
2134 if media_type == 'AUDIO':
2135 f['vcodec'] = 'none'
2136 formats.append(f)
cb252080
S
2137
2138 def build_stream_name():
2139 # Despite specification does not mention NAME attribute for
3019cb0c
S
2140 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2141 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2142 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2143 stream_name = last_stream_inf.get('NAME')
2144 if stream_name:
2145 return stream_name
2146 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2147 # from corresponding rendition group
2148 stream_group_id = last_stream_inf.get('VIDEO')
2149 if not stream_group_id:
2150 return
2151 stream_group = groups.get(stream_group_id)
2152 if not stream_group:
2153 return stream_group_id
2154 rendition = stream_group[0]
2155 return rendition.get('NAME') or stream_group_id
2156
379306ef 2157 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2158 # chance to detect video only formats when EXT-X-STREAM-INF tags
2159 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2160 for line in m3u8_doc.splitlines():
2161 if line.startswith('#EXT-X-MEDIA:'):
2162 extract_media(line)
2163
704df56d
PH
2164 for line in m3u8_doc.splitlines():
2165 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2166 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2167 elif line.startswith('#') or not line.strip():
2168 continue
2169 else:
9c99bef7 2170 tbr = float_or_none(
3089bc74
S
2171 last_stream_inf.get('AVERAGE-BANDWIDTH')
2172 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2173 manifest_url = format_url(line.strip())
5ef62fc4 2174
b3b30a4b 2175 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2176 fatal=fatal, data=data, headers=headers)
310c2ed2 2177
fc21af50 2178 for frmt in playlist_formats:
310c2ed2 2179 format_id = []
2180 if m3u8_id:
2181 format_id.append(m3u8_id)
fc21af50 2182 format_index = frmt.get('index')
310c2ed2 2183 stream_name = build_stream_name()
2184 # Bandwidth of live streams may differ over time thus making
2185 # format_id unpredictable. So it's better to keep provided
2186 # format_id intact.
2187 if not live:
2188 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2189 if format_index:
2190 format_id.append(str(format_index))
2191 f = {
2192 'format_id': '-'.join(format_id),
2193 'format_index': format_index,
2194 'url': manifest_url,
2195 'manifest_url': m3u8_url,
2196 'tbr': tbr,
2197 'ext': ext,
2198 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2199 'protocol': entry_protocol,
2200 'preference': preference,
2201 'quality': quality,
2202 }
2203 resolution = last_stream_inf.get('RESOLUTION')
2204 if resolution:
2205 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2206 if mobj:
2207 f['width'] = int(mobj.group('width'))
2208 f['height'] = int(mobj.group('height'))
2209 # Unified Streaming Platform
2210 mobj = re.search(
2211 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2212 if mobj:
2213 abr, vbr = mobj.groups()
2214 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2215 f.update({
2216 'vbr': vbr,
2217 'abr': abr,
2218 })
2219 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2220 f.update(codecs)
2221 audio_group_id = last_stream_inf.get('AUDIO')
2222 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2223 # references a rendition group MUST have a CODECS attribute.
2224 # However, this is not always respected, for example, [2]
2225 # contains EXT-X-STREAM-INF tag which references AUDIO
2226 # rendition group but does not have CODECS and despite
2227 # referencing an audio group it represents a complete
2228 # (with audio and video) format. So, for such cases we will
2229 # ignore references to rendition groups and treat them
2230 # as complete formats.
2231 if audio_group_id and codecs and f.get('vcodec') != 'none':
2232 audio_group = groups.get(audio_group_id)
2233 if audio_group and audio_group[0].get('URI'):
2234 # TODO: update acodec for audio only formats with
2235 # the same GROUP-ID
2236 f['acodec'] = 'none'
fc21af50 2237 if not f.get('ext'):
2238 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2239 formats.append(f)
2240
2241 # for DailyMotion
2242 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2243 if progressive_uri:
2244 http_f = f.copy()
2245 del http_f['manifest_url']
2246 http_f.update({
2247 'format_id': f['format_id'].replace('hls-', 'http-'),
2248 'protocol': 'http',
2249 'url': progressive_uri,
2250 })
2251 formats.append(http_f)
5ef62fc4 2252
cb252080 2253 last_stream_inf = {}
a0c3b2d5 2254 return formats, subtitles
704df56d 2255
a107193e
S
2256 @staticmethod
2257 def _xpath_ns(path, namespace=None):
2258 if not namespace:
2259 return path
2260 out = []
2261 for c in path.split('/'):
2262 if not c or c == '.':
2263 out.append(c)
2264 else:
2265 out.append('{%s}%s' % (namespace, c))
2266 return '/'.join(out)
2267
09f572fb 2268 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2269 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 2270
995029a1
PH
2271 if smil is False:
2272 assert not fatal
2273 return []
e89a2aab 2274
17712eeb 2275 namespace = self._parse_smil_namespace(smil)
a107193e
S
2276
2277 return self._parse_smil_formats(
2278 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2279
2280 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2281 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2282 if smil is False:
2283 return {}
2284 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2285
09f572fb 2286 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
2287 return self._download_xml(
2288 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2289 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2290
2291 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2292 namespace = self._parse_smil_namespace(smil)
a107193e
S
2293
2294 formats = self._parse_smil_formats(
2295 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2296 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2297
2298 video_id = os.path.splitext(url_basename(smil_url))[0]
2299 title = None
2300 description = None
647eab45 2301 upload_date = None
a107193e
S
2302 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2303 name = meta.attrib.get('name')
2304 content = meta.attrib.get('content')
2305 if not name or not content:
2306 continue
2307 if not title and name == 'title':
2308 title = content
2309 elif not description and name in ('description', 'abstract'):
2310 description = content
647eab45
S
2311 elif not upload_date and name == 'date':
2312 upload_date = unified_strdate(content)
a107193e 2313
1e5bcdec
S
2314 thumbnails = [{
2315 'id': image.get('type'),
2316 'url': image.get('src'),
2317 'width': int_or_none(image.get('width')),
2318 'height': int_or_none(image.get('height')),
2319 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2320
a107193e
S
2321 return {
2322 'id': video_id,
2323 'title': title or video_id,
2324 'description': description,
647eab45 2325 'upload_date': upload_date,
1e5bcdec 2326 'thumbnails': thumbnails,
a107193e
S
2327 'formats': formats,
2328 'subtitles': subtitles,
2329 }
2330
17712eeb
S
2331 def _parse_smil_namespace(self, smil):
2332 return self._search_regex(
2333 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2334
f877c6ae 2335 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2336 base = smil_url
2337 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2338 b = meta.get('base') or meta.get('httpBase')
2339 if b:
2340 base = b
2341 break
e89a2aab
S
2342
2343 formats = []
2344 rtmp_count = 0
a107193e 2345 http_count = 0
7f32e5dc 2346 m3u8_count = 0
a107193e 2347
81e1c4e2 2348 srcs = []
ad96b4c8
YCH
2349 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2350 for medium in media:
2351 src = medium.get('src')
81e1c4e2 2352 if not src or src in srcs:
a107193e 2353 continue
81e1c4e2 2354 srcs.append(src)
a107193e 2355
ad96b4c8
YCH
2356 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2357 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2358 width = int_or_none(medium.get('width'))
2359 height = int_or_none(medium.get('height'))
2360 proto = medium.get('proto')
2361 ext = medium.get('ext')
a107193e 2362 src_ext = determine_ext(src)
ad96b4c8 2363 streamer = medium.get('streamer') or base
a107193e
S
2364
2365 if proto == 'rtmp' or streamer.startswith('rtmp'):
2366 rtmp_count += 1
2367 formats.append({
2368 'url': streamer,
2369 'play_path': src,
2370 'ext': 'flv',
2371 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2372 'tbr': bitrate,
2373 'filesize': filesize,
2374 'width': width,
2375 'height': height,
2376 })
f877c6ae
YCH
2377 if transform_rtmp_url:
2378 streamer, src = transform_rtmp_url(streamer, src)
2379 formats[-1].update({
2380 'url': streamer,
2381 'play_path': src,
2382 })
a107193e
S
2383 continue
2384
2385 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 2386 src_url = src_url.strip()
a107193e
S
2387
2388 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2389 m3u8_formats = self._extract_m3u8_formats(
2390 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2391 if len(m3u8_formats) == 1:
2392 m3u8_count += 1
2393 m3u8_formats[0].update({
2394 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2395 'tbr': bitrate,
2396 'width': width,
2397 'height': height,
2398 })
2399 formats.extend(m3u8_formats)
bd21ead2 2400 elif src_ext == 'f4m':
a107193e
S
2401 f4m_url = src_url
2402 if not f4m_params:
2403 f4m_params = {
2404 'hdcore': '3.2.0',
2405 'plugin': 'flowplayer-3.2.0.1',
2406 }
2407 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 2408 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 2409 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2410 elif src_ext == 'mpd':
2411 formats.extend(self._extract_mpd_formats(
2412 src_url, video_id, mpd_id='dash', fatal=False))
2413 elif re.search(r'\.ism/[Mm]anifest', src_url):
2414 formats.extend(self._extract_ism_formats(
2415 src_url, video_id, ism_id='mss', fatal=False))
2416 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2417 http_count += 1
2418 formats.append({
2419 'url': src_url,
2420 'ext': ext or src_ext or 'flv',
2421 'format_id': 'http-%d' % (bitrate or http_count),
2422 'tbr': bitrate,
2423 'filesize': filesize,
2424 'width': width,
2425 'height': height,
2426 })
63757032 2427
e89a2aab
S
2428 return formats
2429
ce00af87 2430 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2431 urls = []
a107193e
S
2432 subtitles = {}
2433 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2434 src = textstream.get('src')
d413095f 2435 if not src or src in urls:
a107193e 2436 continue
d413095f 2437 urls.append(src)
df634be2 2438 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2439 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2440 subtitles.setdefault(lang, []).append({
2441 'url': src,
2442 'ext': ext,
2443 })
2444 return subtitles
63757032 2445
47a5cb77 2446 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 2447 xspf = self._download_xml(
47a5cb77 2448 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
2449 'Unable to download xspf manifest', fatal=fatal)
2450 if xspf is False:
2451 return []
47a5cb77
S
2452 return self._parse_xspf(
2453 xspf, playlist_id, xspf_url=xspf_url,
2454 xspf_base_url=base_url(xspf_url))
8d6765cf 2455
47a5cb77 2456 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2457 NS_MAP = {
2458 'xspf': 'http://xspf.org/ns/0/',
2459 's1': 'http://static.streamone.nl/player/ns/0',
2460 }
2461
2462 entries = []
47a5cb77 2463 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2464 title = xpath_text(
98044462 2465 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2466 description = xpath_text(
2467 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2468 thumbnail = xpath_text(
2469 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2470 duration = float_or_none(
2471 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2472
47a5cb77
S
2473 formats = []
2474 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2475 format_url = urljoin(xspf_base_url, location.text)
2476 if not format_url:
2477 continue
2478 formats.append({
2479 'url': format_url,
2480 'manifest_url': xspf_url,
2481 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2482 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2483 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2484 })
8d6765cf
S
2485 self._sort_formats(formats)
2486
2487 entries.append({
2488 'id': playlist_id,
2489 'title': title,
2490 'description': description,
2491 'thumbnail': thumbnail,
2492 'duration': duration,
2493 'formats': formats,
2494 })
2495 return entries
2496
171e59ed
F
2497 def _extract_mpd_formats(self, *args, **kwargs):
2498 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2499 if subs:
2500 self.report_warning(bug_reports_message(
2501 "Ignoring subtitle tracks found in the DASH manifest; "
2502 "if any subtitle tracks are missing,"
2503 ))
2504 return fmts
2505
2506 def _extract_mpd_formats_and_subtitles(
2507 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2508 fatal=True, data=None, headers={}, query={}):
47a5cb77 2509 res = self._download_xml_handle(
1bac3455 2510 mpd_url, video_id,
37a3bb66 2511 note='Downloading MPD manifest' if note is None else note,
2512 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2513 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2514 if res is False:
171e59ed 2515 return [], {}
47a5cb77 2516 mpd_doc, urlh = res
c25720ef 2517 if mpd_doc is None:
171e59ed 2518 return [], {}
02dc0a36 2519 mpd_base_url = base_url(urlh.geturl())
1bac3455 2520
171e59ed 2521 return self._parse_mpd_formats_and_subtitles(
545cc85d 2522 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2523
171e59ed
F
2524 def _parse_mpd_formats(self, *args, **kwargs):
2525 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2526 if subs:
2527 self.report_warning(bug_reports_message(
2528 "Ignoring subtitle tracks found in the DASH manifest; "
2529 "if any subtitle tracks are missing,"
2530 ))
2531 return fmts
2532
2533 def _parse_mpd_formats_and_subtitles(
2534 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2535 """
2536 Parse formats from MPD manifest.
2537 References:
2538 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2539 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2540 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2541 """
a06916d9 2542 if not self.get_param('dynamic_mpd', True):
78895bd3 2543 if mpd_doc.get('type') == 'dynamic':
171e59ed 2544 return [], {}
2d2fa82d 2545
91cb6b50 2546 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2547
2548 def _add_ns(path):
2549 return self._xpath_ns(path, namespace)
2550
675d0016 2551 def is_drm_protected(element):
2552 return element.find(_add_ns('ContentProtection')) is not None
2553
1bac3455 2554 def extract_multisegment_info(element, ms_parent_info):
2555 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2556
2557 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2558 # common attributes and elements. We will only extract relevant
2559 # for us.
2560 def extract_common(source):
2561 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2562 if segment_timeline is not None:
2563 s_e = segment_timeline.findall(_add_ns('S'))
2564 if s_e:
2565 ms_info['total_number'] = 0
2566 ms_info['s'] = []
2567 for s in s_e:
2568 r = int(s.get('r', 0))
2569 ms_info['total_number'] += 1 + r
2570 ms_info['s'].append({
2571 't': int(s.get('t', 0)),
2572 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2573 'd': int(s.attrib['d']),
2574 'r': r,
2575 })
2576 start_number = source.get('startNumber')
2577 if start_number:
2578 ms_info['start_number'] = int(start_number)
2579 timescale = source.get('timescale')
2580 if timescale:
2581 ms_info['timescale'] = int(timescale)
2582 segment_duration = source.get('duration')
2583 if segment_duration:
48504785 2584 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2585
2586 def extract_Initialization(source):
2587 initialization = source.find(_add_ns('Initialization'))
2588 if initialization is not None:
2589 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2590
f14be228 2591 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2592 if segment_list is not None:
b4c1d6e8
S
2593 extract_common(segment_list)
2594 extract_Initialization(segment_list)
f14be228 2595 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2596 if segment_urls_e:
2597 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2598 else:
f14be228 2599 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2600 if segment_template is not None:
b4c1d6e8 2601 extract_common(segment_template)
e228616c
S
2602 media = segment_template.get('media')
2603 if media:
2604 ms_info['media'] = media
1bac3455 2605 initialization = segment_template.get('initialization')
2606 if initialization:
e228616c 2607 ms_info['initialization'] = initialization
1bac3455 2608 else:
b4c1d6e8 2609 extract_Initialization(segment_template)
1bac3455 2610 return ms_info
b323e170 2611
a06916d9 2612 skip_unplayable = not self.get_param('allow_unplayable_formats')
63ad4d43 2613
1bac3455 2614 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 2615 formats = []
171e59ed 2616 subtitles = {}
f14be228 2617 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2618 period_duration = parse_duration(period.get('duration')) or mpd_duration
2619 period_ms_info = extract_multisegment_info(period, {
2620 'start_number': 1,
2621 'timescale': 1,
2622 })
f14be228 2623 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
06869367 2624 if skip_unplayable and is_drm_protected(adaptation_set):
675d0016 2625 continue
1bac3455 2626 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2627 for representation in adaptation_set.findall(_add_ns('Representation')):
06869367 2628 if skip_unplayable and is_drm_protected(representation):
675d0016 2629 continue
1bac3455 2630 representation_attrib = adaptation_set.attrib.copy()
2631 representation_attrib.update(representation.attrib)
f0948348 2632 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2633 mime_type = representation_attrib['mimeType']
171e59ed
F
2634 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2635
2636 if content_type in ('video', 'audio', 'text'):
1bac3455 2637 base_url = ''
2638 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 2639 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 2640 if base_url_e is not None:
2641 base_url = base_url_e.text + base_url
2642 if re.match(r'^https?://', base_url):
2643 break
bb20526b
S
2644 if mpd_base_url and not re.match(r'^https?://', base_url):
2645 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2646 mpd_base_url += '/'
1bac3455 2647 base_url = mpd_base_url + base_url
2648 representation_id = representation_attrib.get('id')
d577c796 2649 lang = representation_attrib.get('lang')
51e9094f 2650 url_el = representation.find(_add_ns('BaseURL'))
2651 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
e228616c 2652 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
171e59ed
F
2653 if content_type in ('video', 'audio'):
2654 f = {
2655 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2656 'manifest_url': mpd_url,
2657 'ext': mimetype2ext(mime_type),
2658 'width': int_or_none(representation_attrib.get('width')),
2659 'height': int_or_none(representation_attrib.get('height')),
2660 'tbr': float_or_none(bandwidth, 1000),
2661 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2662 'fps': int_or_none(representation_attrib.get('frameRate')),
2663 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2664 'format_note': 'DASH %s' % content_type,
2665 'filesize': filesize,
2666 'container': mimetype2ext(mime_type) + '_dash',
2667 }
2668 f.update(parse_codecs(representation_attrib.get('codecs')))
2669 elif content_type == 'text':
2670 f = {
2671 'ext': mimetype2ext(mime_type),
2672 'manifest_url': mpd_url,
2673 'filesize': filesize,
2674 }
1bac3455 2675 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
b4c1d6e8 2676
e228616c 2677 def prepare_template(template_name, identifiers):
eca1f0d1
S
2678 tmpl = representation_ms_info[template_name]
2679 # First of, % characters outside $...$ templates
2680 # must be escaped by doubling for proper processing
2681 # by % operator string formatting used further (see
067aa17e 2682 # https://github.com/ytdl-org/youtube-dl/issues/16867).
eca1f0d1
S
2683 t = ''
2684 in_template = False
2685 for c in tmpl:
2686 t += c
2687 if c == '$':
2688 in_template = not in_template
2689 elif c == '%' and not in_template:
2690 t += c
2691 # Next, $...$ templates are translated to their
2692 # %(...) counterparts to be used with % operator
e228616c
S
2693 t = t.replace('$RepresentationID$', representation_id)
2694 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2695 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2696 t.replace('$$', '$')
2697 return t
2698
2699 # @initialization is a regular template like @media one
2700 # so it should be handled just the same way (see
067aa17e 2701 # https://github.com/ytdl-org/youtube-dl/issues/11605)
e228616c
S
2702 if 'initialization' in representation_ms_info:
2703 initialization_template = prepare_template(
2704 'initialization',
2705 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2706 # $Time$ shall not be included for @initialization thus
2707 # only $Bandwidth$ remains
2708 ('Bandwidth', ))
2709 representation_ms_info['initialization_url'] = initialization_template % {
2710 'Bandwidth': bandwidth,
2711 }
2712
1141e910
S
2713 def location_key(location):
2714 return 'url' if re.match(r'^https?://', location) else 'path'
2715
e228616c
S
2716 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2717
2718 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1141e910 2719 media_location_key = location_key(media_template)
f0948348
S
2720
2721 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2722 # can't be used at the same time
b4c1d6e8
S
2723 if '%(Number' in media_template and 's' not in representation_ms_info:
2724 segment_duration = None
c110944f 2725 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
b4c1d6e8
S
2726 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2727 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
b4c1d6e8 2728 representation_ms_info['fragments'] = [{
1141e910 2729 media_location_key: media_template % {
b4c1d6e8 2730 'Number': segment_number,
e228616c 2731 'Bandwidth': bandwidth,
b4c1d6e8
S
2732 },
2733 'duration': segment_duration,
2734 } for segment_number in range(
2735 representation_ms_info['start_number'],
2736 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
f0948348 2737 else:
b4c1d6e8
S
2738 # $Number*$ or $Time$ in media template with S list available
2739 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2740 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
b4c1d6e8 2741 representation_ms_info['fragments'] = []
f0948348 2742 segment_time = 0
b4c1d6e8
S
2743 segment_d = None
2744 segment_number = representation_ms_info['start_number']
f0948348
S
2745
2746 def add_segment_url():
b4c1d6e8
S
2747 segment_url = media_template % {
2748 'Time': segment_time,
e228616c 2749 'Bandwidth': bandwidth,
b4c1d6e8
S
2750 'Number': segment_number,
2751 }
b4c1d6e8 2752 representation_ms_info['fragments'].append({
1141e910 2753 media_location_key: segment_url,
b4c1d6e8
S
2754 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2755 })
f0948348
S
2756
2757 for num, s in enumerate(representation_ms_info['s']):
2758 segment_time = s.get('t') or segment_time
b4c1d6e8 2759 segment_d = s['d']
f0948348 2760 add_segment_url()
b4c1d6e8 2761 segment_number += 1
f0948348 2762 for r in range(s.get('r', 0)):
b4c1d6e8 2763 segment_time += segment_d
f0948348 2764 add_segment_url()
b4c1d6e8
S
2765 segment_number += 1
2766 segment_time += segment_d
2767 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2768 # No media template
2769 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2770 # or any YouTube dashsegments video
2771 fragments = []
d04621da
S
2772 segment_index = 0
2773 timescale = representation_ms_info['timescale']
2774 for s in representation_ms_info['s']:
2775 duration = float_or_none(s['d'], timescale)
b4c1d6e8 2776 for r in range(s.get('r', 0) + 1):
1141e910 2777 segment_uri = representation_ms_info['segment_urls'][segment_index]
b4c1d6e8 2778 fragments.append({
1141e910 2779 location_key(segment_uri): segment_uri,
d04621da 2780 'duration': duration,
b4c1d6e8 2781 })
d04621da 2782 segment_index += 1
b4c1d6e8 2783 representation_ms_info['fragments'] = fragments
41bf647e
PN
2784 elif 'segment_urls' in representation_ms_info:
2785 # Segment URLs with no SegmentTimeline
2786 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
067aa17e 2787 # https://github.com/ytdl-org/youtube-dl/pull/14844
41bf647e 2788 fragments = []
603fc4e0
S
2789 segment_duration = float_or_none(
2790 representation_ms_info['segment_duration'],
2791 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
41bf647e 2792 for segment_url in representation_ms_info['segment_urls']:
603fc4e0 2793 fragment = {
41bf647e 2794 location_key(segment_url): segment_url,
603fc4e0
S
2795 }
2796 if segment_duration:
2797 fragment['duration'] = segment_duration
2798 fragments.append(fragment)
41bf647e 2799 representation_ms_info['fragments'] = fragments
79d2077e
S
2800 # If there is a fragments key available then we correctly recognized fragmented media.
2801 # Otherwise we will assume unfragmented media with direct access. Technically, such
2802 # assumption is not necessarily correct since we may simply have no support for
2803 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
86f4d14f 2804 if 'fragments' in representation_ms_info:
1bac3455 2805 f.update({
79d2077e
S
2806 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2807 'url': mpd_url or base_url,
1141e910 2808 'fragment_base_url': base_url,
b4c1d6e8 2809 'fragments': [],
1bac3455 2810 'protocol': 'http_dash_segments',
df374b52 2811 })
1bac3455 2812 if 'initialization_url' in representation_ms_info:
e228616c 2813 initialization_url = representation_ms_info['initialization_url']
1bac3455 2814 if not f.get('url'):
2815 f['url'] = initialization_url
1141e910 2816 f['fragments'].append({location_key(initialization_url): initialization_url})
b4c1d6e8 2817 f['fragments'].extend(representation_ms_info['fragments'])
79d2077e
S
2818 else:
2819 # Assuming direct URL to unfragmented media.
2820 f['url'] = base_url
fd76a142
F
2821 if content_type in ('video', 'audio'):
2822 formats.append(f)
2823 elif content_type == 'text':
2824 subtitles.setdefault(lang or 'und', []).append(f)
17b598d3 2825 else:
1bac3455 2826 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
171e59ed 2827 return formats, subtitles
17b598d3 2828
fd76a142
F
2829 def _extract_ism_formats(self, *args, **kwargs):
2830 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2831 if subs:
2832 self.report_warning(bug_reports_message(
2833 "Ignoring subtitle tracks found in the ISM manifest; "
2834 "if any subtitle tracks are missing,"
2835 ))
2836 return fmts
2837
2838 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2839 res = self._download_xml_handle(
b2758123 2840 ism_url, video_id,
37a3bb66 2841 note='Downloading ISM manifest' if note is None else note,
2842 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2843 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2844 if res is False:
fd76a142 2845 return [], {}
47a5cb77 2846 ism_doc, urlh = res
13b08034 2847 if ism_doc is None:
fd76a142 2848 return [], {}
b2758123 2849
fd76a142 2850 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 2851
fd76a142 2852 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2853 """
2854 Parse formats from ISM manifest.
2855 References:
2856 1. [MS-SSTR]: Smooth Streaming Protocol,
2857 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2858 """
06869367 2859 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2860 return [], {}
a06916d9 2861 if (not self.get_param('allow_unplayable_formats')
06869367 2862 and ism_doc.find('Protection') is not None):
fd76a142 2863 return [], {}
b2758123 2864
b2758123
RA
2865 duration = int(ism_doc.attrib['Duration'])
2866 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2867
2868 formats = []
fd76a142 2869 subtitles = {}
b2758123
RA
2870 for stream in ism_doc.findall('StreamIndex'):
2871 stream_type = stream.get('Type')
fd76a142 2872 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2873 continue
2874 url_pattern = stream.attrib['Url']
2875 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2876 stream_name = stream.get('Name')
fd76a142 2877 stream_language = stream.get('Language', 'und')
b2758123 2878 for track in stream.findall('QualityLevel'):
2501d41e 2879 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
b2758123 2880 # TODO: add support for WVC1 and WMAP
66a1b864 2881 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
b2758123
RA
2882 self.report_warning('%s is not a supported codec' % fourcc)
2883 continue
2884 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2885 # [1] does not mention Width and Height attributes. However,
2886 # they're often present while MaxWidth and MaxHeight are
2887 # missing, so should be used as fallbacks
2888 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2889 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2890 sampling_rate = int_or_none(track.get('SamplingRate'))
2891
2892 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2893 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2894
2895 fragments = []
2896 fragment_ctx = {
2897 'time': 0,
2898 }
2899 stream_fragments = stream.findall('c')
2900 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2901 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2902 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2903 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2904 if not fragment_ctx['duration']:
2905 try:
2906 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2907 except IndexError:
2908 next_fragment_time = duration
1616f9b4 2909 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2910 for _ in range(fragment_repeat):
2911 fragments.append({
1616f9b4 2912 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2913 'duration': fragment_ctx['duration'] / stream_timescale,
2914 })
2915 fragment_ctx['time'] += fragment_ctx['duration']
2916
2917 format_id = []
2918 if ism_id:
2919 format_id.append(ism_id)
2920 if stream_name:
2921 format_id.append(stream_name)
2922 format_id.append(compat_str(tbr))
2923
fd76a142
F
2924 if stream_type == 'text':
2925 subtitles.setdefault(stream_language, []).append({
2926 'ext': 'ismt',
2927 'protocol': 'ism',
2928 'url': ism_url,
2929 'manifest_url': ism_url,
2930 'fragments': fragments,
2931 '_download_params': {
2932 'stream_type': stream_type,
2933 'duration': duration,
2934 'timescale': stream_timescale,
2935 'fourcc': fourcc,
2936 'language': stream_language,
2937 'codec_private_data': track.get('CodecPrivateData'),
2938 }
2939 })
2940 elif stream_type in ('video', 'audio'):
2941 formats.append({
2942 'format_id': '-'.join(format_id),
2943 'url': ism_url,
2944 'manifest_url': ism_url,
2945 'ext': 'ismv' if stream_type == 'video' else 'isma',
2946 'width': width,
2947 'height': height,
2948 'tbr': tbr,
2949 'asr': sampling_rate,
2950 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2951 'acodec': 'none' if stream_type == 'video' else fourcc,
2952 'protocol': 'ism',
2953 'fragments': fragments,
2954 '_download_params': {
2955 'stream_type': stream_type,
2956 'duration': duration,
2957 'timescale': stream_timescale,
2958 'width': width or 0,
2959 'height': height or 0,
2960 'fourcc': fourcc,
2961 'language': stream_language,
2962 'codec_private_data': track.get('CodecPrivateData'),
2963 'sampling_rate': sampling_rate,
2964 'channels': int_or_none(track.get('Channels', 2)),
2965 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2966 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2967 },
2968 })
2969 return formats, subtitles
b2758123 2970
f983b875 2971 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
6780154e
S
2972 def absolute_url(item_url):
2973 return urljoin(base_url, item_url)
59bbe491 2974
2975 def parse_content_type(content_type):
2976 if not content_type:
2977 return {}
2978 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2979 if ctr:
2980 mimetype, codecs = ctr.groups()
2981 f = parse_codecs(codecs)
2982 f['ext'] = mimetype2ext(mimetype)
2983 return f
2984 return {}
2985
868f79db 2986 def _media_formats(src, cur_media_type, type_info={}):
520251c0 2987 full_url = absolute_url(src)
82889d4a 2988 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 2989 if ext == 'm3u8':
520251c0
YCH
2990 is_plain_url = False
2991 formats = self._extract_m3u8_formats(
ad120ae1 2992 full_url, video_id, ext='mp4',
eeb0a956 2993 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 2994 preference=preference, quality=quality, fatal=False)
87a449c1
S
2995 elif ext == 'mpd':
2996 is_plain_url = False
2997 formats = self._extract_mpd_formats(
b359e977 2998 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
2999 else:
3000 is_plain_url = True
3001 formats = [{
3002 'url': full_url,
3003 'vcodec': 'none' if cur_media_type == 'audio' else None,
3004 }]
3005 return is_plain_url, formats
3006
59bbe491 3007 entries = []
4328ddf8
S
3008 # amp-video and amp-audio are very similar to their HTML5 counterparts
3009 # so we wll include them right here (see
3010 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3011 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3012 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3013 media_tags = [(media_tag, media_tag_name, media_type, '')
3014 for media_tag, media_tag_name, media_type
3015 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3016 media_tags.extend(re.findall(
3017 # We only allow video|audio followed by a whitespace or '>'.
3018 # Allowing more characters may end up in significant slow down (see
067aa17e 3019 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 3020 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3021 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3022 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3023 media_info = {
3024 'formats': [],
3025 'subtitles': {},
3026 }
3027 media_attributes = extract_attributes(media_tag)
f856816b 3028 src = strip_or_none(media_attributes.get('src'))
59bbe491 3029 if src:
dedb1770 3030 _, formats = _media_formats(src, media_type)
520251c0 3031 media_info['formats'].extend(formats)
6780154e 3032 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3033 if media_content:
3034 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3035 s_attr = extract_attributes(source_tag)
3036 # data-video-src and data-src are non standard but seen
3037 # several times in the wild
f856816b 3038 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 3039 if not src:
3040 continue
d493f15c 3041 f = parse_content_type(s_attr.get('type'))
868f79db 3042 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3043 if is_plain_url:
d493f15c
S
3044 # width, height, res, label and title attributes are
3045 # all not standard but seen several times in the wild
3046 labels = [
3047 s_attr.get(lbl)
3048 for lbl in ('label', 'title')
3049 if str_or_none(s_attr.get(lbl))
3050 ]
3051 width = int_or_none(s_attr.get('width'))
3089bc74
S
3052 height = (int_or_none(s_attr.get('height'))
3053 or int_or_none(s_attr.get('res')))
d493f15c
S
3054 if not width or not height:
3055 for lbl in labels:
3056 resolution = parse_resolution(lbl)
3057 if not resolution:
3058 continue
3059 width = width or resolution.get('width')
3060 height = height or resolution.get('height')
3061 for lbl in labels:
3062 tbr = parse_bitrate(lbl)
3063 if tbr:
3064 break
3065 else:
3066 tbr = None
1ed45499 3067 f.update({
d493f15c
S
3068 'width': width,
3069 'height': height,
3070 'tbr': tbr,
3071 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3072 })
520251c0
YCH
3073 f.update(formats[0])
3074 media_info['formats'].append(f)
3075 else:
3076 media_info['formats'].extend(formats)
59bbe491 3077 for track_tag in re.findall(r'<track[^>]+>', media_content):
3078 track_attributes = extract_attributes(track_tag)
3079 kind = track_attributes.get('kind')
5968d7d2 3080 if not kind or kind in ('subtitles', 'captions'):
f856816b 3081 src = strip_or_none(track_attributes.get('src'))
59bbe491 3082 if not src:
3083 continue
3084 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3085 media_info['subtitles'].setdefault(lang, []).append({
3086 'url': absolute_url(src),
3087 })
5e8e2fa5
S
3088 for f in media_info['formats']:
3089 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3090 if media_info['formats'] or media_info['subtitles']:
59bbe491 3091 entries.append(media_info)
3092 return entries
3093
f6a1d69a
F
3094 def _extract_akamai_formats(self, *args, **kwargs):
3095 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3096 if subs:
3097 self.report_warning(bug_reports_message(
3098 "Ignoring subtitle tracks found in the manifests; "
3099 "if any subtitle tracks are missing,"
3100 ))
3101 return fmts
3102
3103 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3104 signed = 'hdnea=' in manifest_url
3105 if not signed:
3106 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3107 manifest_url = re.sub(
3108 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3109 '', manifest_url).strip('?')
3110
c7c43a93 3111 formats = []
f6a1d69a 3112 subtitles = {}
70c5802b 3113
e71a4509 3114 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3115 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3116 hds_host = hosts.get('hds')
3117 if hds_host:
3118 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3119 if 'hdcore=' not in f4m_url:
3120 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3121 f4m_formats = self._extract_f4m_formats(
3122 f4m_url, video_id, f4m_id='hds', fatal=False)
3123 for entry in f4m_formats:
3124 entry.update({'extra_param_to_segment_url': hdcore_sign})
3125 formats.extend(f4m_formats)
70c5802b 3126
c4251b9a
RA
3127 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3128 hls_host = hosts.get('hls')
3129 if hls_host:
3130 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3131 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3132 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3133 m3u8_id='hls', fatal=False)
3134 formats.extend(m3u8_formats)
f6a1d69a 3135 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3136
3137 http_host = hosts.get('http')
29f7c58a 3138 if http_host and m3u8_formats and not signed:
3139 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3140 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3141 qualities_length = len(qualities)
29f7c58a 3142 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3143 i = 0
29f7c58a 3144 for f in m3u8_formats:
3145 if f['vcodec'] != 'none':
70c5802b 3146 for protocol in ('http', 'https'):
3147 http_f = f.copy()
3148 del http_f['manifest_url']
3149 http_url = re.sub(
29f7c58a 3150 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
70c5802b 3151 http_f.update({
3152 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3153 'url': http_url,
3154 'protocol': protocol,
3155 })
29f7c58a 3156 formats.append(http_f)
70c5802b 3157 i += 1
70c5802b 3158
f6a1d69a 3159 return formats, subtitles
c7c43a93 3160
6ad02195 3161 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 3162 query = compat_urlparse.urlparse(url).query
6ad02195 3163 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3164 mobj = re.search(
3165 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3166 url_base = mobj.group('url')
3167 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3168 formats = []
044eeb14
S
3169
3170 def manifest_url(manifest):
3171 m_url = '%s/%s' % (http_base_url, manifest)
3172 if query:
3173 m_url += '?%s' % query
3174 return m_url
3175
6ad02195
RA
3176 if 'm3u8' not in skip_protocols:
3177 formats.extend(self._extract_m3u8_formats(
044eeb14 3178 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3179 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3180 if 'f4m' not in skip_protocols:
3181 formats.extend(self._extract_f4m_formats(
044eeb14 3182 manifest_url('manifest.f4m'),
6ad02195 3183 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3184 if 'dash' not in skip_protocols:
3185 formats.extend(self._extract_mpd_formats(
044eeb14 3186 manifest_url('manifest.mpd'),
0384932e 3187 video_id, mpd_id='dash', fatal=False))
6ad02195 3188 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3189 if 'smil' not in skip_protocols:
3190 rtmp_formats = self._extract_smil_formats(
044eeb14 3191 manifest_url('jwplayer.smil'),
6ad02195
RA
3192 video_id, fatal=False)
3193 for rtmp_format in rtmp_formats:
3194 rtsp_format = rtmp_format.copy()
3195 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3196 del rtsp_format['play_path']
3197 del rtsp_format['ext']
3198 rtsp_format.update({
3199 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3200 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3201 'protocol': 'rtsp',
3202 })
3203 formats.extend([rtmp_format, rtsp_format])
3204 else:
3205 for protocol in ('rtmp', 'rtsp'):
3206 if protocol not in skip_protocols:
3207 formats.append({
f2e2f0c7 3208 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
3209 'format_id': protocol,
3210 'protocol': protocol,
3211 })
3212 return formats
3213
c73e330e 3214 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3215 mobj = re.search(
ac9c69ac 3216 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3217 webpage)
3218 if mobj:
c73e330e
RU
3219 try:
3220 jwplayer_data = self._parse_json(mobj.group('options'),
3221 video_id=video_id,
3222 transform_source=transform_source)
3223 except ExtractorError:
3224 pass
3225 else:
3226 if isinstance(jwplayer_data, dict):
3227 return jwplayer_data
a4a554a7
YCH
3228
3229 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3230 jwplayer_data = self._find_jwplayer_data(
3231 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3232 return self._parse_jwplayer_data(
3233 jwplayer_data, video_id, *args, **kwargs)
3234
3235 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3236 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3237 # JWPlayer backward compatibility: flattened playlists
3238 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3239 if 'playlist' not in jwplayer_data:
3240 jwplayer_data = {'playlist': [jwplayer_data]}
3241
3242 entries = []
3243
3244 # JWPlayer backward compatibility: single playlist item
3245 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3246 if not isinstance(jwplayer_data['playlist'], list):
3247 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3248
3249 for video_data in jwplayer_data['playlist']:
3250 # JWPlayer backward compatibility: flattened sources
3251 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3252 if 'sources' not in video_data:
3253 video_data['sources'] = [video_data]
3254
3255 this_video_id = video_id or video_data['mediaid']
3256
1a2192cb
S
3257 formats = self._parse_jwplayer_formats(
3258 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3259 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3260
3261 subtitles = {}
3262 tracks = video_data.get('tracks')
3263 if tracks and isinstance(tracks, list):
3264 for track in tracks:
96a2daa1
S
3265 if not isinstance(track, dict):
3266 continue
f4b74272
S
3267 track_kind = track.get('kind')
3268 if not track_kind or not isinstance(track_kind, compat_str):
3269 continue
3270 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3271 continue
3272 track_url = urljoin(base_url, track.get('file'))
3273 if not track_url:
3274 continue
3275 subtitles.setdefault(track.get('label') or 'en', []).append({
3276 'url': self._proto_relative_url(track_url)
3277 })
3278
50d808f5 3279 entry = {
a4a554a7 3280 'id': this_video_id,
50d808f5 3281 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3282 'description': clean_html(video_data.get('description')),
6945b9e7 3283 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3284 'timestamp': int_or_none(video_data.get('pubdate')),
3285 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3286 'subtitles': subtitles,
50d808f5
RA
3287 }
3288 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3289 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3290 entry.update({
3291 '_type': 'url_transparent',
3292 'url': formats[0]['url'],
3293 })
3294 else:
3295 self._sort_formats(formats)
3296 entry['formats'] = formats
3297 entries.append(entry)
a4a554a7
YCH
3298 if len(entries) == 1:
3299 return entries[0]
3300 else:
3301 return self.playlist_result(entries)
3302
ed0cf9b3
S
3303 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3304 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3305 urls = []
ed0cf9b3 3306 formats = []
1a2192cb 3307 for source in jwplayer_sources_data:
0a268c6e
S
3308 if not isinstance(source, dict):
3309 continue
6945b9e7
RA
3310 source_url = urljoin(
3311 base_url, self._proto_relative_url(source.get('file')))
3312 if not source_url or source_url in urls:
bf1b87cd
RA
3313 continue
3314 urls.append(source_url)
ed0cf9b3
S
3315 source_type = source.get('type') or ''
3316 ext = mimetype2ext(source_type) or determine_ext(source_url)
3317 if source_type == 'hls' or ext == 'm3u8':
3318 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3319 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3320 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3321 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3322 formats.extend(self._extract_mpd_formats(
3323 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3324 elif ext == 'smil':
3325 formats.extend(self._extract_smil_formats(
3326 source_url, video_id, fatal=False))
ed0cf9b3 3327 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3328 elif source_type.startswith('audio') or ext in (
3329 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3330 formats.append({
3331 'url': source_url,
3332 'vcodec': 'none',
3333 'ext': ext,
3334 })
3335 else:
3336 height = int_or_none(source.get('height'))
3337 if height is None:
3338 # Often no height is provided but there is a label in
0236cd0d 3339 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3340 height = int_or_none(self._search_regex(
0236cd0d 3341 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
3342 'height', default=None))
3343 a_format = {
3344 'url': source_url,
3345 'width': int_or_none(source.get('width')),
3346 'height': height,
0236cd0d 3347 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3348 'ext': ext,
3349 }
3350 if source_url.startswith('rtmp'):
3351 a_format['ext'] = 'flv'
ed0cf9b3
S
3352 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3353 # of jwplayer.flash.swf
3354 rtmp_url_parts = re.split(
3355 r'((?:mp4|mp3|flv):)', source_url, 1)
3356 if len(rtmp_url_parts) == 3:
3357 rtmp_url, prefix, play_path = rtmp_url_parts
3358 a_format.update({
3359 'url': rtmp_url,
3360 'play_path': prefix + play_path,
3361 })
3362 if rtmp_params:
3363 a_format.update(rtmp_params)
3364 formats.append(a_format)
3365 return formats
3366
f4b1c7ad
PH
3367 def _live_title(self, name):
3368 """ Generate the title for a live video """
3369 now = datetime.datetime.now()
611c1dd9 3370 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
3371 return name + ' ' + now_str
3372
b14f3a4c
PH
3373 def _int(self, v, name, fatal=False, **kwargs):
3374 res = int_or_none(v, **kwargs)
3375 if 'get_attr' in kwargs:
3376 print(getattr(v, kwargs['get_attr']))
3377 if res is None:
3378 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3379 if fatal:
3380 raise ExtractorError(msg)
3381 else:
6a39ee13 3382 self.report_warning(msg)
b14f3a4c
PH
3383 return res
3384
3385 def _float(self, v, name, fatal=False, **kwargs):
3386 res = float_or_none(v, **kwargs)
3387 if res is None:
3388 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3389 if fatal:
3390 raise ExtractorError(msg)
3391 else:
6a39ee13 3392 self.report_warning(msg)
b14f3a4c
PH
3393 return res
3394
40e41780
TF
3395 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3396 path='/', secure=False, discard=False, rest={}, **kwargs):
6c22cee6 3397 cookie = compat_cookiejar_Cookie(
4ed2d7b7 3398 0, name, value, port, port is not None, domain, True,
40e41780
TF
3399 domain.startswith('.'), path, True, secure, expire_time,
3400 discard, None, None, rest)
42939b61
JMF
3401 self._downloader.cookiejar.set_cookie(cookie)
3402
799207e8 3403 def _get_cookies(self, url):
f7ad7160 3404 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
5c2266df 3405 req = sanitized_Request(url)
799207e8 3406 self._downloader.cookiejar.add_cookie_header(req)
f7ad7160 3407 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
799207e8 3408
e3c1266f 3409 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3410 """
3411 Apply first Set-Cookie header instead of the last. Experimental.
3412
3413 Some sites (e.g. [1-3]) may serve two cookies under the same name
3414 in Set-Cookie header and expect the first (old) one to be set rather
3415 than second (new). However, as of RFC6265 the newer one cookie
3416 should be set into cookie store what actually happens.
3417 We will workaround this issue by resetting the cookie to
3418 the first one manually.
3419 1. https://new.vk.com/
3420 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3421 3. https://learning.oreilly.com/
3422 """
e3c1266f
S
3423 for header, cookies in url_handle.headers.items():
3424 if header.lower() != 'set-cookie':
3425 continue
3426 if sys.version_info[0] >= 3:
3427 cookies = cookies.encode('iso-8859-1')
3428 cookies = cookies.decode('utf-8')
3429 cookie_value = re.search(
3430 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3431 if cookie_value:
3432 value, domain = cookie_value.groups()
3433 self._set_cookie(domain, cookie, value)
3434 break
3435
05900629
PH
3436 def get_testcases(self, include_onlymatching=False):
3437 t = getattr(self, '_TEST', None)
3438 if t:
3439 assert not hasattr(self, '_TESTS'), \
3440 '%s has _TEST and _TESTS' % type(self).__name__
3441 tests = [t]
3442 else:
3443 tests = getattr(self, '_TESTS', [])
3444 for t in tests:
3445 if not include_onlymatching and t.get('only_matching', False):
3446 continue
3447 t['name'] = type(self).__name__[:-len('IE')]
3448 yield t
3449
3450 def is_suitable(self, age_limit):
3451 """ Test whether the extractor is generally suitable for the given
3452 age limit (i.e. pornographic sites are not, all others usually are) """
3453
3454 any_restricted = False
3455 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 3456 if tc.get('playlist', []):
05900629
PH
3457 tc = tc['playlist'][0]
3458 is_restricted = age_restricted(
3459 tc.get('info_dict', {}).get('age_limit'), age_limit)
3460 if not is_restricted:
3461 return True
3462 any_restricted = any_restricted or is_restricted
3463 return not any_restricted
3464
a504ced0 3465 def extract_subtitles(self, *args, **kwargs):
a06916d9 3466 if (self.get_param('writesubtitles', False)
3467 or self.get_param('listsubtitles')):
9868ea49
JMF
3468 return self._get_subtitles(*args, **kwargs)
3469 return {}
a504ced0
JMF
3470
3471 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3472 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3473
912e0b7e
YCH
3474 @staticmethod
3475 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3476 """ Merge subtitle items for one language. Items with duplicated URLs
3477 will be dropped. """
3478 list1_urls = set([item['url'] for item in subtitle_list1])
3479 ret = list(subtitle_list1)
3480 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3481 return ret
3482
3483 @classmethod
19bb3920
F
3484 def _merge_subtitles(cls, *dicts, **kwargs):
3485 """ Merge subtitle dictionaries, language by language. """
3486
3487 target = (lambda target=None: target)(**kwargs)
3488 # The above lambda extracts the keyword argument 'target' from kwargs
3489 # while ensuring there are no stray ones. When Python 2 support
3490 # is dropped, remove it and change the function signature to:
3491 #
3492 # def _merge_subtitles(cls, *dicts, target=None):
3493
3494 if target is None:
3495 target = {}
3496 for d in dicts:
3497 for lang, subs in d.items():
3498 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3499 return target
912e0b7e 3500
360e1ca5 3501 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3502 if (self.get_param('writeautomaticsub', False)
3503 or self.get_param('listsubtitles')):
9868ea49
JMF
3504 return self._get_automatic_captions(*args, **kwargs)
3505 return {}
360e1ca5
JMF
3506
3507 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3508 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3509
d77ab8e2 3510 def mark_watched(self, *args, **kwargs):
a06916d9 3511 if (self.get_param('mark_watched', False)
3089bc74 3512 and (self._get_login_info()[0] is not None
a06916d9 3513 or self.get_param('cookiefile') is not None)):
d77ab8e2
S
3514 self._mark_watched(*args, **kwargs)
3515
3516 def _mark_watched(self, *args, **kwargs):
3517 raise NotImplementedError('This method must be implemented by subclasses')
3518
38cce791
YCH
3519 def geo_verification_headers(self):
3520 headers = {}
a06916d9 3521 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3522 if geo_verification_proxy:
3523 headers['Ytdl-request-proxy'] = geo_verification_proxy
3524 return headers
3525
98763ee3
YCH
3526 def _generic_id(self, url):
3527 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3528
3529 def _generic_title(self, url):
3530 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3531
c224251a
M
3532 @staticmethod
3533 def _availability(is_private, needs_premium, needs_subscription, needs_auth, is_unlisted):
3534 all_known = all(map(
3535 lambda x: x is not None,
3536 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3537 return (
3538 'private' if is_private
3539 else 'premium_only' if needs_premium
3540 else 'subscriber_only' if needs_subscription
3541 else 'needs_auth' if needs_auth
3542 else 'unlisted' if is_unlisted
3543 else 'public' if all_known
3544 else None)
3545
8dbe9899 3546
d6983cb4
PH
3547class SearchInfoExtractor(InfoExtractor):
3548 """
3549 Base class for paged search queries extractors.
10952eb2 3550 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
3551 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3552 """
3553
3554 @classmethod
3555 def _make_valid_url(cls):
3556 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3557
3558 @classmethod
3559 def suitable(cls, url):
3560 return re.match(cls._make_valid_url(), url) is not None
3561
3562 def _real_extract(self, query):
3563 mobj = re.match(self._make_valid_url(), query)
3564 if mobj is None:
f1a9d64e 3565 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
3566
3567 prefix = mobj.group('prefix')
3568 query = mobj.group('query')
3569 if prefix == '':
3570 return self._get_n_results(query, 1)
3571 elif prefix == 'all':
3572 return self._get_n_results(query, self._MAX_RESULTS)
3573 else:
3574 n = int(prefix)
3575 if n <= 0:
f1a9d64e 3576 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 3577 elif n > self._MAX_RESULTS:
6a39ee13 3578 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3579 n = self._MAX_RESULTS
3580 return self._get_n_results(query, n)
3581
3582 def _get_n_results(self, query, n):
3583 """Get a specified number of results for a query"""
611c1dd9 3584 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
3585
3586 @property
3587 def SEARCH_KEY(self):
3588 return self._SEARCH_KEY