]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
Expand `--check-formats` to thumbnails
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
f4b1c7ad 5import datetime
3ec05685 6import hashlib
3d3538e4 7import json
4094b6e3 8import netrc
d6983cb4 9import os
773f291d 10import random
d6983cb4 11import re
d6983cb4 12import sys
4094b6e3 13import time
1bac3455 14import math
d6983cb4 15
8c25f81b 16from ..compat import (
6c22cee6 17 compat_cookiejar_Cookie,
f7ad7160 18 compat_cookies_SimpleCookie,
ee0ba927 19 compat_etree_Element,
e9c0cdd3 20 compat_etree_fromstring,
e64b7569 21 compat_getpass,
d391b7e2 22 compat_integer_types,
d6983cb4 23 compat_http_client,
e9c0cdd3
YCH
24 compat_os_name,
25 compat_str,
d6983cb4 26 compat_urllib_error,
98763ee3 27 compat_urllib_parse_unquote,
15707c7e 28 compat_urllib_parse_urlencode,
41d06b04 29 compat_urllib_request,
f0b5d6af 30 compat_urlparse,
e01c3d2e 31 compat_xml_parse_error,
8c25f81b 32)
eb8a4433 33from ..downloader import FileDownloader
48107c19
S
34from ..downloader.f4m import (
35 get_base_url,
36 remove_encrypted_media,
37)
8c25f81b 38from ..utils import (
c342041f 39 NO_DEFAULT,
05900629 40 age_restricted,
02dc0a36 41 base_url,
08f2a92c 42 bug_reports_message,
d6983cb4
PH
43 clean_html,
44 compiled_regex_type,
70f0f5a8 45 determine_ext,
46b18f23 46 determine_protocol,
d493f15c 47 dict_get,
9b9c5355 48 error_to_compat_str,
d6983cb4 49 ExtractorError,
46b18f23 50 extract_attributes,
97f4aecf 51 fix_xml_ampersands,
b14f3a4c 52 float_or_none,
773f291d
S
53 GeoRestrictedError,
54 GeoUtils,
31bb8d3f 55 int_or_none,
a4a554a7 56 js_to_json,
0685d972 57 JSON_LD_RE,
46b18f23 58 mimetype2ext,
3158150c 59 network_exceptions,
46b18f23 60 orderedSet,
d493f15c 61 parse_bitrate,
46b18f23
JH
62 parse_codecs,
63 parse_duration,
4ca2a3cf 64 parse_iso8601,
46b18f23 65 parse_m3u8_attributes,
d493f15c 66 parse_resolution,
55b3e45b 67 RegexNotFoundError,
5c2266df 68 sanitized_Request,
46b18f23 69 sanitize_filename,
d493f15c 70 str_or_none,
ce5b9040 71 str_to_int,
f856816b 72 strip_or_none,
f38de77f 73 unescapeHTML,
647eab45 74 unified_strdate,
6b3a3098 75 unified_timestamp,
46b18f23
JH
76 update_Request,
77 update_url_query,
78 urljoin,
a107193e 79 url_basename,
bebef109 80 url_or_none,
a6571f10 81 xpath_element,
8d6765cf
S
82 xpath_text,
83 xpath_with_ns,
d6983cb4 84)
c342041f 85
d6983cb4
PH
86
87class InfoExtractor(object):
88 """Information Extractor class.
89
90 Information extractors are the classes that, given a URL, extract
91 information about the video (or videos) the URL refers to. This
92 information includes the real video URL, the video title, author and
93 others. The information is stored in a dictionary which is then
5d380852 94 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
95 information possibly downloading the video to the file system, among
96 other possible outcomes.
97
cf0649f8 98 The type field determines the type of the result.
fed5d032
PH
99 By far the most common value (and the default if _type is missing) is
100 "video", which indicates a single video.
101
102 For a video, the dictionaries must include the following fields:
d6983cb4
PH
103
104 id: Video identifier.
d6983cb4 105 title: Video title, unescaped.
d67b0b15 106
f49d89ee 107 Additionally, it must contain either a formats entry or a url one:
d67b0b15 108
f49d89ee
PH
109 formats: A list of dictionaries for each format available, ordered
110 from worst to best quality.
111
112 Potential fields:
c790e93a
S
113 * url The mandatory URL representing the media:
114 for plain file media - HTTP URL of this file,
115 for RTMP - RTMP URL,
116 for HLS - URL of the M3U8 media playlist,
117 for HDS - URL of the F4M manifest,
79d2077e
S
118 for DASH
119 - HTTP URL to plain file media (in case of
120 unfragmented media)
121 - URL of the MPD manifest or base URL
122 representing the media if MPD manifest
8ed7a233 123 is parsed from a string (in case of
79d2077e 124 fragmented media)
c790e93a 125 for MSS - URL of the ISM manifest.
86f4d14f
S
126 * manifest_url
127 The URL of the manifest file in case of
c790e93a
S
128 fragmented media:
129 for HLS - URL of the M3U8 master playlist,
130 for HDS - URL of the F4M manifest,
131 for DASH - URL of the MPD manifest,
132 for MSS - URL of the ISM manifest.
10952eb2 133 * ext Will be calculated from URL if missing
d67b0b15
PH
134 * format A human-readable description of the format
135 ("mp4 container with h264/opus").
136 Calculated from the format_id, width, height.
137 and format_note fields if missing.
138 * format_id A short description of the format
5d4f3985
PH
139 ("mp4_h264_opus" or "19").
140 Technically optional, but strongly recommended.
d67b0b15
PH
141 * format_note Additional info about the format
142 ("3D" or "DASH video")
143 * width Width of the video, if known
144 * height Height of the video, if known
f49d89ee 145 * resolution Textual description of width and height
7217e148 146 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
147 * abr Average audio bitrate in KBit/s
148 * acodec Name of the audio codec in use
dd27fd17 149 * asr Audio sampling rate in Hertz
d67b0b15 150 * vbr Average video bitrate in KBit/s
fbb21cf5 151 * fps Frame rate
d67b0b15 152 * vcodec Name of the video codec in use
1394ce65 153 * container Name of the container format
d67b0b15 154 * filesize The number of bytes, if known in advance
9732d77e 155 * filesize_approx An estimate for the number of bytes
d67b0b15 156 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
157 * protocol The protocol that will be used for the actual
158 download, lower-case.
0fa9a1e2 159 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
af7d5a63 160 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
161 * fragment_base_url
162 Base URL for fragments. Each fragment's path
163 value (if present) will be relative to
164 this URL.
165 * fragments A list of fragments of a fragmented media.
166 Each fragment entry must contain either an url
167 or a path. If an url is present it should be
168 considered by a client. Otherwise both path and
169 fragment_base_url must be present. Here is
170 the list of all potential fields:
171 * "url" - fragment's URL
172 * "path" - fragment's path relative to
173 fragment_base_url
a0d5077c
S
174 * "duration" (optional, int or float)
175 * "filesize" (optional, int)
f49d89ee 176 * preference Order number of this format. If this field is
08d13955 177 present and not None, the formats get sorted
38d63d84 178 by this field, regardless of all other values.
f49d89ee
PH
179 -1 for default (order by other properties),
180 -2 or smaller for less than default.
e65566a9
PH
181 < -1000 to hide the format (if there is
182 another one which is strictly better)
32f90364
PH
183 * language Language code, e.g. "de" or "en-US".
184 * language_preference Is this in the language mentioned in
185 the URL?
aff2f4f4
PH
186 10 if it's what the URL is about,
187 -1 for default (don't know),
188 -10 otherwise, other values reserved for now.
5d73273f
PH
189 * quality Order number of the video quality of this
190 format, irrespective of the file format.
191 -1 for default (order by other properties),
192 -2 or smaller for less than default.
c64ed2a3
PH
193 * source_preference Order number for this video source
194 (quality takes higher priority)
195 -1 for default (order by other properties),
196 -2 or smaller for less than default.
d769be6c
PH
197 * http_headers A dictionary of additional HTTP headers
198 to add to the request.
6271f1ca 199 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
200 video's pixels are not square.
201 width : height ratio as float.
202 * no_resume The server does not support resuming the
203 (HTTP or RTMP) download. Boolean.
00c97e3e
S
204 * downloader_options A dictionary of downloader options as
205 described in FileDownloader
3b1fe47d 206 RTMP formats can also have the additional fields: page_url,
207 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
208 rtmp_protocol, rtmp_real_time
3dee7826 209
c0ba0f48 210 url: Final video URL.
d6983cb4 211 ext: Video filename extension.
d67b0b15
PH
212 format: The video format, defaults to ext (used for --get-format)
213 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 214
d6983cb4
PH
215 The following fields are optional:
216
f5e43bc6 217 alt_title: A secondary title of the video.
0afef30b
PH
218 display_id An alternative identifier for the video, not necessarily
219 unique, but available before title. Typically, id is
220 something like "4234987", title "Dancing naked mole rats",
221 and display_id "dancing-naked-mole-rats"
d5519808 222 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 223 * "id" (optional, string) - Thumbnail format ID
d5519808 224 * "url"
cfb56d1a 225 * "preference" (optional, int) - quality of the image
d5519808
PH
226 * "width" (optional, int)
227 * "height" (optional, int)
5e1c39ac 228 * "resolution" (optional, string "{width}x{height}",
d5519808 229 deprecated)
2de624fd 230 * "filesize" (optional, int)
d6983cb4 231 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 232 description: Full video description.
d6983cb4 233 uploader: Full name of the video uploader.
2bc0c46f 234 license: License name the video is licensed under.
8a92e51c 235 creator: The creator of the video.
10db0d2f 236 release_timestamp: UNIX timestamp of the moment the video was released.
8aab976b 237 release_date: The date (YYYYMMDD) when the video was released.
10db0d2f 238 timestamp: UNIX timestamp of the moment the video was uploaded
d6983cb4 239 upload_date: Video upload date (YYYYMMDD).
955c4514 240 If not explicitly set, calculated from timestamp.
d6983cb4 241 uploader_id: Nickname or id of the video uploader.
7bcd2830 242 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 243 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 244 Note that channel fields may or may not repeat uploader
6f1f59f3
S
245 fields. This depends on a particular extractor.
246 channel_id: Id of the channel.
247 channel_url: Full URL to a channel webpage.
da9ec3b9 248 location: Physical location where the video was filmed.
a504ced0 249 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
250 {tag: subformats}. "tag" is usually a language code, and
251 "subformats" is a list sorted from lower to higher
252 preference, each element is a dictionary with the "ext"
253 entry and one of:
a504ced0 254 * "data": The subtitles file contents
10952eb2 255 * "url": A URL pointing to the subtitles file
2412044c 256 It can optionally also have:
257 * "name": Name or description of the subtitles
4bba3716 258 "ext" will be calculated from URL if missing
e167860c 259 automatic_captions: Like 'subtitles'; contains automatically generated
260 captions instead of normal subtitles
62d231c0 261 duration: Length of the video in seconds, as an integer or float.
f3d29461 262 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
263 like_count: Number of positive ratings of the video
264 dislike_count: Number of negative ratings of the video
02835c6b 265 repost_count: Number of reposts of the video
2d30521a 266 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 267 comment_count: Number of comments on the video
dd622d7c
PH
268 comments: A list of comments, each with one or more of the following
269 properties (all but one of text or html optional):
270 * "author" - human-readable name of the comment author
271 * "author_id" - user ID of the comment author
a1c5d2ca 272 * "author_thumbnail" - The thumbnail of the comment author
dd622d7c
PH
273 * "id" - Comment ID
274 * "html" - Comment as HTML
275 * "text" - Plain text of the comment
276 * "timestamp" - UNIX timestamp of comment
277 * "parent" - ID of the comment this one is replying to.
278 Set to "root" to indicate that this is a
279 comment to the original video.
a1c5d2ca
M
280 * "like_count" - Number of positive ratings of the comment
281 * "dislike_count" - Number of negative ratings of the comment
282 * "is_favorited" - Whether the comment is marked as
283 favorite by the video uploader
284 * "author_is_uploader" - Whether the comment is made by
285 the video uploader
8dbe9899 286 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 287 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
288 should allow to get the same result again. (It will be set
289 by YoutubeDL if it's missing)
ad3bc6ac
PH
290 categories: A list of categories that the video falls in, for example
291 ["Sports", "Berlin"]
864f24bd 292 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
293 is_live: True, False, or None (=unknown). Whether this video is a
294 live stream that goes on instead of a fixed-length video.
f76ede8e 295 was_live: True, False, or None (=unknown). Whether this video was
296 originally a live stream.
7c80519c 297 start_time: Time in seconds where the reproduction should start, as
10952eb2 298 specified in the URL.
297a564b 299 end_time: Time in seconds where the reproduction should end, as
10952eb2 300 specified in the URL.
55949fed 301 chapters: A list of dictionaries, with the following entries:
302 * "start_time" - The start time of the chapter in seconds
303 * "end_time" - The end time of the chapter in seconds
304 * "title" (optional, string)
6cfda058 305 playable_in_embed: Whether this video is allowed to play in embedded
306 players on other sites. Can be True (=always allowed),
307 False (=never allowed), None (=unknown), or a string
c224251a
M
308 specifying the criteria for embedability (Eg: 'whitelist')
309 availability: Under what condition the video is available. One of
310 'private', 'premium_only', 'subscriber_only', 'needs_auth',
311 'unlisted' or 'public'. Use 'InfoExtractor._availability'
312 to set it
277d6ff5 313 __post_extractor: A function to be called just before the metadata is
314 written to either disk, logger or console. The function
315 must return a dict which will be added to the info_dict.
316 This is usefull for additional information that is
317 time-consuming to extract. Note that the fields thus
318 extracted will not be available to output template and
319 match_filter. So, only "comments" and "comment_count" are
320 currently allowed to be extracted via this method.
d6983cb4 321
7109903e
S
322 The following fields should only be used when the video belongs to some logical
323 chapter or section:
324
325 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
326 chapter_number: Number of the chapter the video belongs to, as an integer.
327 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
328
329 The following fields should only be used when the video is an episode of some
8d76bdf1 330 series, programme or podcast:
7109903e
S
331
332 series: Title of the series or programme the video episode belongs to.
333 season: Title of the season the video episode belongs to.
27bfd4e5
S
334 season_number: Number of the season the video episode belongs to, as an integer.
335 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
336 episode: Title of the video episode. Unlike mandatory video title field,
337 this field should denote the exact title of the video episode
338 without any kind of decoration.
27bfd4e5
S
339 episode_number: Number of the video episode within a season, as an integer.
340 episode_id: Id of the video episode, as a unicode string.
7109903e 341
7a93ab5f
S
342 The following fields should only be used when the media is a track or a part of
343 a music album:
344
345 track: Title of the track.
346 track_number: Number of the track within an album or a disc, as an integer.
347 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
348 as a unicode string.
349 artist: Artist(s) of the track.
350 genre: Genre(s) of the track.
351 album: Title of the album the track belongs to.
352 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
353 album_artist: List of all artists appeared on the album (e.g.
354 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
355 and compilations).
356 disc_number: Number of the disc or other physical medium the track belongs to,
357 as an integer.
358 release_year: Year (YYYY) when the album was released.
359
deefc05b 360 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 361
d838b1bd
PH
362 Unless mentioned otherwise, None is equivalent to absence of information.
363
fed5d032
PH
364
365 _type "playlist" indicates multiple videos.
b82f815f
PH
366 There must be a key "entries", which is a list, an iterable, or a PagedList
367 object, each element of which is a valid dictionary by this specification.
fed5d032 368
b60419c5 369 Additionally, playlists can have "id", "title", and any other relevent
370 attributes with the same semantics as videos (see above).
fed5d032
PH
371
372
373 _type "multi_video" indicates that there are multiple videos that
374 form a single show, for examples multiple acts of an opera or TV episode.
375 It must have an entries key like a playlist and contain all the keys
376 required for a video at the same time.
377
378
379 _type "url" indicates that the video must be extracted from another
380 location, possibly by a different extractor. Its only required key is:
381 "url" - the next URL to extract.
f58766ce
PH
382 The key "ie_key" can be set to the class name (minus the trailing "IE",
383 e.g. "Youtube") if the extractor class is known in advance.
384 Additionally, the dictionary may have any properties of the resolved entity
385 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
386 known ahead of time.
387
388
389 _type "url_transparent" entities have the same specification as "url", but
390 indicate that the given additional information is more precise than the one
391 associated with the resolved URL.
392 This is useful when a site employs a video service that hosts the video and
393 its technical metadata, but that video service does not embed a useful
394 title, description etc.
395
396
d6983cb4
PH
397 Subclasses of this one should re-define the _real_initialize() and
398 _real_extract() methods and define a _VALID_URL regexp.
399 Probably, they should also be added to the list of extractors.
400
4248dad9 401 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
402 geo restriction bypass mechanisms for a particular extractor.
403 Though it won't disable explicit geo restriction bypass based on
504f20dd 404 country code provided with geo_bypass_country.
4248dad9
S
405
406 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
407 countries for this extractor. One of these countries will be used by
408 geo restriction bypass mechanism right away in order to bypass
504f20dd 409 geo restriction, of course, if the mechanism is not disabled.
773f291d 410
5f95927a
S
411 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
412 IP blocks in CIDR notation for this extractor. One of these IP blocks
413 will be used by geo restriction bypass mechanism similarly
504f20dd 414 to _GEO_COUNTRIES.
3ccdde8c 415
d6983cb4
PH
416 Finally, the _WORKING attribute should be set to False for broken IEs
417 in order to warn the users and skip the tests.
418 """
419
420 _ready = False
421 _downloader = None
773f291d 422 _x_forwarded_for_ip = None
4248dad9
S
423 _GEO_BYPASS = True
424 _GEO_COUNTRIES = None
5f95927a 425 _GEO_IP_BLOCKS = None
d6983cb4
PH
426 _WORKING = True
427
9d5d4d64 428 _LOGIN_HINTS = {
429 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
430 'cookies': (
431 'Use --cookies for the authentication. '
432 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to pass cookies'),
433 'password': 'Use --username and --password or --netrc to provide account credentials',
434 }
435
d6983cb4
PH
436 def __init__(self, downloader=None):
437 """Constructor. Receives an optional downloader."""
438 self._ready = False
773f291d 439 self._x_forwarded_for_ip = None
d6983cb4
PH
440 self.set_downloader(downloader)
441
442 @classmethod
443 def suitable(cls, url):
444 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
445
446 # This does not use has/getattr intentionally - we want to know whether
447 # we have cached the regexp for *this* class, whereas getattr would also
448 # match the superclass
449 if '_VALID_URL_RE' not in cls.__dict__:
450 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
451 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 452
ed9266db
PH
453 @classmethod
454 def _match_id(cls, url):
455 if '_VALID_URL_RE' not in cls.__dict__:
456 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
457 m = cls._VALID_URL_RE.match(url)
458 assert m
1afd0b0d 459 return compat_str(m.group('id'))
ed9266db 460
d6983cb4
PH
461 @classmethod
462 def working(cls):
463 """Getter method for _WORKING."""
464 return cls._WORKING
465
466 def initialize(self):
467 """Initializes an instance (authentication, etc)."""
5f95927a
S
468 self._initialize_geo_bypass({
469 'countries': self._GEO_COUNTRIES,
470 'ip_blocks': self._GEO_IP_BLOCKS,
471 })
4248dad9
S
472 if not self._ready:
473 self._real_initialize()
474 self._ready = True
475
5f95927a 476 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
477 """
478 Initialize geo restriction bypass mechanism.
479
480 This method is used to initialize geo bypass mechanism based on faking
481 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 482 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
483 IP will be passed as X-Forwarded-For HTTP header in all subsequent
484 HTTP requests.
e39b5d4a
S
485
486 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
487 during the instance initialization with _GEO_COUNTRIES and
488 _GEO_IP_BLOCKS.
e39b5d4a 489
5f95927a 490 You may also manually call it from extractor's code if geo bypass
e39b5d4a 491 information is not available beforehand (e.g. obtained during
5f95927a
S
492 extraction) or due to some other reason. In this case you should pass
493 this information in geo bypass context passed as first argument. It may
494 contain following fields:
495
496 countries: List of geo unrestricted countries (similar
497 to _GEO_COUNTRIES)
498 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
499 (similar to _GEO_IP_BLOCKS)
500
e39b5d4a 501 """
773f291d 502 if not self._x_forwarded_for_ip:
5f95927a
S
503
504 # Geo bypass mechanism is explicitly disabled by user
a06916d9 505 if not self.get_param('geo_bypass', True):
5f95927a
S
506 return
507
508 if not geo_bypass_context:
509 geo_bypass_context = {}
510
511 # Backward compatibility: previously _initialize_geo_bypass
512 # expected a list of countries, some 3rd party code may still use
513 # it this way
514 if isinstance(geo_bypass_context, (list, tuple)):
515 geo_bypass_context = {
516 'countries': geo_bypass_context,
517 }
518
519 # The whole point of geo bypass mechanism is to fake IP
520 # as X-Forwarded-For HTTP header based on some IP block or
521 # country code.
522
523 # Path 1: bypassing based on IP block in CIDR notation
524
525 # Explicit IP block specified by user, use it right away
526 # regardless of whether extractor is geo bypassable or not
a06916d9 527 ip_block = self.get_param('geo_bypass_ip_block', None)
5f95927a
S
528
529 # Otherwise use random IP block from geo bypass context but only
530 # if extractor is known as geo bypassable
531 if not ip_block:
532 ip_blocks = geo_bypass_context.get('ip_blocks')
533 if self._GEO_BYPASS and ip_blocks:
534 ip_block = random.choice(ip_blocks)
535
536 if ip_block:
537 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
0760b0a7 538 self._downloader.write_debug(
539 '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip)
5f95927a
S
540 return
541
542 # Path 2: bypassing based on country code
543
544 # Explicit country code specified by user, use it right away
545 # regardless of whether extractor is geo bypassable or not
a06916d9 546 country = self.get_param('geo_bypass_country', None)
5f95927a
S
547
548 # Otherwise use random country code from geo bypass context but
549 # only if extractor is known as geo bypassable
550 if not country:
551 countries = geo_bypass_context.get('countries')
552 if self._GEO_BYPASS and countries:
553 country = random.choice(countries)
554
555 if country:
556 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
0760b0a7 557 self._downloader.write_debug(
558 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
559
560 def extract(self, url):
561 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 562 try:
773f291d
S
563 for _ in range(2):
564 try:
565 self.initialize()
a06916d9 566 self.write_debug('Extracting URL: %s' % url)
0016b84e 567 ie_result = self._real_extract(url)
07cce701 568 if ie_result is None:
569 return None
0016b84e
S
570 if self._x_forwarded_for_ip:
571 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
53ed7066 572 subtitles = ie_result.get('subtitles')
573 if (subtitles and 'live_chat' in subtitles
a06916d9 574 and 'no-live-chat' in self.get_param('compat_opts', [])):
53ed7066 575 del subtitles['live_chat']
0016b84e 576 return ie_result
773f291d 577 except GeoRestrictedError as e:
4248dad9
S
578 if self.__maybe_fake_ip_and_retry(e.countries):
579 continue
773f291d 580 raise
3a5bcd03
PH
581 except ExtractorError:
582 raise
583 except compat_http_client.IncompleteRead as e:
dfb1b146 584 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 585 except (KeyError, StopIteration) as e:
dfb1b146 586 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4 587
4248dad9 588 def __maybe_fake_ip_and_retry(self, countries):
a06916d9 589 if (not self.get_param('geo_bypass_country', None)
3089bc74 590 and self._GEO_BYPASS
a06916d9 591 and self.get_param('geo_bypass', True)
3089bc74
S
592 and not self._x_forwarded_for_ip
593 and countries):
eea0716c
S
594 country_code = random.choice(countries)
595 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
596 if self._x_forwarded_for_ip:
597 self.report_warning(
eea0716c
S
598 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
599 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
600 return True
601 return False
602
d6983cb4
PH
603 def set_downloader(self, downloader):
604 """Sets the downloader for this IE."""
605 self._downloader = downloader
606
607 def _real_initialize(self):
608 """Real initialization process. Redefine in subclasses."""
609 pass
610
611 def _real_extract(self, url):
612 """Real extraction process. Redefine in subclasses."""
613 pass
614
56c73665
JMF
615 @classmethod
616 def ie_key(cls):
617 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 618 return compat_str(cls.__name__[:-2])
56c73665 619
d6983cb4
PH
620 @property
621 def IE_NAME(self):
dc519b54 622 return compat_str(type(self).__name__[:-2])
d6983cb4 623
d391b7e2
S
624 @staticmethod
625 def __can_accept_status_code(err, expected_status):
626 assert isinstance(err, compat_urllib_error.HTTPError)
627 if expected_status is None:
628 return False
629 if isinstance(expected_status, compat_integer_types):
630 return err.code == expected_status
631 elif isinstance(expected_status, (list, tuple)):
632 return err.code in expected_status
633 elif callable(expected_status):
634 return expected_status(err.code) is True
635 else:
636 assert False
637
638 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
639 """
640 Return the response handle.
641
642 See _download_webpage docstring for arguments specification.
643 """
1cf376f5 644 if not self._downloader._first_webpage_request:
a06916d9 645 sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
1cf376f5 646 if sleep_interval > 0:
5ef7d9bd 647 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 648 time.sleep(sleep_interval)
649 else:
650 self._downloader._first_webpage_request = False
651
d6983cb4
PH
652 if note is None:
653 self.report_download_webpage(video_id)
654 elif note is not False:
7cc3570e 655 if video_id is None:
f1a9d64e 656 self.to_screen('%s' % (note,))
7cc3570e 657 else:
f1a9d64e 658 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
659
660 # Some sites check X-Forwarded-For HTTP header in order to figure out
661 # the origin of the client behind proxy. This allows bypassing geo
662 # restriction by faking this header's value to IP that belongs to some
663 # geo unrestricted country. We will do so once we encounter any
664 # geo restriction error.
665 if self._x_forwarded_for_ip:
666 if 'X-Forwarded-For' not in headers:
667 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
668
41d06b04
S
669 if isinstance(url_or_request, compat_urllib_request.Request):
670 url_or_request = update_Request(
671 url_or_request, data=data, headers=headers, query=query)
672 else:
cdfee168 673 if query:
674 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 675 if data is not None or headers:
41d06b04 676 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 677 try:
dca08720 678 return self._downloader.urlopen(url_or_request)
3158150c 679 except network_exceptions as err:
d391b7e2
S
680 if isinstance(err, compat_urllib_error.HTTPError):
681 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
682 # Retain reference to error to prevent file object from
683 # being closed before it can be read. Works around the
684 # effects of <https://bugs.python.org/issue15002>
685 # introduced in Python 3.4.1.
686 err.fp._error = err
d391b7e2
S
687 return err.fp
688
aa94a6d3
PH
689 if errnote is False:
690 return False
d6983cb4 691 if errnote is None:
f1a9d64e 692 errnote = 'Unable to download webpage'
7f8b2714 693
9b9c5355 694 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
695 if fatal:
696 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
697 else:
6a39ee13 698 self.report_warning(errmsg)
7cc3570e 699 return False
d6983cb4 700
d391b7e2
S
701 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
702 """
703 Return a tuple (page content as string, URL handle).
704
705 See _download_webpage docstring for arguments specification.
706 """
b9d3e163
PH
707 # Strip hashes from the URL (#1038)
708 if isinstance(url_or_request, (compat_str, str)):
709 url_or_request = url_or_request.partition('#')[0]
710
d391b7e2 711 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
712 if urlh is False:
713 assert not fatal
714 return False
c9a77969 715 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
716 return (content, urlh)
717
c9a77969
YCH
718 @staticmethod
719 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
720 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
721 if m:
722 encoding = m.group(1)
723 else:
0d75ae2c 724 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
725 webpage_bytes[:1024])
726 if m:
727 encoding = m.group(1).decode('ascii')
b60016e8
PH
728 elif webpage_bytes.startswith(b'\xff\xfe'):
729 encoding = 'utf-16'
f143d86a
PH
730 else:
731 encoding = 'utf-8'
c9a77969
YCH
732
733 return encoding
734
4457823d
S
735 def __check_blocked(self, content):
736 first_block = content[:512]
3089bc74
S
737 if ('<title>Access to this site is blocked</title>' in content
738 and 'Websense' in first_block):
4457823d
S
739 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
740 blocked_iframe = self._html_search_regex(
741 r'<iframe src="([^"]+)"', content,
742 'Websense information URL', default=None)
743 if blocked_iframe:
744 msg += ' Visit %s for more details' % blocked_iframe
745 raise ExtractorError(msg, expected=True)
746 if '<title>The URL you requested has been blocked</title>' in first_block:
747 msg = (
748 'Access to this webpage has been blocked by Indian censorship. '
749 'Use a VPN or proxy server (with --proxy) to route around it.')
750 block_msg = self._html_search_regex(
751 r'</h1><p>(.*?)</p>',
752 content, 'block message', default=None)
753 if block_msg:
754 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
755 raise ExtractorError(msg, expected=True)
3089bc74
S
756 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
757 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
758 raise ExtractorError(
759 'Access to this webpage has been blocked by decision of the Russian government. '
760 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
761 expected=True)
762
c9a77969
YCH
763 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
764 content_type = urlh.headers.get('Content-Type', '')
765 webpage_bytes = urlh.read()
766 if prefix is not None:
767 webpage_bytes = prefix + webpage_bytes
768 if not encoding:
769 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
a06916d9 770 if self.get_param('dump_intermediate_pages', False):
f610dbb0 771 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
772 dump = base64.b64encode(webpage_bytes).decode('ascii')
773 self._downloader.to_screen(dump)
a06916d9 774 if self.get_param('write_pages', False):
f610dbb0 775 basen = '%s_%s' % (video_id, urlh.geturl())
c1bce22f 776 if len(basen) > 240:
f1a9d64e 777 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
778 basen = basen[:240 - len(h)] + h
779 raw_filename = basen + '.dump'
d41e6efc 780 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 781 self.to_screen('Saving request to ' + filename)
5f58165d
S
782 # Working around MAX_PATH limitation on Windows (see
783 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 784 if compat_os_name == 'nt':
5f58165d
S
785 absfilepath = os.path.abspath(filename)
786 if len(absfilepath) > 259:
787 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
788 with open(filename, 'wb') as outf:
789 outf.write(webpage_bytes)
790
ec0fafbb
AA
791 try:
792 content = webpage_bytes.decode(encoding, 'replace')
793 except LookupError:
794 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 795
4457823d 796 self.__check_blocked(content)
2410c43d 797
23be51d8 798 return content
d6983cb4 799
d391b7e2
S
800 def _download_webpage(
801 self, url_or_request, video_id, note=None, errnote=None,
802 fatal=True, tries=1, timeout=5, encoding=None, data=None,
803 headers={}, query={}, expected_status=None):
804 """
805 Return the data of the page as a string.
806
807 Arguments:
808 url_or_request -- plain text URL as a string or
809 a compat_urllib_request.Requestobject
810 video_id -- Video/playlist/item identifier (string)
811
812 Keyword arguments:
813 note -- note printed before downloading (string)
814 errnote -- note printed in case of an error (string)
815 fatal -- flag denoting whether error should be considered fatal,
816 i.e. whether it should cause ExtractionError to be raised,
817 otherwise a warning will be reported and extraction continued
818 tries -- number of tries
819 timeout -- sleep interval between tries
820 encoding -- encoding for a page content decoding, guessed automatically
821 when not explicitly specified
822 data -- POST data (bytes)
823 headers -- HTTP headers (dict)
824 query -- URL query (dict)
825 expected_status -- allows to accept failed HTTP requests (non 2xx
826 status code) by explicitly specifying a set of accepted status
827 codes. Can be any of the following entities:
828 - an integer type specifying an exact failed status code to
829 accept
830 - a list or a tuple of integer types specifying a list of
831 failed status codes to accept
832 - a callable accepting an actual failed status code and
833 returning True if it should be accepted
834 Note that this argument does not affect success status codes (2xx)
835 which are always accepted.
836 """
837
995ad69c
TF
838 success = False
839 try_count = 0
840 while success is False:
841 try:
d391b7e2
S
842 res = self._download_webpage_handle(
843 url_or_request, video_id, note, errnote, fatal,
844 encoding=encoding, data=data, headers=headers, query=query,
845 expected_status=expected_status)
995ad69c
TF
846 success = True
847 except compat_http_client.IncompleteRead as e:
848 try_count += 1
849 if try_count >= tries:
850 raise e
851 self._sleep(timeout, video_id)
7cc3570e
PH
852 if res is False:
853 return res
854 else:
855 content, _ = res
856 return content
d6983cb4 857
e0d198c1
S
858 def _download_xml_handle(
859 self, url_or_request, video_id, note='Downloading XML',
860 errnote='Unable to download XML', transform_source=None,
d391b7e2
S
861 fatal=True, encoding=None, data=None, headers={}, query={},
862 expected_status=None):
863 """
ee0ba927 864 Return a tuple (xml as an compat_etree_Element, URL handle).
d391b7e2
S
865
866 See _download_webpage docstring for arguments specification.
867 """
e0d198c1
S
868 res = self._download_webpage_handle(
869 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
870 encoding=encoding, data=data, headers=headers, query=query,
871 expected_status=expected_status)
e0d198c1
S
872 if res is False:
873 return res
874 xml_string, urlh = res
875 return self._parse_xml(
876 xml_string, video_id, transform_source=transform_source,
877 fatal=fatal), urlh
878
d391b7e2
S
879 def _download_xml(
880 self, url_or_request, video_id,
881 note='Downloading XML', errnote='Unable to download XML',
882 transform_source=None, fatal=True, encoding=None,
883 data=None, headers={}, query={}, expected_status=None):
884 """
ee0ba927 885 Return the xml as an compat_etree_Element.
d391b7e2
S
886
887 See _download_webpage docstring for arguments specification.
888 """
e0d198c1
S
889 res = self._download_xml_handle(
890 url_or_request, video_id, note=note, errnote=errnote,
891 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
892 data=data, headers=headers, query=query,
893 expected_status=expected_status)
e0d198c1 894 return res if res is False else res[0]
e01c3d2e
S
895
896 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
897 if transform_source:
898 xml_string = transform_source(xml_string)
e01c3d2e
S
899 try:
900 return compat_etree_fromstring(xml_string.encode('utf-8'))
901 except compat_xml_parse_error as ve:
902 errmsg = '%s: Failed to parse XML ' % video_id
903 if fatal:
904 raise ExtractorError(errmsg, cause=ve)
905 else:
906 self.report_warning(errmsg + str(ve))
267ed0c5 907
0fe7783e
S
908 def _download_json_handle(
909 self, url_or_request, video_id, note='Downloading JSON metadata',
910 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
911 fatal=True, encoding=None, data=None, headers={}, query={},
912 expected_status=None):
913 """
914 Return a tuple (JSON object, URL handle).
915
916 See _download_webpage docstring for arguments specification.
917 """
0fe7783e 918 res = self._download_webpage_handle(
c9a77969 919 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
920 encoding=encoding, data=data, headers=headers, query=query,
921 expected_status=expected_status)
0fe7783e
S
922 if res is False:
923 return res
924 json_string, urlh = res
ebb64199 925 return self._parse_json(
0fe7783e
S
926 json_string, video_id, transform_source=transform_source,
927 fatal=fatal), urlh
928
929 def _download_json(
930 self, url_or_request, video_id, note='Downloading JSON metadata',
931 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
932 fatal=True, encoding=None, data=None, headers={}, query={},
933 expected_status=None):
934 """
935 Return the JSON object as a dict.
936
937 See _download_webpage docstring for arguments specification.
938 """
0fe7783e
S
939 res = self._download_json_handle(
940 url_or_request, video_id, note=note, errnote=errnote,
941 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
942 data=data, headers=headers, query=query,
943 expected_status=expected_status)
0fe7783e 944 return res if res is False else res[0]
ebb64199
TF
945
946 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
947 if transform_source:
948 json_string = transform_source(json_string)
3d3538e4
PH
949 try:
950 return json.loads(json_string)
951 except ValueError as ve:
e7b6d122
PH
952 errmsg = '%s: Failed to parse JSON ' % video_id
953 if fatal:
954 raise ExtractorError(errmsg, cause=ve)
955 else:
956 self.report_warning(errmsg + str(ve))
3d3538e4 957
adddc50c 958 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
959 return self._parse_json(
960 data[data.find('{'):data.rfind('}') + 1],
961 video_id, transform_source, fatal)
962
963 def _download_socket_json_handle(
964 self, url_or_request, video_id, note='Polling socket',
965 errnote='Unable to poll socket', transform_source=None,
966 fatal=True, encoding=None, data=None, headers={}, query={},
967 expected_status=None):
968 """
969 Return a tuple (JSON object, URL handle).
970
971 See _download_webpage docstring for arguments specification.
972 """
973 res = self._download_webpage_handle(
974 url_or_request, video_id, note, errnote, fatal=fatal,
975 encoding=encoding, data=data, headers=headers, query=query,
976 expected_status=expected_status)
977 if res is False:
978 return res
979 webpage, urlh = res
980 return self._parse_socket_response_as_json(
981 webpage, video_id, transform_source=transform_source,
982 fatal=fatal), urlh
983
984 def _download_socket_json(
985 self, url_or_request, video_id, note='Polling socket',
986 errnote='Unable to poll socket', transform_source=None,
987 fatal=True, encoding=None, data=None, headers={}, query={},
988 expected_status=None):
989 """
990 Return the JSON object as a dict.
991
992 See _download_webpage docstring for arguments specification.
993 """
994 res = self._download_socket_json_handle(
995 url_or_request, video_id, note=note, errnote=errnote,
996 transform_source=transform_source, fatal=fatal, encoding=encoding,
997 data=data, headers=headers, query=query,
998 expected_status=expected_status)
999 return res if res is False else res[0]
1000
a06916d9 1001 def report_warning(self, msg, video_id=None, *args, **kwargs):
f1a9d64e 1002 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 1003 self._downloader.report_warning(
a06916d9 1004 '[%s] %s%s' % (self.IE_NAME, idstr, msg), *args, **kwargs)
f45f96f8 1005
a06916d9 1006 def to_screen(self, msg, *args, **kwargs):
d6983cb4 1007 """Print msg to screen, prefixing it with '[ie_name]'"""
a06916d9 1008 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1009
1010 def write_debug(self, msg, *args, **kwargs):
1011 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1012
1013 def get_param(self, name, default=None, *args, **kwargs):
1014 if self._downloader:
1015 return self._downloader.params.get(name, default, *args, **kwargs)
1016 return default
d6983cb4
PH
1017
1018 def report_extraction(self, id_or_name):
1019 """Report information extraction."""
f1a9d64e 1020 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
1021
1022 def report_download_webpage(self, video_id):
1023 """Report webpage download."""
f1a9d64e 1024 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
1025
1026 def report_age_confirmation(self):
1027 """Report attempt to confirm age."""
f1a9d64e 1028 self.to_screen('Confirming age')
d6983cb4 1029
fc79158d
JMF
1030 def report_login(self):
1031 """Report attempt to log in."""
f1a9d64e 1032 self.to_screen('Logging in')
fc79158d 1033
b7da73eb 1034 def raise_login_required(
9d5d4d64 1035 self, msg='This video is only available for registered users',
1036 metadata_available=False, method='any'):
a06916d9 1037 if metadata_available and self.get_param('ignore_no_formats_error'):
b7da73eb 1038 self.report_warning(msg)
9d5d4d64 1039 raise ExtractorError('%s. %s' % (msg, self._LOGIN_HINTS[method]), expected=True)
43e7d3c9 1040
b7da73eb 1041 def raise_geo_restricted(
1042 self, msg='This video is not available from your location due to geo restriction',
1043 countries=None, metadata_available=False):
a06916d9 1044 if metadata_available and self.get_param('ignore_no_formats_error'):
b7da73eb 1045 self.report_warning(msg)
1046 else:
1047 raise GeoRestrictedError(msg, countries=countries)
1048
1049 def raise_no_formats(self, msg, expected=False, video_id=None):
a06916d9 1050 if expected and self.get_param('ignore_no_formats_error'):
b7da73eb 1051 self.report_warning(msg, video_id)
1052 else:
1053 raise ExtractorError(msg, expected=expected, video_id=video_id)
c430802e 1054
5f6a1245 1055 # Methods for following #608
c0d0b01f 1056 @staticmethod
830d53bf 1057 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 1058 """Returns a URL that points to a page that should be processed"""
5f6a1245 1059 # TODO: ie should be the class used for getting the info
d6983cb4
PH
1060 video_info = {'_type': 'url',
1061 'url': url,
1062 'ie_key': ie}
7012b23c
PH
1063 if video_id is not None:
1064 video_info['id'] = video_id
830d53bf
S
1065 if video_title is not None:
1066 video_info['title'] = video_title
d6983cb4 1067 return video_info
5f6a1245 1068
749ca5ec
S
1069 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1070 urls = orderedSet(
46b18f23
JH
1071 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1072 for m in matches)
1073 return self.playlist_result(
749ca5ec 1074 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 1075
c0d0b01f 1076 @staticmethod
b60419c5 1077 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
d6983cb4
PH
1078 """Returns a playlist"""
1079 video_info = {'_type': 'playlist',
1080 'entries': entries}
b60419c5 1081 video_info.update(kwargs)
d6983cb4
PH
1082 if playlist_id:
1083 video_info['id'] = playlist_id
1084 if playlist_title:
1085 video_info['title'] = playlist_title
ecc97af3 1086 if playlist_description is not None:
acf5cbfe 1087 video_info['description'] = playlist_description
d6983cb4
PH
1088 return video_info
1089
c342041f 1090 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1091 """
1092 Perform a regex search on the given string, using a single or a list of
1093 patterns returning the first matching group.
1094 In case of failure return a default value or raise a WARNING or a
55b3e45b 1095 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
1096 """
1097 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1098 mobj = re.search(pattern, string, flags)
1099 else:
1100 for p in pattern:
1101 mobj = re.search(p, string, flags)
c3415d1b
PH
1102 if mobj:
1103 break
d6983cb4 1104
a06916d9 1105 if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 1106 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
1107 else:
1108 _name = name
1109
1110 if mobj:
711ede6e
PH
1111 if group is None:
1112 # return the first matching group
1113 return next(g for g in mobj.groups() if g is not None)
1114 else:
1115 return mobj.group(group)
c342041f 1116 elif default is not NO_DEFAULT:
d6983cb4
PH
1117 return default
1118 elif fatal:
f1a9d64e 1119 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1120 else:
6a39ee13 1121 self.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1122 return None
1123
c342041f 1124 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1125 """
1126 Like _search_regex, but strips HTML tags and unescapes entities.
1127 """
711ede6e 1128 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1129 if res:
1130 return clean_html(res).strip()
1131 else:
1132 return res
1133
2118fdd1
RA
1134 def _get_netrc_login_info(self, netrc_machine=None):
1135 username = None
1136 password = None
1137 netrc_machine = netrc_machine or self._NETRC_MACHINE
1138
a06916d9 1139 if self.get_param('usenetrc', False):
2118fdd1
RA
1140 try:
1141 info = netrc.netrc().authenticators(netrc_machine)
1142 if info is not None:
1143 username = info[0]
1144 password = info[2]
1145 else:
dcce092e
S
1146 raise netrc.NetrcParseError(
1147 'No authenticators for %s' % netrc_machine)
2118fdd1 1148 except (IOError, netrc.NetrcParseError) as err:
6a39ee13 1149 self.report_warning(
dcce092e 1150 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1151
dcce092e 1152 return username, password
2118fdd1 1153
1b6712ab 1154 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1155 """
cf0649f8 1156 Get the login info as (username, password)
32443dd3
S
1157 First look for the manually specified credentials using username_option
1158 and password_option as keys in params dictionary. If no such credentials
1159 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1160 value.
fc79158d
JMF
1161 If there's no info available, return (None, None)
1162 """
fc79158d
JMF
1163
1164 # Attempt to use provided username and password or .netrc data
a06916d9 1165 username = self.get_param(username_option)
1166 if username is not None:
1167 password = self.get_param(password_option)
2118fdd1 1168 else:
1b6712ab 1169 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1170
2133565c 1171 return username, password
fc79158d 1172
e64b7569 1173 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1174 """
1175 Get the two-factor authentication info
1176 TODO - asking the user will be required for sms/phone verify
1177 currently just uses the command line option
1178 If there's no info available, return None
1179 """
83317f69 1180
a06916d9 1181 tfa = self.get_param('twofactor')
1182 if tfa is not None:
1183 return tfa
83317f69 1184
e64b7569 1185 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1186
46720279
JMF
1187 # Helper functions for extracting OpenGraph info
1188 @staticmethod
ab2d5247 1189 def _og_regexes(prop):
448ef1f3 1190 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
22f5f5c6 1191 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
7a6d76a6 1192 % {'prop': re.escape(prop)})
78fb87b2 1193 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1194 return [
78fb87b2
JMF
1195 template % (property_re, content_re),
1196 template % (content_re, property_re),
ab2d5247 1197 ]
46720279 1198
864f24bd
S
1199 @staticmethod
1200 def _meta_regex(prop):
1201 return r'''(?isx)<meta
8b9848ac 1202 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1203 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1204
3c4e6d83 1205 def _og_search_property(self, prop, html, name=None, **kargs):
b070564e
S
1206 if not isinstance(prop, (list, tuple)):
1207 prop = [prop]
46720279 1208 if name is None:
b070564e
S
1209 name = 'OpenGraph %s' % prop[0]
1210 og_regexes = []
1211 for p in prop:
1212 og_regexes.extend(self._og_regexes(p))
1213 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1214 if escaped is None:
1215 return None
1216 return unescapeHTML(escaped)
46720279
JMF
1217
1218 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1219 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1220
1221 def _og_search_description(self, html, **kargs):
1222 return self._og_search_property('description', html, fatal=False, **kargs)
1223
1224 def _og_search_title(self, html, **kargs):
1225 return self._og_search_property('title', html, **kargs)
1226
8ffa13e0 1227 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1228 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1229 if secure:
1230 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1231 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1232
78338f71
JMF
1233 def _og_search_url(self, html, **kargs):
1234 return self._og_search_property('url', html, **kargs)
1235
40c696e5 1236 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
88d9f6c0
S
1237 if not isinstance(name, (list, tuple)):
1238 name = [name]
59040888 1239 if display_name is None:
88d9f6c0 1240 display_name = name[0]
59040888 1241 return self._html_search_regex(
88d9f6c0 1242 [self._meta_regex(n) for n in name],
711ede6e 1243 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1244
1245 def _dc_search_uploader(self, html):
1246 return self._html_search_meta('dc.creator', html, 'uploader')
1247
8dbe9899
PH
1248 def _rta_search(self, html):
1249 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1250 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1251 r' content="RTA-5042-1996-1400-1577-RTA"',
1252 html):
1253 return 18
1254 return 0
1255
59040888
PH
1256 def _media_rating_search(self, html):
1257 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1258 rating = self._html_search_meta('rating', html)
1259
1260 if not rating:
1261 return None
1262
1263 RATING_TABLE = {
1264 'safe for kids': 0,
1265 'general': 8,
1266 '14 years': 14,
1267 'mature': 17,
1268 'restricted': 19,
1269 }
d800609c 1270 return RATING_TABLE.get(rating.lower())
59040888 1271
69319969 1272 def _family_friendly_search(self, html):
6ca7732d 1273 # See http://schema.org/VideoObject
ac8491fc
S
1274 family_friendly = self._html_search_meta(
1275 'isFamilyFriendly', html, default=None)
69319969
NJ
1276
1277 if not family_friendly:
1278 return None
1279
1280 RATING_TABLE = {
1281 '1': 0,
1282 'true': 0,
1283 '0': 18,
1284 'false': 18,
1285 }
d800609c 1286 return RATING_TABLE.get(family_friendly.lower())
69319969 1287
0c708f11
JMF
1288 def _twitter_search_player(self, html):
1289 return self._html_search_meta('twitter:player', html,
9e1a5b84 1290 'twitter card player')
0c708f11 1291
95b31e26 1292 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4433bb02 1293 json_ld_list = list(re.finditer(JSON_LD_RE, html))
321b5e08 1294 default = kwargs.get('default', NO_DEFAULT)
321b5e08
S
1295 # JSON-LD may be malformed and thus `fatal` should be respected.
1296 # At the same time `default` may be passed that assumes `fatal=False`
1297 # for _search_regex. Let's simulate the same behavior here as well.
1298 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
4433bb02
S
1299 json_ld = []
1300 for mobj in json_ld_list:
1301 json_ld_item = self._parse_json(
1302 mobj.group('json_ld'), video_id, fatal=fatal)
1303 if not json_ld_item:
1304 continue
1305 if isinstance(json_ld_item, dict):
1306 json_ld.append(json_ld_item)
1307 elif isinstance(json_ld_item, (list, tuple)):
1308 json_ld.extend(json_ld_item)
1309 if json_ld:
1310 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1311 if json_ld:
1312 return json_ld
1313 if default is not NO_DEFAULT:
1314 return default
1315 elif fatal:
1316 raise RegexNotFoundError('Unable to extract JSON-LD')
1317 else:
6a39ee13 1318 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
4433bb02 1319 return {}
4ca2a3cf 1320
95b31e26 1321 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1322 if isinstance(json_ld, compat_str):
1323 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1324 if not json_ld:
1325 return {}
1326 info = {}
46933a15
S
1327 if not isinstance(json_ld, (list, tuple, dict)):
1328 return info
1329 if isinstance(json_ld, dict):
1330 json_ld = [json_ld]
bae14048 1331
e7e4a6e0
S
1332 INTERACTION_TYPE_MAP = {
1333 'CommentAction': 'comment',
1334 'AgreeAction': 'like',
1335 'DisagreeAction': 'dislike',
1336 'LikeAction': 'like',
1337 'DislikeAction': 'dislike',
1338 'ListenAction': 'view',
1339 'WatchAction': 'view',
1340 'ViewAction': 'view',
1341 }
1342
29f7c58a 1343 def extract_interaction_type(e):
1344 interaction_type = e.get('interactionType')
1345 if isinstance(interaction_type, dict):
1346 interaction_type = interaction_type.get('@type')
1347 return str_or_none(interaction_type)
1348
e7e4a6e0
S
1349 def extract_interaction_statistic(e):
1350 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1351 if isinstance(interaction_statistic, dict):
1352 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1353 if not isinstance(interaction_statistic, list):
1354 return
1355 for is_e in interaction_statistic:
1356 if not isinstance(is_e, dict):
1357 continue
1358 if is_e.get('@type') != 'InteractionCounter':
1359 continue
29f7c58a 1360 interaction_type = extract_interaction_type(is_e)
1361 if not interaction_type:
e7e4a6e0 1362 continue
ce5b9040
S
1363 # For interaction count some sites provide string instead of
1364 # an integer (as per spec) with non digit characters (e.g. ",")
1365 # so extracting count with more relaxed str_to_int
1366 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1367 if interaction_count is None:
1368 continue
1369 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1370 if not count_kind:
1371 continue
1372 count_key = '%s_count' % count_kind
1373 if info.get(count_key) is not None:
1374 continue
1375 info[count_key] = interaction_count
1376
bae14048
S
1377 def extract_video_object(e):
1378 assert e['@type'] == 'VideoObject'
f7ad7160 1379 author = e.get('author')
bae14048 1380 info.update({
bebef109 1381 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1382 'title': unescapeHTML(e.get('name')),
1383 'description': unescapeHTML(e.get('description')),
bebef109 1384 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
bae14048
S
1385 'duration': parse_duration(e.get('duration')),
1386 'timestamp': unified_timestamp(e.get('uploadDate')),
f7ad7160 1387 # author can be an instance of 'Organization' or 'Person' types.
1388 # both types can have 'name' property(inherited from 'Thing' type). [1]
1389 # however some websites are using 'Text' type instead.
1390 # 1. https://schema.org/VideoObject
1391 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
bae14048
S
1392 'filesize': float_or_none(e.get('contentSize')),
1393 'tbr': int_or_none(e.get('bitrate')),
1394 'width': int_or_none(e.get('width')),
1395 'height': int_or_none(e.get('height')),
33a81c2c 1396 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1397 })
e7e4a6e0 1398 extract_interaction_statistic(e)
bae14048 1399
46933a15 1400 for e in json_ld:
4433bb02 1401 if '@context' in e:
46933a15
S
1402 item_type = e.get('@type')
1403 if expected_type is not None and expected_type != item_type:
4433bb02 1404 continue
c69701c6 1405 if item_type in ('TVEpisode', 'Episode'):
440863ad 1406 episode_name = unescapeHTML(e.get('name'))
46933a15 1407 info.update({
440863ad 1408 'episode': episode_name,
46933a15
S
1409 'episode_number': int_or_none(e.get('episodeNumber')),
1410 'description': unescapeHTML(e.get('description')),
1411 })
440863ad
S
1412 if not info.get('title') and episode_name:
1413 info['title'] = episode_name
46933a15 1414 part_of_season = e.get('partOfSeason')
c69701c6 1415 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1416 info.update({
1417 'season': unescapeHTML(part_of_season.get('name')),
1418 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1419 })
d16b3c66 1420 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1421 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1422 info['series'] = unescapeHTML(part_of_series.get('name'))
391256dc
S
1423 elif item_type == 'Movie':
1424 info.update({
1425 'title': unescapeHTML(e.get('name')),
1426 'description': unescapeHTML(e.get('description')),
1427 'duration': parse_duration(e.get('duration')),
1428 'timestamp': unified_timestamp(e.get('dateCreated')),
1429 })
3931b845 1430 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1431 info.update({
1432 'timestamp': parse_iso8601(e.get('datePublished')),
1433 'title': unescapeHTML(e.get('headline')),
1434 'description': unescapeHTML(e.get('articleBody')),
1435 })
1436 elif item_type == 'VideoObject':
bae14048 1437 extract_video_object(e)
4433bb02
S
1438 if expected_type is None:
1439 continue
1440 else:
1441 break
c69701c6
S
1442 video = e.get('video')
1443 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1444 extract_video_object(video)
4433bb02
S
1445 if expected_type is None:
1446 continue
1447 else:
1448 break
4ca2a3cf
S
1449 return dict((k, v) for k, v in info.items() if v is not None)
1450
27713812 1451 @staticmethod
f8da79f8 1452 def _hidden_inputs(html):
586f1cc5 1453 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1454 hidden_inputs = {}
c8498368
S
1455 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1456 attrs = extract_attributes(input)
1457 if not input:
201ea3ee 1458 continue
c8498368 1459 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1460 continue
c8498368
S
1461 name = attrs.get('name') or attrs.get('id')
1462 value = attrs.get('value')
1463 if name and value is not None:
1464 hidden_inputs[name] = value
201ea3ee 1465 return hidden_inputs
27713812 1466
cf61d96d
S
1467 def _form_hidden_inputs(self, form_id, html):
1468 form = self._search_regex(
73eb13df 1469 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1470 html, '%s form' % form_id, group='form')
1471 return self._hidden_inputs(form)
1472
eb8a4433 1473 class FormatSort:
b050d210 1474 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
eb8a4433 1475
c10d0213 1476 default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
155d2b48 1477 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
53ed7066 1478 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases
1479 ytdl_default = ('hasaud', 'quality', 'tbr', 'filesize', 'vbr',
1480 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1481 'fps', 'fs_approx', 'source', 'format_id')
eb8a4433 1482
1483 settings = {
1484 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1485 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1486 'acodec': {'type': 'ordered', 'regex': True,
1487 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
f137c99e 1488 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
63be1aab 1489 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
eb8a4433 1490 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1491 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1492 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1493 'aext': {'type': 'ordered', 'field': 'audio_ext',
1494 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1495 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1496 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f983b875 1497 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1498 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1499 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
e4beae70 1500 'lang': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
6a04a74e 1501 'quality': {'convert': 'float_none', 'default': -1},
eb8a4433 1502 'filesize': {'convert': 'bytes'},
f137c99e 1503 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1504 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1505 'height': {'convert': 'float_none'},
1506 'width': {'convert': 'float_none'},
1507 'fps': {'convert': 'float_none'},
1508 'tbr': {'convert': 'float_none'},
1509 'vbr': {'convert': 'float_none'},
1510 'abr': {'convert': 'float_none'},
1511 'asr': {'convert': 'float_none'},
e4beae70 1512 'source': {'convert': 'ignore', 'field': 'source_preference'},
63be1aab 1513
eb8a4433 1514 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1515 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1516 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1517 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1518 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1519
1520 # Most of these exist only for compatibility reasons
1521 'dimension': {'type': 'alias', 'field': 'res'},
1522 'resolution': {'type': 'alias', 'field': 'res'},
1523 'extension': {'type': 'alias', 'field': 'ext'},
1524 'bitrate': {'type': 'alias', 'field': 'br'},
eb8a4433 1525 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1526 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1527 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1528 'framerate': {'type': 'alias', 'field': 'fps'},
63be1aab 1529 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1530 'protocol': {'type': 'alias', 'field': 'proto'},
1531 'source_preference': {'type': 'alias', 'field': 'source'},
1532 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1533 'filesize_estimate': {'type': 'alias', 'field': 'size'},
eb8a4433 1534 'samplerate': {'type': 'alias', 'field': 'asr'},
1535 'video_ext': {'type': 'alias', 'field': 'vext'},
1536 'audio_ext': {'type': 'alias', 'field': 'aext'},
1537 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1538 'audio_codec': {'type': 'alias', 'field': 'acodec'},
63be1aab 1539 'video': {'type': 'alias', 'field': 'hasvid'},
1540 'has_video': {'type': 'alias', 'field': 'hasvid'},
1541 'audio': {'type': 'alias', 'field': 'hasaud'},
1542 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1543 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1544 'preference': {'type': 'alias', 'field': 'ie_pref'},
1545 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1546 'format_id': {'type': 'alias', 'field': 'id'},
1547 }
eb8a4433 1548
1549 _order = []
1550
1551 def _get_field_setting(self, field, key):
1552 if field not in self.settings:
1553 self.settings[field] = {}
1554 propObj = self.settings[field]
1555 if key not in propObj:
1556 type = propObj.get('type')
1557 if key == 'field':
1558 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1559 elif key == 'convert':
1560 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1561 else:
eb8a4433 1562 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1563 propObj[key] = default
1564 return propObj[key]
1565
1566 def _resolve_field_value(self, field, value, convertNone=False):
1567 if value is None:
1568 if not convertNone:
1569 return None
4bcc7bd1 1570 else:
eb8a4433 1571 value = value.lower()
1572 conversion = self._get_field_setting(field, 'convert')
1573 if conversion == 'ignore':
1574 return None
1575 if conversion == 'string':
1576 return value
1577 elif conversion == 'float_none':
1578 return float_or_none(value)
1579 elif conversion == 'bytes':
1580 return FileDownloader.parse_bytes(value)
1581 elif conversion == 'order':
da9be05e 1582 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1583 use_regex = self._get_field_setting(field, 'regex')
1584 list_length = len(order_list)
1585 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1586 if use_regex and value is not None:
da9be05e 1587 for i, regex in enumerate(order_list):
eb8a4433 1588 if regex and re.match(regex, value):
1589 return list_length - i
1590 return list_length - empty_pos # not in list
1591 else: # not regex or value = None
1592 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1593 else:
1594 if value.isnumeric():
1595 return float(value)
4bcc7bd1 1596 else:
eb8a4433 1597 self.settings[field]['convert'] = 'string'
1598 return value
1599
1600 def evaluate_params(self, params, sort_extractor):
1601 self._use_free_order = params.get('prefer_free_formats', False)
1602 self._sort_user = params.get('format_sort', [])
1603 self._sort_extractor = sort_extractor
1604
1605 def add_item(field, reverse, closest, limit_text):
1606 field = field.lower()
1607 if field in self._order:
1608 return
1609 self._order.append(field)
1610 limit = self._resolve_field_value(field, limit_text)
1611 data = {
1612 'reverse': reverse,
1613 'closest': False if limit is None else closest,
1614 'limit_text': limit_text,
1615 'limit': limit}
1616 if field in self.settings:
1617 self.settings[field].update(data)
1618 else:
1619 self.settings[field] = data
1620
1621 sort_list = (
1622 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1623 + (tuple() if params.get('format_sort_force', False)
1624 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1625 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1626
1627 for item in sort_list:
1628 match = re.match(self.regex, item)
1629 if match is None:
1630 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1631 field = match.group('field')
1632 if field is None:
1633 continue
1634 if self._get_field_setting(field, 'type') == 'alias':
1635 field = self._get_field_setting(field, 'field')
1636 reverse = match.group('reverse') is not None
b050d210 1637 closest = match.group('separator') == '~'
eb8a4433 1638 limit_text = match.group('limit')
1639
1640 has_limit = limit_text is not None
1641 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1642 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1643
1644 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1645 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1646 limit_count = len(limits)
1647 for (i, f) in enumerate(fields):
1648 add_item(f, reverse, closest,
1649 limits[i] if i < limit_count
1650 else limits[0] if has_limit and not has_multiple_limits
1651 else None)
1652
0760b0a7 1653 def print_verbose_info(self, write_debug):
b31fdeed 1654 if self._sort_user:
0760b0a7 1655 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
eb8a4433 1656 if self._sort_extractor:
0760b0a7 1657 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1658 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
eb8a4433 1659 '+' if self._get_field_setting(field, 'reverse') else '', field,
1660 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1661 self._get_field_setting(field, 'limit_text'),
1662 self._get_field_setting(field, 'limit'))
1663 if self._get_field_setting(field, 'limit_text') is not None else '')
1664 for field in self._order if self._get_field_setting(field, 'visible')]))
1665
1666 def _calculate_field_preference_from_value(self, format, field, type, value):
1667 reverse = self._get_field_setting(field, 'reverse')
1668 closest = self._get_field_setting(field, 'closest')
1669 limit = self._get_field_setting(field, 'limit')
1670
1671 if type == 'extractor':
1672 maximum = self._get_field_setting(field, 'max')
1673 if value is None or (maximum is not None and value >= maximum):
f983b875 1674 value = -1
eb8a4433 1675 elif type == 'boolean':
1676 in_list = self._get_field_setting(field, 'in_list')
1677 not_in_list = self._get_field_setting(field, 'not_in_list')
1678 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1679 elif type == 'ordered':
1680 value = self._resolve_field_value(field, value, True)
1681
1682 # try to convert to number
6a04a74e 1683 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
eb8a4433 1684 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1685 if is_num:
1686 value = val_num
1687
1688 return ((-10, 0) if value is None
1689 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1690 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1691 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1692 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1693 else (-1, value, 0))
1694
1695 def _calculate_field_preference(self, format, field):
1696 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1697 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1698 if type == 'multiple':
1699 type = 'field' # Only 'field' is allowed in multiple for now
1700 actual_fields = self._get_field_setting(field, 'field')
1701
1702 def wrapped_function(values):
1703 values = tuple(filter(lambda x: x is not None, values))
1704 return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1705 else values[0] if values
1706 else None)
1707
1708 value = wrapped_function((get_value(f) for f in actual_fields))
1709 else:
1710 value = get_value(field)
1711 return self._calculate_field_preference_from_value(format, field, type, value)
1712
1713 def calculate_preference(self, format):
1714 # Determine missing protocol
1715 if not format.get('protocol'):
1716 format['protocol'] = determine_protocol(format)
1717
1718 # Determine missing ext
1719 if not format.get('ext') and 'url' in format:
1720 format['ext'] = determine_ext(format['url'])
1721 if format.get('vcodec') == 'none':
1722 format['audio_ext'] = format['ext']
1723 format['video_ext'] = 'none'
1724 else:
1725 format['video_ext'] = format['ext']
1726 format['audio_ext'] = 'none'
1727 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1728 # format['preference'] = -1000
1729
1730 # Determine missing bitrates
1731 if format.get('tbr') is None:
1732 if format.get('vbr') is not None and format.get('abr') is not None:
1733 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1734 else:
1735 if format.get('vcodec') != "none" and format.get('vbr') is None:
1736 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1737 if format.get('acodec') != "none" and format.get('abr') is None:
1738 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1739
1740 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1741
1742 def _sort_formats(self, formats, field_preference=[]):
1743 if not formats:
a06916d9 1744 if self.get_param('ignore_no_formats_error'):
b7da73eb 1745 return
eb8a4433 1746 raise ExtractorError('No video formats found')
1747 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1748 format_sort.evaluate_params(self._downloader.params, field_preference)
a06916d9 1749 if self.get_param('verbose', False):
0760b0a7 1750 format_sort.print_verbose_info(self._downloader.write_debug)
eb8a4433 1751 formats.sort(key=lambda f: format_sort.calculate_preference(f))
59040888 1752
96a53167
S
1753 def _check_formats(self, formats, video_id):
1754 if formats:
1755 formats[:] = filter(
1756 lambda f: self._is_valid_url(
1757 f['url'], video_id,
1758 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1759 formats)
1760
f5bdb444
S
1761 @staticmethod
1762 def _remove_duplicate_formats(formats):
1763 format_urls = set()
1764 unique_formats = []
1765 for f in formats:
1766 if f['url'] not in format_urls:
1767 format_urls.add(f['url'])
1768 unique_formats.append(f)
1769 formats[:] = unique_formats
1770
45024183 1771 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1772 url = self._proto_relative_url(url, scheme='http:')
1773 # For now assume non HTTP(S) URLs always valid
1774 if not (url.startswith('http://') or url.startswith('https://')):
1775 return True
96a53167 1776 try:
45024183 1777 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1778 return True
8bdd16b4 1779 except ExtractorError as e:
25e911a9 1780 self.to_screen(
8bdd16b4 1781 '%s: %s URL is invalid, skipping: %s'
1782 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1783 return False
96a53167 1784
20991253 1785 def http_scheme(self):
1ede5b24 1786 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1787 return (
1788 'http:'
a06916d9 1789 if self.get_param('prefer_insecure', False)
20991253
PH
1790 else 'https:')
1791
57c7411f
PH
1792 def _proto_relative_url(self, url, scheme=None):
1793 if url is None:
1794 return url
1795 if url.startswith('//'):
1796 if scheme is None:
1797 scheme = self.http_scheme()
1798 return scheme + url
1799 else:
1800 return url
1801
4094b6e3
PH
1802 def _sleep(self, timeout, video_id, msg_template=None):
1803 if msg_template is None:
f1a9d64e 1804 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1805 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1806 self.to_screen(msg)
1807 time.sleep(timeout)
1808
f983b875 1809 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1810 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1811 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
f036a632
JMF
1812 manifest = self._download_xml(
1813 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1814 'Unable to download f4m manifest',
1815 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1816 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1817 transform_source=transform_source,
7360c06f 1818 fatal=fatal, data=data, headers=headers, query=query)
4de61310
S
1819
1820 if manifest is False:
8d29e47f 1821 return []
31bb8d3f 1822
0fdbb332 1823 return self._parse_f4m_formats(
f983b875 1824 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1825 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1826
f983b875 1827 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1828 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1829 fatal=True, m3u8_id=None):
ee0ba927 1830 if not isinstance(manifest, compat_etree_Element) and not fatal:
d9eb580a
S
1831 return []
1832
7a5c1cfe 1833 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1834 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1835 if akamai_pv is not None and ';' in akamai_pv.text:
1836 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1837 if playerVerificationChallenge.strip() != '':
1838 return []
1839
31bb8d3f 1840 formats = []
7a47d07c 1841 manifest_version = '1.0'
b2527359 1842 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1843 if not media_nodes:
7a47d07c 1844 manifest_version = '2.0'
34e48bed 1845 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1846 # Remove unsupported DRM protected media from final formats
067aa17e 1847 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1848 media_nodes = remove_encrypted_media(media_nodes)
1849 if not media_nodes:
1850 return formats
48107c19
S
1851
1852 manifest_base_url = get_base_url(manifest)
0a5685b2 1853
a6571f10 1854 bootstrap_info = xpath_element(
0a5685b2
YCH
1855 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1856 'bootstrap info', default=None)
1857
edd6074c
RA
1858 vcodec = None
1859 mime_type = xpath_text(
1860 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1861 'base URL', default=None)
1862 if mime_type and mime_type.startswith('audio/'):
1863 vcodec = 'none'
1864
b2527359 1865 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1866 tbr = int_or_none(media_el.attrib.get('bitrate'))
1867 width = int_or_none(media_el.attrib.get('width'))
1868 height = int_or_none(media_el.attrib.get('height'))
1869 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1870 # If <bootstrapInfo> is present, the specified f4m is a
1871 # stream-level manifest, and only set-level manifests may refer to
1872 # external resources. See section 11.4 and section 4 of F4M spec
1873 if bootstrap_info is None:
1874 media_url = None
1875 # @href is introduced in 2.0, see section 11.6 of F4M spec
1876 if manifest_version == '2.0':
1877 media_url = media_el.attrib.get('href')
1878 if media_url is None:
1879 media_url = media_el.attrib.get('url')
31c746e5
S
1880 if not media_url:
1881 continue
cc357c4d
S
1882 manifest_url = (
1883 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1884 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1885 # If media_url is itself a f4m manifest do the recursive extraction
1886 # since bitrates in parent manifest (this one) and media_url manifest
1887 # may differ leading to inability to resolve the format by requested
1888 # bitrate in f4m downloader
240b6045
YCH
1889 ext = determine_ext(manifest_url)
1890 if ext == 'f4m':
77b8b4e6 1891 f4m_formats = self._extract_f4m_formats(
f983b875 1892 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1893 transform_source=transform_source, fatal=fatal)
1894 # Sometimes stream-level manifest contains single media entry that
1895 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1896 # At the same time parent's media entry in set-level manifest may
1897 # contain it. We will copy it from parent in such cases.
1898 if len(f4m_formats) == 1:
1899 f = f4m_formats[0]
1900 f.update({
1901 'tbr': f.get('tbr') or tbr,
1902 'width': f.get('width') or width,
1903 'height': f.get('height') or height,
1904 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1905 'vcodec': vcodec,
77b8b4e6
S
1906 })
1907 formats.extend(f4m_formats)
70f0f5a8 1908 continue
240b6045
YCH
1909 elif ext == 'm3u8':
1910 formats.extend(self._extract_m3u8_formats(
1911 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1912 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1913 continue
31bb8d3f 1914 formats.append({
77b8b4e6 1915 'format_id': format_id,
31bb8d3f 1916 'url': manifest_url,
30d0b549 1917 'manifest_url': manifest_url,
a6571f10 1918 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1919 'protocol': 'f4m',
b2527359 1920 'tbr': tbr,
77b8b4e6
S
1921 'width': width,
1922 'height': height,
edd6074c 1923 'vcodec': vcodec,
60ca389c 1924 'preference': preference,
f983b875 1925 'quality': quality,
31bb8d3f 1926 })
31bb8d3f
JMF
1927 return formats
1928
f983b875 1929 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1930 return {
f207019c 1931 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1932 'url': m3u8_url,
1933 'ext': ext,
1934 'protocol': 'm3u8',
37768f92 1935 'preference': preference - 100 if preference else -100,
f983b875 1936 'quality': quality,
704df56d
PH
1937 'resolution': 'multiple',
1938 'format_note': 'Quality selection URL',
16da9bbc
YCH
1939 }
1940
a0c3b2d5
F
1941 def _extract_m3u8_formats(self, *args, **kwargs):
1942 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1943 if subs:
1944 self.report_warning(bug_reports_message(
1945 "Ignoring subtitle tracks found in the HLS manifest; "
1946 "if any subtitle tracks are missing,"
1947 ))
1948 return fmts
1949
1950 def _extract_m3u8_formats_and_subtitles(
177877c5 1951 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1952 preference=None, quality=None, m3u8_id=None, note=None,
1953 errnote=None, fatal=True, live=False, data=None, headers={},
1954 query={}):
1955
dbd82a1d 1956 res = self._download_webpage_handle(
81515ad9 1957 m3u8_url, video_id,
37a3bb66 1958 note='Downloading m3u8 information' if note is None else note,
1959 errnote='Failed to download m3u8 information' if errnote is None else errnote,
7360c06f 1960 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1961
dbd82a1d 1962 if res is False:
a0c3b2d5 1963 return [], {}
cb252080 1964
dbd82a1d 1965 m3u8_doc, urlh = res
37113045 1966 m3u8_url = urlh.geturl()
9cdffeeb 1967
a0c3b2d5 1968 return self._parse_m3u8_formats_and_subtitles(
cb252080 1969 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1970 preference=preference, quality=quality, m3u8_id=m3u8_id,
1971 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1972 headers=headers, query=query, video_id=video_id)
cb252080 1973
a0c3b2d5 1974 def _parse_m3u8_formats_and_subtitles(
177877c5 1975 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
a0c3b2d5
F
1976 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1977 errnote=None, fatal=True, data=None, headers={}, query={},
1978 video_id=None):
1979
08a00eef 1980 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
a0c3b2d5 1981 return [], {}
08a00eef 1982
a06916d9 1983 if (not self.get_param('allow_unplayable_formats')
73d4343e 1984 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
a0c3b2d5 1985 return [], {}
ea229584 1986
ff99fe52 1987 formats = []
0def7587 1988
a0c3b2d5
F
1989 subtitles = {}
1990
0def7587
RA
1991 format_url = lambda u: (
1992 u
1993 if re.match(r'^https?://', u)
1994 else compat_urlparse.urljoin(m3u8_url, u))
1995
a06916d9 1996 split_discontinuity = self.get_param('hls_split_discontinuity', False)
310c2ed2 1997
cb252080
S
1998 # References:
1999 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
2000 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2001 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
2002
2003 # We should try extracting formats only from master playlists [1, 4.3.4],
2004 # i.e. playlists that describe available qualities. On the other hand
2005 # media playlists [1, 4.3.3] should be returned as is since they contain
2006 # just the media without qualities renditions.
9cdffeeb 2007 # Fortunately, master playlist can be easily distinguished from media
cb252080 2008 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 2009 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
2010 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2011 # media playlist and MUST NOT appear in master playlist thus we can
2012 # clearly detect media playlist with this criterion.
2013
b3b30a4b 2014 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
2015 fatal=True, data=None, headers={}):
310c2ed2 2016 if not m3u8_doc:
ed9b7e3d 2017 if not format_url:
2018 return []
310c2ed2 2019 res = self._download_webpage_handle(
2020 format_url, video_id,
2021 note=False,
b3b30a4b 2022 errnote='Failed to download m3u8 playlist information',
2023 fatal=fatal, data=data, headers=headers)
310c2ed2 2024
2025 if res is False:
2026 return []
2027
2028 m3u8_doc, urlh = res
2029 format_url = urlh.geturl()
2030
2031 playlist_formats = []
2032 i = (
2033 0
2034 if split_discontinuity
2035 else None)
2036 format_info = {
2037 'index': i,
2038 'key_data': None,
2039 'files': [],
2040 }
2041 for line in m3u8_doc.splitlines():
2042 if not line.startswith('#'):
2043 format_info['files'].append(line)
2044 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
2045 i += 1
2046 playlist_formats.append(format_info)
2047 format_info = {
2048 'index': i,
2049 'url': format_url,
2050 'files': [],
2051 }
2052 playlist_formats.append(format_info)
2053 return playlist_formats
2054
9cdffeeb 2055 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
310c2ed2 2056
ed9b7e3d 2057 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
310c2ed2 2058
2059 for format in playlist_formats:
2060 format_id = []
2061 if m3u8_id:
2062 format_id.append(m3u8_id)
2063 format_index = format.get('index')
2064 if format_index:
2065 format_id.append(str(format_index))
2066 f = {
2067 'format_id': '-'.join(format_id),
2068 'format_index': format_index,
2069 'url': m3u8_url,
2070 'ext': ext,
2071 'protocol': entry_protocol,
2072 'preference': preference,
2073 'quality': quality,
2074 }
2075 formats.append(f)
2076
a0c3b2d5 2077 return formats, subtitles
cb252080
S
2078
2079 groups = {}
2080 last_stream_inf = {}
2081
2082 def extract_media(x_media_line):
2083 media = parse_m3u8_attributes(x_media_line)
2084 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2085 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2086 if not (media_type and group_id and name):
2087 return
2088 groups.setdefault(group_id, []).append(media)
a0c3b2d5
F
2089 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2090 if media_type == 'SUBTITLES':
3907333c 2091 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2092 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2093 # However, lack of URI has been spotted in the wild.
2094 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2095 if not media.get('URI'):
2096 return
a0c3b2d5
F
2097 url = format_url(media['URI'])
2098 sub_info = {
2099 'url': url,
2100 'ext': determine_ext(url),
2101 }
4a2f19ab
F
2102 if sub_info['ext'] == 'm3u8':
2103 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2104 # files may contain is WebVTT:
2105 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2106 sub_info['ext'] = 'vtt'
2107 sub_info['protocol'] = 'm3u8_native'
37a3bb66 2108 lang = media.get('LANGUAGE') or 'und'
a0c3b2d5 2109 subtitles.setdefault(lang, []).append(sub_info)
cb252080
S
2110 if media_type not in ('VIDEO', 'AUDIO'):
2111 return
2112 media_url = media.get('URI')
2113 if media_url:
310c2ed2 2114 manifest_url = format_url(media_url)
cb252080 2115 format_id = []
b3b30a4b 2116 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2117 fatal=fatal, data=data, headers=headers)
310c2ed2 2118
2119 for format in playlist_formats:
2120 format_index = format.get('index')
2121 for v in (m3u8_id, group_id, name):
2122 if v:
2123 format_id.append(v)
2124 if format_index:
2125 format_id.append(str(format_index))
2126 f = {
2127 'format_id': '-'.join(format_id),
2128 'format_index': format_index,
2129 'url': manifest_url,
2130 'manifest_url': m3u8_url,
2131 'language': media.get('LANGUAGE'),
2132 'ext': ext,
2133 'protocol': entry_protocol,
2134 'preference': preference,
2135 'quality': quality,
2136 }
2137 if media_type == 'AUDIO':
2138 f['vcodec'] = 'none'
2139 formats.append(f)
cb252080
S
2140
2141 def build_stream_name():
2142 # Despite specification does not mention NAME attribute for
3019cb0c
S
2143 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2144 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2145 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2146 stream_name = last_stream_inf.get('NAME')
2147 if stream_name:
2148 return stream_name
2149 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2150 # from corresponding rendition group
2151 stream_group_id = last_stream_inf.get('VIDEO')
2152 if not stream_group_id:
2153 return
2154 stream_group = groups.get(stream_group_id)
2155 if not stream_group:
2156 return stream_group_id
2157 rendition = stream_group[0]
2158 return rendition.get('NAME') or stream_group_id
2159
379306ef 2160 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2161 # chance to detect video only formats when EXT-X-STREAM-INF tags
2162 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2163 for line in m3u8_doc.splitlines():
2164 if line.startswith('#EXT-X-MEDIA:'):
2165 extract_media(line)
2166
704df56d
PH
2167 for line in m3u8_doc.splitlines():
2168 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2169 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2170 elif line.startswith('#') or not line.strip():
2171 continue
2172 else:
9c99bef7 2173 tbr = float_or_none(
3089bc74
S
2174 last_stream_inf.get('AVERAGE-BANDWIDTH')
2175 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2176 manifest_url = format_url(line.strip())
5ef62fc4 2177
b3b30a4b 2178 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2179 fatal=fatal, data=data, headers=headers)
310c2ed2 2180
fc21af50 2181 for frmt in playlist_formats:
310c2ed2 2182 format_id = []
2183 if m3u8_id:
2184 format_id.append(m3u8_id)
fc21af50 2185 format_index = frmt.get('index')
310c2ed2 2186 stream_name = build_stream_name()
2187 # Bandwidth of live streams may differ over time thus making
2188 # format_id unpredictable. So it's better to keep provided
2189 # format_id intact.
2190 if not live:
2191 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2192 if format_index:
2193 format_id.append(str(format_index))
2194 f = {
2195 'format_id': '-'.join(format_id),
2196 'format_index': format_index,
2197 'url': manifest_url,
2198 'manifest_url': m3u8_url,
2199 'tbr': tbr,
2200 'ext': ext,
2201 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2202 'protocol': entry_protocol,
2203 'preference': preference,
2204 'quality': quality,
2205 }
2206 resolution = last_stream_inf.get('RESOLUTION')
2207 if resolution:
2208 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2209 if mobj:
2210 f['width'] = int(mobj.group('width'))
2211 f['height'] = int(mobj.group('height'))
2212 # Unified Streaming Platform
2213 mobj = re.search(
2214 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2215 if mobj:
2216 abr, vbr = mobj.groups()
2217 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2218 f.update({
2219 'vbr': vbr,
2220 'abr': abr,
2221 })
2222 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2223 f.update(codecs)
2224 audio_group_id = last_stream_inf.get('AUDIO')
2225 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2226 # references a rendition group MUST have a CODECS attribute.
2227 # However, this is not always respected, for example, [2]
2228 # contains EXT-X-STREAM-INF tag which references AUDIO
2229 # rendition group but does not have CODECS and despite
2230 # referencing an audio group it represents a complete
2231 # (with audio and video) format. So, for such cases we will
2232 # ignore references to rendition groups and treat them
2233 # as complete formats.
2234 if audio_group_id and codecs and f.get('vcodec') != 'none':
2235 audio_group = groups.get(audio_group_id)
2236 if audio_group and audio_group[0].get('URI'):
2237 # TODO: update acodec for audio only formats with
2238 # the same GROUP-ID
2239 f['acodec'] = 'none'
fc21af50 2240 if not f.get('ext'):
2241 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
310c2ed2 2242 formats.append(f)
2243
2244 # for DailyMotion
2245 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2246 if progressive_uri:
2247 http_f = f.copy()
2248 del http_f['manifest_url']
2249 http_f.update({
2250 'format_id': f['format_id'].replace('hls-', 'http-'),
2251 'protocol': 'http',
2252 'url': progressive_uri,
2253 })
2254 formats.append(http_f)
5ef62fc4 2255
cb252080 2256 last_stream_inf = {}
a0c3b2d5 2257 return formats, subtitles
704df56d 2258
a107193e
S
2259 @staticmethod
2260 def _xpath_ns(path, namespace=None):
2261 if not namespace:
2262 return path
2263 out = []
2264 for c in path.split('/'):
2265 if not c or c == '.':
2266 out.append(c)
2267 else:
2268 out.append('{%s}%s' % (namespace, c))
2269 return '/'.join(out)
2270
09f572fb 2271 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2272 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 2273
995029a1
PH
2274 if smil is False:
2275 assert not fatal
2276 return []
e89a2aab 2277
17712eeb 2278 namespace = self._parse_smil_namespace(smil)
a107193e
S
2279
2280 return self._parse_smil_formats(
2281 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2282
2283 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2284 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2285 if smil is False:
2286 return {}
2287 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2288
09f572fb 2289 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
2290 return self._download_xml(
2291 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2292 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2293
2294 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2295 namespace = self._parse_smil_namespace(smil)
a107193e
S
2296
2297 formats = self._parse_smil_formats(
2298 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2299 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2300
2301 video_id = os.path.splitext(url_basename(smil_url))[0]
2302 title = None
2303 description = None
647eab45 2304 upload_date = None
a107193e
S
2305 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2306 name = meta.attrib.get('name')
2307 content = meta.attrib.get('content')
2308 if not name or not content:
2309 continue
2310 if not title and name == 'title':
2311 title = content
2312 elif not description and name in ('description', 'abstract'):
2313 description = content
647eab45
S
2314 elif not upload_date and name == 'date':
2315 upload_date = unified_strdate(content)
a107193e 2316
1e5bcdec
S
2317 thumbnails = [{
2318 'id': image.get('type'),
2319 'url': image.get('src'),
2320 'width': int_or_none(image.get('width')),
2321 'height': int_or_none(image.get('height')),
2322 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2323
a107193e
S
2324 return {
2325 'id': video_id,
2326 'title': title or video_id,
2327 'description': description,
647eab45 2328 'upload_date': upload_date,
1e5bcdec 2329 'thumbnails': thumbnails,
a107193e
S
2330 'formats': formats,
2331 'subtitles': subtitles,
2332 }
2333
17712eeb
S
2334 def _parse_smil_namespace(self, smil):
2335 return self._search_regex(
2336 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2337
f877c6ae 2338 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2339 base = smil_url
2340 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2341 b = meta.get('base') or meta.get('httpBase')
2342 if b:
2343 base = b
2344 break
e89a2aab
S
2345
2346 formats = []
2347 rtmp_count = 0
a107193e 2348 http_count = 0
7f32e5dc 2349 m3u8_count = 0
a107193e 2350
81e1c4e2 2351 srcs = []
ad96b4c8
YCH
2352 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2353 for medium in media:
2354 src = medium.get('src')
81e1c4e2 2355 if not src or src in srcs:
a107193e 2356 continue
81e1c4e2 2357 srcs.append(src)
a107193e 2358
ad96b4c8
YCH
2359 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2360 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2361 width = int_or_none(medium.get('width'))
2362 height = int_or_none(medium.get('height'))
2363 proto = medium.get('proto')
2364 ext = medium.get('ext')
a107193e 2365 src_ext = determine_ext(src)
ad96b4c8 2366 streamer = medium.get('streamer') or base
a107193e
S
2367
2368 if proto == 'rtmp' or streamer.startswith('rtmp'):
2369 rtmp_count += 1
2370 formats.append({
2371 'url': streamer,
2372 'play_path': src,
2373 'ext': 'flv',
2374 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2375 'tbr': bitrate,
2376 'filesize': filesize,
2377 'width': width,
2378 'height': height,
2379 })
f877c6ae
YCH
2380 if transform_rtmp_url:
2381 streamer, src = transform_rtmp_url(streamer, src)
2382 formats[-1].update({
2383 'url': streamer,
2384 'play_path': src,
2385 })
a107193e
S
2386 continue
2387
2388 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 2389 src_url = src_url.strip()
a107193e
S
2390
2391 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2392 m3u8_formats = self._extract_m3u8_formats(
2393 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2394 if len(m3u8_formats) == 1:
2395 m3u8_count += 1
2396 m3u8_formats[0].update({
2397 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2398 'tbr': bitrate,
2399 'width': width,
2400 'height': height,
2401 })
2402 formats.extend(m3u8_formats)
bd21ead2 2403 elif src_ext == 'f4m':
a107193e
S
2404 f4m_url = src_url
2405 if not f4m_params:
2406 f4m_params = {
2407 'hdcore': '3.2.0',
2408 'plugin': 'flowplayer-3.2.0.1',
2409 }
2410 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 2411 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 2412 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2413 elif src_ext == 'mpd':
2414 formats.extend(self._extract_mpd_formats(
2415 src_url, video_id, mpd_id='dash', fatal=False))
2416 elif re.search(r'\.ism/[Mm]anifest', src_url):
2417 formats.extend(self._extract_ism_formats(
2418 src_url, video_id, ism_id='mss', fatal=False))
2419 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2420 http_count += 1
2421 formats.append({
2422 'url': src_url,
2423 'ext': ext or src_ext or 'flv',
2424 'format_id': 'http-%d' % (bitrate or http_count),
2425 'tbr': bitrate,
2426 'filesize': filesize,
2427 'width': width,
2428 'height': height,
2429 })
63757032 2430
e89a2aab
S
2431 return formats
2432
ce00af87 2433 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2434 urls = []
a107193e
S
2435 subtitles = {}
2436 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2437 src = textstream.get('src')
d413095f 2438 if not src or src in urls:
a107193e 2439 continue
d413095f 2440 urls.append(src)
df634be2 2441 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2442 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2443 subtitles.setdefault(lang, []).append({
2444 'url': src,
2445 'ext': ext,
2446 })
2447 return subtitles
63757032 2448
47a5cb77 2449 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 2450 xspf = self._download_xml(
47a5cb77 2451 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
2452 'Unable to download xspf manifest', fatal=fatal)
2453 if xspf is False:
2454 return []
47a5cb77
S
2455 return self._parse_xspf(
2456 xspf, playlist_id, xspf_url=xspf_url,
2457 xspf_base_url=base_url(xspf_url))
8d6765cf 2458
47a5cb77 2459 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2460 NS_MAP = {
2461 'xspf': 'http://xspf.org/ns/0/',
2462 's1': 'http://static.streamone.nl/player/ns/0',
2463 }
2464
2465 entries = []
47a5cb77 2466 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2467 title = xpath_text(
98044462 2468 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2469 description = xpath_text(
2470 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2471 thumbnail = xpath_text(
2472 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2473 duration = float_or_none(
2474 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2475
47a5cb77
S
2476 formats = []
2477 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2478 format_url = urljoin(xspf_base_url, location.text)
2479 if not format_url:
2480 continue
2481 formats.append({
2482 'url': format_url,
2483 'manifest_url': xspf_url,
2484 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2485 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2486 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2487 })
8d6765cf
S
2488 self._sort_formats(formats)
2489
2490 entries.append({
2491 'id': playlist_id,
2492 'title': title,
2493 'description': description,
2494 'thumbnail': thumbnail,
2495 'duration': duration,
2496 'formats': formats,
2497 })
2498 return entries
2499
171e59ed
F
2500 def _extract_mpd_formats(self, *args, **kwargs):
2501 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2502 if subs:
2503 self.report_warning(bug_reports_message(
2504 "Ignoring subtitle tracks found in the DASH manifest; "
2505 "if any subtitle tracks are missing,"
2506 ))
2507 return fmts
2508
2509 def _extract_mpd_formats_and_subtitles(
2510 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2511 fatal=True, data=None, headers={}, query={}):
47a5cb77 2512 res = self._download_xml_handle(
1bac3455 2513 mpd_url, video_id,
37a3bb66 2514 note='Downloading MPD manifest' if note is None else note,
2515 errnote='Failed to download MPD manifest' if errnote is None else errnote,
7360c06f 2516 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2517 if res is False:
171e59ed 2518 return [], {}
47a5cb77 2519 mpd_doc, urlh = res
c25720ef 2520 if mpd_doc is None:
171e59ed 2521 return [], {}
02dc0a36 2522 mpd_base_url = base_url(urlh.geturl())
1bac3455 2523
171e59ed 2524 return self._parse_mpd_formats_and_subtitles(
545cc85d 2525 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2526
171e59ed
F
2527 def _parse_mpd_formats(self, *args, **kwargs):
2528 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2529 if subs:
2530 self.report_warning(bug_reports_message(
2531 "Ignoring subtitle tracks found in the DASH manifest; "
2532 "if any subtitle tracks are missing,"
2533 ))
2534 return fmts
2535
2536 def _parse_mpd_formats_and_subtitles(
2537 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2538 """
2539 Parse formats from MPD manifest.
2540 References:
2541 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2542 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2543 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2544 """
a06916d9 2545 if not self.get_param('dynamic_mpd', True):
78895bd3 2546 if mpd_doc.get('type') == 'dynamic':
171e59ed 2547 return [], {}
2d2fa82d 2548
91cb6b50 2549 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2550
2551 def _add_ns(path):
2552 return self._xpath_ns(path, namespace)
2553
675d0016 2554 def is_drm_protected(element):
2555 return element.find(_add_ns('ContentProtection')) is not None
2556
1bac3455 2557 def extract_multisegment_info(element, ms_parent_info):
2558 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2559
2560 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2561 # common attributes and elements. We will only extract relevant
2562 # for us.
2563 def extract_common(source):
2564 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2565 if segment_timeline is not None:
2566 s_e = segment_timeline.findall(_add_ns('S'))
2567 if s_e:
2568 ms_info['total_number'] = 0
2569 ms_info['s'] = []
2570 for s in s_e:
2571 r = int(s.get('r', 0))
2572 ms_info['total_number'] += 1 + r
2573 ms_info['s'].append({
2574 't': int(s.get('t', 0)),
2575 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2576 'd': int(s.attrib['d']),
2577 'r': r,
2578 })
2579 start_number = source.get('startNumber')
2580 if start_number:
2581 ms_info['start_number'] = int(start_number)
2582 timescale = source.get('timescale')
2583 if timescale:
2584 ms_info['timescale'] = int(timescale)
2585 segment_duration = source.get('duration')
2586 if segment_duration:
48504785 2587 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2588
2589 def extract_Initialization(source):
2590 initialization = source.find(_add_ns('Initialization'))
2591 if initialization is not None:
2592 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2593
f14be228 2594 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2595 if segment_list is not None:
b4c1d6e8
S
2596 extract_common(segment_list)
2597 extract_Initialization(segment_list)
f14be228 2598 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2599 if segment_urls_e:
2600 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2601 else:
f14be228 2602 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2603 if segment_template is not None:
b4c1d6e8 2604 extract_common(segment_template)
e228616c
S
2605 media = segment_template.get('media')
2606 if media:
2607 ms_info['media'] = media
1bac3455 2608 initialization = segment_template.get('initialization')
2609 if initialization:
e228616c 2610 ms_info['initialization'] = initialization
1bac3455 2611 else:
b4c1d6e8 2612 extract_Initialization(segment_template)
1bac3455 2613 return ms_info
b323e170 2614
a06916d9 2615 skip_unplayable = not self.get_param('allow_unplayable_formats')
63ad4d43 2616
1bac3455 2617 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 2618 formats = []
171e59ed 2619 subtitles = {}
f14be228 2620 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2621 period_duration = parse_duration(period.get('duration')) or mpd_duration
2622 period_ms_info = extract_multisegment_info(period, {
2623 'start_number': 1,
2624 'timescale': 1,
2625 })
f14be228 2626 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
06869367 2627 if skip_unplayable and is_drm_protected(adaptation_set):
675d0016 2628 continue
1bac3455 2629 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2630 for representation in adaptation_set.findall(_add_ns('Representation')):
06869367 2631 if skip_unplayable and is_drm_protected(representation):
675d0016 2632 continue
1bac3455 2633 representation_attrib = adaptation_set.attrib.copy()
2634 representation_attrib.update(representation.attrib)
f0948348 2635 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759 2636 mime_type = representation_attrib['mimeType']
171e59ed
F
2637 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2638
2639 if content_type in ('video', 'audio', 'text'):
1bac3455 2640 base_url = ''
2641 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 2642 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 2643 if base_url_e is not None:
2644 base_url = base_url_e.text + base_url
2645 if re.match(r'^https?://', base_url):
2646 break
bb20526b
S
2647 if mpd_base_url and not re.match(r'^https?://', base_url):
2648 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2649 mpd_base_url += '/'
1bac3455 2650 base_url = mpd_base_url + base_url
2651 representation_id = representation_attrib.get('id')
d577c796 2652 lang = representation_attrib.get('lang')
51e9094f 2653 url_el = representation.find(_add_ns('BaseURL'))
2654 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
e228616c 2655 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
171e59ed
F
2656 if content_type in ('video', 'audio'):
2657 f = {
2658 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2659 'manifest_url': mpd_url,
2660 'ext': mimetype2ext(mime_type),
2661 'width': int_or_none(representation_attrib.get('width')),
2662 'height': int_or_none(representation_attrib.get('height')),
2663 'tbr': float_or_none(bandwidth, 1000),
2664 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2665 'fps': int_or_none(representation_attrib.get('frameRate')),
2666 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2667 'format_note': 'DASH %s' % content_type,
2668 'filesize': filesize,
2669 'container': mimetype2ext(mime_type) + '_dash',
2670 }
2671 f.update(parse_codecs(representation_attrib.get('codecs')))
2672 elif content_type == 'text':
2673 f = {
2674 'ext': mimetype2ext(mime_type),
2675 'manifest_url': mpd_url,
2676 'filesize': filesize,
2677 }
1bac3455 2678 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
b4c1d6e8 2679
e228616c 2680 def prepare_template(template_name, identifiers):
eca1f0d1
S
2681 tmpl = representation_ms_info[template_name]
2682 # First of, % characters outside $...$ templates
2683 # must be escaped by doubling for proper processing
2684 # by % operator string formatting used further (see
067aa17e 2685 # https://github.com/ytdl-org/youtube-dl/issues/16867).
eca1f0d1
S
2686 t = ''
2687 in_template = False
2688 for c in tmpl:
2689 t += c
2690 if c == '$':
2691 in_template = not in_template
2692 elif c == '%' and not in_template:
2693 t += c
2694 # Next, $...$ templates are translated to their
2695 # %(...) counterparts to be used with % operator
e228616c
S
2696 t = t.replace('$RepresentationID$', representation_id)
2697 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2698 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2699 t.replace('$$', '$')
2700 return t
2701
2702 # @initialization is a regular template like @media one
2703 # so it should be handled just the same way (see
067aa17e 2704 # https://github.com/ytdl-org/youtube-dl/issues/11605)
e228616c
S
2705 if 'initialization' in representation_ms_info:
2706 initialization_template = prepare_template(
2707 'initialization',
2708 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2709 # $Time$ shall not be included for @initialization thus
2710 # only $Bandwidth$ remains
2711 ('Bandwidth', ))
2712 representation_ms_info['initialization_url'] = initialization_template % {
2713 'Bandwidth': bandwidth,
2714 }
2715
1141e910
S
2716 def location_key(location):
2717 return 'url' if re.match(r'^https?://', location) else 'path'
2718
e228616c
S
2719 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2720
2721 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1141e910 2722 media_location_key = location_key(media_template)
f0948348
S
2723
2724 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2725 # can't be used at the same time
b4c1d6e8
S
2726 if '%(Number' in media_template and 's' not in representation_ms_info:
2727 segment_duration = None
c110944f 2728 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
b4c1d6e8
S
2729 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2730 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
b4c1d6e8 2731 representation_ms_info['fragments'] = [{
1141e910 2732 media_location_key: media_template % {
b4c1d6e8 2733 'Number': segment_number,
e228616c 2734 'Bandwidth': bandwidth,
b4c1d6e8
S
2735 },
2736 'duration': segment_duration,
2737 } for segment_number in range(
2738 representation_ms_info['start_number'],
2739 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
f0948348 2740 else:
b4c1d6e8
S
2741 # $Number*$ or $Time$ in media template with S list available
2742 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2743 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
b4c1d6e8 2744 representation_ms_info['fragments'] = []
f0948348 2745 segment_time = 0
b4c1d6e8
S
2746 segment_d = None
2747 segment_number = representation_ms_info['start_number']
f0948348
S
2748
2749 def add_segment_url():
b4c1d6e8
S
2750 segment_url = media_template % {
2751 'Time': segment_time,
e228616c 2752 'Bandwidth': bandwidth,
b4c1d6e8
S
2753 'Number': segment_number,
2754 }
b4c1d6e8 2755 representation_ms_info['fragments'].append({
1141e910 2756 media_location_key: segment_url,
b4c1d6e8
S
2757 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2758 })
f0948348
S
2759
2760 for num, s in enumerate(representation_ms_info['s']):
2761 segment_time = s.get('t') or segment_time
b4c1d6e8 2762 segment_d = s['d']
f0948348 2763 add_segment_url()
b4c1d6e8 2764 segment_number += 1
f0948348 2765 for r in range(s.get('r', 0)):
b4c1d6e8 2766 segment_time += segment_d
f0948348 2767 add_segment_url()
b4c1d6e8
S
2768 segment_number += 1
2769 segment_time += segment_d
2770 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2771 # No media template
2772 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2773 # or any YouTube dashsegments video
2774 fragments = []
d04621da
S
2775 segment_index = 0
2776 timescale = representation_ms_info['timescale']
2777 for s in representation_ms_info['s']:
2778 duration = float_or_none(s['d'], timescale)
b4c1d6e8 2779 for r in range(s.get('r', 0) + 1):
1141e910 2780 segment_uri = representation_ms_info['segment_urls'][segment_index]
b4c1d6e8 2781 fragments.append({
1141e910 2782 location_key(segment_uri): segment_uri,
d04621da 2783 'duration': duration,
b4c1d6e8 2784 })
d04621da 2785 segment_index += 1
b4c1d6e8 2786 representation_ms_info['fragments'] = fragments
41bf647e
PN
2787 elif 'segment_urls' in representation_ms_info:
2788 # Segment URLs with no SegmentTimeline
2789 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
067aa17e 2790 # https://github.com/ytdl-org/youtube-dl/pull/14844
41bf647e 2791 fragments = []
603fc4e0
S
2792 segment_duration = float_or_none(
2793 representation_ms_info['segment_duration'],
2794 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
41bf647e 2795 for segment_url in representation_ms_info['segment_urls']:
603fc4e0 2796 fragment = {
41bf647e 2797 location_key(segment_url): segment_url,
603fc4e0
S
2798 }
2799 if segment_duration:
2800 fragment['duration'] = segment_duration
2801 fragments.append(fragment)
41bf647e 2802 representation_ms_info['fragments'] = fragments
79d2077e
S
2803 # If there is a fragments key available then we correctly recognized fragmented media.
2804 # Otherwise we will assume unfragmented media with direct access. Technically, such
2805 # assumption is not necessarily correct since we may simply have no support for
2806 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
86f4d14f 2807 if 'fragments' in representation_ms_info:
1bac3455 2808 f.update({
79d2077e
S
2809 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2810 'url': mpd_url or base_url,
1141e910 2811 'fragment_base_url': base_url,
b4c1d6e8 2812 'fragments': [],
1bac3455 2813 'protocol': 'http_dash_segments',
df374b52 2814 })
1bac3455 2815 if 'initialization_url' in representation_ms_info:
e228616c 2816 initialization_url = representation_ms_info['initialization_url']
1bac3455 2817 if not f.get('url'):
2818 f['url'] = initialization_url
1141e910 2819 f['fragments'].append({location_key(initialization_url): initialization_url})
b4c1d6e8 2820 f['fragments'].extend(representation_ms_info['fragments'])
79d2077e
S
2821 else:
2822 # Assuming direct URL to unfragmented media.
2823 f['url'] = base_url
fd76a142
F
2824 if content_type in ('video', 'audio'):
2825 formats.append(f)
2826 elif content_type == 'text':
2827 subtitles.setdefault(lang or 'und', []).append(f)
17b598d3 2828 else:
1bac3455 2829 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
171e59ed 2830 return formats, subtitles
17b598d3 2831
fd76a142
F
2832 def _extract_ism_formats(self, *args, **kwargs):
2833 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2834 if subs:
2835 self.report_warning(bug_reports_message(
2836 "Ignoring subtitle tracks found in the ISM manifest; "
2837 "if any subtitle tracks are missing,"
2838 ))
2839 return fmts
2840
2841 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2842 res = self._download_xml_handle(
b2758123 2843 ism_url, video_id,
37a3bb66 2844 note='Downloading ISM manifest' if note is None else note,
2845 errnote='Failed to download ISM manifest' if errnote is None else errnote,
7360c06f 2846 fatal=fatal, data=data, headers=headers, query=query)
b2758123 2847 if res is False:
fd76a142 2848 return [], {}
47a5cb77 2849 ism_doc, urlh = res
13b08034 2850 if ism_doc is None:
fd76a142 2851 return [], {}
b2758123 2852
fd76a142 2853 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
b2758123 2854
fd76a142 2855 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2856 """
2857 Parse formats from ISM manifest.
2858 References:
2859 1. [MS-SSTR]: Smooth Streaming Protocol,
2860 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2861 """
06869367 2862 if ism_doc.get('IsLive') == 'TRUE':
fd76a142 2863 return [], {}
a06916d9 2864 if (not self.get_param('allow_unplayable_formats')
06869367 2865 and ism_doc.find('Protection') is not None):
fd76a142 2866 return [], {}
b2758123 2867
b2758123
RA
2868 duration = int(ism_doc.attrib['Duration'])
2869 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2870
2871 formats = []
fd76a142 2872 subtitles = {}
b2758123
RA
2873 for stream in ism_doc.findall('StreamIndex'):
2874 stream_type = stream.get('Type')
fd76a142 2875 if stream_type not in ('video', 'audio', 'text'):
b2758123
RA
2876 continue
2877 url_pattern = stream.attrib['Url']
2878 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2879 stream_name = stream.get('Name')
fd76a142 2880 stream_language = stream.get('Language', 'und')
b2758123 2881 for track in stream.findall('QualityLevel'):
e2efe599 2882 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
b2758123 2883 # TODO: add support for WVC1 and WMAP
66a1b864 2884 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
b2758123
RA
2885 self.report_warning('%s is not a supported codec' % fourcc)
2886 continue
2887 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2888 # [1] does not mention Width and Height attributes. However,
2889 # they're often present while MaxWidth and MaxHeight are
2890 # missing, so should be used as fallbacks
2891 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2892 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2893 sampling_rate = int_or_none(track.get('SamplingRate'))
2894
2895 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2896 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2897
2898 fragments = []
2899 fragment_ctx = {
2900 'time': 0,
2901 }
2902 stream_fragments = stream.findall('c')
2903 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2904 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2905 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2906 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2907 if not fragment_ctx['duration']:
2908 try:
2909 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2910 except IndexError:
2911 next_fragment_time = duration
1616f9b4 2912 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2913 for _ in range(fragment_repeat):
2914 fragments.append({
1616f9b4 2915 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2916 'duration': fragment_ctx['duration'] / stream_timescale,
2917 })
2918 fragment_ctx['time'] += fragment_ctx['duration']
2919
2920 format_id = []
2921 if ism_id:
2922 format_id.append(ism_id)
2923 if stream_name:
2924 format_id.append(stream_name)
2925 format_id.append(compat_str(tbr))
2926
fd76a142
F
2927 if stream_type == 'text':
2928 subtitles.setdefault(stream_language, []).append({
2929 'ext': 'ismt',
2930 'protocol': 'ism',
2931 'url': ism_url,
2932 'manifest_url': ism_url,
2933 'fragments': fragments,
2934 '_download_params': {
2935 'stream_type': stream_type,
2936 'duration': duration,
2937 'timescale': stream_timescale,
2938 'fourcc': fourcc,
2939 'language': stream_language,
2940 'codec_private_data': track.get('CodecPrivateData'),
2941 }
2942 })
2943 elif stream_type in ('video', 'audio'):
2944 formats.append({
2945 'format_id': '-'.join(format_id),
2946 'url': ism_url,
2947 'manifest_url': ism_url,
2948 'ext': 'ismv' if stream_type == 'video' else 'isma',
2949 'width': width,
2950 'height': height,
2951 'tbr': tbr,
2952 'asr': sampling_rate,
2953 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2954 'acodec': 'none' if stream_type == 'video' else fourcc,
2955 'protocol': 'ism',
2956 'fragments': fragments,
2957 '_download_params': {
2958 'stream_type': stream_type,
2959 'duration': duration,
2960 'timescale': stream_timescale,
2961 'width': width or 0,
2962 'height': height or 0,
2963 'fourcc': fourcc,
2964 'language': stream_language,
2965 'codec_private_data': track.get('CodecPrivateData'),
2966 'sampling_rate': sampling_rate,
2967 'channels': int_or_none(track.get('Channels', 2)),
2968 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2969 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2970 },
2971 })
2972 return formats, subtitles
b2758123 2973
f983b875 2974 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
6780154e
S
2975 def absolute_url(item_url):
2976 return urljoin(base_url, item_url)
59bbe491 2977
2978 def parse_content_type(content_type):
2979 if not content_type:
2980 return {}
2981 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2982 if ctr:
2983 mimetype, codecs = ctr.groups()
2984 f = parse_codecs(codecs)
2985 f['ext'] = mimetype2ext(mimetype)
2986 return f
2987 return {}
2988
868f79db 2989 def _media_formats(src, cur_media_type, type_info={}):
520251c0 2990 full_url = absolute_url(src)
82889d4a 2991 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 2992 if ext == 'm3u8':
520251c0
YCH
2993 is_plain_url = False
2994 formats = self._extract_m3u8_formats(
ad120ae1 2995 full_url, video_id, ext='mp4',
eeb0a956 2996 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 2997 preference=preference, quality=quality, fatal=False)
87a449c1
S
2998 elif ext == 'mpd':
2999 is_plain_url = False
3000 formats = self._extract_mpd_formats(
b359e977 3001 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
3002 else:
3003 is_plain_url = True
3004 formats = [{
3005 'url': full_url,
3006 'vcodec': 'none' if cur_media_type == 'audio' else None,
3007 }]
3008 return is_plain_url, formats
3009
59bbe491 3010 entries = []
4328ddf8
S
3011 # amp-video and amp-audio are very similar to their HTML5 counterparts
3012 # so we wll include them right here (see
3013 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 3014 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3015 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3016 media_tags = [(media_tag, media_tag_name, media_type, '')
3017 for media_tag, media_tag_name, media_type
3018 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
3019 media_tags.extend(re.findall(
3020 # We only allow video|audio followed by a whitespace or '>'.
3021 # Allowing more characters may end up in significant slow down (see
067aa17e 3022 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 3023 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 3024 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3025 for media_tag, _, media_type, media_content in media_tags:
59bbe491 3026 media_info = {
3027 'formats': [],
3028 'subtitles': {},
3029 }
3030 media_attributes = extract_attributes(media_tag)
f856816b 3031 src = strip_or_none(media_attributes.get('src'))
59bbe491 3032 if src:
dedb1770 3033 _, formats = _media_formats(src, media_type)
520251c0 3034 media_info['formats'].extend(formats)
6780154e 3035 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 3036 if media_content:
3037 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
3038 s_attr = extract_attributes(source_tag)
3039 # data-video-src and data-src are non standard but seen
3040 # several times in the wild
f856816b 3041 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 3042 if not src:
3043 continue
d493f15c 3044 f = parse_content_type(s_attr.get('type'))
868f79db 3045 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 3046 if is_plain_url:
d493f15c
S
3047 # width, height, res, label and title attributes are
3048 # all not standard but seen several times in the wild
3049 labels = [
3050 s_attr.get(lbl)
3051 for lbl in ('label', 'title')
3052 if str_or_none(s_attr.get(lbl))
3053 ]
3054 width = int_or_none(s_attr.get('width'))
3089bc74
S
3055 height = (int_or_none(s_attr.get('height'))
3056 or int_or_none(s_attr.get('res')))
d493f15c
S
3057 if not width or not height:
3058 for lbl in labels:
3059 resolution = parse_resolution(lbl)
3060 if not resolution:
3061 continue
3062 width = width or resolution.get('width')
3063 height = height or resolution.get('height')
3064 for lbl in labels:
3065 tbr = parse_bitrate(lbl)
3066 if tbr:
3067 break
3068 else:
3069 tbr = None
1ed45499 3070 f.update({
d493f15c
S
3071 'width': width,
3072 'height': height,
3073 'tbr': tbr,
3074 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 3075 })
520251c0
YCH
3076 f.update(formats[0])
3077 media_info['formats'].append(f)
3078 else:
3079 media_info['formats'].extend(formats)
59bbe491 3080 for track_tag in re.findall(r'<track[^>]+>', media_content):
3081 track_attributes = extract_attributes(track_tag)
3082 kind = track_attributes.get('kind')
5968d7d2 3083 if not kind or kind in ('subtitles', 'captions'):
f856816b 3084 src = strip_or_none(track_attributes.get('src'))
59bbe491 3085 if not src:
3086 continue
3087 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3088 media_info['subtitles'].setdefault(lang, []).append({
3089 'url': absolute_url(src),
3090 })
5e8e2fa5
S
3091 for f in media_info['formats']:
3092 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 3093 if media_info['formats'] or media_info['subtitles']:
59bbe491 3094 entries.append(media_info)
3095 return entries
3096
f6a1d69a
F
3097 def _extract_akamai_formats(self, *args, **kwargs):
3098 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3099 if subs:
3100 self.report_warning(bug_reports_message(
3101 "Ignoring subtitle tracks found in the manifests; "
3102 "if any subtitle tracks are missing,"
3103 ))
3104 return fmts
3105
3106 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
29f7c58a 3107 signed = 'hdnea=' in manifest_url
3108 if not signed:
3109 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3110 manifest_url = re.sub(
3111 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3112 '', manifest_url).strip('?')
3113
c7c43a93 3114 formats = []
f6a1d69a 3115 subtitles = {}
70c5802b 3116
e71a4509 3117 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 3118 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
3119 hds_host = hosts.get('hds')
3120 if hds_host:
3121 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
3122 if 'hdcore=' not in f4m_url:
3123 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3124 f4m_formats = self._extract_f4m_formats(
3125 f4m_url, video_id, f4m_id='hds', fatal=False)
3126 for entry in f4m_formats:
3127 entry.update({'extra_param_to_segment_url': hdcore_sign})
3128 formats.extend(f4m_formats)
70c5802b 3129
c4251b9a
RA
3130 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3131 hls_host = hosts.get('hls')
3132 if hls_host:
3133 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
f6a1d69a 3134 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
c7c43a93 3135 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 3136 m3u8_id='hls', fatal=False)
3137 formats.extend(m3u8_formats)
f6a1d69a 3138 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
70c5802b 3139
3140 http_host = hosts.get('http')
29f7c58a 3141 if http_host and m3u8_formats and not signed:
3142 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 3143 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3144 qualities_length = len(qualities)
29f7c58a 3145 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 3146 i = 0
29f7c58a 3147 for f in m3u8_formats:
3148 if f['vcodec'] != 'none':
70c5802b 3149 for protocol in ('http', 'https'):
3150 http_f = f.copy()
3151 del http_f['manifest_url']
3152 http_url = re.sub(
29f7c58a 3153 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
70c5802b 3154 http_f.update({
3155 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3156 'url': http_url,
3157 'protocol': protocol,
3158 })
29f7c58a 3159 formats.append(http_f)
70c5802b 3160 i += 1
70c5802b 3161
f6a1d69a 3162 return formats, subtitles
c7c43a93 3163
6ad02195 3164 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 3165 query = compat_urlparse.urlparse(url).query
6ad02195 3166 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
3167 mobj = re.search(
3168 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3169 url_base = mobj.group('url')
3170 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 3171 formats = []
044eeb14
S
3172
3173 def manifest_url(manifest):
3174 m_url = '%s/%s' % (http_base_url, manifest)
3175 if query:
3176 m_url += '?%s' % query
3177 return m_url
3178
6ad02195
RA
3179 if 'm3u8' not in skip_protocols:
3180 formats.extend(self._extract_m3u8_formats(
044eeb14 3181 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
3182 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3183 if 'f4m' not in skip_protocols:
3184 formats.extend(self._extract_f4m_formats(
044eeb14 3185 manifest_url('manifest.f4m'),
6ad02195 3186 video_id, f4m_id='hds', fatal=False))
0384932e
RA
3187 if 'dash' not in skip_protocols:
3188 formats.extend(self._extract_mpd_formats(
044eeb14 3189 manifest_url('manifest.mpd'),
0384932e 3190 video_id, mpd_id='dash', fatal=False))
6ad02195 3191 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
3192 if 'smil' not in skip_protocols:
3193 rtmp_formats = self._extract_smil_formats(
044eeb14 3194 manifest_url('jwplayer.smil'),
6ad02195
RA
3195 video_id, fatal=False)
3196 for rtmp_format in rtmp_formats:
3197 rtsp_format = rtmp_format.copy()
3198 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3199 del rtsp_format['play_path']
3200 del rtsp_format['ext']
3201 rtsp_format.update({
3202 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3203 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3204 'protocol': 'rtsp',
3205 })
3206 formats.extend([rtmp_format, rtsp_format])
3207 else:
3208 for protocol in ('rtmp', 'rtsp'):
3209 if protocol not in skip_protocols:
3210 formats.append({
f2e2f0c7 3211 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
3212 'format_id': protocol,
3213 'protocol': protocol,
3214 })
3215 return formats
3216
c73e330e 3217 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3218 mobj = re.search(
ac9c69ac 3219 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3220 webpage)
3221 if mobj:
c73e330e
RU
3222 try:
3223 jwplayer_data = self._parse_json(mobj.group('options'),
3224 video_id=video_id,
3225 transform_source=transform_source)
3226 except ExtractorError:
3227 pass
3228 else:
3229 if isinstance(jwplayer_data, dict):
3230 return jwplayer_data
a4a554a7
YCH
3231
3232 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3233 jwplayer_data = self._find_jwplayer_data(
3234 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3235 return self._parse_jwplayer_data(
3236 jwplayer_data, video_id, *args, **kwargs)
3237
3238 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3239 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3240 # JWPlayer backward compatibility: flattened playlists
3241 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3242 if 'playlist' not in jwplayer_data:
3243 jwplayer_data = {'playlist': [jwplayer_data]}
3244
3245 entries = []
3246
3247 # JWPlayer backward compatibility: single playlist item
3248 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3249 if not isinstance(jwplayer_data['playlist'], list):
3250 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3251
3252 for video_data in jwplayer_data['playlist']:
3253 # JWPlayer backward compatibility: flattened sources
3254 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3255 if 'sources' not in video_data:
3256 video_data['sources'] = [video_data]
3257
3258 this_video_id = video_id or video_data['mediaid']
3259
1a2192cb
S
3260 formats = self._parse_jwplayer_formats(
3261 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3262 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3263
3264 subtitles = {}
3265 tracks = video_data.get('tracks')
3266 if tracks and isinstance(tracks, list):
3267 for track in tracks:
96a2daa1
S
3268 if not isinstance(track, dict):
3269 continue
f4b74272
S
3270 track_kind = track.get('kind')
3271 if not track_kind or not isinstance(track_kind, compat_str):
3272 continue
3273 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3274 continue
3275 track_url = urljoin(base_url, track.get('file'))
3276 if not track_url:
3277 continue
3278 subtitles.setdefault(track.get('label') or 'en', []).append({
3279 'url': self._proto_relative_url(track_url)
3280 })
3281
50d808f5 3282 entry = {
a4a554a7 3283 'id': this_video_id,
50d808f5 3284 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3285 'description': clean_html(video_data.get('description')),
6945b9e7 3286 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3287 'timestamp': int_or_none(video_data.get('pubdate')),
3288 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3289 'subtitles': subtitles,
50d808f5
RA
3290 }
3291 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3292 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3293 entry.update({
3294 '_type': 'url_transparent',
3295 'url': formats[0]['url'],
3296 })
3297 else:
3298 self._sort_formats(formats)
3299 entry['formats'] = formats
3300 entries.append(entry)
a4a554a7
YCH
3301 if len(entries) == 1:
3302 return entries[0]
3303 else:
3304 return self.playlist_result(entries)
3305
ed0cf9b3
S
3306 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3307 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3308 urls = []
ed0cf9b3 3309 formats = []
1a2192cb 3310 for source in jwplayer_sources_data:
0a268c6e
S
3311 if not isinstance(source, dict):
3312 continue
6945b9e7
RA
3313 source_url = urljoin(
3314 base_url, self._proto_relative_url(source.get('file')))
3315 if not source_url or source_url in urls:
bf1b87cd
RA
3316 continue
3317 urls.append(source_url)
ed0cf9b3
S
3318 source_type = source.get('type') or ''
3319 ext = mimetype2ext(source_type) or determine_ext(source_url)
3320 if source_type == 'hls' or ext == 'm3u8':
3321 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3322 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3323 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3324 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3325 formats.extend(self._extract_mpd_formats(
3326 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3327 elif ext == 'smil':
3328 formats.extend(self._extract_smil_formats(
3329 source_url, video_id, fatal=False))
ed0cf9b3 3330 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3331 elif source_type.startswith('audio') or ext in (
3332 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3333 formats.append({
3334 'url': source_url,
3335 'vcodec': 'none',
3336 'ext': ext,
3337 })
3338 else:
3339 height = int_or_none(source.get('height'))
3340 if height is None:
3341 # Often no height is provided but there is a label in
0236cd0d 3342 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3343 height = int_or_none(self._search_regex(
0236cd0d 3344 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
3345 'height', default=None))
3346 a_format = {
3347 'url': source_url,
3348 'width': int_or_none(source.get('width')),
3349 'height': height,
0236cd0d 3350 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3351 'ext': ext,
3352 }
3353 if source_url.startswith('rtmp'):
3354 a_format['ext'] = 'flv'
ed0cf9b3
S
3355 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3356 # of jwplayer.flash.swf
3357 rtmp_url_parts = re.split(
3358 r'((?:mp4|mp3|flv):)', source_url, 1)
3359 if len(rtmp_url_parts) == 3:
3360 rtmp_url, prefix, play_path = rtmp_url_parts
3361 a_format.update({
3362 'url': rtmp_url,
3363 'play_path': prefix + play_path,
3364 })
3365 if rtmp_params:
3366 a_format.update(rtmp_params)
3367 formats.append(a_format)
3368 return formats
3369
f4b1c7ad
PH
3370 def _live_title(self, name):
3371 """ Generate the title for a live video """
3372 now = datetime.datetime.now()
611c1dd9 3373 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
3374 return name + ' ' + now_str
3375
b14f3a4c
PH
3376 def _int(self, v, name, fatal=False, **kwargs):
3377 res = int_or_none(v, **kwargs)
3378 if 'get_attr' in kwargs:
3379 print(getattr(v, kwargs['get_attr']))
3380 if res is None:
3381 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3382 if fatal:
3383 raise ExtractorError(msg)
3384 else:
6a39ee13 3385 self.report_warning(msg)
b14f3a4c
PH
3386 return res
3387
3388 def _float(self, v, name, fatal=False, **kwargs):
3389 res = float_or_none(v, **kwargs)
3390 if res is None:
3391 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3392 if fatal:
3393 raise ExtractorError(msg)
3394 else:
6a39ee13 3395 self.report_warning(msg)
b14f3a4c
PH
3396 return res
3397
40e41780
TF
3398 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3399 path='/', secure=False, discard=False, rest={}, **kwargs):
6c22cee6 3400 cookie = compat_cookiejar_Cookie(
4ed2d7b7 3401 0, name, value, port, port is not None, domain, True,
40e41780
TF
3402 domain.startswith('.'), path, True, secure, expire_time,
3403 discard, None, None, rest)
42939b61
JMF
3404 self._downloader.cookiejar.set_cookie(cookie)
3405
799207e8 3406 def _get_cookies(self, url):
f7ad7160 3407 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
5c2266df 3408 req = sanitized_Request(url)
799207e8 3409 self._downloader.cookiejar.add_cookie_header(req)
f7ad7160 3410 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
799207e8 3411
e3c1266f 3412 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3413 """
3414 Apply first Set-Cookie header instead of the last. Experimental.
3415
3416 Some sites (e.g. [1-3]) may serve two cookies under the same name
3417 in Set-Cookie header and expect the first (old) one to be set rather
3418 than second (new). However, as of RFC6265 the newer one cookie
3419 should be set into cookie store what actually happens.
3420 We will workaround this issue by resetting the cookie to
3421 the first one manually.
3422 1. https://new.vk.com/
3423 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3424 3. https://learning.oreilly.com/
3425 """
e3c1266f
S
3426 for header, cookies in url_handle.headers.items():
3427 if header.lower() != 'set-cookie':
3428 continue
3429 if sys.version_info[0] >= 3:
3430 cookies = cookies.encode('iso-8859-1')
3431 cookies = cookies.decode('utf-8')
3432 cookie_value = re.search(
3433 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3434 if cookie_value:
3435 value, domain = cookie_value.groups()
3436 self._set_cookie(domain, cookie, value)
3437 break
3438
05900629
PH
3439 def get_testcases(self, include_onlymatching=False):
3440 t = getattr(self, '_TEST', None)
3441 if t:
3442 assert not hasattr(self, '_TESTS'), \
3443 '%s has _TEST and _TESTS' % type(self).__name__
3444 tests = [t]
3445 else:
3446 tests = getattr(self, '_TESTS', [])
3447 for t in tests:
3448 if not include_onlymatching and t.get('only_matching', False):
3449 continue
3450 t['name'] = type(self).__name__[:-len('IE')]
3451 yield t
3452
3453 def is_suitable(self, age_limit):
3454 """ Test whether the extractor is generally suitable for the given
3455 age limit (i.e. pornographic sites are not, all others usually are) """
3456
3457 any_restricted = False
3458 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 3459 if tc.get('playlist', []):
05900629
PH
3460 tc = tc['playlist'][0]
3461 is_restricted = age_restricted(
3462 tc.get('info_dict', {}).get('age_limit'), age_limit)
3463 if not is_restricted:
3464 return True
3465 any_restricted = any_restricted or is_restricted
3466 return not any_restricted
3467
a504ced0 3468 def extract_subtitles(self, *args, **kwargs):
a06916d9 3469 if (self.get_param('writesubtitles', False)
3470 or self.get_param('listsubtitles')):
9868ea49
JMF
3471 return self._get_subtitles(*args, **kwargs)
3472 return {}
a504ced0
JMF
3473
3474 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3475 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3476
912e0b7e
YCH
3477 @staticmethod
3478 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3479 """ Merge subtitle items for one language. Items with duplicated URLs
3480 will be dropped. """
3481 list1_urls = set([item['url'] for item in subtitle_list1])
3482 ret = list(subtitle_list1)
3483 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3484 return ret
3485
3486 @classmethod
19bb3920
F
3487 def _merge_subtitles(cls, *dicts, **kwargs):
3488 """ Merge subtitle dictionaries, language by language. """
3489
3490 target = (lambda target=None: target)(**kwargs)
3491 # The above lambda extracts the keyword argument 'target' from kwargs
3492 # while ensuring there are no stray ones. When Python 2 support
3493 # is dropped, remove it and change the function signature to:
3494 #
3495 # def _merge_subtitles(cls, *dicts, target=None):
3496
3497 if target is None:
3498 target = {}
3499 for d in dicts:
3500 for lang, subs in d.items():
3501 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3502 return target
912e0b7e 3503
360e1ca5 3504 def extract_automatic_captions(self, *args, **kwargs):
a06916d9 3505 if (self.get_param('writeautomaticsub', False)
3506 or self.get_param('listsubtitles')):
9868ea49
JMF
3507 return self._get_automatic_captions(*args, **kwargs)
3508 return {}
360e1ca5
JMF
3509
3510 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3511 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3512
d77ab8e2 3513 def mark_watched(self, *args, **kwargs):
a06916d9 3514 if (self.get_param('mark_watched', False)
3089bc74 3515 and (self._get_login_info()[0] is not None
a06916d9 3516 or self.get_param('cookiefile') is not None)):
d77ab8e2
S
3517 self._mark_watched(*args, **kwargs)
3518
3519 def _mark_watched(self, *args, **kwargs):
3520 raise NotImplementedError('This method must be implemented by subclasses')
3521
38cce791
YCH
3522 def geo_verification_headers(self):
3523 headers = {}
a06916d9 3524 geo_verification_proxy = self.get_param('geo_verification_proxy')
38cce791
YCH
3525 if geo_verification_proxy:
3526 headers['Ytdl-request-proxy'] = geo_verification_proxy
3527 return headers
3528
98763ee3
YCH
3529 def _generic_id(self, url):
3530 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3531
3532 def _generic_title(self, url):
3533 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3534
c224251a 3535 @staticmethod
b0089e89 3536 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
c224251a
M
3537 all_known = all(map(
3538 lambda x: x is not None,
3539 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3540 return (
3541 'private' if is_private
3542 else 'premium_only' if needs_premium
3543 else 'subscriber_only' if needs_subscription
3544 else 'needs_auth' if needs_auth
3545 else 'unlisted' if is_unlisted
3546 else 'public' if all_known
3547 else None)
3548
8dbe9899 3549
d6983cb4
PH
3550class SearchInfoExtractor(InfoExtractor):
3551 """
3552 Base class for paged search queries extractors.
10952eb2 3553 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
3554 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3555 """
3556
3557 @classmethod
3558 def _make_valid_url(cls):
3559 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3560
3561 @classmethod
3562 def suitable(cls, url):
3563 return re.match(cls._make_valid_url(), url) is not None
3564
3565 def _real_extract(self, query):
3566 mobj = re.match(self._make_valid_url(), query)
3567 if mobj is None:
f1a9d64e 3568 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
3569
3570 prefix = mobj.group('prefix')
3571 query = mobj.group('query')
3572 if prefix == '':
3573 return self._get_n_results(query, 1)
3574 elif prefix == 'all':
3575 return self._get_n_results(query, self._MAX_RESULTS)
3576 else:
3577 n = int(prefix)
3578 if n <= 0:
f1a9d64e 3579 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 3580 elif n > self._MAX_RESULTS:
6a39ee13 3581 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3582 n = self._MAX_RESULTS
3583 return self._get_n_results(query, n)
3584
3585 def _get_n_results(self, query, n):
3586 """Get a specified number of results for a query"""
611c1dd9 3587 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
3588
3589 @property
3590 def SEARCH_KEY(self):
3591 return self._SEARCH_KEY