]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
Release 2021.03.01
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
f4b1c7ad 5import datetime
3ec05685 6import hashlib
3d3538e4 7import json
4094b6e3 8import netrc
d6983cb4 9import os
773f291d 10import random
d6983cb4
PH
11import re
12import socket
f8c7bed1 13import ssl
d6983cb4 14import sys
4094b6e3 15import time
1bac3455 16import math
d6983cb4 17
8c25f81b 18from ..compat import (
6c22cee6 19 compat_cookiejar_Cookie,
799207e8 20 compat_cookies,
ee0ba927 21 compat_etree_Element,
e9c0cdd3 22 compat_etree_fromstring,
e64b7569 23 compat_getpass,
d391b7e2 24 compat_integer_types,
d6983cb4 25 compat_http_client,
e9c0cdd3
YCH
26 compat_os_name,
27 compat_str,
d6983cb4 28 compat_urllib_error,
98763ee3 29 compat_urllib_parse_unquote,
15707c7e 30 compat_urllib_parse_urlencode,
41d06b04 31 compat_urllib_request,
f0b5d6af 32 compat_urlparse,
e01c3d2e 33 compat_xml_parse_error,
8c25f81b 34)
eb8a4433 35from ..downloader import FileDownloader
48107c19
S
36from ..downloader.f4m import (
37 get_base_url,
38 remove_encrypted_media,
39)
8c25f81b 40from ..utils import (
c342041f 41 NO_DEFAULT,
05900629 42 age_restricted,
02dc0a36 43 base_url,
08f2a92c 44 bug_reports_message,
d6983cb4
PH
45 clean_html,
46 compiled_regex_type,
70f0f5a8 47 determine_ext,
46b18f23 48 determine_protocol,
d493f15c 49 dict_get,
9b9c5355 50 error_to_compat_str,
d6983cb4 51 ExtractorError,
46b18f23 52 extract_attributes,
97f4aecf 53 fix_xml_ampersands,
b14f3a4c 54 float_or_none,
773f291d
S
55 GeoRestrictedError,
56 GeoUtils,
31bb8d3f 57 int_or_none,
a4a554a7 58 js_to_json,
0685d972 59 JSON_LD_RE,
46b18f23
JH
60 mimetype2ext,
61 orderedSet,
d493f15c 62 parse_bitrate,
46b18f23
JH
63 parse_codecs,
64 parse_duration,
4ca2a3cf 65 parse_iso8601,
46b18f23 66 parse_m3u8_attributes,
d493f15c 67 parse_resolution,
55b3e45b 68 RegexNotFoundError,
5c2266df 69 sanitized_Request,
46b18f23 70 sanitize_filename,
d493f15c 71 str_or_none,
ce5b9040 72 str_to_int,
f856816b 73 strip_or_none,
f38de77f 74 unescapeHTML,
647eab45 75 unified_strdate,
6b3a3098 76 unified_timestamp,
46b18f23
JH
77 update_Request,
78 update_url_query,
79 urljoin,
a107193e 80 url_basename,
bebef109 81 url_or_none,
a6571f10 82 xpath_element,
8d6765cf
S
83 xpath_text,
84 xpath_with_ns,
d6983cb4 85)
c342041f 86
d6983cb4
PH
87
88class InfoExtractor(object):
89 """Information Extractor class.
90
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
5d380852 95 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
98
cf0649f8 99 The type field determines the type of the result.
fed5d032
PH
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
102
103 For a video, the dictionaries must include the following fields:
d6983cb4
PH
104
105 id: Video identifier.
d6983cb4 106 title: Video title, unescaped.
d67b0b15 107
f49d89ee 108 Additionally, it must contain either a formats entry or a url one:
d67b0b15 109
f49d89ee
PH
110 formats: A list of dictionaries for each format available, ordered
111 from worst to best quality.
112
113 Potential fields:
c790e93a
S
114 * url The mandatory URL representing the media:
115 for plain file media - HTTP URL of this file,
116 for RTMP - RTMP URL,
117 for HLS - URL of the M3U8 media playlist,
118 for HDS - URL of the F4M manifest,
79d2077e
S
119 for DASH
120 - HTTP URL to plain file media (in case of
121 unfragmented media)
122 - URL of the MPD manifest or base URL
123 representing the media if MPD manifest
8ed7a233 124 is parsed from a string (in case of
79d2077e 125 fragmented media)
c790e93a 126 for MSS - URL of the ISM manifest.
86f4d14f
S
127 * manifest_url
128 The URL of the manifest file in case of
c790e93a
S
129 fragmented media:
130 for HLS - URL of the M3U8 master playlist,
131 for HDS - URL of the F4M manifest,
132 for DASH - URL of the MPD manifest,
133 for MSS - URL of the ISM manifest.
10952eb2 134 * ext Will be calculated from URL if missing
d67b0b15
PH
135 * format A human-readable description of the format
136 ("mp4 container with h264/opus").
137 Calculated from the format_id, width, height.
138 and format_note fields if missing.
139 * format_id A short description of the format
5d4f3985
PH
140 ("mp4_h264_opus" or "19").
141 Technically optional, but strongly recommended.
d67b0b15
PH
142 * format_note Additional info about the format
143 ("3D" or "DASH video")
144 * width Width of the video, if known
145 * height Height of the video, if known
f49d89ee 146 * resolution Textual description of width and height
7217e148 147 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
148 * abr Average audio bitrate in KBit/s
149 * acodec Name of the audio codec in use
dd27fd17 150 * asr Audio sampling rate in Hertz
d67b0b15 151 * vbr Average video bitrate in KBit/s
fbb21cf5 152 * fps Frame rate
d67b0b15 153 * vcodec Name of the video codec in use
1394ce65 154 * container Name of the container format
d67b0b15 155 * filesize The number of bytes, if known in advance
9732d77e 156 * filesize_approx An estimate for the number of bytes
d67b0b15 157 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
158 * protocol The protocol that will be used for the actual
159 download, lower-case.
b04b8852 160 "http", "https", "rtsp", "rtmp", "rtmpe",
af7d5a63 161 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
162 * fragment_base_url
163 Base URL for fragments. Each fragment's path
164 value (if present) will be relative to
165 this URL.
166 * fragments A list of fragments of a fragmented media.
167 Each fragment entry must contain either an url
168 or a path. If an url is present it should be
169 considered by a client. Otherwise both path and
170 fragment_base_url must be present. Here is
171 the list of all potential fields:
172 * "url" - fragment's URL
173 * "path" - fragment's path relative to
174 fragment_base_url
a0d5077c
S
175 * "duration" (optional, int or float)
176 * "filesize" (optional, int)
f49d89ee 177 * preference Order number of this format. If this field is
08d13955 178 present and not None, the formats get sorted
38d63d84 179 by this field, regardless of all other values.
f49d89ee
PH
180 -1 for default (order by other properties),
181 -2 or smaller for less than default.
e65566a9
PH
182 < -1000 to hide the format (if there is
183 another one which is strictly better)
32f90364
PH
184 * language Language code, e.g. "de" or "en-US".
185 * language_preference Is this in the language mentioned in
186 the URL?
aff2f4f4
PH
187 10 if it's what the URL is about,
188 -1 for default (don't know),
189 -10 otherwise, other values reserved for now.
5d73273f
PH
190 * quality Order number of the video quality of this
191 format, irrespective of the file format.
192 -1 for default (order by other properties),
193 -2 or smaller for less than default.
c64ed2a3
PH
194 * source_preference Order number for this video source
195 (quality takes higher priority)
196 -1 for default (order by other properties),
197 -2 or smaller for less than default.
d769be6c
PH
198 * http_headers A dictionary of additional HTTP headers
199 to add to the request.
6271f1ca 200 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
201 video's pixels are not square.
202 width : height ratio as float.
203 * no_resume The server does not support resuming the
204 (HTTP or RTMP) download. Boolean.
00c97e3e
S
205 * downloader_options A dictionary of downloader options as
206 described in FileDownloader
3dee7826 207
c0ba0f48 208 url: Final video URL.
d6983cb4 209 ext: Video filename extension.
d67b0b15
PH
210 format: The video format, defaults to ext (used for --get-format)
211 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 212
d6983cb4
PH
213 The following fields are optional:
214
f5e43bc6 215 alt_title: A secondary title of the video.
0afef30b
PH
216 display_id An alternative identifier for the video, not necessarily
217 unique, but available before title. Typically, id is
218 something like "4234987", title "Dancing naked mole rats",
219 and display_id "dancing-naked-mole-rats"
d5519808 220 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 221 * "id" (optional, string) - Thumbnail format ID
d5519808 222 * "url"
cfb56d1a 223 * "preference" (optional, int) - quality of the image
d5519808
PH
224 * "width" (optional, int)
225 * "height" (optional, int)
5e1c39ac 226 * "resolution" (optional, string "{width}x{height}",
d5519808 227 deprecated)
2de624fd 228 * "filesize" (optional, int)
d6983cb4 229 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 230 description: Full video description.
d6983cb4 231 uploader: Full name of the video uploader.
2bc0c46f 232 license: License name the video is licensed under.
8a92e51c 233 creator: The creator of the video.
8aab976b 234 release_date: The date (YYYYMMDD) when the video was released.
955c4514 235 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 236 upload_date: Video upload date (YYYYMMDD).
955c4514 237 If not explicitly set, calculated from timestamp.
d6983cb4 238 uploader_id: Nickname or id of the video uploader.
7bcd2830 239 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 240 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 241 Note that channel fields may or may not repeat uploader
6f1f59f3
S
242 fields. This depends on a particular extractor.
243 channel_id: Id of the channel.
244 channel_url: Full URL to a channel webpage.
da9ec3b9 245 location: Physical location where the video was filmed.
a504ced0 246 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
247 {tag: subformats}. "tag" is usually a language code, and
248 "subformats" is a list sorted from lower to higher
249 preference, each element is a dictionary with the "ext"
250 entry and one of:
a504ced0 251 * "data": The subtitles file contents
10952eb2 252 * "url": A URL pointing to the subtitles file
4bba3716 253 "ext" will be calculated from URL if missing
360e1ca5
JMF
254 automatic_captions: Like 'subtitles', used by the YoutubeIE for
255 automatically generated captions
62d231c0 256 duration: Length of the video in seconds, as an integer or float.
f3d29461 257 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
258 like_count: Number of positive ratings of the video
259 dislike_count: Number of negative ratings of the video
02835c6b 260 repost_count: Number of reposts of the video
2d30521a 261 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 262 comment_count: Number of comments on the video
dd622d7c
PH
263 comments: A list of comments, each with one or more of the following
264 properties (all but one of text or html optional):
265 * "author" - human-readable name of the comment author
266 * "author_id" - user ID of the comment author
267 * "id" - Comment ID
268 * "html" - Comment as HTML
269 * "text" - Plain text of the comment
270 * "timestamp" - UNIX timestamp of comment
271 * "parent" - ID of the comment this one is replying to.
272 Set to "root" to indicate that this is a
273 comment to the original video.
8dbe9899 274 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 275 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
276 should allow to get the same result again. (It will be set
277 by YoutubeDL if it's missing)
ad3bc6ac
PH
278 categories: A list of categories that the video falls in, for example
279 ["Sports", "Berlin"]
864f24bd 280 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
281 is_live: True, False, or None (=unknown). Whether this video is a
282 live stream that goes on instead of a fixed-length video.
f76ede8e 283 was_live: True, False, or None (=unknown). Whether this video was
284 originally a live stream.
7c80519c 285 start_time: Time in seconds where the reproduction should start, as
10952eb2 286 specified in the URL.
297a564b 287 end_time: Time in seconds where the reproduction should end, as
10952eb2 288 specified in the URL.
55949fed 289 chapters: A list of dictionaries, with the following entries:
290 * "start_time" - The start time of the chapter in seconds
291 * "end_time" - The end time of the chapter in seconds
292 * "title" (optional, string)
6cfda058 293 playable_in_embed: Whether this video is allowed to play in embedded
294 players on other sites. Can be True (=always allowed),
295 False (=never allowed), None (=unknown), or a string
296 specifying the criteria for embedability (Eg: 'whitelist').
277d6ff5 297 __post_extractor: A function to be called just before the metadata is
298 written to either disk, logger or console. The function
299 must return a dict which will be added to the info_dict.
300 This is usefull for additional information that is
301 time-consuming to extract. Note that the fields thus
302 extracted will not be available to output template and
303 match_filter. So, only "comments" and "comment_count" are
304 currently allowed to be extracted via this method.
d6983cb4 305
7109903e
S
306 The following fields should only be used when the video belongs to some logical
307 chapter or section:
308
309 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
310 chapter_number: Number of the chapter the video belongs to, as an integer.
311 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
312
313 The following fields should only be used when the video is an episode of some
8d76bdf1 314 series, programme or podcast:
7109903e
S
315
316 series: Title of the series or programme the video episode belongs to.
317 season: Title of the season the video episode belongs to.
27bfd4e5
S
318 season_number: Number of the season the video episode belongs to, as an integer.
319 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
320 episode: Title of the video episode. Unlike mandatory video title field,
321 this field should denote the exact title of the video episode
322 without any kind of decoration.
27bfd4e5
S
323 episode_number: Number of the video episode within a season, as an integer.
324 episode_id: Id of the video episode, as a unicode string.
7109903e 325
7a93ab5f
S
326 The following fields should only be used when the media is a track or a part of
327 a music album:
328
329 track: Title of the track.
330 track_number: Number of the track within an album or a disc, as an integer.
331 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
332 as a unicode string.
333 artist: Artist(s) of the track.
334 genre: Genre(s) of the track.
335 album: Title of the album the track belongs to.
336 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
337 album_artist: List of all artists appeared on the album (e.g.
338 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
339 and compilations).
340 disc_number: Number of the disc or other physical medium the track belongs to,
341 as an integer.
342 release_year: Year (YYYY) when the album was released.
343
deefc05b 344 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 345
d838b1bd
PH
346 Unless mentioned otherwise, None is equivalent to absence of information.
347
fed5d032
PH
348
349 _type "playlist" indicates multiple videos.
b82f815f
PH
350 There must be a key "entries", which is a list, an iterable, or a PagedList
351 object, each element of which is a valid dictionary by this specification.
fed5d032 352
b60419c5 353 Additionally, playlists can have "id", "title", and any other relevent
354 attributes with the same semantics as videos (see above).
fed5d032
PH
355
356
357 _type "multi_video" indicates that there are multiple videos that
358 form a single show, for examples multiple acts of an opera or TV episode.
359 It must have an entries key like a playlist and contain all the keys
360 required for a video at the same time.
361
362
363 _type "url" indicates that the video must be extracted from another
364 location, possibly by a different extractor. Its only required key is:
365 "url" - the next URL to extract.
f58766ce
PH
366 The key "ie_key" can be set to the class name (minus the trailing "IE",
367 e.g. "Youtube") if the extractor class is known in advance.
368 Additionally, the dictionary may have any properties of the resolved entity
369 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
370 known ahead of time.
371
372
373 _type "url_transparent" entities have the same specification as "url", but
374 indicate that the given additional information is more precise than the one
375 associated with the resolved URL.
376 This is useful when a site employs a video service that hosts the video and
377 its technical metadata, but that video service does not embed a useful
378 title, description etc.
379
380
d6983cb4
PH
381 Subclasses of this one should re-define the _real_initialize() and
382 _real_extract() methods and define a _VALID_URL regexp.
383 Probably, they should also be added to the list of extractors.
384
4248dad9 385 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
386 geo restriction bypass mechanisms for a particular extractor.
387 Though it won't disable explicit geo restriction bypass based on
504f20dd 388 country code provided with geo_bypass_country.
4248dad9
S
389
390 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
391 countries for this extractor. One of these countries will be used by
392 geo restriction bypass mechanism right away in order to bypass
504f20dd 393 geo restriction, of course, if the mechanism is not disabled.
773f291d 394
5f95927a
S
395 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
396 IP blocks in CIDR notation for this extractor. One of these IP blocks
397 will be used by geo restriction bypass mechanism similarly
504f20dd 398 to _GEO_COUNTRIES.
3ccdde8c 399
d6983cb4
PH
400 Finally, the _WORKING attribute should be set to False for broken IEs
401 in order to warn the users and skip the tests.
402 """
403
404 _ready = False
405 _downloader = None
773f291d 406 _x_forwarded_for_ip = None
4248dad9
S
407 _GEO_BYPASS = True
408 _GEO_COUNTRIES = None
5f95927a 409 _GEO_IP_BLOCKS = None
d6983cb4
PH
410 _WORKING = True
411
412 def __init__(self, downloader=None):
413 """Constructor. Receives an optional downloader."""
414 self._ready = False
773f291d 415 self._x_forwarded_for_ip = None
d6983cb4
PH
416 self.set_downloader(downloader)
417
418 @classmethod
419 def suitable(cls, url):
420 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
421
422 # This does not use has/getattr intentionally - we want to know whether
423 # we have cached the regexp for *this* class, whereas getattr would also
424 # match the superclass
425 if '_VALID_URL_RE' not in cls.__dict__:
426 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
427 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 428
ed9266db
PH
429 @classmethod
430 def _match_id(cls, url):
431 if '_VALID_URL_RE' not in cls.__dict__:
432 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
433 m = cls._VALID_URL_RE.match(url)
434 assert m
1afd0b0d 435 return compat_str(m.group('id'))
ed9266db 436
d6983cb4
PH
437 @classmethod
438 def working(cls):
439 """Getter method for _WORKING."""
440 return cls._WORKING
441
442 def initialize(self):
443 """Initializes an instance (authentication, etc)."""
5f95927a
S
444 self._initialize_geo_bypass({
445 'countries': self._GEO_COUNTRIES,
446 'ip_blocks': self._GEO_IP_BLOCKS,
447 })
4248dad9
S
448 if not self._ready:
449 self._real_initialize()
450 self._ready = True
451
5f95927a 452 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
453 """
454 Initialize geo restriction bypass mechanism.
455
456 This method is used to initialize geo bypass mechanism based on faking
457 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 458 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
459 IP will be passed as X-Forwarded-For HTTP header in all subsequent
460 HTTP requests.
e39b5d4a
S
461
462 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
463 during the instance initialization with _GEO_COUNTRIES and
464 _GEO_IP_BLOCKS.
e39b5d4a 465
5f95927a 466 You may also manually call it from extractor's code if geo bypass
e39b5d4a 467 information is not available beforehand (e.g. obtained during
5f95927a
S
468 extraction) or due to some other reason. In this case you should pass
469 this information in geo bypass context passed as first argument. It may
470 contain following fields:
471
472 countries: List of geo unrestricted countries (similar
473 to _GEO_COUNTRIES)
474 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
475 (similar to _GEO_IP_BLOCKS)
476
e39b5d4a 477 """
773f291d 478 if not self._x_forwarded_for_ip:
5f95927a
S
479
480 # Geo bypass mechanism is explicitly disabled by user
481 if not self._downloader.params.get('geo_bypass', True):
482 return
483
484 if not geo_bypass_context:
485 geo_bypass_context = {}
486
487 # Backward compatibility: previously _initialize_geo_bypass
488 # expected a list of countries, some 3rd party code may still use
489 # it this way
490 if isinstance(geo_bypass_context, (list, tuple)):
491 geo_bypass_context = {
492 'countries': geo_bypass_context,
493 }
494
495 # The whole point of geo bypass mechanism is to fake IP
496 # as X-Forwarded-For HTTP header based on some IP block or
497 # country code.
498
499 # Path 1: bypassing based on IP block in CIDR notation
500
501 # Explicit IP block specified by user, use it right away
502 # regardless of whether extractor is geo bypassable or not
503 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
504
505 # Otherwise use random IP block from geo bypass context but only
506 # if extractor is known as geo bypassable
507 if not ip_block:
508 ip_blocks = geo_bypass_context.get('ip_blocks')
509 if self._GEO_BYPASS and ip_blocks:
510 ip_block = random.choice(ip_blocks)
511
512 if ip_block:
513 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
514 if self._downloader.params.get('verbose', False):
515 self._downloader.to_screen(
516 '[debug] Using fake IP %s as X-Forwarded-For.'
517 % self._x_forwarded_for_ip)
518 return
519
520 # Path 2: bypassing based on country code
521
522 # Explicit country code specified by user, use it right away
523 # regardless of whether extractor is geo bypassable or not
524 country = self._downloader.params.get('geo_bypass_country', None)
525
526 # Otherwise use random country code from geo bypass context but
527 # only if extractor is known as geo bypassable
528 if not country:
529 countries = geo_bypass_context.get('countries')
530 if self._GEO_BYPASS and countries:
531 country = random.choice(countries)
532
533 if country:
534 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
4248dad9 535 if self._downloader.params.get('verbose', False):
6a9cb295 536 self._downloader.to_screen(
eea0716c 537 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
5f95927a 538 % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
539
540 def extract(self, url):
541 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 542 try:
773f291d
S
543 for _ in range(2):
544 try:
545 self.initialize()
0016b84e
S
546 ie_result = self._real_extract(url)
547 if self._x_forwarded_for_ip:
548 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
549 return ie_result
773f291d 550 except GeoRestrictedError as e:
4248dad9
S
551 if self.__maybe_fake_ip_and_retry(e.countries):
552 continue
773f291d 553 raise
3a5bcd03
PH
554 except ExtractorError:
555 raise
556 except compat_http_client.IncompleteRead as e:
dfb1b146 557 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 558 except (KeyError, StopIteration) as e:
dfb1b146 559 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4 560
4248dad9 561 def __maybe_fake_ip_and_retry(self, countries):
3089bc74
S
562 if (not self._downloader.params.get('geo_bypass_country', None)
563 and self._GEO_BYPASS
564 and self._downloader.params.get('geo_bypass', True)
565 and not self._x_forwarded_for_ip
566 and countries):
eea0716c
S
567 country_code = random.choice(countries)
568 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
569 if self._x_forwarded_for_ip:
570 self.report_warning(
eea0716c
S
571 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
572 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
573 return True
574 return False
575
d6983cb4
PH
576 def set_downloader(self, downloader):
577 """Sets the downloader for this IE."""
578 self._downloader = downloader
579
580 def _real_initialize(self):
581 """Real initialization process. Redefine in subclasses."""
582 pass
583
584 def _real_extract(self, url):
585 """Real extraction process. Redefine in subclasses."""
586 pass
587
56c73665
JMF
588 @classmethod
589 def ie_key(cls):
590 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 591 return compat_str(cls.__name__[:-2])
56c73665 592
d6983cb4
PH
593 @property
594 def IE_NAME(self):
dc519b54 595 return compat_str(type(self).__name__[:-2])
d6983cb4 596
d391b7e2
S
597 @staticmethod
598 def __can_accept_status_code(err, expected_status):
599 assert isinstance(err, compat_urllib_error.HTTPError)
600 if expected_status is None:
601 return False
602 if isinstance(expected_status, compat_integer_types):
603 return err.code == expected_status
604 elif isinstance(expected_status, (list, tuple)):
605 return err.code in expected_status
606 elif callable(expected_status):
607 return expected_status(err.code) is True
608 else:
609 assert False
610
611 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
612 """
613 Return the response handle.
614
615 See _download_webpage docstring for arguments specification.
616 """
1cf376f5 617 if not self._downloader._first_webpage_request:
618 sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
619 if sleep_interval > 0:
5ef7d9bd 620 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 621 time.sleep(sleep_interval)
622 else:
623 self._downloader._first_webpage_request = False
624
d6983cb4
PH
625 if note is None:
626 self.report_download_webpage(video_id)
627 elif note is not False:
7cc3570e 628 if video_id is None:
f1a9d64e 629 self.to_screen('%s' % (note,))
7cc3570e 630 else:
f1a9d64e 631 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
632
633 # Some sites check X-Forwarded-For HTTP header in order to figure out
634 # the origin of the client behind proxy. This allows bypassing geo
635 # restriction by faking this header's value to IP that belongs to some
636 # geo unrestricted country. We will do so once we encounter any
637 # geo restriction error.
638 if self._x_forwarded_for_ip:
639 if 'X-Forwarded-For' not in headers:
640 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
641
41d06b04
S
642 if isinstance(url_or_request, compat_urllib_request.Request):
643 url_or_request = update_Request(
644 url_or_request, data=data, headers=headers, query=query)
645 else:
cdfee168 646 if query:
647 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 648 if data is not None or headers:
41d06b04 649 url_or_request = sanitized_Request(url_or_request, data, headers)
f8c7bed1
S
650 exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
651 if hasattr(ssl, 'CertificateError'):
652 exceptions.append(ssl.CertificateError)
d6983cb4 653 try:
dca08720 654 return self._downloader.urlopen(url_or_request)
f8c7bed1 655 except tuple(exceptions) as err:
d391b7e2
S
656 if isinstance(err, compat_urllib_error.HTTPError):
657 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
658 # Retain reference to error to prevent file object from
659 # being closed before it can be read. Works around the
660 # effects of <https://bugs.python.org/issue15002>
661 # introduced in Python 3.4.1.
662 err.fp._error = err
d391b7e2
S
663 return err.fp
664
aa94a6d3
PH
665 if errnote is False:
666 return False
d6983cb4 667 if errnote is None:
f1a9d64e 668 errnote = 'Unable to download webpage'
7f8b2714 669
9b9c5355 670 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
671 if fatal:
672 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
673 else:
674 self._downloader.report_warning(errmsg)
675 return False
d6983cb4 676
d391b7e2
S
677 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
678 """
679 Return a tuple (page content as string, URL handle).
680
681 See _download_webpage docstring for arguments specification.
682 """
b9d3e163
PH
683 # Strip hashes from the URL (#1038)
684 if isinstance(url_or_request, (compat_str, str)):
685 url_or_request = url_or_request.partition('#')[0]
686
d391b7e2 687 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
688 if urlh is False:
689 assert not fatal
690 return False
c9a77969 691 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
692 return (content, urlh)
693
c9a77969
YCH
694 @staticmethod
695 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
696 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
697 if m:
698 encoding = m.group(1)
699 else:
0d75ae2c 700 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
701 webpage_bytes[:1024])
702 if m:
703 encoding = m.group(1).decode('ascii')
b60016e8
PH
704 elif webpage_bytes.startswith(b'\xff\xfe'):
705 encoding = 'utf-16'
f143d86a
PH
706 else:
707 encoding = 'utf-8'
c9a77969
YCH
708
709 return encoding
710
4457823d
S
711 def __check_blocked(self, content):
712 first_block = content[:512]
3089bc74
S
713 if ('<title>Access to this site is blocked</title>' in content
714 and 'Websense' in first_block):
4457823d
S
715 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
716 blocked_iframe = self._html_search_regex(
717 r'<iframe src="([^"]+)"', content,
718 'Websense information URL', default=None)
719 if blocked_iframe:
720 msg += ' Visit %s for more details' % blocked_iframe
721 raise ExtractorError(msg, expected=True)
722 if '<title>The URL you requested has been blocked</title>' in first_block:
723 msg = (
724 'Access to this webpage has been blocked by Indian censorship. '
725 'Use a VPN or proxy server (with --proxy) to route around it.')
726 block_msg = self._html_search_regex(
727 r'</h1><p>(.*?)</p>',
728 content, 'block message', default=None)
729 if block_msg:
730 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
731 raise ExtractorError(msg, expected=True)
3089bc74
S
732 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
733 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
734 raise ExtractorError(
735 'Access to this webpage has been blocked by decision of the Russian government. '
736 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
737 expected=True)
738
c9a77969
YCH
739 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
740 content_type = urlh.headers.get('Content-Type', '')
741 webpage_bytes = urlh.read()
742 if prefix is not None:
743 webpage_bytes = prefix + webpage_bytes
744 if not encoding:
745 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4 746 if self._downloader.params.get('dump_intermediate_pages', False):
f610dbb0 747 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
748 dump = base64.b64encode(webpage_bytes).decode('ascii')
749 self._downloader.to_screen(dump)
d41e6efc 750 if self._downloader.params.get('write_pages', False):
f610dbb0 751 basen = '%s_%s' % (video_id, urlh.geturl())
c1bce22f 752 if len(basen) > 240:
f1a9d64e 753 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
754 basen = basen[:240 - len(h)] + h
755 raw_filename = basen + '.dump'
d41e6efc 756 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 757 self.to_screen('Saving request to ' + filename)
5f58165d
S
758 # Working around MAX_PATH limitation on Windows (see
759 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 760 if compat_os_name == 'nt':
5f58165d
S
761 absfilepath = os.path.abspath(filename)
762 if len(absfilepath) > 259:
763 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
764 with open(filename, 'wb') as outf:
765 outf.write(webpage_bytes)
766
ec0fafbb
AA
767 try:
768 content = webpage_bytes.decode(encoding, 'replace')
769 except LookupError:
770 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 771
4457823d 772 self.__check_blocked(content)
2410c43d 773
23be51d8 774 return content
d6983cb4 775
d391b7e2
S
776 def _download_webpage(
777 self, url_or_request, video_id, note=None, errnote=None,
778 fatal=True, tries=1, timeout=5, encoding=None, data=None,
779 headers={}, query={}, expected_status=None):
780 """
781 Return the data of the page as a string.
782
783 Arguments:
784 url_or_request -- plain text URL as a string or
785 a compat_urllib_request.Requestobject
786 video_id -- Video/playlist/item identifier (string)
787
788 Keyword arguments:
789 note -- note printed before downloading (string)
790 errnote -- note printed in case of an error (string)
791 fatal -- flag denoting whether error should be considered fatal,
792 i.e. whether it should cause ExtractionError to be raised,
793 otherwise a warning will be reported and extraction continued
794 tries -- number of tries
795 timeout -- sleep interval between tries
796 encoding -- encoding for a page content decoding, guessed automatically
797 when not explicitly specified
798 data -- POST data (bytes)
799 headers -- HTTP headers (dict)
800 query -- URL query (dict)
801 expected_status -- allows to accept failed HTTP requests (non 2xx
802 status code) by explicitly specifying a set of accepted status
803 codes. Can be any of the following entities:
804 - an integer type specifying an exact failed status code to
805 accept
806 - a list or a tuple of integer types specifying a list of
807 failed status codes to accept
808 - a callable accepting an actual failed status code and
809 returning True if it should be accepted
810 Note that this argument does not affect success status codes (2xx)
811 which are always accepted.
812 """
813
995ad69c
TF
814 success = False
815 try_count = 0
816 while success is False:
817 try:
d391b7e2
S
818 res = self._download_webpage_handle(
819 url_or_request, video_id, note, errnote, fatal,
820 encoding=encoding, data=data, headers=headers, query=query,
821 expected_status=expected_status)
995ad69c
TF
822 success = True
823 except compat_http_client.IncompleteRead as e:
824 try_count += 1
825 if try_count >= tries:
826 raise e
827 self._sleep(timeout, video_id)
7cc3570e
PH
828 if res is False:
829 return res
830 else:
831 content, _ = res
832 return content
d6983cb4 833
e0d198c1
S
834 def _download_xml_handle(
835 self, url_or_request, video_id, note='Downloading XML',
836 errnote='Unable to download XML', transform_source=None,
d391b7e2
S
837 fatal=True, encoding=None, data=None, headers={}, query={},
838 expected_status=None):
839 """
ee0ba927 840 Return a tuple (xml as an compat_etree_Element, URL handle).
d391b7e2
S
841
842 See _download_webpage docstring for arguments specification.
843 """
e0d198c1
S
844 res = self._download_webpage_handle(
845 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
846 encoding=encoding, data=data, headers=headers, query=query,
847 expected_status=expected_status)
e0d198c1
S
848 if res is False:
849 return res
850 xml_string, urlh = res
851 return self._parse_xml(
852 xml_string, video_id, transform_source=transform_source,
853 fatal=fatal), urlh
854
d391b7e2
S
855 def _download_xml(
856 self, url_or_request, video_id,
857 note='Downloading XML', errnote='Unable to download XML',
858 transform_source=None, fatal=True, encoding=None,
859 data=None, headers={}, query={}, expected_status=None):
860 """
ee0ba927 861 Return the xml as an compat_etree_Element.
d391b7e2
S
862
863 See _download_webpage docstring for arguments specification.
864 """
e0d198c1
S
865 res = self._download_xml_handle(
866 url_or_request, video_id, note=note, errnote=errnote,
867 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
868 data=data, headers=headers, query=query,
869 expected_status=expected_status)
e0d198c1 870 return res if res is False else res[0]
e01c3d2e
S
871
872 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
873 if transform_source:
874 xml_string = transform_source(xml_string)
e01c3d2e
S
875 try:
876 return compat_etree_fromstring(xml_string.encode('utf-8'))
877 except compat_xml_parse_error as ve:
878 errmsg = '%s: Failed to parse XML ' % video_id
879 if fatal:
880 raise ExtractorError(errmsg, cause=ve)
881 else:
882 self.report_warning(errmsg + str(ve))
267ed0c5 883
0fe7783e
S
884 def _download_json_handle(
885 self, url_or_request, video_id, note='Downloading JSON metadata',
886 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
887 fatal=True, encoding=None, data=None, headers={}, query={},
888 expected_status=None):
889 """
890 Return a tuple (JSON object, URL handle).
891
892 See _download_webpage docstring for arguments specification.
893 """
0fe7783e 894 res = self._download_webpage_handle(
c9a77969 895 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
896 encoding=encoding, data=data, headers=headers, query=query,
897 expected_status=expected_status)
0fe7783e
S
898 if res is False:
899 return res
900 json_string, urlh = res
ebb64199 901 return self._parse_json(
0fe7783e
S
902 json_string, video_id, transform_source=transform_source,
903 fatal=fatal), urlh
904
905 def _download_json(
906 self, url_or_request, video_id, note='Downloading JSON metadata',
907 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
908 fatal=True, encoding=None, data=None, headers={}, query={},
909 expected_status=None):
910 """
911 Return the JSON object as a dict.
912
913 See _download_webpage docstring for arguments specification.
914 """
0fe7783e
S
915 res = self._download_json_handle(
916 url_or_request, video_id, note=note, errnote=errnote,
917 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
918 data=data, headers=headers, query=query,
919 expected_status=expected_status)
0fe7783e 920 return res if res is False else res[0]
ebb64199
TF
921
922 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
923 if transform_source:
924 json_string = transform_source(json_string)
3d3538e4
PH
925 try:
926 return json.loads(json_string)
927 except ValueError as ve:
e7b6d122
PH
928 errmsg = '%s: Failed to parse JSON ' % video_id
929 if fatal:
930 raise ExtractorError(errmsg, cause=ve)
931 else:
932 self.report_warning(errmsg + str(ve))
3d3538e4 933
f45f96f8 934 def report_warning(self, msg, video_id=None):
f1a9d64e 935 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 936 self._downloader.report_warning(
f1a9d64e 937 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 938
d6983cb4
PH
939 def to_screen(self, msg):
940 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 941 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
942
943 def report_extraction(self, id_or_name):
944 """Report information extraction."""
f1a9d64e 945 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
946
947 def report_download_webpage(self, video_id):
948 """Report webpage download."""
f1a9d64e 949 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
950
951 def report_age_confirmation(self):
952 """Report attempt to confirm age."""
f1a9d64e 953 self.to_screen('Confirming age')
d6983cb4 954
fc79158d
JMF
955 def report_login(self):
956 """Report attempt to log in."""
f1a9d64e 957 self.to_screen('Logging in')
fc79158d 958
43e7d3c9
S
959 @staticmethod
960 def raise_login_required(msg='This video is only available for registered users'):
961 raise ExtractorError(
962 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
963 expected=True)
964
c430802e 965 @staticmethod
773f291d
S
966 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
967 raise GeoRestrictedError(msg, countries=countries)
c430802e 968
5f6a1245 969 # Methods for following #608
c0d0b01f 970 @staticmethod
830d53bf 971 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 972 """Returns a URL that points to a page that should be processed"""
5f6a1245 973 # TODO: ie should be the class used for getting the info
d6983cb4
PH
974 video_info = {'_type': 'url',
975 'url': url,
976 'ie_key': ie}
7012b23c
PH
977 if video_id is not None:
978 video_info['id'] = video_id
830d53bf
S
979 if video_title is not None:
980 video_info['title'] = video_title
d6983cb4 981 return video_info
5f6a1245 982
749ca5ec
S
983 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
984 urls = orderedSet(
46b18f23
JH
985 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
986 for m in matches)
987 return self.playlist_result(
749ca5ec 988 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 989
c0d0b01f 990 @staticmethod
b60419c5 991 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
d6983cb4
PH
992 """Returns a playlist"""
993 video_info = {'_type': 'playlist',
994 'entries': entries}
b60419c5 995 video_info.update(kwargs)
d6983cb4
PH
996 if playlist_id:
997 video_info['id'] = playlist_id
998 if playlist_title:
999 video_info['title'] = playlist_title
ecc97af3 1000 if playlist_description is not None:
acf5cbfe 1001 video_info['description'] = playlist_description
d6983cb4
PH
1002 return video_info
1003
c342041f 1004 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1005 """
1006 Perform a regex search on the given string, using a single or a list of
1007 patterns returning the first matching group.
1008 In case of failure return a default value or raise a WARNING or a
55b3e45b 1009 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
1010 """
1011 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1012 mobj = re.search(pattern, string, flags)
1013 else:
1014 for p in pattern:
1015 mobj = re.search(p, string, flags)
c3415d1b
PH
1016 if mobj:
1017 break
d6983cb4 1018
e9c0cdd3 1019 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 1020 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
1021 else:
1022 _name = name
1023
1024 if mobj:
711ede6e
PH
1025 if group is None:
1026 # return the first matching group
1027 return next(g for g in mobj.groups() if g is not None)
1028 else:
1029 return mobj.group(group)
c342041f 1030 elif default is not NO_DEFAULT:
d6983cb4
PH
1031 return default
1032 elif fatal:
f1a9d64e 1033 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1034 else:
08f2a92c 1035 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1036 return None
1037
c342041f 1038 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1039 """
1040 Like _search_regex, but strips HTML tags and unescapes entities.
1041 """
711ede6e 1042 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1043 if res:
1044 return clean_html(res).strip()
1045 else:
1046 return res
1047
2118fdd1
RA
1048 def _get_netrc_login_info(self, netrc_machine=None):
1049 username = None
1050 password = None
1051 netrc_machine = netrc_machine or self._NETRC_MACHINE
1052
1053 if self._downloader.params.get('usenetrc', False):
1054 try:
1055 info = netrc.netrc().authenticators(netrc_machine)
1056 if info is not None:
1057 username = info[0]
1058 password = info[2]
1059 else:
dcce092e
S
1060 raise netrc.NetrcParseError(
1061 'No authenticators for %s' % netrc_machine)
2118fdd1 1062 except (IOError, netrc.NetrcParseError) as err:
dcce092e
S
1063 self._downloader.report_warning(
1064 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1065
dcce092e 1066 return username, password
2118fdd1 1067
1b6712ab 1068 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1069 """
cf0649f8 1070 Get the login info as (username, password)
32443dd3
S
1071 First look for the manually specified credentials using username_option
1072 and password_option as keys in params dictionary. If no such credentials
1073 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1074 value.
fc79158d
JMF
1075 If there's no info available, return (None, None)
1076 """
1077 if self._downloader is None:
1078 return (None, None)
1079
fc79158d
JMF
1080 downloader_params = self._downloader.params
1081
1082 # Attempt to use provided username and password or .netrc data
1b6712ab
RA
1083 if downloader_params.get(username_option) is not None:
1084 username = downloader_params[username_option]
1085 password = downloader_params[password_option]
2118fdd1 1086 else:
1b6712ab 1087 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1088
2133565c 1089 return username, password
fc79158d 1090
e64b7569 1091 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1092 """
1093 Get the two-factor authentication info
1094 TODO - asking the user will be required for sms/phone verify
1095 currently just uses the command line option
1096 If there's no info available, return None
1097 """
1098 if self._downloader is None:
83317f69 1099 return None
1100 downloader_params = self._downloader.params
1101
d800609c 1102 if downloader_params.get('twofactor') is not None:
83317f69 1103 return downloader_params['twofactor']
1104
e64b7569 1105 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1106
46720279
JMF
1107 # Helper functions for extracting OpenGraph info
1108 @staticmethod
ab2d5247 1109 def _og_regexes(prop):
448ef1f3 1110 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
22f5f5c6 1111 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
7a6d76a6 1112 % {'prop': re.escape(prop)})
78fb87b2 1113 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1114 return [
78fb87b2
JMF
1115 template % (property_re, content_re),
1116 template % (content_re, property_re),
ab2d5247 1117 ]
46720279 1118
864f24bd
S
1119 @staticmethod
1120 def _meta_regex(prop):
1121 return r'''(?isx)<meta
8b9848ac 1122 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1123 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1124
3c4e6d83 1125 def _og_search_property(self, prop, html, name=None, **kargs):
b070564e
S
1126 if not isinstance(prop, (list, tuple)):
1127 prop = [prop]
46720279 1128 if name is None:
b070564e
S
1129 name = 'OpenGraph %s' % prop[0]
1130 og_regexes = []
1131 for p in prop:
1132 og_regexes.extend(self._og_regexes(p))
1133 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1134 if escaped is None:
1135 return None
1136 return unescapeHTML(escaped)
46720279
JMF
1137
1138 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1139 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1140
1141 def _og_search_description(self, html, **kargs):
1142 return self._og_search_property('description', html, fatal=False, **kargs)
1143
1144 def _og_search_title(self, html, **kargs):
1145 return self._og_search_property('title', html, **kargs)
1146
8ffa13e0 1147 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1148 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1149 if secure:
1150 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1151 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1152
78338f71
JMF
1153 def _og_search_url(self, html, **kargs):
1154 return self._og_search_property('url', html, **kargs)
1155
40c696e5 1156 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
88d9f6c0
S
1157 if not isinstance(name, (list, tuple)):
1158 name = [name]
59040888 1159 if display_name is None:
88d9f6c0 1160 display_name = name[0]
59040888 1161 return self._html_search_regex(
88d9f6c0 1162 [self._meta_regex(n) for n in name],
711ede6e 1163 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1164
1165 def _dc_search_uploader(self, html):
1166 return self._html_search_meta('dc.creator', html, 'uploader')
1167
8dbe9899
PH
1168 def _rta_search(self, html):
1169 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1170 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1171 r' content="RTA-5042-1996-1400-1577-RTA"',
1172 html):
1173 return 18
1174 return 0
1175
59040888
PH
1176 def _media_rating_search(self, html):
1177 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1178 rating = self._html_search_meta('rating', html)
1179
1180 if not rating:
1181 return None
1182
1183 RATING_TABLE = {
1184 'safe for kids': 0,
1185 'general': 8,
1186 '14 years': 14,
1187 'mature': 17,
1188 'restricted': 19,
1189 }
d800609c 1190 return RATING_TABLE.get(rating.lower())
59040888 1191
69319969 1192 def _family_friendly_search(self, html):
6ca7732d 1193 # See http://schema.org/VideoObject
ac8491fc
S
1194 family_friendly = self._html_search_meta(
1195 'isFamilyFriendly', html, default=None)
69319969
NJ
1196
1197 if not family_friendly:
1198 return None
1199
1200 RATING_TABLE = {
1201 '1': 0,
1202 'true': 0,
1203 '0': 18,
1204 'false': 18,
1205 }
d800609c 1206 return RATING_TABLE.get(family_friendly.lower())
69319969 1207
0c708f11
JMF
1208 def _twitter_search_player(self, html):
1209 return self._html_search_meta('twitter:player', html,
9e1a5b84 1210 'twitter card player')
0c708f11 1211
95b31e26 1212 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4433bb02 1213 json_ld_list = list(re.finditer(JSON_LD_RE, html))
321b5e08 1214 default = kwargs.get('default', NO_DEFAULT)
321b5e08
S
1215 # JSON-LD may be malformed and thus `fatal` should be respected.
1216 # At the same time `default` may be passed that assumes `fatal=False`
1217 # for _search_regex. Let's simulate the same behavior here as well.
1218 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
4433bb02
S
1219 json_ld = []
1220 for mobj in json_ld_list:
1221 json_ld_item = self._parse_json(
1222 mobj.group('json_ld'), video_id, fatal=fatal)
1223 if not json_ld_item:
1224 continue
1225 if isinstance(json_ld_item, dict):
1226 json_ld.append(json_ld_item)
1227 elif isinstance(json_ld_item, (list, tuple)):
1228 json_ld.extend(json_ld_item)
1229 if json_ld:
1230 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1231 if json_ld:
1232 return json_ld
1233 if default is not NO_DEFAULT:
1234 return default
1235 elif fatal:
1236 raise RegexNotFoundError('Unable to extract JSON-LD')
1237 else:
1238 self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1239 return {}
4ca2a3cf 1240
95b31e26 1241 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1242 if isinstance(json_ld, compat_str):
1243 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1244 if not json_ld:
1245 return {}
1246 info = {}
46933a15
S
1247 if not isinstance(json_ld, (list, tuple, dict)):
1248 return info
1249 if isinstance(json_ld, dict):
1250 json_ld = [json_ld]
bae14048 1251
e7e4a6e0
S
1252 INTERACTION_TYPE_MAP = {
1253 'CommentAction': 'comment',
1254 'AgreeAction': 'like',
1255 'DisagreeAction': 'dislike',
1256 'LikeAction': 'like',
1257 'DislikeAction': 'dislike',
1258 'ListenAction': 'view',
1259 'WatchAction': 'view',
1260 'ViewAction': 'view',
1261 }
1262
29f7c58a 1263 def extract_interaction_type(e):
1264 interaction_type = e.get('interactionType')
1265 if isinstance(interaction_type, dict):
1266 interaction_type = interaction_type.get('@type')
1267 return str_or_none(interaction_type)
1268
e7e4a6e0
S
1269 def extract_interaction_statistic(e):
1270 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1271 if isinstance(interaction_statistic, dict):
1272 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1273 if not isinstance(interaction_statistic, list):
1274 return
1275 for is_e in interaction_statistic:
1276 if not isinstance(is_e, dict):
1277 continue
1278 if is_e.get('@type') != 'InteractionCounter':
1279 continue
29f7c58a 1280 interaction_type = extract_interaction_type(is_e)
1281 if not interaction_type:
e7e4a6e0 1282 continue
ce5b9040
S
1283 # For interaction count some sites provide string instead of
1284 # an integer (as per spec) with non digit characters (e.g. ",")
1285 # so extracting count with more relaxed str_to_int
1286 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1287 if interaction_count is None:
1288 continue
1289 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1290 if not count_kind:
1291 continue
1292 count_key = '%s_count' % count_kind
1293 if info.get(count_key) is not None:
1294 continue
1295 info[count_key] = interaction_count
1296
bae14048
S
1297 def extract_video_object(e):
1298 assert e['@type'] == 'VideoObject'
1299 info.update({
bebef109 1300 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1301 'title': unescapeHTML(e.get('name')),
1302 'description': unescapeHTML(e.get('description')),
bebef109 1303 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
bae14048
S
1304 'duration': parse_duration(e.get('duration')),
1305 'timestamp': unified_timestamp(e.get('uploadDate')),
ad06b99d 1306 'uploader': str_or_none(e.get('author')),
bae14048
S
1307 'filesize': float_or_none(e.get('contentSize')),
1308 'tbr': int_or_none(e.get('bitrate')),
1309 'width': int_or_none(e.get('width')),
1310 'height': int_or_none(e.get('height')),
33a81c2c 1311 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1312 })
e7e4a6e0 1313 extract_interaction_statistic(e)
bae14048 1314
46933a15 1315 for e in json_ld:
4433bb02 1316 if '@context' in e:
46933a15
S
1317 item_type = e.get('@type')
1318 if expected_type is not None and expected_type != item_type:
4433bb02 1319 continue
c69701c6 1320 if item_type in ('TVEpisode', 'Episode'):
440863ad 1321 episode_name = unescapeHTML(e.get('name'))
46933a15 1322 info.update({
440863ad 1323 'episode': episode_name,
46933a15
S
1324 'episode_number': int_or_none(e.get('episodeNumber')),
1325 'description': unescapeHTML(e.get('description')),
1326 })
440863ad
S
1327 if not info.get('title') and episode_name:
1328 info['title'] = episode_name
46933a15 1329 part_of_season = e.get('partOfSeason')
c69701c6 1330 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1331 info.update({
1332 'season': unescapeHTML(part_of_season.get('name')),
1333 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1334 })
d16b3c66 1335 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1336 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1337 info['series'] = unescapeHTML(part_of_series.get('name'))
391256dc
S
1338 elif item_type == 'Movie':
1339 info.update({
1340 'title': unescapeHTML(e.get('name')),
1341 'description': unescapeHTML(e.get('description')),
1342 'duration': parse_duration(e.get('duration')),
1343 'timestamp': unified_timestamp(e.get('dateCreated')),
1344 })
3931b845 1345 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1346 info.update({
1347 'timestamp': parse_iso8601(e.get('datePublished')),
1348 'title': unescapeHTML(e.get('headline')),
1349 'description': unescapeHTML(e.get('articleBody')),
1350 })
1351 elif item_type == 'VideoObject':
bae14048 1352 extract_video_object(e)
4433bb02
S
1353 if expected_type is None:
1354 continue
1355 else:
1356 break
c69701c6
S
1357 video = e.get('video')
1358 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1359 extract_video_object(video)
4433bb02
S
1360 if expected_type is None:
1361 continue
1362 else:
1363 break
4ca2a3cf
S
1364 return dict((k, v) for k, v in info.items() if v is not None)
1365
27713812 1366 @staticmethod
f8da79f8 1367 def _hidden_inputs(html):
586f1cc5 1368 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1369 hidden_inputs = {}
c8498368
S
1370 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1371 attrs = extract_attributes(input)
1372 if not input:
201ea3ee 1373 continue
c8498368 1374 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1375 continue
c8498368
S
1376 name = attrs.get('name') or attrs.get('id')
1377 value = attrs.get('value')
1378 if name and value is not None:
1379 hidden_inputs[name] = value
201ea3ee 1380 return hidden_inputs
27713812 1381
cf61d96d
S
1382 def _form_hidden_inputs(self, form_id, html):
1383 form = self._search_regex(
73eb13df 1384 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1385 html, '%s form' % form_id, group='form')
1386 return self._hidden_inputs(form)
1387
eb8a4433 1388 class FormatSort:
1389 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
1390
c10d0213 1391 default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
155d2b48 1392 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
c10d0213 1393 'proto', 'ext', 'has_audio', 'source', 'format_id') # These must not be aliases
eb8a4433 1394
1395 settings = {
1396 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1397 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1398 'acodec': {'type': 'ordered', 'regex': True,
1399 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
f137c99e 1400 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
63be1aab 1401 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
eb8a4433 1402 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1403 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1404 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1405 'aext': {'type': 'ordered', 'field': 'audio_ext',
1406 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1407 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1408 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f983b875 1409 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1410 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1411 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
f983b875 1412 'lang': {'priority': True, 'convert': 'ignore', 'type': 'extractor', 'field': 'language_preference'},
1413 'quality': {'convert': 'float_none', 'type': 'extractor'},
eb8a4433 1414 'filesize': {'convert': 'bytes'},
f137c99e 1415 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1416 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1417 'height': {'convert': 'float_none'},
1418 'width': {'convert': 'float_none'},
1419 'fps': {'convert': 'float_none'},
1420 'tbr': {'convert': 'float_none'},
1421 'vbr': {'convert': 'float_none'},
1422 'abr': {'convert': 'float_none'},
1423 'asr': {'convert': 'float_none'},
f983b875 1424 'source': {'convert': 'ignore', 'type': 'extractor', 'field': 'source_preference'},
63be1aab 1425
eb8a4433 1426 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1427 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1428 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1429 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1430 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1431
1432 # Most of these exist only for compatibility reasons
1433 'dimension': {'type': 'alias', 'field': 'res'},
1434 'resolution': {'type': 'alias', 'field': 'res'},
1435 'extension': {'type': 'alias', 'field': 'ext'},
1436 'bitrate': {'type': 'alias', 'field': 'br'},
eb8a4433 1437 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1438 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1439 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1440 'framerate': {'type': 'alias', 'field': 'fps'},
63be1aab 1441 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1442 'protocol': {'type': 'alias', 'field': 'proto'},
1443 'source_preference': {'type': 'alias', 'field': 'source'},
1444 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1445 'filesize_estimate': {'type': 'alias', 'field': 'size'},
eb8a4433 1446 'samplerate': {'type': 'alias', 'field': 'asr'},
1447 'video_ext': {'type': 'alias', 'field': 'vext'},
1448 'audio_ext': {'type': 'alias', 'field': 'aext'},
1449 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1450 'audio_codec': {'type': 'alias', 'field': 'acodec'},
63be1aab 1451 'video': {'type': 'alias', 'field': 'hasvid'},
1452 'has_video': {'type': 'alias', 'field': 'hasvid'},
1453 'audio': {'type': 'alias', 'field': 'hasaud'},
1454 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1455 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1456 'preference': {'type': 'alias', 'field': 'ie_pref'},
1457 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1458 'format_id': {'type': 'alias', 'field': 'id'},
1459 }
eb8a4433 1460
1461 _order = []
1462
1463 def _get_field_setting(self, field, key):
1464 if field not in self.settings:
1465 self.settings[field] = {}
1466 propObj = self.settings[field]
1467 if key not in propObj:
1468 type = propObj.get('type')
1469 if key == 'field':
1470 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1471 elif key == 'convert':
1472 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1473 else:
eb8a4433 1474 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1475 propObj[key] = default
1476 return propObj[key]
1477
1478 def _resolve_field_value(self, field, value, convertNone=False):
1479 if value is None:
1480 if not convertNone:
1481 return None
4bcc7bd1 1482 else:
eb8a4433 1483 value = value.lower()
1484 conversion = self._get_field_setting(field, 'convert')
1485 if conversion == 'ignore':
1486 return None
1487 if conversion == 'string':
1488 return value
1489 elif conversion == 'float_none':
1490 return float_or_none(value)
1491 elif conversion == 'bytes':
1492 return FileDownloader.parse_bytes(value)
1493 elif conversion == 'order':
da9be05e 1494 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1495 use_regex = self._get_field_setting(field, 'regex')
1496 list_length = len(order_list)
1497 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1498 if use_regex and value is not None:
da9be05e 1499 for i, regex in enumerate(order_list):
eb8a4433 1500 if regex and re.match(regex, value):
1501 return list_length - i
1502 return list_length - empty_pos # not in list
1503 else: # not regex or value = None
1504 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1505 else:
1506 if value.isnumeric():
1507 return float(value)
4bcc7bd1 1508 else:
eb8a4433 1509 self.settings[field]['convert'] = 'string'
1510 return value
1511
1512 def evaluate_params(self, params, sort_extractor):
1513 self._use_free_order = params.get('prefer_free_formats', False)
1514 self._sort_user = params.get('format_sort', [])
1515 self._sort_extractor = sort_extractor
1516
1517 def add_item(field, reverse, closest, limit_text):
1518 field = field.lower()
1519 if field in self._order:
1520 return
1521 self._order.append(field)
1522 limit = self._resolve_field_value(field, limit_text)
1523 data = {
1524 'reverse': reverse,
1525 'closest': False if limit is None else closest,
1526 'limit_text': limit_text,
1527 'limit': limit}
1528 if field in self.settings:
1529 self.settings[field].update(data)
1530 else:
1531 self.settings[field] = data
1532
1533 sort_list = (
1534 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1535 + (tuple() if params.get('format_sort_force', False)
1536 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1537 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1538
1539 for item in sort_list:
1540 match = re.match(self.regex, item)
1541 if match is None:
1542 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1543 field = match.group('field')
1544 if field is None:
1545 continue
1546 if self._get_field_setting(field, 'type') == 'alias':
1547 field = self._get_field_setting(field, 'field')
1548 reverse = match.group('reverse') is not None
1549 closest = match.group('seperator') == '~'
1550 limit_text = match.group('limit')
1551
1552 has_limit = limit_text is not None
1553 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1554 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1555
1556 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1557 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1558 limit_count = len(limits)
1559 for (i, f) in enumerate(fields):
1560 add_item(f, reverse, closest,
1561 limits[i] if i < limit_count
1562 else limits[0] if has_limit and not has_multiple_limits
1563 else None)
1564
1565 def print_verbose_info(self, to_screen):
1566 to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1567 if self._sort_extractor:
f983b875 1568 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
eb8a4433 1569 to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1570 '+' if self._get_field_setting(field, 'reverse') else '', field,
1571 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1572 self._get_field_setting(field, 'limit_text'),
1573 self._get_field_setting(field, 'limit'))
1574 if self._get_field_setting(field, 'limit_text') is not None else '')
1575 for field in self._order if self._get_field_setting(field, 'visible')]))
1576
1577 def _calculate_field_preference_from_value(self, format, field, type, value):
1578 reverse = self._get_field_setting(field, 'reverse')
1579 closest = self._get_field_setting(field, 'closest')
1580 limit = self._get_field_setting(field, 'limit')
1581
1582 if type == 'extractor':
1583 maximum = self._get_field_setting(field, 'max')
1584 if value is None or (maximum is not None and value >= maximum):
f983b875 1585 value = -1
eb8a4433 1586 elif type == 'boolean':
1587 in_list = self._get_field_setting(field, 'in_list')
1588 not_in_list = self._get_field_setting(field, 'not_in_list')
1589 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1590 elif type == 'ordered':
1591 value = self._resolve_field_value(field, value, True)
1592
1593 # try to convert to number
1594 val_num = float_or_none(value)
1595 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1596 if is_num:
1597 value = val_num
1598
1599 return ((-10, 0) if value is None
1600 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1601 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1602 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1603 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1604 else (-1, value, 0))
1605
1606 def _calculate_field_preference(self, format, field):
1607 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1608 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1609 if type == 'multiple':
1610 type = 'field' # Only 'field' is allowed in multiple for now
1611 actual_fields = self._get_field_setting(field, 'field')
1612
1613 def wrapped_function(values):
1614 values = tuple(filter(lambda x: x is not None, values))
1615 return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1616 else values[0] if values
1617 else None)
1618
1619 value = wrapped_function((get_value(f) for f in actual_fields))
1620 else:
1621 value = get_value(field)
1622 return self._calculate_field_preference_from_value(format, field, type, value)
1623
1624 def calculate_preference(self, format):
1625 # Determine missing protocol
1626 if not format.get('protocol'):
1627 format['protocol'] = determine_protocol(format)
1628
1629 # Determine missing ext
1630 if not format.get('ext') and 'url' in format:
1631 format['ext'] = determine_ext(format['url'])
1632 if format.get('vcodec') == 'none':
1633 format['audio_ext'] = format['ext']
1634 format['video_ext'] = 'none'
1635 else:
1636 format['video_ext'] = format['ext']
1637 format['audio_ext'] = 'none'
1638 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1639 # format['preference'] = -1000
1640
1641 # Determine missing bitrates
1642 if format.get('tbr') is None:
1643 if format.get('vbr') is not None and format.get('abr') is not None:
1644 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1645 else:
1646 if format.get('vcodec') != "none" and format.get('vbr') is None:
1647 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1648 if format.get('acodec') != "none" and format.get('abr') is None:
1649 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1650
1651 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1652
1653 def _sort_formats(self, formats, field_preference=[]):
1654 if not formats:
1655 raise ExtractorError('No video formats found')
1656 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1657 format_sort.evaluate_params(self._downloader.params, field_preference)
1658 if self._downloader.params.get('verbose', False):
1659 format_sort.print_verbose_info(self._downloader.to_screen)
1660 formats.sort(key=lambda f: format_sort.calculate_preference(f))
59040888 1661
96a53167
S
1662 def _check_formats(self, formats, video_id):
1663 if formats:
1664 formats[:] = filter(
1665 lambda f: self._is_valid_url(
1666 f['url'], video_id,
1667 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1668 formats)
1669
f5bdb444
S
1670 @staticmethod
1671 def _remove_duplicate_formats(formats):
1672 format_urls = set()
1673 unique_formats = []
1674 for f in formats:
1675 if f['url'] not in format_urls:
1676 format_urls.add(f['url'])
1677 unique_formats.append(f)
1678 formats[:] = unique_formats
1679
45024183 1680 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1681 url = self._proto_relative_url(url, scheme='http:')
1682 # For now assume non HTTP(S) URLs always valid
1683 if not (url.startswith('http://') or url.startswith('https://')):
1684 return True
96a53167 1685 try:
45024183 1686 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1687 return True
8bdd16b4 1688 except ExtractorError as e:
25e911a9 1689 self.to_screen(
8bdd16b4 1690 '%s: %s URL is invalid, skipping: %s'
1691 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1692 return False
96a53167 1693
20991253 1694 def http_scheme(self):
1ede5b24 1695 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1696 return (
1697 'http:'
1698 if self._downloader.params.get('prefer_insecure', False)
1699 else 'https:')
1700
57c7411f
PH
1701 def _proto_relative_url(self, url, scheme=None):
1702 if url is None:
1703 return url
1704 if url.startswith('//'):
1705 if scheme is None:
1706 scheme = self.http_scheme()
1707 return scheme + url
1708 else:
1709 return url
1710
4094b6e3
PH
1711 def _sleep(self, timeout, video_id, msg_template=None):
1712 if msg_template is None:
f1a9d64e 1713 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1714 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1715 self.to_screen(msg)
1716 time.sleep(timeout)
1717
f983b875 1718 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1719 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1720 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
f036a632
JMF
1721 manifest = self._download_xml(
1722 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1723 'Unable to download f4m manifest',
1724 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1725 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1726 transform_source=transform_source,
7360c06f 1727 fatal=fatal, data=data, headers=headers, query=query)
4de61310
S
1728
1729 if manifest is False:
8d29e47f 1730 return []
31bb8d3f 1731
0fdbb332 1732 return self._parse_f4m_formats(
f983b875 1733 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1734 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1735
f983b875 1736 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1737 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1738 fatal=True, m3u8_id=None):
ee0ba927 1739 if not isinstance(manifest, compat_etree_Element) and not fatal:
d9eb580a
S
1740 return []
1741
7a5c1cfe 1742 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1743 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1744 if akamai_pv is not None and ';' in akamai_pv.text:
1745 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1746 if playerVerificationChallenge.strip() != '':
1747 return []
1748
31bb8d3f 1749 formats = []
7a47d07c 1750 manifest_version = '1.0'
b2527359 1751 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1752 if not media_nodes:
7a47d07c 1753 manifest_version = '2.0'
34e48bed 1754 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1755 # Remove unsupported DRM protected media from final formats
067aa17e 1756 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1757 media_nodes = remove_encrypted_media(media_nodes)
1758 if not media_nodes:
1759 return formats
48107c19
S
1760
1761 manifest_base_url = get_base_url(manifest)
0a5685b2 1762
a6571f10 1763 bootstrap_info = xpath_element(
0a5685b2
YCH
1764 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1765 'bootstrap info', default=None)
1766
edd6074c
RA
1767 vcodec = None
1768 mime_type = xpath_text(
1769 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1770 'base URL', default=None)
1771 if mime_type and mime_type.startswith('audio/'):
1772 vcodec = 'none'
1773
b2527359 1774 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1775 tbr = int_or_none(media_el.attrib.get('bitrate'))
1776 width = int_or_none(media_el.attrib.get('width'))
1777 height = int_or_none(media_el.attrib.get('height'))
1778 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1779 # If <bootstrapInfo> is present, the specified f4m is a
1780 # stream-level manifest, and only set-level manifests may refer to
1781 # external resources. See section 11.4 and section 4 of F4M spec
1782 if bootstrap_info is None:
1783 media_url = None
1784 # @href is introduced in 2.0, see section 11.6 of F4M spec
1785 if manifest_version == '2.0':
1786 media_url = media_el.attrib.get('href')
1787 if media_url is None:
1788 media_url = media_el.attrib.get('url')
31c746e5
S
1789 if not media_url:
1790 continue
cc357c4d
S
1791 manifest_url = (
1792 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1793 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1794 # If media_url is itself a f4m manifest do the recursive extraction
1795 # since bitrates in parent manifest (this one) and media_url manifest
1796 # may differ leading to inability to resolve the format by requested
1797 # bitrate in f4m downloader
240b6045
YCH
1798 ext = determine_ext(manifest_url)
1799 if ext == 'f4m':
77b8b4e6 1800 f4m_formats = self._extract_f4m_formats(
f983b875 1801 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1802 transform_source=transform_source, fatal=fatal)
1803 # Sometimes stream-level manifest contains single media entry that
1804 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1805 # At the same time parent's media entry in set-level manifest may
1806 # contain it. We will copy it from parent in such cases.
1807 if len(f4m_formats) == 1:
1808 f = f4m_formats[0]
1809 f.update({
1810 'tbr': f.get('tbr') or tbr,
1811 'width': f.get('width') or width,
1812 'height': f.get('height') or height,
1813 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1814 'vcodec': vcodec,
77b8b4e6
S
1815 })
1816 formats.extend(f4m_formats)
70f0f5a8 1817 continue
240b6045
YCH
1818 elif ext == 'm3u8':
1819 formats.extend(self._extract_m3u8_formats(
1820 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1821 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1822 continue
31bb8d3f 1823 formats.append({
77b8b4e6 1824 'format_id': format_id,
31bb8d3f 1825 'url': manifest_url,
30d0b549 1826 'manifest_url': manifest_url,
a6571f10 1827 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1828 'protocol': 'f4m',
b2527359 1829 'tbr': tbr,
77b8b4e6
S
1830 'width': width,
1831 'height': height,
edd6074c 1832 'vcodec': vcodec,
60ca389c 1833 'preference': preference,
f983b875 1834 'quality': quality,
31bb8d3f 1835 })
31bb8d3f
JMF
1836 return formats
1837
f983b875 1838 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1839 return {
f207019c 1840 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1841 'url': m3u8_url,
1842 'ext': ext,
1843 'protocol': 'm3u8',
37768f92 1844 'preference': preference - 100 if preference else -100,
f983b875 1845 'quality': quality,
704df56d
PH
1846 'resolution': 'multiple',
1847 'format_note': 'Quality selection URL',
16da9bbc
YCH
1848 }
1849
1850 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
f983b875 1851 entry_protocol='m3u8', preference=None, quality=None,
310c2ed2 1852 m3u8_id=None, live=False, note=None, errnote=None,
1853 fatal=True, data=None, headers={}, query={}):
dbd82a1d 1854 res = self._download_webpage_handle(
81515ad9 1855 m3u8_url, video_id,
621ed9f5 1856 note=note or 'Downloading m3u8 information',
13af92fd 1857 errnote=errnote or 'Failed to download m3u8 information',
7360c06f 1858 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1859
dbd82a1d 1860 if res is False:
8d29e47f 1861 return []
cb252080 1862
dbd82a1d 1863 m3u8_doc, urlh = res
37113045 1864 m3u8_url = urlh.geturl()
9cdffeeb 1865
cb252080
S
1866 return self._parse_m3u8_formats(
1867 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1868 preference=preference, quality=quality, m3u8_id=m3u8_id,
1869 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1870 headers=headers, query=query, video_id=video_id)
cb252080
S
1871
1872 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
f983b875 1873 entry_protocol='m3u8', preference=None, quality=None,
310c2ed2 1874 m3u8_id=None, live=False, note=None, errnote=None,
1875 fatal=True, data=None, headers={}, query={}, video_id=None):
08a00eef
RA
1876 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1877 return []
1878
ea229584
RA
1879 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1880 return []
1881
ff99fe52 1882 formats = []
0def7587
RA
1883
1884 format_url = lambda u: (
1885 u
1886 if re.match(r'^https?://', u)
1887 else compat_urlparse.urljoin(m3u8_url, u))
1888
310c2ed2 1889 split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1890
cb252080
S
1891 # References:
1892 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
1893 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1894 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
1895
1896 # We should try extracting formats only from master playlists [1, 4.3.4],
1897 # i.e. playlists that describe available qualities. On the other hand
1898 # media playlists [1, 4.3.3] should be returned as is since they contain
1899 # just the media without qualities renditions.
9cdffeeb 1900 # Fortunately, master playlist can be easily distinguished from media
cb252080 1901 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 1902 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
1903 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1904 # media playlist and MUST NOT appear in master playlist thus we can
1905 # clearly detect media playlist with this criterion.
1906
ed9b7e3d 1907 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None):
310c2ed2 1908 if not m3u8_doc:
ed9b7e3d 1909 if not format_url:
1910 return []
310c2ed2 1911 res = self._download_webpage_handle(
1912 format_url, video_id,
1913 note=False,
1914 errnote=errnote or 'Failed to download m3u8 playlist information',
1915 fatal=fatal, data=data, headers=headers, query=query)
1916
1917 if res is False:
1918 return []
1919
1920 m3u8_doc, urlh = res
1921 format_url = urlh.geturl()
1922
1923 playlist_formats = []
1924 i = (
1925 0
1926 if split_discontinuity
1927 else None)
1928 format_info = {
1929 'index': i,
1930 'key_data': None,
1931 'files': [],
1932 }
1933 for line in m3u8_doc.splitlines():
1934 if not line.startswith('#'):
1935 format_info['files'].append(line)
1936 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1937 i += 1
1938 playlist_formats.append(format_info)
1939 format_info = {
1940 'index': i,
1941 'url': format_url,
1942 'files': [],
1943 }
1944 playlist_formats.append(format_info)
1945 return playlist_formats
1946
9cdffeeb 1947 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
310c2ed2 1948
ed9b7e3d 1949 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
310c2ed2 1950
1951 for format in playlist_formats:
1952 format_id = []
1953 if m3u8_id:
1954 format_id.append(m3u8_id)
1955 format_index = format.get('index')
1956 if format_index:
1957 format_id.append(str(format_index))
1958 f = {
1959 'format_id': '-'.join(format_id),
1960 'format_index': format_index,
1961 'url': m3u8_url,
1962 'ext': ext,
1963 'protocol': entry_protocol,
1964 'preference': preference,
1965 'quality': quality,
1966 }
1967 formats.append(f)
1968
1969 return formats
cb252080
S
1970
1971 groups = {}
1972 last_stream_inf = {}
1973
1974 def extract_media(x_media_line):
1975 media = parse_m3u8_attributes(x_media_line)
1976 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1977 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1978 if not (media_type and group_id and name):
1979 return
1980 groups.setdefault(group_id, []).append(media)
1981 if media_type not in ('VIDEO', 'AUDIO'):
1982 return
1983 media_url = media.get('URI')
1984 if media_url:
310c2ed2 1985 manifest_url = format_url(media_url)
cb252080 1986 format_id = []
310c2ed2 1987 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
1988
1989 for format in playlist_formats:
1990 format_index = format.get('index')
1991 for v in (m3u8_id, group_id, name):
1992 if v:
1993 format_id.append(v)
1994 if format_index:
1995 format_id.append(str(format_index))
1996 f = {
1997 'format_id': '-'.join(format_id),
1998 'format_index': format_index,
1999 'url': manifest_url,
2000 'manifest_url': m3u8_url,
2001 'language': media.get('LANGUAGE'),
2002 'ext': ext,
2003 'protocol': entry_protocol,
2004 'preference': preference,
2005 'quality': quality,
2006 }
2007 if media_type == 'AUDIO':
2008 f['vcodec'] = 'none'
2009 formats.append(f)
cb252080
S
2010
2011 def build_stream_name():
2012 # Despite specification does not mention NAME attribute for
3019cb0c
S
2013 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2014 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2015 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2016 stream_name = last_stream_inf.get('NAME')
2017 if stream_name:
2018 return stream_name
2019 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2020 # from corresponding rendition group
2021 stream_group_id = last_stream_inf.get('VIDEO')
2022 if not stream_group_id:
2023 return
2024 stream_group = groups.get(stream_group_id)
2025 if not stream_group:
2026 return stream_group_id
2027 rendition = stream_group[0]
2028 return rendition.get('NAME') or stream_group_id
2029
379306ef 2030 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2031 # chance to detect video only formats when EXT-X-STREAM-INF tags
2032 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2033 for line in m3u8_doc.splitlines():
2034 if line.startswith('#EXT-X-MEDIA:'):
2035 extract_media(line)
2036
704df56d
PH
2037 for line in m3u8_doc.splitlines():
2038 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2039 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2040 elif line.startswith('#') or not line.strip():
2041 continue
2042 else:
9c99bef7 2043 tbr = float_or_none(
3089bc74
S
2044 last_stream_inf.get('AVERAGE-BANDWIDTH')
2045 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2046 manifest_url = format_url(line.strip())
5ef62fc4 2047
310c2ed2 2048 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
2049
2050 for format in playlist_formats:
2051 format_id = []
2052 if m3u8_id:
2053 format_id.append(m3u8_id)
2054 format_index = format.get('index')
2055 stream_name = build_stream_name()
2056 # Bandwidth of live streams may differ over time thus making
2057 # format_id unpredictable. So it's better to keep provided
2058 # format_id intact.
2059 if not live:
2060 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2061 if format_index:
2062 format_id.append(str(format_index))
2063 f = {
2064 'format_id': '-'.join(format_id),
2065 'format_index': format_index,
2066 'url': manifest_url,
2067 'manifest_url': m3u8_url,
2068 'tbr': tbr,
2069 'ext': ext,
2070 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2071 'protocol': entry_protocol,
2072 'preference': preference,
2073 'quality': quality,
2074 }
2075 resolution = last_stream_inf.get('RESOLUTION')
2076 if resolution:
2077 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2078 if mobj:
2079 f['width'] = int(mobj.group('width'))
2080 f['height'] = int(mobj.group('height'))
2081 # Unified Streaming Platform
2082 mobj = re.search(
2083 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2084 if mobj:
2085 abr, vbr = mobj.groups()
2086 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2087 f.update({
2088 'vbr': vbr,
2089 'abr': abr,
2090 })
2091 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2092 f.update(codecs)
2093 audio_group_id = last_stream_inf.get('AUDIO')
2094 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2095 # references a rendition group MUST have a CODECS attribute.
2096 # However, this is not always respected, for example, [2]
2097 # contains EXT-X-STREAM-INF tag which references AUDIO
2098 # rendition group but does not have CODECS and despite
2099 # referencing an audio group it represents a complete
2100 # (with audio and video) format. So, for such cases we will
2101 # ignore references to rendition groups and treat them
2102 # as complete formats.
2103 if audio_group_id and codecs and f.get('vcodec') != 'none':
2104 audio_group = groups.get(audio_group_id)
2105 if audio_group and audio_group[0].get('URI'):
2106 # TODO: update acodec for audio only formats with
2107 # the same GROUP-ID
2108 f['acodec'] = 'none'
2109 formats.append(f)
2110
2111 # for DailyMotion
2112 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2113 if progressive_uri:
2114 http_f = f.copy()
2115 del http_f['manifest_url']
2116 http_f.update({
2117 'format_id': f['format_id'].replace('hls-', 'http-'),
2118 'protocol': 'http',
2119 'url': progressive_uri,
2120 })
2121 formats.append(http_f)
5ef62fc4 2122
cb252080 2123 last_stream_inf = {}
704df56d
PH
2124 return formats
2125
a107193e
S
2126 @staticmethod
2127 def _xpath_ns(path, namespace=None):
2128 if not namespace:
2129 return path
2130 out = []
2131 for c in path.split('/'):
2132 if not c or c == '.':
2133 out.append(c)
2134 else:
2135 out.append('{%s}%s' % (namespace, c))
2136 return '/'.join(out)
2137
09f572fb 2138 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2139 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 2140
995029a1
PH
2141 if smil is False:
2142 assert not fatal
2143 return []
e89a2aab 2144
17712eeb 2145 namespace = self._parse_smil_namespace(smil)
a107193e
S
2146
2147 return self._parse_smil_formats(
2148 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2149
2150 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2151 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2152 if smil is False:
2153 return {}
2154 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2155
09f572fb 2156 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
2157 return self._download_xml(
2158 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2159 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2160
2161 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2162 namespace = self._parse_smil_namespace(smil)
a107193e
S
2163
2164 formats = self._parse_smil_formats(
2165 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2166 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2167
2168 video_id = os.path.splitext(url_basename(smil_url))[0]
2169 title = None
2170 description = None
647eab45 2171 upload_date = None
a107193e
S
2172 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2173 name = meta.attrib.get('name')
2174 content = meta.attrib.get('content')
2175 if not name or not content:
2176 continue
2177 if not title and name == 'title':
2178 title = content
2179 elif not description and name in ('description', 'abstract'):
2180 description = content
647eab45
S
2181 elif not upload_date and name == 'date':
2182 upload_date = unified_strdate(content)
a107193e 2183
1e5bcdec
S
2184 thumbnails = [{
2185 'id': image.get('type'),
2186 'url': image.get('src'),
2187 'width': int_or_none(image.get('width')),
2188 'height': int_or_none(image.get('height')),
2189 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2190
a107193e
S
2191 return {
2192 'id': video_id,
2193 'title': title or video_id,
2194 'description': description,
647eab45 2195 'upload_date': upload_date,
1e5bcdec 2196 'thumbnails': thumbnails,
a107193e
S
2197 'formats': formats,
2198 'subtitles': subtitles,
2199 }
2200
17712eeb
S
2201 def _parse_smil_namespace(self, smil):
2202 return self._search_regex(
2203 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2204
f877c6ae 2205 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2206 base = smil_url
2207 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2208 b = meta.get('base') or meta.get('httpBase')
2209 if b:
2210 base = b
2211 break
e89a2aab
S
2212
2213 formats = []
2214 rtmp_count = 0
a107193e 2215 http_count = 0
7f32e5dc 2216 m3u8_count = 0
a107193e 2217
81e1c4e2 2218 srcs = []
ad96b4c8
YCH
2219 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2220 for medium in media:
2221 src = medium.get('src')
81e1c4e2 2222 if not src or src in srcs:
a107193e 2223 continue
81e1c4e2 2224 srcs.append(src)
a107193e 2225
ad96b4c8
YCH
2226 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2227 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2228 width = int_or_none(medium.get('width'))
2229 height = int_or_none(medium.get('height'))
2230 proto = medium.get('proto')
2231 ext = medium.get('ext')
a107193e 2232 src_ext = determine_ext(src)
ad96b4c8 2233 streamer = medium.get('streamer') or base
a107193e
S
2234
2235 if proto == 'rtmp' or streamer.startswith('rtmp'):
2236 rtmp_count += 1
2237 formats.append({
2238 'url': streamer,
2239 'play_path': src,
2240 'ext': 'flv',
2241 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2242 'tbr': bitrate,
2243 'filesize': filesize,
2244 'width': width,
2245 'height': height,
2246 })
f877c6ae
YCH
2247 if transform_rtmp_url:
2248 streamer, src = transform_rtmp_url(streamer, src)
2249 formats[-1].update({
2250 'url': streamer,
2251 'play_path': src,
2252 })
a107193e
S
2253 continue
2254
2255 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 2256 src_url = src_url.strip()
a107193e
S
2257
2258 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2259 m3u8_formats = self._extract_m3u8_formats(
2260 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2261 if len(m3u8_formats) == 1:
2262 m3u8_count += 1
2263 m3u8_formats[0].update({
2264 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2265 'tbr': bitrate,
2266 'width': width,
2267 'height': height,
2268 })
2269 formats.extend(m3u8_formats)
bd21ead2 2270 elif src_ext == 'f4m':
a107193e
S
2271 f4m_url = src_url
2272 if not f4m_params:
2273 f4m_params = {
2274 'hdcore': '3.2.0',
2275 'plugin': 'flowplayer-3.2.0.1',
2276 }
2277 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 2278 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 2279 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2280 elif src_ext == 'mpd':
2281 formats.extend(self._extract_mpd_formats(
2282 src_url, video_id, mpd_id='dash', fatal=False))
2283 elif re.search(r'\.ism/[Mm]anifest', src_url):
2284 formats.extend(self._extract_ism_formats(
2285 src_url, video_id, ism_id='mss', fatal=False))
2286 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2287 http_count += 1
2288 formats.append({
2289 'url': src_url,
2290 'ext': ext or src_ext or 'flv',
2291 'format_id': 'http-%d' % (bitrate or http_count),
2292 'tbr': bitrate,
2293 'filesize': filesize,
2294 'width': width,
2295 'height': height,
2296 })
63757032 2297
e89a2aab
S
2298 return formats
2299
ce00af87 2300 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2301 urls = []
a107193e
S
2302 subtitles = {}
2303 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2304 src = textstream.get('src')
d413095f 2305 if not src or src in urls:
a107193e 2306 continue
d413095f 2307 urls.append(src)
df634be2 2308 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2309 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2310 subtitles.setdefault(lang, []).append({
2311 'url': src,
2312 'ext': ext,
2313 })
2314 return subtitles
63757032 2315
47a5cb77 2316 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 2317 xspf = self._download_xml(
47a5cb77 2318 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
2319 'Unable to download xspf manifest', fatal=fatal)
2320 if xspf is False:
2321 return []
47a5cb77
S
2322 return self._parse_xspf(
2323 xspf, playlist_id, xspf_url=xspf_url,
2324 xspf_base_url=base_url(xspf_url))
8d6765cf 2325
47a5cb77 2326 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2327 NS_MAP = {
2328 'xspf': 'http://xspf.org/ns/0/',
2329 's1': 'http://static.streamone.nl/player/ns/0',
2330 }
2331
2332 entries = []
47a5cb77 2333 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2334 title = xpath_text(
98044462 2335 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2336 description = xpath_text(
2337 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2338 thumbnail = xpath_text(
2339 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2340 duration = float_or_none(
2341 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2342
47a5cb77
S
2343 formats = []
2344 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2345 format_url = urljoin(xspf_base_url, location.text)
2346 if not format_url:
2347 continue
2348 formats.append({
2349 'url': format_url,
2350 'manifest_url': xspf_url,
2351 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2352 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2353 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2354 })
8d6765cf
S
2355 self._sort_formats(formats)
2356
2357 entries.append({
2358 'id': playlist_id,
2359 'title': title,
2360 'description': description,
2361 'thumbnail': thumbnail,
2362 'duration': duration,
2363 'formats': formats,
2364 })
2365 return entries
2366
545cc85d 2367 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2368 res = self._download_xml_handle(
1bac3455 2369 mpd_url, video_id,
2370 note=note or 'Downloading MPD manifest',
2371 errnote=errnote or 'Failed to download MPD manifest',
7360c06f 2372 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2373 if res is False:
2d2fa82d 2374 return []
47a5cb77 2375 mpd_doc, urlh = res
c25720ef
RA
2376 if mpd_doc is None:
2377 return []
02dc0a36 2378 mpd_base_url = base_url(urlh.geturl())
1bac3455 2379
91cb6b50 2380 return self._parse_mpd_formats(
545cc85d 2381 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2382
545cc85d 2383 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2384 """
2385 Parse formats from MPD manifest.
2386 References:
2387 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2388 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2389 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2390 """
78895bd3
U
2391 if not self._downloader.params.get('dynamic_mpd'):
2392 if mpd_doc.get('type') == 'dynamic':
2393 return []
2d2fa82d 2394
91cb6b50 2395 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2396
2397 def _add_ns(path):
2398 return self._xpath_ns(path, namespace)
2399
675d0016 2400 def is_drm_protected(element):
2401 return element.find(_add_ns('ContentProtection')) is not None
2402
1bac3455 2403 def extract_multisegment_info(element, ms_parent_info):
2404 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2405
2406 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2407 # common attributes and elements. We will only extract relevant
2408 # for us.
2409 def extract_common(source):
2410 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2411 if segment_timeline is not None:
2412 s_e = segment_timeline.findall(_add_ns('S'))
2413 if s_e:
2414 ms_info['total_number'] = 0
2415 ms_info['s'] = []
2416 for s in s_e:
2417 r = int(s.get('r', 0))
2418 ms_info['total_number'] += 1 + r
2419 ms_info['s'].append({
2420 't': int(s.get('t', 0)),
2421 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2422 'd': int(s.attrib['d']),
2423 'r': r,
2424 })
2425 start_number = source.get('startNumber')
2426 if start_number:
2427 ms_info['start_number'] = int(start_number)
2428 timescale = source.get('timescale')
2429 if timescale:
2430 ms_info['timescale'] = int(timescale)
2431 segment_duration = source.get('duration')
2432 if segment_duration:
48504785 2433 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2434
2435 def extract_Initialization(source):
2436 initialization = source.find(_add_ns('Initialization'))
2437 if initialization is not None:
2438 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2439
f14be228 2440 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2441 if segment_list is not None:
b4c1d6e8
S
2442 extract_common(segment_list)
2443 extract_Initialization(segment_list)
f14be228 2444 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2445 if segment_urls_e:
2446 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2447 else:
f14be228 2448 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2449 if segment_template is not None:
b4c1d6e8 2450 extract_common(segment_template)
e228616c
S
2451 media = segment_template.get('media')
2452 if media:
2453 ms_info['media'] = media
1bac3455 2454 initialization = segment_template.get('initialization')
2455 if initialization:
e228616c 2456 ms_info['initialization'] = initialization
1bac3455 2457 else:
b4c1d6e8 2458 extract_Initialization(segment_template)
1bac3455 2459 return ms_info
b323e170 2460
06869367 2461 skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
63ad4d43 2462
1bac3455 2463 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 2464 formats = []
f14be228 2465 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2466 period_duration = parse_duration(period.get('duration')) or mpd_duration
2467 period_ms_info = extract_multisegment_info(period, {
2468 'start_number': 1,
2469 'timescale': 1,
2470 })
f14be228 2471 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
06869367 2472 if skip_unplayable and is_drm_protected(adaptation_set):
675d0016 2473 continue
1bac3455 2474 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2475 for representation in adaptation_set.findall(_add_ns('Representation')):
06869367 2476 if skip_unplayable and is_drm_protected(representation):
675d0016 2477 continue
1bac3455 2478 representation_attrib = adaptation_set.attrib.copy()
2479 representation_attrib.update(representation.attrib)
f0948348 2480 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759
YCH
2481 mime_type = representation_attrib['mimeType']
2482 content_type = mime_type.split('/')[0]
1bac3455 2483 if content_type == 'text':
2484 # TODO implement WebVTT downloading
2485 pass
40fcba5e 2486 elif content_type in ('video', 'audio'):
1bac3455 2487 base_url = ''
2488 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 2489 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 2490 if base_url_e is not None:
2491 base_url = base_url_e.text + base_url
2492 if re.match(r'^https?://', base_url):
2493 break
bb20526b
S
2494 if mpd_base_url and not re.match(r'^https?://', base_url):
2495 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2496 mpd_base_url += '/'
1bac3455 2497 base_url = mpd_base_url + base_url
2498 representation_id = representation_attrib.get('id')
d577c796 2499 lang = representation_attrib.get('lang')
51e9094f 2500 url_el = representation.find(_add_ns('BaseURL'))
2501 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
e228616c 2502 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1bac3455 2503 f = {
154c209e 2504 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
86f4d14f 2505 'manifest_url': mpd_url,
a6c8b759 2506 'ext': mimetype2ext(mime_type),
1bac3455 2507 'width': int_or_none(representation_attrib.get('width')),
2508 'height': int_or_none(representation_attrib.get('height')),
9c99bef7 2509 'tbr': float_or_none(bandwidth, 1000),
1bac3455 2510 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2511 'fps': int_or_none(representation_attrib.get('frameRate')),
d577c796 2512 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 2513 'format_note': 'DASH %s' % content_type,
51e9094f 2514 'filesize': filesize,
126f225b 2515 'container': mimetype2ext(mime_type) + '_dash',
1bac3455 2516 }
7fe15920 2517 f.update(parse_codecs(representation_attrib.get('codecs')))
1bac3455 2518 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
b4c1d6e8 2519
e228616c 2520 def prepare_template(template_name, identifiers):
eca1f0d1
S
2521 tmpl = representation_ms_info[template_name]
2522 # First of, % characters outside $...$ templates
2523 # must be escaped by doubling for proper processing
2524 # by % operator string formatting used further (see
067aa17e 2525 # https://github.com/ytdl-org/youtube-dl/issues/16867).
eca1f0d1
S
2526 t = ''
2527 in_template = False
2528 for c in tmpl:
2529 t += c
2530 if c == '$':
2531 in_template = not in_template
2532 elif c == '%' and not in_template:
2533 t += c
2534 # Next, $...$ templates are translated to their
2535 # %(...) counterparts to be used with % operator
e228616c
S
2536 t = t.replace('$RepresentationID$', representation_id)
2537 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2538 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2539 t.replace('$$', '$')
2540 return t
2541
2542 # @initialization is a regular template like @media one
2543 # so it should be handled just the same way (see
067aa17e 2544 # https://github.com/ytdl-org/youtube-dl/issues/11605)
e228616c
S
2545 if 'initialization' in representation_ms_info:
2546 initialization_template = prepare_template(
2547 'initialization',
2548 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2549 # $Time$ shall not be included for @initialization thus
2550 # only $Bandwidth$ remains
2551 ('Bandwidth', ))
2552 representation_ms_info['initialization_url'] = initialization_template % {
2553 'Bandwidth': bandwidth,
2554 }
2555
1141e910
S
2556 def location_key(location):
2557 return 'url' if re.match(r'^https?://', location) else 'path'
2558
e228616c
S
2559 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2560
2561 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1141e910 2562 media_location_key = location_key(media_template)
f0948348
S
2563
2564 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2565 # can't be used at the same time
b4c1d6e8
S
2566 if '%(Number' in media_template and 's' not in representation_ms_info:
2567 segment_duration = None
c110944f 2568 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
b4c1d6e8
S
2569 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2570 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
b4c1d6e8 2571 representation_ms_info['fragments'] = [{
1141e910 2572 media_location_key: media_template % {
b4c1d6e8 2573 'Number': segment_number,
e228616c 2574 'Bandwidth': bandwidth,
b4c1d6e8
S
2575 },
2576 'duration': segment_duration,
2577 } for segment_number in range(
2578 representation_ms_info['start_number'],
2579 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
f0948348 2580 else:
b4c1d6e8
S
2581 # $Number*$ or $Time$ in media template with S list available
2582 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2583 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
b4c1d6e8 2584 representation_ms_info['fragments'] = []
f0948348 2585 segment_time = 0
b4c1d6e8
S
2586 segment_d = None
2587 segment_number = representation_ms_info['start_number']
f0948348
S
2588
2589 def add_segment_url():
b4c1d6e8
S
2590 segment_url = media_template % {
2591 'Time': segment_time,
e228616c 2592 'Bandwidth': bandwidth,
b4c1d6e8
S
2593 'Number': segment_number,
2594 }
b4c1d6e8 2595 representation_ms_info['fragments'].append({
1141e910 2596 media_location_key: segment_url,
b4c1d6e8
S
2597 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2598 })
f0948348
S
2599
2600 for num, s in enumerate(representation_ms_info['s']):
2601 segment_time = s.get('t') or segment_time
b4c1d6e8 2602 segment_d = s['d']
f0948348 2603 add_segment_url()
b4c1d6e8 2604 segment_number += 1
f0948348 2605 for r in range(s.get('r', 0)):
b4c1d6e8 2606 segment_time += segment_d
f0948348 2607 add_segment_url()
b4c1d6e8
S
2608 segment_number += 1
2609 segment_time += segment_d
2610 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2611 # No media template
2612 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2613 # or any YouTube dashsegments video
2614 fragments = []
d04621da
S
2615 segment_index = 0
2616 timescale = representation_ms_info['timescale']
2617 for s in representation_ms_info['s']:
2618 duration = float_or_none(s['d'], timescale)
b4c1d6e8 2619 for r in range(s.get('r', 0) + 1):
1141e910 2620 segment_uri = representation_ms_info['segment_urls'][segment_index]
b4c1d6e8 2621 fragments.append({
1141e910 2622 location_key(segment_uri): segment_uri,
d04621da 2623 'duration': duration,
b4c1d6e8 2624 })
d04621da 2625 segment_index += 1
b4c1d6e8 2626 representation_ms_info['fragments'] = fragments
41bf647e
PN
2627 elif 'segment_urls' in representation_ms_info:
2628 # Segment URLs with no SegmentTimeline
2629 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
067aa17e 2630 # https://github.com/ytdl-org/youtube-dl/pull/14844
41bf647e 2631 fragments = []
603fc4e0
S
2632 segment_duration = float_or_none(
2633 representation_ms_info['segment_duration'],
2634 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
41bf647e 2635 for segment_url in representation_ms_info['segment_urls']:
603fc4e0 2636 fragment = {
41bf647e 2637 location_key(segment_url): segment_url,
603fc4e0
S
2638 }
2639 if segment_duration:
2640 fragment['duration'] = segment_duration
2641 fragments.append(fragment)
41bf647e 2642 representation_ms_info['fragments'] = fragments
79d2077e
S
2643 # If there is a fragments key available then we correctly recognized fragmented media.
2644 # Otherwise we will assume unfragmented media with direct access. Technically, such
2645 # assumption is not necessarily correct since we may simply have no support for
2646 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
86f4d14f 2647 if 'fragments' in representation_ms_info:
1bac3455 2648 f.update({
79d2077e
S
2649 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2650 'url': mpd_url or base_url,
1141e910 2651 'fragment_base_url': base_url,
b4c1d6e8 2652 'fragments': [],
1bac3455 2653 'protocol': 'http_dash_segments',
df374b52 2654 })
1bac3455 2655 if 'initialization_url' in representation_ms_info:
e228616c 2656 initialization_url = representation_ms_info['initialization_url']
1bac3455 2657 if not f.get('url'):
2658 f['url'] = initialization_url
1141e910 2659 f['fragments'].append({location_key(initialization_url): initialization_url})
b4c1d6e8 2660 f['fragments'].extend(representation_ms_info['fragments'])
79d2077e
S
2661 else:
2662 # Assuming direct URL to unfragmented media.
2663 f['url'] = base_url
545cc85d 2664 formats.append(f)
17b598d3 2665 else:
1bac3455 2666 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
17b598d3
YCH
2667 return formats
2668
7360c06f 2669 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2670 res = self._download_xml_handle(
b2758123
RA
2671 ism_url, video_id,
2672 note=note or 'Downloading ISM manifest',
2673 errnote=errnote or 'Failed to download ISM manifest',
7360c06f 2674 fatal=fatal, data=data, headers=headers, query=query)
b2758123
RA
2675 if res is False:
2676 return []
47a5cb77 2677 ism_doc, urlh = res
13b08034
S
2678 if ism_doc is None:
2679 return []
b2758123 2680
7947a1f7 2681 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
b2758123
RA
2682
2683 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2684 """
2685 Parse formats from ISM manifest.
2686 References:
2687 1. [MS-SSTR]: Smooth Streaming Protocol,
2688 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2689 """
06869367 2690 if ism_doc.get('IsLive') == 'TRUE':
2691 return []
2692 if (not self._downloader.params.get('allow_unplayable_formats')
2693 and ism_doc.find('Protection') is not None):
b2758123
RA
2694 return []
2695
b2758123
RA
2696 duration = int(ism_doc.attrib['Duration'])
2697 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2698
2699 formats = []
2700 for stream in ism_doc.findall('StreamIndex'):
2701 stream_type = stream.get('Type')
2702 if stream_type not in ('video', 'audio'):
2703 continue
2704 url_pattern = stream.attrib['Url']
2705 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2706 stream_name = stream.get('Name')
2707 for track in stream.findall('QualityLevel'):
2501d41e 2708 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
b2758123
RA
2709 # TODO: add support for WVC1 and WMAP
2710 if fourcc not in ('H264', 'AVC1', 'AACL'):
2711 self.report_warning('%s is not a supported codec' % fourcc)
2712 continue
2713 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2714 # [1] does not mention Width and Height attributes. However,
2715 # they're often present while MaxWidth and MaxHeight are
2716 # missing, so should be used as fallbacks
2717 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2718 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2719 sampling_rate = int_or_none(track.get('SamplingRate'))
2720
2721 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2722 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2723
2724 fragments = []
2725 fragment_ctx = {
2726 'time': 0,
2727 }
2728 stream_fragments = stream.findall('c')
2729 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2730 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2731 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2732 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2733 if not fragment_ctx['duration']:
2734 try:
2735 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2736 except IndexError:
2737 next_fragment_time = duration
1616f9b4 2738 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2739 for _ in range(fragment_repeat):
2740 fragments.append({
1616f9b4 2741 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2742 'duration': fragment_ctx['duration'] / stream_timescale,
2743 })
2744 fragment_ctx['time'] += fragment_ctx['duration']
2745
2746 format_id = []
2747 if ism_id:
2748 format_id.append(ism_id)
2749 if stream_name:
2750 format_id.append(stream_name)
2751 format_id.append(compat_str(tbr))
2752
2753 formats.append({
2754 'format_id': '-'.join(format_id),
2755 'url': ism_url,
2756 'manifest_url': ism_url,
2757 'ext': 'ismv' if stream_type == 'video' else 'isma',
2758 'width': width,
2759 'height': height,
2760 'tbr': tbr,
2761 'asr': sampling_rate,
2762 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2763 'acodec': 'none' if stream_type == 'video' else fourcc,
2764 'protocol': 'ism',
2765 'fragments': fragments,
2766 '_download_params': {
2767 'duration': duration,
2768 'timescale': stream_timescale,
2769 'width': width or 0,
2770 'height': height or 0,
2771 'fourcc': fourcc,
2772 'codec_private_data': track.get('CodecPrivateData'),
2773 'sampling_rate': sampling_rate,
2774 'channels': int_or_none(track.get('Channels', 2)),
2775 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2776 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2777 },
2778 })
2779 return formats
2780
f983b875 2781 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
6780154e
S
2782 def absolute_url(item_url):
2783 return urljoin(base_url, item_url)
59bbe491 2784
2785 def parse_content_type(content_type):
2786 if not content_type:
2787 return {}
2788 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2789 if ctr:
2790 mimetype, codecs = ctr.groups()
2791 f = parse_codecs(codecs)
2792 f['ext'] = mimetype2ext(mimetype)
2793 return f
2794 return {}
2795
868f79db 2796 def _media_formats(src, cur_media_type, type_info={}):
520251c0 2797 full_url = absolute_url(src)
82889d4a 2798 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 2799 if ext == 'm3u8':
520251c0
YCH
2800 is_plain_url = False
2801 formats = self._extract_m3u8_formats(
ad120ae1 2802 full_url, video_id, ext='mp4',
eeb0a956 2803 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 2804 preference=preference, quality=quality, fatal=False)
87a449c1
S
2805 elif ext == 'mpd':
2806 is_plain_url = False
2807 formats = self._extract_mpd_formats(
b359e977 2808 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
2809 else:
2810 is_plain_url = True
2811 formats = [{
2812 'url': full_url,
2813 'vcodec': 'none' if cur_media_type == 'audio' else None,
2814 }]
2815 return is_plain_url, formats
2816
59bbe491 2817 entries = []
4328ddf8
S
2818 # amp-video and amp-audio are very similar to their HTML5 counterparts
2819 # so we wll include them right here (see
2820 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 2821 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2822 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2823 media_tags = [(media_tag, media_tag_name, media_type, '')
2824 for media_tag, media_tag_name, media_type
2825 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
2826 media_tags.extend(re.findall(
2827 # We only allow video|audio followed by a whitespace or '>'.
2828 # Allowing more characters may end up in significant slow down (see
067aa17e 2829 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 2830 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 2831 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2832 for media_tag, _, media_type, media_content in media_tags:
59bbe491 2833 media_info = {
2834 'formats': [],
2835 'subtitles': {},
2836 }
2837 media_attributes = extract_attributes(media_tag)
f856816b 2838 src = strip_or_none(media_attributes.get('src'))
59bbe491 2839 if src:
dedb1770 2840 _, formats = _media_formats(src, media_type)
520251c0 2841 media_info['formats'].extend(formats)
6780154e 2842 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 2843 if media_content:
2844 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
2845 s_attr = extract_attributes(source_tag)
2846 # data-video-src and data-src are non standard but seen
2847 # several times in the wild
f856816b 2848 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 2849 if not src:
2850 continue
d493f15c 2851 f = parse_content_type(s_attr.get('type'))
868f79db 2852 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 2853 if is_plain_url:
d493f15c
S
2854 # width, height, res, label and title attributes are
2855 # all not standard but seen several times in the wild
2856 labels = [
2857 s_attr.get(lbl)
2858 for lbl in ('label', 'title')
2859 if str_or_none(s_attr.get(lbl))
2860 ]
2861 width = int_or_none(s_attr.get('width'))
3089bc74
S
2862 height = (int_or_none(s_attr.get('height'))
2863 or int_or_none(s_attr.get('res')))
d493f15c
S
2864 if not width or not height:
2865 for lbl in labels:
2866 resolution = parse_resolution(lbl)
2867 if not resolution:
2868 continue
2869 width = width or resolution.get('width')
2870 height = height or resolution.get('height')
2871 for lbl in labels:
2872 tbr = parse_bitrate(lbl)
2873 if tbr:
2874 break
2875 else:
2876 tbr = None
1ed45499 2877 f.update({
d493f15c
S
2878 'width': width,
2879 'height': height,
2880 'tbr': tbr,
2881 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 2882 })
520251c0
YCH
2883 f.update(formats[0])
2884 media_info['formats'].append(f)
2885 else:
2886 media_info['formats'].extend(formats)
59bbe491 2887 for track_tag in re.findall(r'<track[^>]+>', media_content):
2888 track_attributes = extract_attributes(track_tag)
2889 kind = track_attributes.get('kind')
5968d7d2 2890 if not kind or kind in ('subtitles', 'captions'):
f856816b 2891 src = strip_or_none(track_attributes.get('src'))
59bbe491 2892 if not src:
2893 continue
2894 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2895 media_info['subtitles'].setdefault(lang, []).append({
2896 'url': absolute_url(src),
2897 })
5e8e2fa5
S
2898 for f in media_info['formats']:
2899 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 2900 if media_info['formats'] or media_info['subtitles']:
59bbe491 2901 entries.append(media_info)
2902 return entries
2903
c4251b9a 2904 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
29f7c58a 2905 signed = 'hdnea=' in manifest_url
2906 if not signed:
2907 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2908 manifest_url = re.sub(
2909 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2910 '', manifest_url).strip('?')
2911
c7c43a93 2912 formats = []
70c5802b 2913
e71a4509 2914 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 2915 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
2916 hds_host = hosts.get('hds')
2917 if hds_host:
2918 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
2919 if 'hdcore=' not in f4m_url:
2920 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2921 f4m_formats = self._extract_f4m_formats(
2922 f4m_url, video_id, f4m_id='hds', fatal=False)
2923 for entry in f4m_formats:
2924 entry.update({'extra_param_to_segment_url': hdcore_sign})
2925 formats.extend(f4m_formats)
70c5802b 2926
c4251b9a
RA
2927 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2928 hls_host = hosts.get('hls')
2929 if hls_host:
2930 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
29f7c58a 2931 m3u8_formats = self._extract_m3u8_formats(
c7c43a93 2932 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 2933 m3u8_id='hls', fatal=False)
2934 formats.extend(m3u8_formats)
70c5802b 2935
2936 http_host = hosts.get('http')
29f7c58a 2937 if http_host and m3u8_formats and not signed:
2938 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 2939 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2940 qualities_length = len(qualities)
29f7c58a 2941 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 2942 i = 0
29f7c58a 2943 for f in m3u8_formats:
2944 if f['vcodec'] != 'none':
70c5802b 2945 for protocol in ('http', 'https'):
2946 http_f = f.copy()
2947 del http_f['manifest_url']
2948 http_url = re.sub(
29f7c58a 2949 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
70c5802b 2950 http_f.update({
2951 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2952 'url': http_url,
2953 'protocol': protocol,
2954 })
29f7c58a 2955 formats.append(http_f)
70c5802b 2956 i += 1
70c5802b 2957
c7c43a93
RA
2958 return formats
2959
6ad02195 2960 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 2961 query = compat_urlparse.urlparse(url).query
6ad02195 2962 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
2963 mobj = re.search(
2964 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2965 url_base = mobj.group('url')
2966 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 2967 formats = []
044eeb14
S
2968
2969 def manifest_url(manifest):
2970 m_url = '%s/%s' % (http_base_url, manifest)
2971 if query:
2972 m_url += '?%s' % query
2973 return m_url
2974
6ad02195
RA
2975 if 'm3u8' not in skip_protocols:
2976 formats.extend(self._extract_m3u8_formats(
044eeb14 2977 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
2978 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2979 if 'f4m' not in skip_protocols:
2980 formats.extend(self._extract_f4m_formats(
044eeb14 2981 manifest_url('manifest.f4m'),
6ad02195 2982 video_id, f4m_id='hds', fatal=False))
0384932e
RA
2983 if 'dash' not in skip_protocols:
2984 formats.extend(self._extract_mpd_formats(
044eeb14 2985 manifest_url('manifest.mpd'),
0384932e 2986 video_id, mpd_id='dash', fatal=False))
6ad02195 2987 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
2988 if 'smil' not in skip_protocols:
2989 rtmp_formats = self._extract_smil_formats(
044eeb14 2990 manifest_url('jwplayer.smil'),
6ad02195
RA
2991 video_id, fatal=False)
2992 for rtmp_format in rtmp_formats:
2993 rtsp_format = rtmp_format.copy()
2994 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2995 del rtsp_format['play_path']
2996 del rtsp_format['ext']
2997 rtsp_format.update({
2998 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2999 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3000 'protocol': 'rtsp',
3001 })
3002 formats.extend([rtmp_format, rtsp_format])
3003 else:
3004 for protocol in ('rtmp', 'rtsp'):
3005 if protocol not in skip_protocols:
3006 formats.append({
f2e2f0c7 3007 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
3008 'format_id': protocol,
3009 'protocol': protocol,
3010 })
3011 return formats
3012
c73e330e 3013 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3014 mobj = re.search(
ac9c69ac 3015 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3016 webpage)
3017 if mobj:
c73e330e
RU
3018 try:
3019 jwplayer_data = self._parse_json(mobj.group('options'),
3020 video_id=video_id,
3021 transform_source=transform_source)
3022 except ExtractorError:
3023 pass
3024 else:
3025 if isinstance(jwplayer_data, dict):
3026 return jwplayer_data
a4a554a7
YCH
3027
3028 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3029 jwplayer_data = self._find_jwplayer_data(
3030 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3031 return self._parse_jwplayer_data(
3032 jwplayer_data, video_id, *args, **kwargs)
3033
3034 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3035 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3036 # JWPlayer backward compatibility: flattened playlists
3037 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3038 if 'playlist' not in jwplayer_data:
3039 jwplayer_data = {'playlist': [jwplayer_data]}
3040
3041 entries = []
3042
3043 # JWPlayer backward compatibility: single playlist item
3044 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3045 if not isinstance(jwplayer_data['playlist'], list):
3046 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3047
3048 for video_data in jwplayer_data['playlist']:
3049 # JWPlayer backward compatibility: flattened sources
3050 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3051 if 'sources' not in video_data:
3052 video_data['sources'] = [video_data]
3053
3054 this_video_id = video_id or video_data['mediaid']
3055
1a2192cb
S
3056 formats = self._parse_jwplayer_formats(
3057 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3058 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3059
3060 subtitles = {}
3061 tracks = video_data.get('tracks')
3062 if tracks and isinstance(tracks, list):
3063 for track in tracks:
96a2daa1
S
3064 if not isinstance(track, dict):
3065 continue
f4b74272
S
3066 track_kind = track.get('kind')
3067 if not track_kind or not isinstance(track_kind, compat_str):
3068 continue
3069 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3070 continue
3071 track_url = urljoin(base_url, track.get('file'))
3072 if not track_url:
3073 continue
3074 subtitles.setdefault(track.get('label') or 'en', []).append({
3075 'url': self._proto_relative_url(track_url)
3076 })
3077
50d808f5 3078 entry = {
a4a554a7 3079 'id': this_video_id,
50d808f5 3080 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3081 'description': clean_html(video_data.get('description')),
6945b9e7 3082 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3083 'timestamp': int_or_none(video_data.get('pubdate')),
3084 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3085 'subtitles': subtitles,
50d808f5
RA
3086 }
3087 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3088 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3089 entry.update({
3090 '_type': 'url_transparent',
3091 'url': formats[0]['url'],
3092 })
3093 else:
3094 self._sort_formats(formats)
3095 entry['formats'] = formats
3096 entries.append(entry)
a4a554a7
YCH
3097 if len(entries) == 1:
3098 return entries[0]
3099 else:
3100 return self.playlist_result(entries)
3101
ed0cf9b3
S
3102 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3103 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3104 urls = []
ed0cf9b3 3105 formats = []
1a2192cb 3106 for source in jwplayer_sources_data:
0a268c6e
S
3107 if not isinstance(source, dict):
3108 continue
6945b9e7
RA
3109 source_url = urljoin(
3110 base_url, self._proto_relative_url(source.get('file')))
3111 if not source_url or source_url in urls:
bf1b87cd
RA
3112 continue
3113 urls.append(source_url)
ed0cf9b3
S
3114 source_type = source.get('type') or ''
3115 ext = mimetype2ext(source_type) or determine_ext(source_url)
3116 if source_type == 'hls' or ext == 'm3u8':
3117 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3118 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3119 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3120 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3121 formats.extend(self._extract_mpd_formats(
3122 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3123 elif ext == 'smil':
3124 formats.extend(self._extract_smil_formats(
3125 source_url, video_id, fatal=False))
ed0cf9b3 3126 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3127 elif source_type.startswith('audio') or ext in (
3128 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3129 formats.append({
3130 'url': source_url,
3131 'vcodec': 'none',
3132 'ext': ext,
3133 })
3134 else:
3135 height = int_or_none(source.get('height'))
3136 if height is None:
3137 # Often no height is provided but there is a label in
0236cd0d 3138 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3139 height = int_or_none(self._search_regex(
0236cd0d 3140 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
3141 'height', default=None))
3142 a_format = {
3143 'url': source_url,
3144 'width': int_or_none(source.get('width')),
3145 'height': height,
0236cd0d 3146 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3147 'ext': ext,
3148 }
3149 if source_url.startswith('rtmp'):
3150 a_format['ext'] = 'flv'
ed0cf9b3
S
3151 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3152 # of jwplayer.flash.swf
3153 rtmp_url_parts = re.split(
3154 r'((?:mp4|mp3|flv):)', source_url, 1)
3155 if len(rtmp_url_parts) == 3:
3156 rtmp_url, prefix, play_path = rtmp_url_parts
3157 a_format.update({
3158 'url': rtmp_url,
3159 'play_path': prefix + play_path,
3160 })
3161 if rtmp_params:
3162 a_format.update(rtmp_params)
3163 formats.append(a_format)
3164 return formats
3165
f4b1c7ad
PH
3166 def _live_title(self, name):
3167 """ Generate the title for a live video """
3168 now = datetime.datetime.now()
611c1dd9 3169 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
3170 return name + ' ' + now_str
3171
b14f3a4c
PH
3172 def _int(self, v, name, fatal=False, **kwargs):
3173 res = int_or_none(v, **kwargs)
3174 if 'get_attr' in kwargs:
3175 print(getattr(v, kwargs['get_attr']))
3176 if res is None:
3177 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3178 if fatal:
3179 raise ExtractorError(msg)
3180 else:
3181 self._downloader.report_warning(msg)
3182 return res
3183
3184 def _float(self, v, name, fatal=False, **kwargs):
3185 res = float_or_none(v, **kwargs)
3186 if res is None:
3187 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3188 if fatal:
3189 raise ExtractorError(msg)
3190 else:
3191 self._downloader.report_warning(msg)
3192 return res
3193
40e41780
TF
3194 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3195 path='/', secure=False, discard=False, rest={}, **kwargs):
6c22cee6 3196 cookie = compat_cookiejar_Cookie(
4ed2d7b7 3197 0, name, value, port, port is not None, domain, True,
40e41780
TF
3198 domain.startswith('.'), path, True, secure, expire_time,
3199 discard, None, None, rest)
42939b61
JMF
3200 self._downloader.cookiejar.set_cookie(cookie)
3201
799207e8 3202 def _get_cookies(self, url):
3203 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 3204 req = sanitized_Request(url)
799207e8 3205 self._downloader.cookiejar.add_cookie_header(req)
3206 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3207
e3c1266f 3208 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3209 """
3210 Apply first Set-Cookie header instead of the last. Experimental.
3211
3212 Some sites (e.g. [1-3]) may serve two cookies under the same name
3213 in Set-Cookie header and expect the first (old) one to be set rather
3214 than second (new). However, as of RFC6265 the newer one cookie
3215 should be set into cookie store what actually happens.
3216 We will workaround this issue by resetting the cookie to
3217 the first one manually.
3218 1. https://new.vk.com/
3219 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3220 3. https://learning.oreilly.com/
3221 """
e3c1266f
S
3222 for header, cookies in url_handle.headers.items():
3223 if header.lower() != 'set-cookie':
3224 continue
3225 if sys.version_info[0] >= 3:
3226 cookies = cookies.encode('iso-8859-1')
3227 cookies = cookies.decode('utf-8')
3228 cookie_value = re.search(
3229 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3230 if cookie_value:
3231 value, domain = cookie_value.groups()
3232 self._set_cookie(domain, cookie, value)
3233 break
3234
05900629
PH
3235 def get_testcases(self, include_onlymatching=False):
3236 t = getattr(self, '_TEST', None)
3237 if t:
3238 assert not hasattr(self, '_TESTS'), \
3239 '%s has _TEST and _TESTS' % type(self).__name__
3240 tests = [t]
3241 else:
3242 tests = getattr(self, '_TESTS', [])
3243 for t in tests:
3244 if not include_onlymatching and t.get('only_matching', False):
3245 continue
3246 t['name'] = type(self).__name__[:-len('IE')]
3247 yield t
3248
3249 def is_suitable(self, age_limit):
3250 """ Test whether the extractor is generally suitable for the given
3251 age limit (i.e. pornographic sites are not, all others usually are) """
3252
3253 any_restricted = False
3254 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 3255 if tc.get('playlist', []):
05900629
PH
3256 tc = tc['playlist'][0]
3257 is_restricted = age_restricted(
3258 tc.get('info_dict', {}).get('age_limit'), age_limit)
3259 if not is_restricted:
3260 return True
3261 any_restricted = any_restricted or is_restricted
3262 return not any_restricted
3263
a504ced0 3264 def extract_subtitles(self, *args, **kwargs):
3089bc74
S
3265 if (self._downloader.params.get('writesubtitles', False)
3266 or self._downloader.params.get('listsubtitles')):
9868ea49
JMF
3267 return self._get_subtitles(*args, **kwargs)
3268 return {}
a504ced0
JMF
3269
3270 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3271 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3272
912e0b7e
YCH
3273 @staticmethod
3274 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3275 """ Merge subtitle items for one language. Items with duplicated URLs
3276 will be dropped. """
3277 list1_urls = set([item['url'] for item in subtitle_list1])
3278 ret = list(subtitle_list1)
3279 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3280 return ret
3281
3282 @classmethod
8c97f819 3283 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 3284 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
3285 ret = dict(subtitle_dict1)
3286 for lang in subtitle_dict2:
8c97f819 3287 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
3288 return ret
3289
360e1ca5 3290 def extract_automatic_captions(self, *args, **kwargs):
3089bc74
S
3291 if (self._downloader.params.get('writeautomaticsub', False)
3292 or self._downloader.params.get('listsubtitles')):
9868ea49
JMF
3293 return self._get_automatic_captions(*args, **kwargs)
3294 return {}
360e1ca5
JMF
3295
3296 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3297 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3298
d77ab8e2 3299 def mark_watched(self, *args, **kwargs):
3089bc74
S
3300 if (self._downloader.params.get('mark_watched', False)
3301 and (self._get_login_info()[0] is not None
3302 or self._downloader.params.get('cookiefile') is not None)):
d77ab8e2
S
3303 self._mark_watched(*args, **kwargs)
3304
3305 def _mark_watched(self, *args, **kwargs):
3306 raise NotImplementedError('This method must be implemented by subclasses')
3307
38cce791
YCH
3308 def geo_verification_headers(self):
3309 headers = {}
3310 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3311 if geo_verification_proxy:
3312 headers['Ytdl-request-proxy'] = geo_verification_proxy
3313 return headers
3314
98763ee3
YCH
3315 def _generic_id(self, url):
3316 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3317
3318 def _generic_title(self, url):
3319 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3320
8dbe9899 3321
d6983cb4
PH
3322class SearchInfoExtractor(InfoExtractor):
3323 """
3324 Base class for paged search queries extractors.
10952eb2 3325 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
3326 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3327 """
3328
3329 @classmethod
3330 def _make_valid_url(cls):
3331 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3332
3333 @classmethod
3334 def suitable(cls, url):
3335 return re.match(cls._make_valid_url(), url) is not None
3336
3337 def _real_extract(self, query):
3338 mobj = re.match(self._make_valid_url(), query)
3339 if mobj is None:
f1a9d64e 3340 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
3341
3342 prefix = mobj.group('prefix')
3343 query = mobj.group('query')
3344 if prefix == '':
3345 return self._get_n_results(query, 1)
3346 elif prefix == 'all':
3347 return self._get_n_results(query, self._MAX_RESULTS)
3348 else:
3349 n = int(prefix)
3350 if n <= 0:
f1a9d64e 3351 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 3352 elif n > self._MAX_RESULTS:
f1a9d64e 3353 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3354 n = self._MAX_RESULTS
3355 return self._get_n_results(query, n)
3356
3357 def _get_n_results(self, query, n):
3358 """Get a specified number of results for a query"""
611c1dd9 3359 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
3360
3361 @property
3362 def SEARCH_KEY(self):
3363 return self._SEARCH_KEY