]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/common.py
Fix HLS playlist downloading (#127)
[yt-dlp.git] / yt_dlp / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
f4b1c7ad 5import datetime
3ec05685 6import hashlib
3d3538e4 7import json
4094b6e3 8import netrc
d6983cb4 9import os
773f291d 10import random
d6983cb4
PH
11import re
12import socket
f8c7bed1 13import ssl
d6983cb4 14import sys
4094b6e3 15import time
1bac3455 16import math
d6983cb4 17
8c25f81b 18from ..compat import (
6c22cee6 19 compat_cookiejar_Cookie,
799207e8 20 compat_cookies,
ee0ba927 21 compat_etree_Element,
e9c0cdd3 22 compat_etree_fromstring,
e64b7569 23 compat_getpass,
d391b7e2 24 compat_integer_types,
d6983cb4 25 compat_http_client,
e9c0cdd3
YCH
26 compat_os_name,
27 compat_str,
d6983cb4 28 compat_urllib_error,
98763ee3 29 compat_urllib_parse_unquote,
15707c7e 30 compat_urllib_parse_urlencode,
41d06b04 31 compat_urllib_request,
f0b5d6af 32 compat_urlparse,
e01c3d2e 33 compat_xml_parse_error,
8c25f81b 34)
eb8a4433 35from ..downloader import FileDownloader
48107c19
S
36from ..downloader.f4m import (
37 get_base_url,
38 remove_encrypted_media,
39)
8c25f81b 40from ..utils import (
c342041f 41 NO_DEFAULT,
05900629 42 age_restricted,
02dc0a36 43 base_url,
08f2a92c 44 bug_reports_message,
d6983cb4
PH
45 clean_html,
46 compiled_regex_type,
70f0f5a8 47 determine_ext,
46b18f23 48 determine_protocol,
d493f15c 49 dict_get,
9b9c5355 50 error_to_compat_str,
d6983cb4 51 ExtractorError,
46b18f23 52 extract_attributes,
97f4aecf 53 fix_xml_ampersands,
b14f3a4c 54 float_or_none,
773f291d
S
55 GeoRestrictedError,
56 GeoUtils,
31bb8d3f 57 int_or_none,
a4a554a7 58 js_to_json,
0685d972 59 JSON_LD_RE,
46b18f23
JH
60 mimetype2ext,
61 orderedSet,
d493f15c 62 parse_bitrate,
46b18f23
JH
63 parse_codecs,
64 parse_duration,
4ca2a3cf 65 parse_iso8601,
46b18f23 66 parse_m3u8_attributes,
d493f15c 67 parse_resolution,
55b3e45b 68 RegexNotFoundError,
5c2266df 69 sanitized_Request,
46b18f23 70 sanitize_filename,
d493f15c 71 str_or_none,
ce5b9040 72 str_to_int,
f856816b 73 strip_or_none,
f38de77f 74 unescapeHTML,
647eab45 75 unified_strdate,
6b3a3098 76 unified_timestamp,
46b18f23
JH
77 update_Request,
78 update_url_query,
79 urljoin,
a107193e 80 url_basename,
bebef109 81 url_or_none,
a6571f10 82 xpath_element,
8d6765cf
S
83 xpath_text,
84 xpath_with_ns,
d6983cb4 85)
c342041f 86
d6983cb4
PH
87
88class InfoExtractor(object):
89 """Information Extractor class.
90
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
5d380852 95 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
98
cf0649f8 99 The type field determines the type of the result.
fed5d032
PH
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
102
103 For a video, the dictionaries must include the following fields:
d6983cb4
PH
104
105 id: Video identifier.
d6983cb4 106 title: Video title, unescaped.
d67b0b15 107
f49d89ee 108 Additionally, it must contain either a formats entry or a url one:
d67b0b15 109
f49d89ee
PH
110 formats: A list of dictionaries for each format available, ordered
111 from worst to best quality.
112
113 Potential fields:
c790e93a
S
114 * url The mandatory URL representing the media:
115 for plain file media - HTTP URL of this file,
116 for RTMP - RTMP URL,
117 for HLS - URL of the M3U8 media playlist,
118 for HDS - URL of the F4M manifest,
79d2077e
S
119 for DASH
120 - HTTP URL to plain file media (in case of
121 unfragmented media)
122 - URL of the MPD manifest or base URL
123 representing the media if MPD manifest
8ed7a233 124 is parsed from a string (in case of
79d2077e 125 fragmented media)
c790e93a 126 for MSS - URL of the ISM manifest.
86f4d14f
S
127 * manifest_url
128 The URL of the manifest file in case of
c790e93a
S
129 fragmented media:
130 for HLS - URL of the M3U8 master playlist,
131 for HDS - URL of the F4M manifest,
132 for DASH - URL of the MPD manifest,
133 for MSS - URL of the ISM manifest.
10952eb2 134 * ext Will be calculated from URL if missing
d67b0b15
PH
135 * format A human-readable description of the format
136 ("mp4 container with h264/opus").
137 Calculated from the format_id, width, height.
138 and format_note fields if missing.
139 * format_id A short description of the format
5d4f3985
PH
140 ("mp4_h264_opus" or "19").
141 Technically optional, but strongly recommended.
d67b0b15
PH
142 * format_note Additional info about the format
143 ("3D" or "DASH video")
144 * width Width of the video, if known
145 * height Height of the video, if known
f49d89ee 146 * resolution Textual description of width and height
7217e148 147 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
148 * abr Average audio bitrate in KBit/s
149 * acodec Name of the audio codec in use
dd27fd17 150 * asr Audio sampling rate in Hertz
d67b0b15 151 * vbr Average video bitrate in KBit/s
fbb21cf5 152 * fps Frame rate
d67b0b15 153 * vcodec Name of the video codec in use
1394ce65 154 * container Name of the container format
d67b0b15 155 * filesize The number of bytes, if known in advance
9732d77e 156 * filesize_approx An estimate for the number of bytes
d67b0b15 157 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
158 * protocol The protocol that will be used for the actual
159 download, lower-case.
b04b8852 160 "http", "https", "rtsp", "rtmp", "rtmpe",
af7d5a63 161 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
162 * fragment_base_url
163 Base URL for fragments. Each fragment's path
164 value (if present) will be relative to
165 this URL.
166 * fragments A list of fragments of a fragmented media.
167 Each fragment entry must contain either an url
168 or a path. If an url is present it should be
169 considered by a client. Otherwise both path and
170 fragment_base_url must be present. Here is
171 the list of all potential fields:
172 * "url" - fragment's URL
173 * "path" - fragment's path relative to
174 fragment_base_url
a0d5077c
S
175 * "duration" (optional, int or float)
176 * "filesize" (optional, int)
f49d89ee 177 * preference Order number of this format. If this field is
08d13955 178 present and not None, the formats get sorted
38d63d84 179 by this field, regardless of all other values.
f49d89ee
PH
180 -1 for default (order by other properties),
181 -2 or smaller for less than default.
e65566a9
PH
182 < -1000 to hide the format (if there is
183 another one which is strictly better)
32f90364
PH
184 * language Language code, e.g. "de" or "en-US".
185 * language_preference Is this in the language mentioned in
186 the URL?
aff2f4f4
PH
187 10 if it's what the URL is about,
188 -1 for default (don't know),
189 -10 otherwise, other values reserved for now.
5d73273f
PH
190 * quality Order number of the video quality of this
191 format, irrespective of the file format.
192 -1 for default (order by other properties),
193 -2 or smaller for less than default.
c64ed2a3
PH
194 * source_preference Order number for this video source
195 (quality takes higher priority)
196 -1 for default (order by other properties),
197 -2 or smaller for less than default.
d769be6c
PH
198 * http_headers A dictionary of additional HTTP headers
199 to add to the request.
6271f1ca 200 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
201 video's pixels are not square.
202 width : height ratio as float.
203 * no_resume The server does not support resuming the
204 (HTTP or RTMP) download. Boolean.
00c97e3e
S
205 * downloader_options A dictionary of downloader options as
206 described in FileDownloader
3dee7826 207
c0ba0f48 208 url: Final video URL.
d6983cb4 209 ext: Video filename extension.
d67b0b15
PH
210 format: The video format, defaults to ext (used for --get-format)
211 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 212
d6983cb4
PH
213 The following fields are optional:
214
f5e43bc6 215 alt_title: A secondary title of the video.
0afef30b
PH
216 display_id An alternative identifier for the video, not necessarily
217 unique, but available before title. Typically, id is
218 something like "4234987", title "Dancing naked mole rats",
219 and display_id "dancing-naked-mole-rats"
d5519808 220 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 221 * "id" (optional, string) - Thumbnail format ID
d5519808 222 * "url"
cfb56d1a 223 * "preference" (optional, int) - quality of the image
d5519808
PH
224 * "width" (optional, int)
225 * "height" (optional, int)
5e1c39ac 226 * "resolution" (optional, string "{width}x{height}",
d5519808 227 deprecated)
2de624fd 228 * "filesize" (optional, int)
d6983cb4 229 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 230 description: Full video description.
d6983cb4 231 uploader: Full name of the video uploader.
2bc0c46f 232 license: License name the video is licensed under.
8a92e51c 233 creator: The creator of the video.
8aab976b 234 release_date: The date (YYYYMMDD) when the video was released.
955c4514 235 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 236 upload_date: Video upload date (YYYYMMDD).
955c4514 237 If not explicitly set, calculated from timestamp.
d6983cb4 238 uploader_id: Nickname or id of the video uploader.
7bcd2830 239 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 240 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 241 Note that channel fields may or may not repeat uploader
6f1f59f3
S
242 fields. This depends on a particular extractor.
243 channel_id: Id of the channel.
244 channel_url: Full URL to a channel webpage.
da9ec3b9 245 location: Physical location where the video was filmed.
a504ced0 246 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
247 {tag: subformats}. "tag" is usually a language code, and
248 "subformats" is a list sorted from lower to higher
249 preference, each element is a dictionary with the "ext"
250 entry and one of:
a504ced0 251 * "data": The subtitles file contents
10952eb2 252 * "url": A URL pointing to the subtitles file
4bba3716 253 "ext" will be calculated from URL if missing
360e1ca5
JMF
254 automatic_captions: Like 'subtitles', used by the YoutubeIE for
255 automatically generated captions
62d231c0 256 duration: Length of the video in seconds, as an integer or float.
f3d29461 257 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
258 like_count: Number of positive ratings of the video
259 dislike_count: Number of negative ratings of the video
02835c6b 260 repost_count: Number of reposts of the video
2d30521a 261 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 262 comment_count: Number of comments on the video
dd622d7c
PH
263 comments: A list of comments, each with one or more of the following
264 properties (all but one of text or html optional):
265 * "author" - human-readable name of the comment author
266 * "author_id" - user ID of the comment author
267 * "id" - Comment ID
268 * "html" - Comment as HTML
269 * "text" - Plain text of the comment
270 * "timestamp" - UNIX timestamp of comment
271 * "parent" - ID of the comment this one is replying to.
272 Set to "root" to indicate that this is a
273 comment to the original video.
8dbe9899 274 age_limit: Age restriction for the video, as an integer (years)
7a5c1cfe 275 webpage_url: The URL to the video webpage, if given to yt-dlp it
9103bbc5
JMF
276 should allow to get the same result again. (It will be set
277 by YoutubeDL if it's missing)
ad3bc6ac
PH
278 categories: A list of categories that the video falls in, for example
279 ["Sports", "Berlin"]
864f24bd 280 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
281 is_live: True, False, or None (=unknown). Whether this video is a
282 live stream that goes on instead of a fixed-length video.
f76ede8e 283 was_live: True, False, or None (=unknown). Whether this video was
284 originally a live stream.
7c80519c 285 start_time: Time in seconds where the reproduction should start, as
10952eb2 286 specified in the URL.
297a564b 287 end_time: Time in seconds where the reproduction should end, as
10952eb2 288 specified in the URL.
55949fed 289 chapters: A list of dictionaries, with the following entries:
290 * "start_time" - The start time of the chapter in seconds
291 * "end_time" - The end time of the chapter in seconds
292 * "title" (optional, string)
6cfda058 293 playable_in_embed: Whether this video is allowed to play in embedded
294 players on other sites. Can be True (=always allowed),
295 False (=never allowed), None (=unknown), or a string
296 specifying the criteria for embedability (Eg: 'whitelist').
277d6ff5 297 __post_extractor: A function to be called just before the metadata is
298 written to either disk, logger or console. The function
299 must return a dict which will be added to the info_dict.
300 This is usefull for additional information that is
301 time-consuming to extract. Note that the fields thus
302 extracted will not be available to output template and
303 match_filter. So, only "comments" and "comment_count" are
304 currently allowed to be extracted via this method.
d6983cb4 305
7109903e
S
306 The following fields should only be used when the video belongs to some logical
307 chapter or section:
308
309 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
310 chapter_number: Number of the chapter the video belongs to, as an integer.
311 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
312
313 The following fields should only be used when the video is an episode of some
8d76bdf1 314 series, programme or podcast:
7109903e
S
315
316 series: Title of the series or programme the video episode belongs to.
317 season: Title of the season the video episode belongs to.
27bfd4e5
S
318 season_number: Number of the season the video episode belongs to, as an integer.
319 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
320 episode: Title of the video episode. Unlike mandatory video title field,
321 this field should denote the exact title of the video episode
322 without any kind of decoration.
27bfd4e5
S
323 episode_number: Number of the video episode within a season, as an integer.
324 episode_id: Id of the video episode, as a unicode string.
7109903e 325
7a93ab5f
S
326 The following fields should only be used when the media is a track or a part of
327 a music album:
328
329 track: Title of the track.
330 track_number: Number of the track within an album or a disc, as an integer.
331 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
332 as a unicode string.
333 artist: Artist(s) of the track.
334 genre: Genre(s) of the track.
335 album: Title of the album the track belongs to.
336 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
337 album_artist: List of all artists appeared on the album (e.g.
338 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
339 and compilations).
340 disc_number: Number of the disc or other physical medium the track belongs to,
341 as an integer.
342 release_year: Year (YYYY) when the album was released.
343
deefc05b 344 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 345
d838b1bd
PH
346 Unless mentioned otherwise, None is equivalent to absence of information.
347
fed5d032
PH
348
349 _type "playlist" indicates multiple videos.
b82f815f
PH
350 There must be a key "entries", which is a list, an iterable, or a PagedList
351 object, each element of which is a valid dictionary by this specification.
fed5d032 352
b60419c5 353 Additionally, playlists can have "id", "title", and any other relevent
354 attributes with the same semantics as videos (see above).
fed5d032
PH
355
356
357 _type "multi_video" indicates that there are multiple videos that
358 form a single show, for examples multiple acts of an opera or TV episode.
359 It must have an entries key like a playlist and contain all the keys
360 required for a video at the same time.
361
362
363 _type "url" indicates that the video must be extracted from another
364 location, possibly by a different extractor. Its only required key is:
365 "url" - the next URL to extract.
f58766ce
PH
366 The key "ie_key" can be set to the class name (minus the trailing "IE",
367 e.g. "Youtube") if the extractor class is known in advance.
368 Additionally, the dictionary may have any properties of the resolved entity
369 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
370 known ahead of time.
371
372
373 _type "url_transparent" entities have the same specification as "url", but
374 indicate that the given additional information is more precise than the one
375 associated with the resolved URL.
376 This is useful when a site employs a video service that hosts the video and
377 its technical metadata, but that video service does not embed a useful
378 title, description etc.
379
380
d6983cb4
PH
381 Subclasses of this one should re-define the _real_initialize() and
382 _real_extract() methods and define a _VALID_URL regexp.
383 Probably, they should also be added to the list of extractors.
384
4248dad9 385 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
386 geo restriction bypass mechanisms for a particular extractor.
387 Though it won't disable explicit geo restriction bypass based on
504f20dd 388 country code provided with geo_bypass_country.
4248dad9
S
389
390 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
391 countries for this extractor. One of these countries will be used by
392 geo restriction bypass mechanism right away in order to bypass
504f20dd 393 geo restriction, of course, if the mechanism is not disabled.
773f291d 394
5f95927a
S
395 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
396 IP blocks in CIDR notation for this extractor. One of these IP blocks
397 will be used by geo restriction bypass mechanism similarly
504f20dd 398 to _GEO_COUNTRIES.
3ccdde8c 399
d6983cb4
PH
400 Finally, the _WORKING attribute should be set to False for broken IEs
401 in order to warn the users and skip the tests.
402 """
403
404 _ready = False
405 _downloader = None
773f291d 406 _x_forwarded_for_ip = None
4248dad9
S
407 _GEO_BYPASS = True
408 _GEO_COUNTRIES = None
5f95927a 409 _GEO_IP_BLOCKS = None
d6983cb4
PH
410 _WORKING = True
411
412 def __init__(self, downloader=None):
413 """Constructor. Receives an optional downloader."""
414 self._ready = False
773f291d 415 self._x_forwarded_for_ip = None
d6983cb4
PH
416 self.set_downloader(downloader)
417
418 @classmethod
419 def suitable(cls, url):
420 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
421
422 # This does not use has/getattr intentionally - we want to know whether
423 # we have cached the regexp for *this* class, whereas getattr would also
424 # match the superclass
425 if '_VALID_URL_RE' not in cls.__dict__:
426 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
427 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 428
ed9266db
PH
429 @classmethod
430 def _match_id(cls, url):
431 if '_VALID_URL_RE' not in cls.__dict__:
432 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
433 m = cls._VALID_URL_RE.match(url)
434 assert m
1afd0b0d 435 return compat_str(m.group('id'))
ed9266db 436
d6983cb4
PH
437 @classmethod
438 def working(cls):
439 """Getter method for _WORKING."""
440 return cls._WORKING
441
442 def initialize(self):
443 """Initializes an instance (authentication, etc)."""
5f95927a
S
444 self._initialize_geo_bypass({
445 'countries': self._GEO_COUNTRIES,
446 'ip_blocks': self._GEO_IP_BLOCKS,
447 })
4248dad9
S
448 if not self._ready:
449 self._real_initialize()
450 self._ready = True
451
5f95927a 452 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
453 """
454 Initialize geo restriction bypass mechanism.
455
456 This method is used to initialize geo bypass mechanism based on faking
457 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 458 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
459 IP will be passed as X-Forwarded-For HTTP header in all subsequent
460 HTTP requests.
e39b5d4a
S
461
462 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
463 during the instance initialization with _GEO_COUNTRIES and
464 _GEO_IP_BLOCKS.
e39b5d4a 465
5f95927a 466 You may also manually call it from extractor's code if geo bypass
e39b5d4a 467 information is not available beforehand (e.g. obtained during
5f95927a
S
468 extraction) or due to some other reason. In this case you should pass
469 this information in geo bypass context passed as first argument. It may
470 contain following fields:
471
472 countries: List of geo unrestricted countries (similar
473 to _GEO_COUNTRIES)
474 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
475 (similar to _GEO_IP_BLOCKS)
476
e39b5d4a 477 """
773f291d 478 if not self._x_forwarded_for_ip:
5f95927a
S
479
480 # Geo bypass mechanism is explicitly disabled by user
481 if not self._downloader.params.get('geo_bypass', True):
482 return
483
484 if not geo_bypass_context:
485 geo_bypass_context = {}
486
487 # Backward compatibility: previously _initialize_geo_bypass
488 # expected a list of countries, some 3rd party code may still use
489 # it this way
490 if isinstance(geo_bypass_context, (list, tuple)):
491 geo_bypass_context = {
492 'countries': geo_bypass_context,
493 }
494
495 # The whole point of geo bypass mechanism is to fake IP
496 # as X-Forwarded-For HTTP header based on some IP block or
497 # country code.
498
499 # Path 1: bypassing based on IP block in CIDR notation
500
501 # Explicit IP block specified by user, use it right away
502 # regardless of whether extractor is geo bypassable or not
503 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
504
505 # Otherwise use random IP block from geo bypass context but only
506 # if extractor is known as geo bypassable
507 if not ip_block:
508 ip_blocks = geo_bypass_context.get('ip_blocks')
509 if self._GEO_BYPASS and ip_blocks:
510 ip_block = random.choice(ip_blocks)
511
512 if ip_block:
513 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
514 if self._downloader.params.get('verbose', False):
515 self._downloader.to_screen(
516 '[debug] Using fake IP %s as X-Forwarded-For.'
517 % self._x_forwarded_for_ip)
518 return
519
520 # Path 2: bypassing based on country code
521
522 # Explicit country code specified by user, use it right away
523 # regardless of whether extractor is geo bypassable or not
524 country = self._downloader.params.get('geo_bypass_country', None)
525
526 # Otherwise use random country code from geo bypass context but
527 # only if extractor is known as geo bypassable
528 if not country:
529 countries = geo_bypass_context.get('countries')
530 if self._GEO_BYPASS and countries:
531 country = random.choice(countries)
532
533 if country:
534 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
4248dad9 535 if self._downloader.params.get('verbose', False):
6a9cb295 536 self._downloader.to_screen(
eea0716c 537 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
5f95927a 538 % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
539
540 def extract(self, url):
541 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 542 try:
773f291d
S
543 for _ in range(2):
544 try:
545 self.initialize()
0016b84e
S
546 ie_result = self._real_extract(url)
547 if self._x_forwarded_for_ip:
548 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
549 return ie_result
773f291d 550 except GeoRestrictedError as e:
4248dad9
S
551 if self.__maybe_fake_ip_and_retry(e.countries):
552 continue
773f291d 553 raise
3a5bcd03
PH
554 except ExtractorError:
555 raise
556 except compat_http_client.IncompleteRead as e:
dfb1b146 557 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 558 except (KeyError, StopIteration) as e:
dfb1b146 559 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4 560
4248dad9 561 def __maybe_fake_ip_and_retry(self, countries):
3089bc74
S
562 if (not self._downloader.params.get('geo_bypass_country', None)
563 and self._GEO_BYPASS
564 and self._downloader.params.get('geo_bypass', True)
565 and not self._x_forwarded_for_ip
566 and countries):
eea0716c
S
567 country_code = random.choice(countries)
568 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
569 if self._x_forwarded_for_ip:
570 self.report_warning(
eea0716c
S
571 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
572 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
573 return True
574 return False
575
d6983cb4
PH
576 def set_downloader(self, downloader):
577 """Sets the downloader for this IE."""
578 self._downloader = downloader
579
580 def _real_initialize(self):
581 """Real initialization process. Redefine in subclasses."""
582 pass
583
584 def _real_extract(self, url):
585 """Real extraction process. Redefine in subclasses."""
586 pass
587
56c73665
JMF
588 @classmethod
589 def ie_key(cls):
590 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 591 return compat_str(cls.__name__[:-2])
56c73665 592
d6983cb4
PH
593 @property
594 def IE_NAME(self):
dc519b54 595 return compat_str(type(self).__name__[:-2])
d6983cb4 596
d391b7e2
S
597 @staticmethod
598 def __can_accept_status_code(err, expected_status):
599 assert isinstance(err, compat_urllib_error.HTTPError)
600 if expected_status is None:
601 return False
602 if isinstance(expected_status, compat_integer_types):
603 return err.code == expected_status
604 elif isinstance(expected_status, (list, tuple)):
605 return err.code in expected_status
606 elif callable(expected_status):
607 return expected_status(err.code) is True
608 else:
609 assert False
610
611 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
612 """
613 Return the response handle.
614
615 See _download_webpage docstring for arguments specification.
616 """
1cf376f5 617 if not self._downloader._first_webpage_request:
618 sleep_interval = float_or_none(self._downloader.params.get('sleep_interval_requests')) or 0
619 if sleep_interval > 0:
5ef7d9bd 620 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
1cf376f5 621 time.sleep(sleep_interval)
622 else:
623 self._downloader._first_webpage_request = False
624
d6983cb4
PH
625 if note is None:
626 self.report_download_webpage(video_id)
627 elif note is not False:
7cc3570e 628 if video_id is None:
f1a9d64e 629 self.to_screen('%s' % (note,))
7cc3570e 630 else:
f1a9d64e 631 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
632
633 # Some sites check X-Forwarded-For HTTP header in order to figure out
634 # the origin of the client behind proxy. This allows bypassing geo
635 # restriction by faking this header's value to IP that belongs to some
636 # geo unrestricted country. We will do so once we encounter any
637 # geo restriction error.
638 if self._x_forwarded_for_ip:
639 if 'X-Forwarded-For' not in headers:
640 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
641
41d06b04
S
642 if isinstance(url_or_request, compat_urllib_request.Request):
643 url_or_request = update_Request(
644 url_or_request, data=data, headers=headers, query=query)
645 else:
cdfee168 646 if query:
647 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 648 if data is not None or headers:
41d06b04 649 url_or_request = sanitized_Request(url_or_request, data, headers)
f8c7bed1
S
650 exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
651 if hasattr(ssl, 'CertificateError'):
652 exceptions.append(ssl.CertificateError)
d6983cb4 653 try:
dca08720 654 return self._downloader.urlopen(url_or_request)
f8c7bed1 655 except tuple(exceptions) as err:
d391b7e2
S
656 if isinstance(err, compat_urllib_error.HTTPError):
657 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
658 # Retain reference to error to prevent file object from
659 # being closed before it can be read. Works around the
660 # effects of <https://bugs.python.org/issue15002>
661 # introduced in Python 3.4.1.
662 err.fp._error = err
d391b7e2
S
663 return err.fp
664
aa94a6d3
PH
665 if errnote is False:
666 return False
d6983cb4 667 if errnote is None:
f1a9d64e 668 errnote = 'Unable to download webpage'
7f8b2714 669
9b9c5355 670 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
671 if fatal:
672 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
673 else:
674 self._downloader.report_warning(errmsg)
675 return False
d6983cb4 676
d391b7e2
S
677 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
678 """
679 Return a tuple (page content as string, URL handle).
680
681 See _download_webpage docstring for arguments specification.
682 """
b9d3e163
PH
683 # Strip hashes from the URL (#1038)
684 if isinstance(url_or_request, (compat_str, str)):
685 url_or_request = url_or_request.partition('#')[0]
686
d391b7e2 687 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
688 if urlh is False:
689 assert not fatal
690 return False
c9a77969 691 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
692 return (content, urlh)
693
c9a77969
YCH
694 @staticmethod
695 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
696 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
697 if m:
698 encoding = m.group(1)
699 else:
0d75ae2c 700 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
701 webpage_bytes[:1024])
702 if m:
703 encoding = m.group(1).decode('ascii')
b60016e8
PH
704 elif webpage_bytes.startswith(b'\xff\xfe'):
705 encoding = 'utf-16'
f143d86a
PH
706 else:
707 encoding = 'utf-8'
c9a77969
YCH
708
709 return encoding
710
4457823d
S
711 def __check_blocked(self, content):
712 first_block = content[:512]
3089bc74
S
713 if ('<title>Access to this site is blocked</title>' in content
714 and 'Websense' in first_block):
4457823d
S
715 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
716 blocked_iframe = self._html_search_regex(
717 r'<iframe src="([^"]+)"', content,
718 'Websense information URL', default=None)
719 if blocked_iframe:
720 msg += ' Visit %s for more details' % blocked_iframe
721 raise ExtractorError(msg, expected=True)
722 if '<title>The URL you requested has been blocked</title>' in first_block:
723 msg = (
724 'Access to this webpage has been blocked by Indian censorship. '
725 'Use a VPN or proxy server (with --proxy) to route around it.')
726 block_msg = self._html_search_regex(
727 r'</h1><p>(.*?)</p>',
728 content, 'block message', default=None)
729 if block_msg:
730 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
731 raise ExtractorError(msg, expected=True)
3089bc74
S
732 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
733 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
734 raise ExtractorError(
735 'Access to this webpage has been blocked by decision of the Russian government. '
736 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
737 expected=True)
738
c9a77969
YCH
739 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
740 content_type = urlh.headers.get('Content-Type', '')
741 webpage_bytes = urlh.read()
742 if prefix is not None:
743 webpage_bytes = prefix + webpage_bytes
744 if not encoding:
745 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4 746 if self._downloader.params.get('dump_intermediate_pages', False):
f610dbb0 747 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
748 dump = base64.b64encode(webpage_bytes).decode('ascii')
749 self._downloader.to_screen(dump)
d41e6efc 750 if self._downloader.params.get('write_pages', False):
f610dbb0 751 basen = '%s_%s' % (video_id, urlh.geturl())
c1bce22f 752 if len(basen) > 240:
f1a9d64e 753 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
754 basen = basen[:240 - len(h)] + h
755 raw_filename = basen + '.dump'
d41e6efc 756 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 757 self.to_screen('Saving request to ' + filename)
5f58165d
S
758 # Working around MAX_PATH limitation on Windows (see
759 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 760 if compat_os_name == 'nt':
5f58165d
S
761 absfilepath = os.path.abspath(filename)
762 if len(absfilepath) > 259:
763 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
764 with open(filename, 'wb') as outf:
765 outf.write(webpage_bytes)
766
ec0fafbb
AA
767 try:
768 content = webpage_bytes.decode(encoding, 'replace')
769 except LookupError:
770 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 771
4457823d 772 self.__check_blocked(content)
2410c43d 773
23be51d8 774 return content
d6983cb4 775
d391b7e2
S
776 def _download_webpage(
777 self, url_or_request, video_id, note=None, errnote=None,
778 fatal=True, tries=1, timeout=5, encoding=None, data=None,
779 headers={}, query={}, expected_status=None):
780 """
781 Return the data of the page as a string.
782
783 Arguments:
784 url_or_request -- plain text URL as a string or
785 a compat_urllib_request.Requestobject
786 video_id -- Video/playlist/item identifier (string)
787
788 Keyword arguments:
789 note -- note printed before downloading (string)
790 errnote -- note printed in case of an error (string)
791 fatal -- flag denoting whether error should be considered fatal,
792 i.e. whether it should cause ExtractionError to be raised,
793 otherwise a warning will be reported and extraction continued
794 tries -- number of tries
795 timeout -- sleep interval between tries
796 encoding -- encoding for a page content decoding, guessed automatically
797 when not explicitly specified
798 data -- POST data (bytes)
799 headers -- HTTP headers (dict)
800 query -- URL query (dict)
801 expected_status -- allows to accept failed HTTP requests (non 2xx
802 status code) by explicitly specifying a set of accepted status
803 codes. Can be any of the following entities:
804 - an integer type specifying an exact failed status code to
805 accept
806 - a list or a tuple of integer types specifying a list of
807 failed status codes to accept
808 - a callable accepting an actual failed status code and
809 returning True if it should be accepted
810 Note that this argument does not affect success status codes (2xx)
811 which are always accepted.
812 """
813
995ad69c
TF
814 success = False
815 try_count = 0
816 while success is False:
817 try:
d391b7e2
S
818 res = self._download_webpage_handle(
819 url_or_request, video_id, note, errnote, fatal,
820 encoding=encoding, data=data, headers=headers, query=query,
821 expected_status=expected_status)
995ad69c
TF
822 success = True
823 except compat_http_client.IncompleteRead as e:
824 try_count += 1
825 if try_count >= tries:
826 raise e
827 self._sleep(timeout, video_id)
7cc3570e
PH
828 if res is False:
829 return res
830 else:
831 content, _ = res
832 return content
d6983cb4 833
e0d198c1
S
834 def _download_xml_handle(
835 self, url_or_request, video_id, note='Downloading XML',
836 errnote='Unable to download XML', transform_source=None,
d391b7e2
S
837 fatal=True, encoding=None, data=None, headers={}, query={},
838 expected_status=None):
839 """
ee0ba927 840 Return a tuple (xml as an compat_etree_Element, URL handle).
d391b7e2
S
841
842 See _download_webpage docstring for arguments specification.
843 """
e0d198c1
S
844 res = self._download_webpage_handle(
845 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
846 encoding=encoding, data=data, headers=headers, query=query,
847 expected_status=expected_status)
e0d198c1
S
848 if res is False:
849 return res
850 xml_string, urlh = res
851 return self._parse_xml(
852 xml_string, video_id, transform_source=transform_source,
853 fatal=fatal), urlh
854
d391b7e2
S
855 def _download_xml(
856 self, url_or_request, video_id,
857 note='Downloading XML', errnote='Unable to download XML',
858 transform_source=None, fatal=True, encoding=None,
859 data=None, headers={}, query={}, expected_status=None):
860 """
ee0ba927 861 Return the xml as an compat_etree_Element.
d391b7e2
S
862
863 See _download_webpage docstring for arguments specification.
864 """
e0d198c1
S
865 res = self._download_xml_handle(
866 url_or_request, video_id, note=note, errnote=errnote,
867 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
868 data=data, headers=headers, query=query,
869 expected_status=expected_status)
e0d198c1 870 return res if res is False else res[0]
e01c3d2e
S
871
872 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
873 if transform_source:
874 xml_string = transform_source(xml_string)
e01c3d2e
S
875 try:
876 return compat_etree_fromstring(xml_string.encode('utf-8'))
877 except compat_xml_parse_error as ve:
878 errmsg = '%s: Failed to parse XML ' % video_id
879 if fatal:
880 raise ExtractorError(errmsg, cause=ve)
881 else:
882 self.report_warning(errmsg + str(ve))
267ed0c5 883
0fe7783e
S
884 def _download_json_handle(
885 self, url_or_request, video_id, note='Downloading JSON metadata',
886 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
887 fatal=True, encoding=None, data=None, headers={}, query={},
888 expected_status=None):
889 """
890 Return a tuple (JSON object, URL handle).
891
892 See _download_webpage docstring for arguments specification.
893 """
0fe7783e 894 res = self._download_webpage_handle(
c9a77969 895 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
896 encoding=encoding, data=data, headers=headers, query=query,
897 expected_status=expected_status)
0fe7783e
S
898 if res is False:
899 return res
900 json_string, urlh = res
ebb64199 901 return self._parse_json(
0fe7783e
S
902 json_string, video_id, transform_source=transform_source,
903 fatal=fatal), urlh
904
905 def _download_json(
906 self, url_or_request, video_id, note='Downloading JSON metadata',
907 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
908 fatal=True, encoding=None, data=None, headers={}, query={},
909 expected_status=None):
910 """
911 Return the JSON object as a dict.
912
913 See _download_webpage docstring for arguments specification.
914 """
0fe7783e
S
915 res = self._download_json_handle(
916 url_or_request, video_id, note=note, errnote=errnote,
917 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
918 data=data, headers=headers, query=query,
919 expected_status=expected_status)
0fe7783e 920 return res if res is False else res[0]
ebb64199
TF
921
922 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
923 if transform_source:
924 json_string = transform_source(json_string)
3d3538e4
PH
925 try:
926 return json.loads(json_string)
927 except ValueError as ve:
e7b6d122
PH
928 errmsg = '%s: Failed to parse JSON ' % video_id
929 if fatal:
930 raise ExtractorError(errmsg, cause=ve)
931 else:
932 self.report_warning(errmsg + str(ve))
3d3538e4 933
f45f96f8 934 def report_warning(self, msg, video_id=None):
f1a9d64e 935 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 936 self._downloader.report_warning(
f1a9d64e 937 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 938
d6983cb4
PH
939 def to_screen(self, msg):
940 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 941 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
942
943 def report_extraction(self, id_or_name):
944 """Report information extraction."""
f1a9d64e 945 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
946
947 def report_download_webpage(self, video_id):
948 """Report webpage download."""
f1a9d64e 949 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
950
951 def report_age_confirmation(self):
952 """Report attempt to confirm age."""
f1a9d64e 953 self.to_screen('Confirming age')
d6983cb4 954
fc79158d
JMF
955 def report_login(self):
956 """Report attempt to log in."""
f1a9d64e 957 self.to_screen('Logging in')
fc79158d 958
43e7d3c9
S
959 @staticmethod
960 def raise_login_required(msg='This video is only available for registered users'):
961 raise ExtractorError(
962 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
963 expected=True)
964
c430802e 965 @staticmethod
773f291d
S
966 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
967 raise GeoRestrictedError(msg, countries=countries)
c430802e 968
5f6a1245 969 # Methods for following #608
c0d0b01f 970 @staticmethod
830d53bf 971 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 972 """Returns a URL that points to a page that should be processed"""
5f6a1245 973 # TODO: ie should be the class used for getting the info
d6983cb4
PH
974 video_info = {'_type': 'url',
975 'url': url,
976 'ie_key': ie}
7012b23c
PH
977 if video_id is not None:
978 video_info['id'] = video_id
830d53bf
S
979 if video_title is not None:
980 video_info['title'] = video_title
d6983cb4 981 return video_info
5f6a1245 982
749ca5ec
S
983 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
984 urls = orderedSet(
46b18f23
JH
985 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
986 for m in matches)
987 return self.playlist_result(
749ca5ec 988 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 989
c0d0b01f 990 @staticmethod
b60419c5 991 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
d6983cb4
PH
992 """Returns a playlist"""
993 video_info = {'_type': 'playlist',
994 'entries': entries}
b60419c5 995 video_info.update(kwargs)
d6983cb4
PH
996 if playlist_id:
997 video_info['id'] = playlist_id
998 if playlist_title:
999 video_info['title'] = playlist_title
ecc97af3 1000 if playlist_description is not None:
acf5cbfe 1001 video_info['description'] = playlist_description
d6983cb4
PH
1002 return video_info
1003
c342041f 1004 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1005 """
1006 Perform a regex search on the given string, using a single or a list of
1007 patterns returning the first matching group.
1008 In case of failure return a default value or raise a WARNING or a
55b3e45b 1009 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
1010 """
1011 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1012 mobj = re.search(pattern, string, flags)
1013 else:
1014 for p in pattern:
1015 mobj = re.search(p, string, flags)
c3415d1b
PH
1016 if mobj:
1017 break
d6983cb4 1018
e9c0cdd3 1019 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 1020 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
1021 else:
1022 _name = name
1023
1024 if mobj:
711ede6e
PH
1025 if group is None:
1026 # return the first matching group
1027 return next(g for g in mobj.groups() if g is not None)
1028 else:
1029 return mobj.group(group)
c342041f 1030 elif default is not NO_DEFAULT:
d6983cb4
PH
1031 return default
1032 elif fatal:
f1a9d64e 1033 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1034 else:
08f2a92c 1035 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1036 return None
1037
c342041f 1038 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1039 """
1040 Like _search_regex, but strips HTML tags and unescapes entities.
1041 """
711ede6e 1042 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1043 if res:
1044 return clean_html(res).strip()
1045 else:
1046 return res
1047
2118fdd1
RA
1048 def _get_netrc_login_info(self, netrc_machine=None):
1049 username = None
1050 password = None
1051 netrc_machine = netrc_machine or self._NETRC_MACHINE
1052
1053 if self._downloader.params.get('usenetrc', False):
1054 try:
1055 info = netrc.netrc().authenticators(netrc_machine)
1056 if info is not None:
1057 username = info[0]
1058 password = info[2]
1059 else:
dcce092e
S
1060 raise netrc.NetrcParseError(
1061 'No authenticators for %s' % netrc_machine)
2118fdd1 1062 except (IOError, netrc.NetrcParseError) as err:
dcce092e
S
1063 self._downloader.report_warning(
1064 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1065
dcce092e 1066 return username, password
2118fdd1 1067
1b6712ab 1068 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1069 """
cf0649f8 1070 Get the login info as (username, password)
32443dd3
S
1071 First look for the manually specified credentials using username_option
1072 and password_option as keys in params dictionary. If no such credentials
1073 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1074 value.
fc79158d
JMF
1075 If there's no info available, return (None, None)
1076 """
1077 if self._downloader is None:
1078 return (None, None)
1079
fc79158d
JMF
1080 downloader_params = self._downloader.params
1081
1082 # Attempt to use provided username and password or .netrc data
1b6712ab
RA
1083 if downloader_params.get(username_option) is not None:
1084 username = downloader_params[username_option]
1085 password = downloader_params[password_option]
2118fdd1 1086 else:
1b6712ab 1087 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1088
2133565c 1089 return username, password
fc79158d 1090
e64b7569 1091 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1092 """
1093 Get the two-factor authentication info
1094 TODO - asking the user will be required for sms/phone verify
1095 currently just uses the command line option
1096 If there's no info available, return None
1097 """
1098 if self._downloader is None:
83317f69 1099 return None
1100 downloader_params = self._downloader.params
1101
d800609c 1102 if downloader_params.get('twofactor') is not None:
83317f69 1103 return downloader_params['twofactor']
1104
e64b7569 1105 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1106
46720279
JMF
1107 # Helper functions for extracting OpenGraph info
1108 @staticmethod
ab2d5247 1109 def _og_regexes(prop):
448ef1f3 1110 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
22f5f5c6 1111 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
7a6d76a6 1112 % {'prop': re.escape(prop)})
78fb87b2 1113 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1114 return [
78fb87b2
JMF
1115 template % (property_re, content_re),
1116 template % (content_re, property_re),
ab2d5247 1117 ]
46720279 1118
864f24bd
S
1119 @staticmethod
1120 def _meta_regex(prop):
1121 return r'''(?isx)<meta
8b9848ac 1122 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1123 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1124
3c4e6d83 1125 def _og_search_property(self, prop, html, name=None, **kargs):
b070564e
S
1126 if not isinstance(prop, (list, tuple)):
1127 prop = [prop]
46720279 1128 if name is None:
b070564e
S
1129 name = 'OpenGraph %s' % prop[0]
1130 og_regexes = []
1131 for p in prop:
1132 og_regexes.extend(self._og_regexes(p))
1133 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1134 if escaped is None:
1135 return None
1136 return unescapeHTML(escaped)
46720279
JMF
1137
1138 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1139 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1140
1141 def _og_search_description(self, html, **kargs):
1142 return self._og_search_property('description', html, fatal=False, **kargs)
1143
1144 def _og_search_title(self, html, **kargs):
1145 return self._og_search_property('title', html, **kargs)
1146
8ffa13e0 1147 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1148 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1149 if secure:
1150 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1151 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1152
78338f71
JMF
1153 def _og_search_url(self, html, **kargs):
1154 return self._og_search_property('url', html, **kargs)
1155
40c696e5 1156 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
88d9f6c0
S
1157 if not isinstance(name, (list, tuple)):
1158 name = [name]
59040888 1159 if display_name is None:
88d9f6c0 1160 display_name = name[0]
59040888 1161 return self._html_search_regex(
88d9f6c0 1162 [self._meta_regex(n) for n in name],
711ede6e 1163 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1164
1165 def _dc_search_uploader(self, html):
1166 return self._html_search_meta('dc.creator', html, 'uploader')
1167
8dbe9899
PH
1168 def _rta_search(self, html):
1169 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1170 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1171 r' content="RTA-5042-1996-1400-1577-RTA"',
1172 html):
1173 return 18
1174 return 0
1175
59040888
PH
1176 def _media_rating_search(self, html):
1177 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1178 rating = self._html_search_meta('rating', html)
1179
1180 if not rating:
1181 return None
1182
1183 RATING_TABLE = {
1184 'safe for kids': 0,
1185 'general': 8,
1186 '14 years': 14,
1187 'mature': 17,
1188 'restricted': 19,
1189 }
d800609c 1190 return RATING_TABLE.get(rating.lower())
59040888 1191
69319969 1192 def _family_friendly_search(self, html):
6ca7732d 1193 # See http://schema.org/VideoObject
ac8491fc
S
1194 family_friendly = self._html_search_meta(
1195 'isFamilyFriendly', html, default=None)
69319969
NJ
1196
1197 if not family_friendly:
1198 return None
1199
1200 RATING_TABLE = {
1201 '1': 0,
1202 'true': 0,
1203 '0': 18,
1204 'false': 18,
1205 }
d800609c 1206 return RATING_TABLE.get(family_friendly.lower())
69319969 1207
0c708f11
JMF
1208 def _twitter_search_player(self, html):
1209 return self._html_search_meta('twitter:player', html,
9e1a5b84 1210 'twitter card player')
0c708f11 1211
95b31e26 1212 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4433bb02 1213 json_ld_list = list(re.finditer(JSON_LD_RE, html))
321b5e08 1214 default = kwargs.get('default', NO_DEFAULT)
321b5e08
S
1215 # JSON-LD may be malformed and thus `fatal` should be respected.
1216 # At the same time `default` may be passed that assumes `fatal=False`
1217 # for _search_regex. Let's simulate the same behavior here as well.
1218 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
4433bb02
S
1219 json_ld = []
1220 for mobj in json_ld_list:
1221 json_ld_item = self._parse_json(
1222 mobj.group('json_ld'), video_id, fatal=fatal)
1223 if not json_ld_item:
1224 continue
1225 if isinstance(json_ld_item, dict):
1226 json_ld.append(json_ld_item)
1227 elif isinstance(json_ld_item, (list, tuple)):
1228 json_ld.extend(json_ld_item)
1229 if json_ld:
1230 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1231 if json_ld:
1232 return json_ld
1233 if default is not NO_DEFAULT:
1234 return default
1235 elif fatal:
1236 raise RegexNotFoundError('Unable to extract JSON-LD')
1237 else:
1238 self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1239 return {}
4ca2a3cf 1240
95b31e26 1241 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1242 if isinstance(json_ld, compat_str):
1243 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1244 if not json_ld:
1245 return {}
1246 info = {}
46933a15
S
1247 if not isinstance(json_ld, (list, tuple, dict)):
1248 return info
1249 if isinstance(json_ld, dict):
1250 json_ld = [json_ld]
bae14048 1251
e7e4a6e0
S
1252 INTERACTION_TYPE_MAP = {
1253 'CommentAction': 'comment',
1254 'AgreeAction': 'like',
1255 'DisagreeAction': 'dislike',
1256 'LikeAction': 'like',
1257 'DislikeAction': 'dislike',
1258 'ListenAction': 'view',
1259 'WatchAction': 'view',
1260 'ViewAction': 'view',
1261 }
1262
29f7c58a 1263 def extract_interaction_type(e):
1264 interaction_type = e.get('interactionType')
1265 if isinstance(interaction_type, dict):
1266 interaction_type = interaction_type.get('@type')
1267 return str_or_none(interaction_type)
1268
e7e4a6e0
S
1269 def extract_interaction_statistic(e):
1270 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1271 if isinstance(interaction_statistic, dict):
1272 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1273 if not isinstance(interaction_statistic, list):
1274 return
1275 for is_e in interaction_statistic:
1276 if not isinstance(is_e, dict):
1277 continue
1278 if is_e.get('@type') != 'InteractionCounter':
1279 continue
29f7c58a 1280 interaction_type = extract_interaction_type(is_e)
1281 if not interaction_type:
e7e4a6e0 1282 continue
ce5b9040
S
1283 # For interaction count some sites provide string instead of
1284 # an integer (as per spec) with non digit characters (e.g. ",")
1285 # so extracting count with more relaxed str_to_int
1286 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1287 if interaction_count is None:
1288 continue
1289 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1290 if not count_kind:
1291 continue
1292 count_key = '%s_count' % count_kind
1293 if info.get(count_key) is not None:
1294 continue
1295 info[count_key] = interaction_count
1296
bae14048
S
1297 def extract_video_object(e):
1298 assert e['@type'] == 'VideoObject'
1299 info.update({
bebef109 1300 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1301 'title': unescapeHTML(e.get('name')),
1302 'description': unescapeHTML(e.get('description')),
bebef109 1303 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
bae14048
S
1304 'duration': parse_duration(e.get('duration')),
1305 'timestamp': unified_timestamp(e.get('uploadDate')),
ad06b99d 1306 'uploader': str_or_none(e.get('author')),
bae14048
S
1307 'filesize': float_or_none(e.get('contentSize')),
1308 'tbr': int_or_none(e.get('bitrate')),
1309 'width': int_or_none(e.get('width')),
1310 'height': int_or_none(e.get('height')),
33a81c2c 1311 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1312 })
e7e4a6e0 1313 extract_interaction_statistic(e)
bae14048 1314
46933a15 1315 for e in json_ld:
4433bb02 1316 if '@context' in e:
46933a15
S
1317 item_type = e.get('@type')
1318 if expected_type is not None and expected_type != item_type:
4433bb02 1319 continue
c69701c6 1320 if item_type in ('TVEpisode', 'Episode'):
440863ad 1321 episode_name = unescapeHTML(e.get('name'))
46933a15 1322 info.update({
440863ad 1323 'episode': episode_name,
46933a15
S
1324 'episode_number': int_or_none(e.get('episodeNumber')),
1325 'description': unescapeHTML(e.get('description')),
1326 })
440863ad
S
1327 if not info.get('title') and episode_name:
1328 info['title'] = episode_name
46933a15 1329 part_of_season = e.get('partOfSeason')
c69701c6 1330 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1331 info.update({
1332 'season': unescapeHTML(part_of_season.get('name')),
1333 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1334 })
d16b3c66 1335 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1336 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1337 info['series'] = unescapeHTML(part_of_series.get('name'))
391256dc
S
1338 elif item_type == 'Movie':
1339 info.update({
1340 'title': unescapeHTML(e.get('name')),
1341 'description': unescapeHTML(e.get('description')),
1342 'duration': parse_duration(e.get('duration')),
1343 'timestamp': unified_timestamp(e.get('dateCreated')),
1344 })
3931b845 1345 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1346 info.update({
1347 'timestamp': parse_iso8601(e.get('datePublished')),
1348 'title': unescapeHTML(e.get('headline')),
1349 'description': unescapeHTML(e.get('articleBody')),
1350 })
1351 elif item_type == 'VideoObject':
bae14048 1352 extract_video_object(e)
4433bb02
S
1353 if expected_type is None:
1354 continue
1355 else:
1356 break
c69701c6
S
1357 video = e.get('video')
1358 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1359 extract_video_object(video)
4433bb02
S
1360 if expected_type is None:
1361 continue
1362 else:
1363 break
4ca2a3cf
S
1364 return dict((k, v) for k, v in info.items() if v is not None)
1365
27713812 1366 @staticmethod
f8da79f8 1367 def _hidden_inputs(html):
586f1cc5 1368 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1369 hidden_inputs = {}
c8498368
S
1370 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1371 attrs = extract_attributes(input)
1372 if not input:
201ea3ee 1373 continue
c8498368 1374 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1375 continue
c8498368
S
1376 name = attrs.get('name') or attrs.get('id')
1377 value = attrs.get('value')
1378 if name and value is not None:
1379 hidden_inputs[name] = value
201ea3ee 1380 return hidden_inputs
27713812 1381
cf61d96d
S
1382 def _form_hidden_inputs(self, form_id, html):
1383 form = self._search_regex(
73eb13df 1384 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1385 html, '%s form' % form_id, group='form')
1386 return self._hidden_inputs(form)
1387
eb8a4433 1388 class FormatSort:
1389 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
1390
c10d0213 1391 default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
155d2b48 1392 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
c10d0213 1393 'proto', 'ext', 'has_audio', 'source', 'format_id') # These must not be aliases
eb8a4433 1394
1395 settings = {
1396 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1397 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1398 'acodec': {'type': 'ordered', 'regex': True,
1399 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
f137c99e 1400 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
63be1aab 1401 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
eb8a4433 1402 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1403 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1404 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1405 'aext': {'type': 'ordered', 'field': 'audio_ext',
1406 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1407 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1408 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f983b875 1409 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1410 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1411 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
f983b875 1412 'lang': {'priority': True, 'convert': 'ignore', 'type': 'extractor', 'field': 'language_preference'},
1413 'quality': {'convert': 'float_none', 'type': 'extractor'},
eb8a4433 1414 'filesize': {'convert': 'bytes'},
f137c99e 1415 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1416 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1417 'height': {'convert': 'float_none'},
1418 'width': {'convert': 'float_none'},
1419 'fps': {'convert': 'float_none'},
1420 'tbr': {'convert': 'float_none'},
1421 'vbr': {'convert': 'float_none'},
1422 'abr': {'convert': 'float_none'},
1423 'asr': {'convert': 'float_none'},
f983b875 1424 'source': {'convert': 'ignore', 'type': 'extractor', 'field': 'source_preference'},
63be1aab 1425
eb8a4433 1426 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1427 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1428 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1429 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1430 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1431
1432 # Most of these exist only for compatibility reasons
1433 'dimension': {'type': 'alias', 'field': 'res'},
1434 'resolution': {'type': 'alias', 'field': 'res'},
1435 'extension': {'type': 'alias', 'field': 'ext'},
1436 'bitrate': {'type': 'alias', 'field': 'br'},
eb8a4433 1437 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1438 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1439 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1440 'framerate': {'type': 'alias', 'field': 'fps'},
63be1aab 1441 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1442 'protocol': {'type': 'alias', 'field': 'proto'},
1443 'source_preference': {'type': 'alias', 'field': 'source'},
1444 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1445 'filesize_estimate': {'type': 'alias', 'field': 'size'},
eb8a4433 1446 'samplerate': {'type': 'alias', 'field': 'asr'},
1447 'video_ext': {'type': 'alias', 'field': 'vext'},
1448 'audio_ext': {'type': 'alias', 'field': 'aext'},
1449 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1450 'audio_codec': {'type': 'alias', 'field': 'acodec'},
63be1aab 1451 'video': {'type': 'alias', 'field': 'hasvid'},
1452 'has_video': {'type': 'alias', 'field': 'hasvid'},
1453 'audio': {'type': 'alias', 'field': 'hasaud'},
1454 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1455 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1456 'preference': {'type': 'alias', 'field': 'ie_pref'},
1457 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1458 'format_id': {'type': 'alias', 'field': 'id'},
1459 }
eb8a4433 1460
1461 _order = []
1462
1463 def _get_field_setting(self, field, key):
1464 if field not in self.settings:
1465 self.settings[field] = {}
1466 propObj = self.settings[field]
1467 if key not in propObj:
1468 type = propObj.get('type')
1469 if key == 'field':
1470 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1471 elif key == 'convert':
1472 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1473 else:
eb8a4433 1474 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1475 propObj[key] = default
1476 return propObj[key]
1477
1478 def _resolve_field_value(self, field, value, convertNone=False):
1479 if value is None:
1480 if not convertNone:
1481 return None
4bcc7bd1 1482 else:
eb8a4433 1483 value = value.lower()
1484 conversion = self._get_field_setting(field, 'convert')
1485 if conversion == 'ignore':
1486 return None
1487 if conversion == 'string':
1488 return value
1489 elif conversion == 'float_none':
1490 return float_or_none(value)
1491 elif conversion == 'bytes':
1492 return FileDownloader.parse_bytes(value)
1493 elif conversion == 'order':
da9be05e 1494 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1495 use_regex = self._get_field_setting(field, 'regex')
1496 list_length = len(order_list)
1497 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1498 if use_regex and value is not None:
da9be05e 1499 for i, regex in enumerate(order_list):
eb8a4433 1500 if regex and re.match(regex, value):
1501 return list_length - i
1502 return list_length - empty_pos # not in list
1503 else: # not regex or value = None
1504 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1505 else:
1506 if value.isnumeric():
1507 return float(value)
4bcc7bd1 1508 else:
eb8a4433 1509 self.settings[field]['convert'] = 'string'
1510 return value
1511
1512 def evaluate_params(self, params, sort_extractor):
1513 self._use_free_order = params.get('prefer_free_formats', False)
1514 self._sort_user = params.get('format_sort', [])
1515 self._sort_extractor = sort_extractor
1516
1517 def add_item(field, reverse, closest, limit_text):
1518 field = field.lower()
1519 if field in self._order:
1520 return
1521 self._order.append(field)
1522 limit = self._resolve_field_value(field, limit_text)
1523 data = {
1524 'reverse': reverse,
1525 'closest': False if limit is None else closest,
1526 'limit_text': limit_text,
1527 'limit': limit}
1528 if field in self.settings:
1529 self.settings[field].update(data)
1530 else:
1531 self.settings[field] = data
1532
1533 sort_list = (
1534 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1535 + (tuple() if params.get('format_sort_force', False)
1536 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1537 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1538
1539 for item in sort_list:
1540 match = re.match(self.regex, item)
1541 if match is None:
1542 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1543 field = match.group('field')
1544 if field is None:
1545 continue
1546 if self._get_field_setting(field, 'type') == 'alias':
1547 field = self._get_field_setting(field, 'field')
1548 reverse = match.group('reverse') is not None
1549 closest = match.group('seperator') == '~'
1550 limit_text = match.group('limit')
1551
1552 has_limit = limit_text is not None
1553 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1554 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1555
1556 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1557 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1558 limit_count = len(limits)
1559 for (i, f) in enumerate(fields):
1560 add_item(f, reverse, closest,
1561 limits[i] if i < limit_count
1562 else limits[0] if has_limit and not has_multiple_limits
1563 else None)
1564
1565 def print_verbose_info(self, to_screen):
1566 to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1567 if self._sort_extractor:
f983b875 1568 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
eb8a4433 1569 to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1570 '+' if self._get_field_setting(field, 'reverse') else '', field,
1571 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1572 self._get_field_setting(field, 'limit_text'),
1573 self._get_field_setting(field, 'limit'))
1574 if self._get_field_setting(field, 'limit_text') is not None else '')
1575 for field in self._order if self._get_field_setting(field, 'visible')]))
1576
1577 def _calculate_field_preference_from_value(self, format, field, type, value):
1578 reverse = self._get_field_setting(field, 'reverse')
1579 closest = self._get_field_setting(field, 'closest')
1580 limit = self._get_field_setting(field, 'limit')
1581
1582 if type == 'extractor':
1583 maximum = self._get_field_setting(field, 'max')
1584 if value is None or (maximum is not None and value >= maximum):
f983b875 1585 value = -1
eb8a4433 1586 elif type == 'boolean':
1587 in_list = self._get_field_setting(field, 'in_list')
1588 not_in_list = self._get_field_setting(field, 'not_in_list')
1589 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1590 elif type == 'ordered':
1591 value = self._resolve_field_value(field, value, True)
1592
1593 # try to convert to number
1594 val_num = float_or_none(value)
1595 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1596 if is_num:
1597 value = val_num
1598
1599 return ((-10, 0) if value is None
1600 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1601 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1602 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1603 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1604 else (-1, value, 0))
1605
1606 def _calculate_field_preference(self, format, field):
1607 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1608 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1609 if type == 'multiple':
1610 type = 'field' # Only 'field' is allowed in multiple for now
1611 actual_fields = self._get_field_setting(field, 'field')
1612
1613 def wrapped_function(values):
1614 values = tuple(filter(lambda x: x is not None, values))
1615 return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1616 else values[0] if values
1617 else None)
1618
1619 value = wrapped_function((get_value(f) for f in actual_fields))
1620 else:
1621 value = get_value(field)
1622 return self._calculate_field_preference_from_value(format, field, type, value)
1623
1624 def calculate_preference(self, format):
1625 # Determine missing protocol
1626 if not format.get('protocol'):
1627 format['protocol'] = determine_protocol(format)
1628
1629 # Determine missing ext
1630 if not format.get('ext') and 'url' in format:
1631 format['ext'] = determine_ext(format['url'])
1632 if format.get('vcodec') == 'none':
1633 format['audio_ext'] = format['ext']
1634 format['video_ext'] = 'none'
1635 else:
1636 format['video_ext'] = format['ext']
1637 format['audio_ext'] = 'none'
1638 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1639 # format['preference'] = -1000
1640
1641 # Determine missing bitrates
1642 if format.get('tbr') is None:
1643 if format.get('vbr') is not None and format.get('abr') is not None:
1644 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1645 else:
1646 if format.get('vcodec') != "none" and format.get('vbr') is None:
1647 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1648 if format.get('acodec') != "none" and format.get('abr') is None:
1649 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1650
1651 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1652
1653 def _sort_formats(self, formats, field_preference=[]):
1654 if not formats:
1655 raise ExtractorError('No video formats found')
1656 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1657 format_sort.evaluate_params(self._downloader.params, field_preference)
1658 if self._downloader.params.get('verbose', False):
1659 format_sort.print_verbose_info(self._downloader.to_screen)
1660 formats.sort(key=lambda f: format_sort.calculate_preference(f))
59040888 1661
96a53167
S
1662 def _check_formats(self, formats, video_id):
1663 if formats:
1664 formats[:] = filter(
1665 lambda f: self._is_valid_url(
1666 f['url'], video_id,
1667 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1668 formats)
1669
f5bdb444
S
1670 @staticmethod
1671 def _remove_duplicate_formats(formats):
1672 format_urls = set()
1673 unique_formats = []
1674 for f in formats:
1675 if f['url'] not in format_urls:
1676 format_urls.add(f['url'])
1677 unique_formats.append(f)
1678 formats[:] = unique_formats
1679
45024183 1680 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1681 url = self._proto_relative_url(url, scheme='http:')
1682 # For now assume non HTTP(S) URLs always valid
1683 if not (url.startswith('http://') or url.startswith('https://')):
1684 return True
96a53167 1685 try:
45024183 1686 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1687 return True
8bdd16b4 1688 except ExtractorError as e:
25e911a9 1689 self.to_screen(
8bdd16b4 1690 '%s: %s URL is invalid, skipping: %s'
1691 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1692 return False
96a53167 1693
20991253 1694 def http_scheme(self):
1ede5b24 1695 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1696 return (
1697 'http:'
1698 if self._downloader.params.get('prefer_insecure', False)
1699 else 'https:')
1700
57c7411f
PH
1701 def _proto_relative_url(self, url, scheme=None):
1702 if url is None:
1703 return url
1704 if url.startswith('//'):
1705 if scheme is None:
1706 scheme = self.http_scheme()
1707 return scheme + url
1708 else:
1709 return url
1710
4094b6e3
PH
1711 def _sleep(self, timeout, video_id, msg_template=None):
1712 if msg_template is None:
f1a9d64e 1713 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1714 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1715 self.to_screen(msg)
1716 time.sleep(timeout)
1717
f983b875 1718 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1719 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1720 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
f036a632
JMF
1721 manifest = self._download_xml(
1722 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1723 'Unable to download f4m manifest',
1724 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1725 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1726 transform_source=transform_source,
7360c06f 1727 fatal=fatal, data=data, headers=headers, query=query)
4de61310
S
1728
1729 if manifest is False:
8d29e47f 1730 return []
31bb8d3f 1731
0fdbb332 1732 return self._parse_f4m_formats(
f983b875 1733 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1734 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1735
f983b875 1736 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1737 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1738 fatal=True, m3u8_id=None):
ee0ba927 1739 if not isinstance(manifest, compat_etree_Element) and not fatal:
d9eb580a
S
1740 return []
1741
7a5c1cfe 1742 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1743 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1744 if akamai_pv is not None and ';' in akamai_pv.text:
1745 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1746 if playerVerificationChallenge.strip() != '':
1747 return []
1748
31bb8d3f 1749 formats = []
7a47d07c 1750 manifest_version = '1.0'
b2527359 1751 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1752 if not media_nodes:
7a47d07c 1753 manifest_version = '2.0'
34e48bed 1754 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1755 # Remove unsupported DRM protected media from final formats
067aa17e 1756 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1757 media_nodes = remove_encrypted_media(media_nodes)
1758 if not media_nodes:
1759 return formats
48107c19
S
1760
1761 manifest_base_url = get_base_url(manifest)
0a5685b2 1762
a6571f10 1763 bootstrap_info = xpath_element(
0a5685b2
YCH
1764 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1765 'bootstrap info', default=None)
1766
edd6074c
RA
1767 vcodec = None
1768 mime_type = xpath_text(
1769 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1770 'base URL', default=None)
1771 if mime_type and mime_type.startswith('audio/'):
1772 vcodec = 'none'
1773
b2527359 1774 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1775 tbr = int_or_none(media_el.attrib.get('bitrate'))
1776 width = int_or_none(media_el.attrib.get('width'))
1777 height = int_or_none(media_el.attrib.get('height'))
1778 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1779 # If <bootstrapInfo> is present, the specified f4m is a
1780 # stream-level manifest, and only set-level manifests may refer to
1781 # external resources. See section 11.4 and section 4 of F4M spec
1782 if bootstrap_info is None:
1783 media_url = None
1784 # @href is introduced in 2.0, see section 11.6 of F4M spec
1785 if manifest_version == '2.0':
1786 media_url = media_el.attrib.get('href')
1787 if media_url is None:
1788 media_url = media_el.attrib.get('url')
31c746e5
S
1789 if not media_url:
1790 continue
cc357c4d
S
1791 manifest_url = (
1792 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1793 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1794 # If media_url is itself a f4m manifest do the recursive extraction
1795 # since bitrates in parent manifest (this one) and media_url manifest
1796 # may differ leading to inability to resolve the format by requested
1797 # bitrate in f4m downloader
240b6045
YCH
1798 ext = determine_ext(manifest_url)
1799 if ext == 'f4m':
77b8b4e6 1800 f4m_formats = self._extract_f4m_formats(
f983b875 1801 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1802 transform_source=transform_source, fatal=fatal)
1803 # Sometimes stream-level manifest contains single media entry that
1804 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1805 # At the same time parent's media entry in set-level manifest may
1806 # contain it. We will copy it from parent in such cases.
1807 if len(f4m_formats) == 1:
1808 f = f4m_formats[0]
1809 f.update({
1810 'tbr': f.get('tbr') or tbr,
1811 'width': f.get('width') or width,
1812 'height': f.get('height') or height,
1813 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1814 'vcodec': vcodec,
77b8b4e6
S
1815 })
1816 formats.extend(f4m_formats)
70f0f5a8 1817 continue
240b6045
YCH
1818 elif ext == 'm3u8':
1819 formats.extend(self._extract_m3u8_formats(
1820 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1821 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1822 continue
31bb8d3f 1823 formats.append({
77b8b4e6 1824 'format_id': format_id,
31bb8d3f 1825 'url': manifest_url,
30d0b549 1826 'manifest_url': manifest_url,
a6571f10 1827 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1828 'protocol': 'f4m',
b2527359 1829 'tbr': tbr,
77b8b4e6
S
1830 'width': width,
1831 'height': height,
edd6074c 1832 'vcodec': vcodec,
60ca389c 1833 'preference': preference,
f983b875 1834 'quality': quality,
31bb8d3f 1835 })
31bb8d3f
JMF
1836 return formats
1837
f983b875 1838 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1839 return {
f207019c 1840 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1841 'url': m3u8_url,
1842 'ext': ext,
1843 'protocol': 'm3u8',
37768f92 1844 'preference': preference - 100 if preference else -100,
f983b875 1845 'quality': quality,
704df56d
PH
1846 'resolution': 'multiple',
1847 'format_note': 'Quality selection URL',
16da9bbc
YCH
1848 }
1849
1850 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
f983b875 1851 entry_protocol='m3u8', preference=None, quality=None,
310c2ed2 1852 m3u8_id=None, live=False, note=None, errnote=None,
1853 fatal=True, data=None, headers={}, query={}):
dbd82a1d 1854 res = self._download_webpage_handle(
81515ad9 1855 m3u8_url, video_id,
621ed9f5 1856 note=note or 'Downloading m3u8 information',
13af92fd 1857 errnote=errnote or 'Failed to download m3u8 information',
7360c06f 1858 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1859
dbd82a1d 1860 if res is False:
8d29e47f 1861 return []
cb252080 1862
dbd82a1d 1863 m3u8_doc, urlh = res
37113045 1864 m3u8_url = urlh.geturl()
9cdffeeb 1865
cb252080
S
1866 return self._parse_m3u8_formats(
1867 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
310c2ed2 1868 preference=preference, quality=quality, m3u8_id=m3u8_id,
1869 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1870 headers=headers, query=query, video_id=video_id)
cb252080
S
1871
1872 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
f983b875 1873 entry_protocol='m3u8', preference=None, quality=None,
310c2ed2 1874 m3u8_id=None, live=False, note=None, errnote=None,
1875 fatal=True, data=None, headers={}, query={}, video_id=None):
08a00eef
RA
1876 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1877 return []
1878
ea229584
RA
1879 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1880 return []
1881
ff99fe52 1882 formats = []
0def7587
RA
1883
1884 format_url = lambda u: (
1885 u
1886 if re.match(r'^https?://', u)
1887 else compat_urlparse.urljoin(m3u8_url, u))
1888
310c2ed2 1889 split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1890
cb252080
S
1891 # References:
1892 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
1893 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1894 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
1895
1896 # We should try extracting formats only from master playlists [1, 4.3.4],
1897 # i.e. playlists that describe available qualities. On the other hand
1898 # media playlists [1, 4.3.3] should be returned as is since they contain
1899 # just the media without qualities renditions.
9cdffeeb 1900 # Fortunately, master playlist can be easily distinguished from media
cb252080 1901 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 1902 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
1903 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1904 # media playlist and MUST NOT appear in master playlist thus we can
1905 # clearly detect media playlist with this criterion.
1906
b3b30a4b 1907 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
1908 fatal=True, data=None, headers={}):
310c2ed2 1909 if not m3u8_doc:
ed9b7e3d 1910 if not format_url:
1911 return []
310c2ed2 1912 res = self._download_webpage_handle(
1913 format_url, video_id,
1914 note=False,
b3b30a4b 1915 errnote='Failed to download m3u8 playlist information',
1916 fatal=fatal, data=data, headers=headers)
310c2ed2 1917
1918 if res is False:
1919 return []
1920
1921 m3u8_doc, urlh = res
1922 format_url = urlh.geturl()
1923
1924 playlist_formats = []
1925 i = (
1926 0
1927 if split_discontinuity
1928 else None)
1929 format_info = {
1930 'index': i,
1931 'key_data': None,
1932 'files': [],
1933 }
1934 for line in m3u8_doc.splitlines():
1935 if not line.startswith('#'):
1936 format_info['files'].append(line)
1937 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1938 i += 1
1939 playlist_formats.append(format_info)
1940 format_info = {
1941 'index': i,
1942 'url': format_url,
1943 'files': [],
1944 }
1945 playlist_formats.append(format_info)
1946 return playlist_formats
1947
9cdffeeb 1948 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
310c2ed2 1949
ed9b7e3d 1950 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
310c2ed2 1951
1952 for format in playlist_formats:
1953 format_id = []
1954 if m3u8_id:
1955 format_id.append(m3u8_id)
1956 format_index = format.get('index')
1957 if format_index:
1958 format_id.append(str(format_index))
1959 f = {
1960 'format_id': '-'.join(format_id),
1961 'format_index': format_index,
1962 'url': m3u8_url,
1963 'ext': ext,
1964 'protocol': entry_protocol,
1965 'preference': preference,
1966 'quality': quality,
1967 }
1968 formats.append(f)
1969
1970 return formats
cb252080
S
1971
1972 groups = {}
1973 last_stream_inf = {}
1974
1975 def extract_media(x_media_line):
1976 media = parse_m3u8_attributes(x_media_line)
1977 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1978 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1979 if not (media_type and group_id and name):
1980 return
1981 groups.setdefault(group_id, []).append(media)
1982 if media_type not in ('VIDEO', 'AUDIO'):
1983 return
1984 media_url = media.get('URI')
1985 if media_url:
310c2ed2 1986 manifest_url = format_url(media_url)
cb252080 1987 format_id = []
b3b30a4b 1988 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
1989 fatal=fatal, data=data, headers=headers)
310c2ed2 1990
1991 for format in playlist_formats:
1992 format_index = format.get('index')
1993 for v in (m3u8_id, group_id, name):
1994 if v:
1995 format_id.append(v)
1996 if format_index:
1997 format_id.append(str(format_index))
1998 f = {
1999 'format_id': '-'.join(format_id),
2000 'format_index': format_index,
2001 'url': manifest_url,
2002 'manifest_url': m3u8_url,
2003 'language': media.get('LANGUAGE'),
2004 'ext': ext,
2005 'protocol': entry_protocol,
2006 'preference': preference,
2007 'quality': quality,
2008 }
2009 if media_type == 'AUDIO':
2010 f['vcodec'] = 'none'
2011 formats.append(f)
cb252080
S
2012
2013 def build_stream_name():
2014 # Despite specification does not mention NAME attribute for
3019cb0c
S
2015 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2016 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 2017 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
2018 stream_name = last_stream_inf.get('NAME')
2019 if stream_name:
2020 return stream_name
2021 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2022 # from corresponding rendition group
2023 stream_group_id = last_stream_inf.get('VIDEO')
2024 if not stream_group_id:
2025 return
2026 stream_group = groups.get(stream_group_id)
2027 if not stream_group:
2028 return stream_group_id
2029 rendition = stream_group[0]
2030 return rendition.get('NAME') or stream_group_id
2031
379306ef 2032 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
2033 # chance to detect video only formats when EXT-X-STREAM-INF tags
2034 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2035 for line in m3u8_doc.splitlines():
2036 if line.startswith('#EXT-X-MEDIA:'):
2037 extract_media(line)
2038
704df56d
PH
2039 for line in m3u8_doc.splitlines():
2040 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 2041 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
2042 elif line.startswith('#') or not line.strip():
2043 continue
2044 else:
9c99bef7 2045 tbr = float_or_none(
3089bc74
S
2046 last_stream_inf.get('AVERAGE-BANDWIDTH')
2047 or last_stream_inf.get('BANDWIDTH'), scale=1000)
30d0b549 2048 manifest_url = format_url(line.strip())
5ef62fc4 2049
b3b30a4b 2050 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2051 fatal=fatal, data=data, headers=headers)
310c2ed2 2052
2053 for format in playlist_formats:
2054 format_id = []
2055 if m3u8_id:
2056 format_id.append(m3u8_id)
2057 format_index = format.get('index')
2058 stream_name = build_stream_name()
2059 # Bandwidth of live streams may differ over time thus making
2060 # format_id unpredictable. So it's better to keep provided
2061 # format_id intact.
2062 if not live:
2063 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2064 if format_index:
2065 format_id.append(str(format_index))
2066 f = {
2067 'format_id': '-'.join(format_id),
2068 'format_index': format_index,
2069 'url': manifest_url,
2070 'manifest_url': m3u8_url,
2071 'tbr': tbr,
2072 'ext': ext,
2073 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2074 'protocol': entry_protocol,
2075 'preference': preference,
2076 'quality': quality,
2077 }
2078 resolution = last_stream_inf.get('RESOLUTION')
2079 if resolution:
2080 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2081 if mobj:
2082 f['width'] = int(mobj.group('width'))
2083 f['height'] = int(mobj.group('height'))
2084 # Unified Streaming Platform
2085 mobj = re.search(
2086 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2087 if mobj:
2088 abr, vbr = mobj.groups()
2089 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2090 f.update({
2091 'vbr': vbr,
2092 'abr': abr,
2093 })
2094 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2095 f.update(codecs)
2096 audio_group_id = last_stream_inf.get('AUDIO')
2097 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2098 # references a rendition group MUST have a CODECS attribute.
2099 # However, this is not always respected, for example, [2]
2100 # contains EXT-X-STREAM-INF tag which references AUDIO
2101 # rendition group but does not have CODECS and despite
2102 # referencing an audio group it represents a complete
2103 # (with audio and video) format. So, for such cases we will
2104 # ignore references to rendition groups and treat them
2105 # as complete formats.
2106 if audio_group_id and codecs and f.get('vcodec') != 'none':
2107 audio_group = groups.get(audio_group_id)
2108 if audio_group and audio_group[0].get('URI'):
2109 # TODO: update acodec for audio only formats with
2110 # the same GROUP-ID
2111 f['acodec'] = 'none'
2112 formats.append(f)
2113
2114 # for DailyMotion
2115 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2116 if progressive_uri:
2117 http_f = f.copy()
2118 del http_f['manifest_url']
2119 http_f.update({
2120 'format_id': f['format_id'].replace('hls-', 'http-'),
2121 'protocol': 'http',
2122 'url': progressive_uri,
2123 })
2124 formats.append(http_f)
5ef62fc4 2125
cb252080 2126 last_stream_inf = {}
704df56d
PH
2127 return formats
2128
a107193e
S
2129 @staticmethod
2130 def _xpath_ns(path, namespace=None):
2131 if not namespace:
2132 return path
2133 out = []
2134 for c in path.split('/'):
2135 if not c or c == '.':
2136 out.append(c)
2137 else:
2138 out.append('{%s}%s' % (namespace, c))
2139 return '/'.join(out)
2140
09f572fb 2141 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2142 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 2143
995029a1
PH
2144 if smil is False:
2145 assert not fatal
2146 return []
e89a2aab 2147
17712eeb 2148 namespace = self._parse_smil_namespace(smil)
a107193e
S
2149
2150 return self._parse_smil_formats(
2151 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2152
2153 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2154 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2155 if smil is False:
2156 return {}
2157 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2158
09f572fb 2159 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
2160 return self._download_xml(
2161 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2162 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2163
2164 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2165 namespace = self._parse_smil_namespace(smil)
a107193e
S
2166
2167 formats = self._parse_smil_formats(
2168 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2169 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2170
2171 video_id = os.path.splitext(url_basename(smil_url))[0]
2172 title = None
2173 description = None
647eab45 2174 upload_date = None
a107193e
S
2175 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2176 name = meta.attrib.get('name')
2177 content = meta.attrib.get('content')
2178 if not name or not content:
2179 continue
2180 if not title and name == 'title':
2181 title = content
2182 elif not description and name in ('description', 'abstract'):
2183 description = content
647eab45
S
2184 elif not upload_date and name == 'date':
2185 upload_date = unified_strdate(content)
a107193e 2186
1e5bcdec
S
2187 thumbnails = [{
2188 'id': image.get('type'),
2189 'url': image.get('src'),
2190 'width': int_or_none(image.get('width')),
2191 'height': int_or_none(image.get('height')),
2192 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2193
a107193e
S
2194 return {
2195 'id': video_id,
2196 'title': title or video_id,
2197 'description': description,
647eab45 2198 'upload_date': upload_date,
1e5bcdec 2199 'thumbnails': thumbnails,
a107193e
S
2200 'formats': formats,
2201 'subtitles': subtitles,
2202 }
2203
17712eeb
S
2204 def _parse_smil_namespace(self, smil):
2205 return self._search_regex(
2206 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2207
f877c6ae 2208 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2209 base = smil_url
2210 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2211 b = meta.get('base') or meta.get('httpBase')
2212 if b:
2213 base = b
2214 break
e89a2aab
S
2215
2216 formats = []
2217 rtmp_count = 0
a107193e 2218 http_count = 0
7f32e5dc 2219 m3u8_count = 0
a107193e 2220
81e1c4e2 2221 srcs = []
ad96b4c8
YCH
2222 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2223 for medium in media:
2224 src = medium.get('src')
81e1c4e2 2225 if not src or src in srcs:
a107193e 2226 continue
81e1c4e2 2227 srcs.append(src)
a107193e 2228
ad96b4c8
YCH
2229 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2230 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2231 width = int_or_none(medium.get('width'))
2232 height = int_or_none(medium.get('height'))
2233 proto = medium.get('proto')
2234 ext = medium.get('ext')
a107193e 2235 src_ext = determine_ext(src)
ad96b4c8 2236 streamer = medium.get('streamer') or base
a107193e
S
2237
2238 if proto == 'rtmp' or streamer.startswith('rtmp'):
2239 rtmp_count += 1
2240 formats.append({
2241 'url': streamer,
2242 'play_path': src,
2243 'ext': 'flv',
2244 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2245 'tbr': bitrate,
2246 'filesize': filesize,
2247 'width': width,
2248 'height': height,
2249 })
f877c6ae
YCH
2250 if transform_rtmp_url:
2251 streamer, src = transform_rtmp_url(streamer, src)
2252 formats[-1].update({
2253 'url': streamer,
2254 'play_path': src,
2255 })
a107193e
S
2256 continue
2257
2258 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 2259 src_url = src_url.strip()
a107193e
S
2260
2261 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2262 m3u8_formats = self._extract_m3u8_formats(
2263 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2264 if len(m3u8_formats) == 1:
2265 m3u8_count += 1
2266 m3u8_formats[0].update({
2267 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2268 'tbr': bitrate,
2269 'width': width,
2270 'height': height,
2271 })
2272 formats.extend(m3u8_formats)
bd21ead2 2273 elif src_ext == 'f4m':
a107193e
S
2274 f4m_url = src_url
2275 if not f4m_params:
2276 f4m_params = {
2277 'hdcore': '3.2.0',
2278 'plugin': 'flowplayer-3.2.0.1',
2279 }
2280 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 2281 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 2282 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2283 elif src_ext == 'mpd':
2284 formats.extend(self._extract_mpd_formats(
2285 src_url, video_id, mpd_id='dash', fatal=False))
2286 elif re.search(r'\.ism/[Mm]anifest', src_url):
2287 formats.extend(self._extract_ism_formats(
2288 src_url, video_id, ism_id='mss', fatal=False))
2289 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2290 http_count += 1
2291 formats.append({
2292 'url': src_url,
2293 'ext': ext or src_ext or 'flv',
2294 'format_id': 'http-%d' % (bitrate or http_count),
2295 'tbr': bitrate,
2296 'filesize': filesize,
2297 'width': width,
2298 'height': height,
2299 })
63757032 2300
e89a2aab
S
2301 return formats
2302
ce00af87 2303 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2304 urls = []
a107193e
S
2305 subtitles = {}
2306 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2307 src = textstream.get('src')
d413095f 2308 if not src or src in urls:
a107193e 2309 continue
d413095f 2310 urls.append(src)
df634be2 2311 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2312 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2313 subtitles.setdefault(lang, []).append({
2314 'url': src,
2315 'ext': ext,
2316 })
2317 return subtitles
63757032 2318
47a5cb77 2319 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 2320 xspf = self._download_xml(
47a5cb77 2321 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
2322 'Unable to download xspf manifest', fatal=fatal)
2323 if xspf is False:
2324 return []
47a5cb77
S
2325 return self._parse_xspf(
2326 xspf, playlist_id, xspf_url=xspf_url,
2327 xspf_base_url=base_url(xspf_url))
8d6765cf 2328
47a5cb77 2329 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2330 NS_MAP = {
2331 'xspf': 'http://xspf.org/ns/0/',
2332 's1': 'http://static.streamone.nl/player/ns/0',
2333 }
2334
2335 entries = []
47a5cb77 2336 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2337 title = xpath_text(
98044462 2338 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2339 description = xpath_text(
2340 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2341 thumbnail = xpath_text(
2342 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2343 duration = float_or_none(
2344 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2345
47a5cb77
S
2346 formats = []
2347 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2348 format_url = urljoin(xspf_base_url, location.text)
2349 if not format_url:
2350 continue
2351 formats.append({
2352 'url': format_url,
2353 'manifest_url': xspf_url,
2354 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2355 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2356 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2357 })
8d6765cf
S
2358 self._sort_formats(formats)
2359
2360 entries.append({
2361 'id': playlist_id,
2362 'title': title,
2363 'description': description,
2364 'thumbnail': thumbnail,
2365 'duration': duration,
2366 'formats': formats,
2367 })
2368 return entries
2369
545cc85d 2370 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2371 res = self._download_xml_handle(
1bac3455 2372 mpd_url, video_id,
2373 note=note or 'Downloading MPD manifest',
2374 errnote=errnote or 'Failed to download MPD manifest',
7360c06f 2375 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2376 if res is False:
2d2fa82d 2377 return []
47a5cb77 2378 mpd_doc, urlh = res
c25720ef
RA
2379 if mpd_doc is None:
2380 return []
02dc0a36 2381 mpd_base_url = base_url(urlh.geturl())
1bac3455 2382
91cb6b50 2383 return self._parse_mpd_formats(
545cc85d 2384 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2385
545cc85d 2386 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2387 """
2388 Parse formats from MPD manifest.
2389 References:
2390 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2391 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2392 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2393 """
78895bd3
U
2394 if not self._downloader.params.get('dynamic_mpd'):
2395 if mpd_doc.get('type') == 'dynamic':
2396 return []
2d2fa82d 2397
91cb6b50 2398 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2399
2400 def _add_ns(path):
2401 return self._xpath_ns(path, namespace)
2402
675d0016 2403 def is_drm_protected(element):
2404 return element.find(_add_ns('ContentProtection')) is not None
2405
1bac3455 2406 def extract_multisegment_info(element, ms_parent_info):
2407 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2408
2409 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2410 # common attributes and elements. We will only extract relevant
2411 # for us.
2412 def extract_common(source):
2413 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2414 if segment_timeline is not None:
2415 s_e = segment_timeline.findall(_add_ns('S'))
2416 if s_e:
2417 ms_info['total_number'] = 0
2418 ms_info['s'] = []
2419 for s in s_e:
2420 r = int(s.get('r', 0))
2421 ms_info['total_number'] += 1 + r
2422 ms_info['s'].append({
2423 't': int(s.get('t', 0)),
2424 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2425 'd': int(s.attrib['d']),
2426 'r': r,
2427 })
2428 start_number = source.get('startNumber')
2429 if start_number:
2430 ms_info['start_number'] = int(start_number)
2431 timescale = source.get('timescale')
2432 if timescale:
2433 ms_info['timescale'] = int(timescale)
2434 segment_duration = source.get('duration')
2435 if segment_duration:
48504785 2436 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2437
2438 def extract_Initialization(source):
2439 initialization = source.find(_add_ns('Initialization'))
2440 if initialization is not None:
2441 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2442
f14be228 2443 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2444 if segment_list is not None:
b4c1d6e8
S
2445 extract_common(segment_list)
2446 extract_Initialization(segment_list)
f14be228 2447 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2448 if segment_urls_e:
2449 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2450 else:
f14be228 2451 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2452 if segment_template is not None:
b4c1d6e8 2453 extract_common(segment_template)
e228616c
S
2454 media = segment_template.get('media')
2455 if media:
2456 ms_info['media'] = media
1bac3455 2457 initialization = segment_template.get('initialization')
2458 if initialization:
e228616c 2459 ms_info['initialization'] = initialization
1bac3455 2460 else:
b4c1d6e8 2461 extract_Initialization(segment_template)
1bac3455 2462 return ms_info
b323e170 2463
06869367 2464 skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
63ad4d43 2465
1bac3455 2466 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 2467 formats = []
f14be228 2468 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2469 period_duration = parse_duration(period.get('duration')) or mpd_duration
2470 period_ms_info = extract_multisegment_info(period, {
2471 'start_number': 1,
2472 'timescale': 1,
2473 })
f14be228 2474 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
06869367 2475 if skip_unplayable and is_drm_protected(adaptation_set):
675d0016 2476 continue
1bac3455 2477 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2478 for representation in adaptation_set.findall(_add_ns('Representation')):
06869367 2479 if skip_unplayable and is_drm_protected(representation):
675d0016 2480 continue
1bac3455 2481 representation_attrib = adaptation_set.attrib.copy()
2482 representation_attrib.update(representation.attrib)
f0948348 2483 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759
YCH
2484 mime_type = representation_attrib['mimeType']
2485 content_type = mime_type.split('/')[0]
1bac3455 2486 if content_type == 'text':
2487 # TODO implement WebVTT downloading
2488 pass
40fcba5e 2489 elif content_type in ('video', 'audio'):
1bac3455 2490 base_url = ''
2491 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 2492 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 2493 if base_url_e is not None:
2494 base_url = base_url_e.text + base_url
2495 if re.match(r'^https?://', base_url):
2496 break
bb20526b
S
2497 if mpd_base_url and not re.match(r'^https?://', base_url):
2498 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2499 mpd_base_url += '/'
1bac3455 2500 base_url = mpd_base_url + base_url
2501 representation_id = representation_attrib.get('id')
d577c796 2502 lang = representation_attrib.get('lang')
51e9094f 2503 url_el = representation.find(_add_ns('BaseURL'))
2504 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
e228616c 2505 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1bac3455 2506 f = {
154c209e 2507 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
86f4d14f 2508 'manifest_url': mpd_url,
a6c8b759 2509 'ext': mimetype2ext(mime_type),
1bac3455 2510 'width': int_or_none(representation_attrib.get('width')),
2511 'height': int_or_none(representation_attrib.get('height')),
9c99bef7 2512 'tbr': float_or_none(bandwidth, 1000),
1bac3455 2513 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2514 'fps': int_or_none(representation_attrib.get('frameRate')),
d577c796 2515 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 2516 'format_note': 'DASH %s' % content_type,
51e9094f 2517 'filesize': filesize,
126f225b 2518 'container': mimetype2ext(mime_type) + '_dash',
1bac3455 2519 }
7fe15920 2520 f.update(parse_codecs(representation_attrib.get('codecs')))
1bac3455 2521 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
b4c1d6e8 2522
e228616c 2523 def prepare_template(template_name, identifiers):
eca1f0d1
S
2524 tmpl = representation_ms_info[template_name]
2525 # First of, % characters outside $...$ templates
2526 # must be escaped by doubling for proper processing
2527 # by % operator string formatting used further (see
067aa17e 2528 # https://github.com/ytdl-org/youtube-dl/issues/16867).
eca1f0d1
S
2529 t = ''
2530 in_template = False
2531 for c in tmpl:
2532 t += c
2533 if c == '$':
2534 in_template = not in_template
2535 elif c == '%' and not in_template:
2536 t += c
2537 # Next, $...$ templates are translated to their
2538 # %(...) counterparts to be used with % operator
e228616c
S
2539 t = t.replace('$RepresentationID$', representation_id)
2540 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2541 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2542 t.replace('$$', '$')
2543 return t
2544
2545 # @initialization is a regular template like @media one
2546 # so it should be handled just the same way (see
067aa17e 2547 # https://github.com/ytdl-org/youtube-dl/issues/11605)
e228616c
S
2548 if 'initialization' in representation_ms_info:
2549 initialization_template = prepare_template(
2550 'initialization',
2551 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2552 # $Time$ shall not be included for @initialization thus
2553 # only $Bandwidth$ remains
2554 ('Bandwidth', ))
2555 representation_ms_info['initialization_url'] = initialization_template % {
2556 'Bandwidth': bandwidth,
2557 }
2558
1141e910
S
2559 def location_key(location):
2560 return 'url' if re.match(r'^https?://', location) else 'path'
2561
e228616c
S
2562 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2563
2564 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1141e910 2565 media_location_key = location_key(media_template)
f0948348
S
2566
2567 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2568 # can't be used at the same time
b4c1d6e8
S
2569 if '%(Number' in media_template and 's' not in representation_ms_info:
2570 segment_duration = None
c110944f 2571 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
b4c1d6e8
S
2572 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2573 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
b4c1d6e8 2574 representation_ms_info['fragments'] = [{
1141e910 2575 media_location_key: media_template % {
b4c1d6e8 2576 'Number': segment_number,
e228616c 2577 'Bandwidth': bandwidth,
b4c1d6e8
S
2578 },
2579 'duration': segment_duration,
2580 } for segment_number in range(
2581 representation_ms_info['start_number'],
2582 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
f0948348 2583 else:
b4c1d6e8
S
2584 # $Number*$ or $Time$ in media template with S list available
2585 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2586 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
b4c1d6e8 2587 representation_ms_info['fragments'] = []
f0948348 2588 segment_time = 0
b4c1d6e8
S
2589 segment_d = None
2590 segment_number = representation_ms_info['start_number']
f0948348
S
2591
2592 def add_segment_url():
b4c1d6e8
S
2593 segment_url = media_template % {
2594 'Time': segment_time,
e228616c 2595 'Bandwidth': bandwidth,
b4c1d6e8
S
2596 'Number': segment_number,
2597 }
b4c1d6e8 2598 representation_ms_info['fragments'].append({
1141e910 2599 media_location_key: segment_url,
b4c1d6e8
S
2600 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2601 })
f0948348
S
2602
2603 for num, s in enumerate(representation_ms_info['s']):
2604 segment_time = s.get('t') or segment_time
b4c1d6e8 2605 segment_d = s['d']
f0948348 2606 add_segment_url()
b4c1d6e8 2607 segment_number += 1
f0948348 2608 for r in range(s.get('r', 0)):
b4c1d6e8 2609 segment_time += segment_d
f0948348 2610 add_segment_url()
b4c1d6e8
S
2611 segment_number += 1
2612 segment_time += segment_d
2613 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2614 # No media template
2615 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2616 # or any YouTube dashsegments video
2617 fragments = []
d04621da
S
2618 segment_index = 0
2619 timescale = representation_ms_info['timescale']
2620 for s in representation_ms_info['s']:
2621 duration = float_or_none(s['d'], timescale)
b4c1d6e8 2622 for r in range(s.get('r', 0) + 1):
1141e910 2623 segment_uri = representation_ms_info['segment_urls'][segment_index]
b4c1d6e8 2624 fragments.append({
1141e910 2625 location_key(segment_uri): segment_uri,
d04621da 2626 'duration': duration,
b4c1d6e8 2627 })
d04621da 2628 segment_index += 1
b4c1d6e8 2629 representation_ms_info['fragments'] = fragments
41bf647e
PN
2630 elif 'segment_urls' in representation_ms_info:
2631 # Segment URLs with no SegmentTimeline
2632 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
067aa17e 2633 # https://github.com/ytdl-org/youtube-dl/pull/14844
41bf647e 2634 fragments = []
603fc4e0
S
2635 segment_duration = float_or_none(
2636 representation_ms_info['segment_duration'],
2637 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
41bf647e 2638 for segment_url in representation_ms_info['segment_urls']:
603fc4e0 2639 fragment = {
41bf647e 2640 location_key(segment_url): segment_url,
603fc4e0
S
2641 }
2642 if segment_duration:
2643 fragment['duration'] = segment_duration
2644 fragments.append(fragment)
41bf647e 2645 representation_ms_info['fragments'] = fragments
79d2077e
S
2646 # If there is a fragments key available then we correctly recognized fragmented media.
2647 # Otherwise we will assume unfragmented media with direct access. Technically, such
2648 # assumption is not necessarily correct since we may simply have no support for
2649 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
86f4d14f 2650 if 'fragments' in representation_ms_info:
1bac3455 2651 f.update({
79d2077e
S
2652 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2653 'url': mpd_url or base_url,
1141e910 2654 'fragment_base_url': base_url,
b4c1d6e8 2655 'fragments': [],
1bac3455 2656 'protocol': 'http_dash_segments',
df374b52 2657 })
1bac3455 2658 if 'initialization_url' in representation_ms_info:
e228616c 2659 initialization_url = representation_ms_info['initialization_url']
1bac3455 2660 if not f.get('url'):
2661 f['url'] = initialization_url
1141e910 2662 f['fragments'].append({location_key(initialization_url): initialization_url})
b4c1d6e8 2663 f['fragments'].extend(representation_ms_info['fragments'])
79d2077e
S
2664 else:
2665 # Assuming direct URL to unfragmented media.
2666 f['url'] = base_url
545cc85d 2667 formats.append(f)
17b598d3 2668 else:
1bac3455 2669 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
17b598d3
YCH
2670 return formats
2671
7360c06f 2672 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2673 res = self._download_xml_handle(
b2758123
RA
2674 ism_url, video_id,
2675 note=note or 'Downloading ISM manifest',
2676 errnote=errnote or 'Failed to download ISM manifest',
7360c06f 2677 fatal=fatal, data=data, headers=headers, query=query)
b2758123
RA
2678 if res is False:
2679 return []
47a5cb77 2680 ism_doc, urlh = res
13b08034
S
2681 if ism_doc is None:
2682 return []
b2758123 2683
7947a1f7 2684 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
b2758123
RA
2685
2686 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2687 """
2688 Parse formats from ISM manifest.
2689 References:
2690 1. [MS-SSTR]: Smooth Streaming Protocol,
2691 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2692 """
06869367 2693 if ism_doc.get('IsLive') == 'TRUE':
2694 return []
2695 if (not self._downloader.params.get('allow_unplayable_formats')
2696 and ism_doc.find('Protection') is not None):
b2758123
RA
2697 return []
2698
b2758123
RA
2699 duration = int(ism_doc.attrib['Duration'])
2700 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2701
2702 formats = []
2703 for stream in ism_doc.findall('StreamIndex'):
2704 stream_type = stream.get('Type')
2705 if stream_type not in ('video', 'audio'):
2706 continue
2707 url_pattern = stream.attrib['Url']
2708 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2709 stream_name = stream.get('Name')
2710 for track in stream.findall('QualityLevel'):
2501d41e 2711 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
b2758123
RA
2712 # TODO: add support for WVC1 and WMAP
2713 if fourcc not in ('H264', 'AVC1', 'AACL'):
2714 self.report_warning('%s is not a supported codec' % fourcc)
2715 continue
2716 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2717 # [1] does not mention Width and Height attributes. However,
2718 # they're often present while MaxWidth and MaxHeight are
2719 # missing, so should be used as fallbacks
2720 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2721 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2722 sampling_rate = int_or_none(track.get('SamplingRate'))
2723
2724 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2725 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2726
2727 fragments = []
2728 fragment_ctx = {
2729 'time': 0,
2730 }
2731 stream_fragments = stream.findall('c')
2732 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2733 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2734 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2735 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2736 if not fragment_ctx['duration']:
2737 try:
2738 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2739 except IndexError:
2740 next_fragment_time = duration
1616f9b4 2741 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2742 for _ in range(fragment_repeat):
2743 fragments.append({
1616f9b4 2744 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2745 'duration': fragment_ctx['duration'] / stream_timescale,
2746 })
2747 fragment_ctx['time'] += fragment_ctx['duration']
2748
2749 format_id = []
2750 if ism_id:
2751 format_id.append(ism_id)
2752 if stream_name:
2753 format_id.append(stream_name)
2754 format_id.append(compat_str(tbr))
2755
2756 formats.append({
2757 'format_id': '-'.join(format_id),
2758 'url': ism_url,
2759 'manifest_url': ism_url,
2760 'ext': 'ismv' if stream_type == 'video' else 'isma',
2761 'width': width,
2762 'height': height,
2763 'tbr': tbr,
2764 'asr': sampling_rate,
2765 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2766 'acodec': 'none' if stream_type == 'video' else fourcc,
2767 'protocol': 'ism',
2768 'fragments': fragments,
2769 '_download_params': {
2770 'duration': duration,
2771 'timescale': stream_timescale,
2772 'width': width or 0,
2773 'height': height or 0,
2774 'fourcc': fourcc,
2775 'codec_private_data': track.get('CodecPrivateData'),
2776 'sampling_rate': sampling_rate,
2777 'channels': int_or_none(track.get('Channels', 2)),
2778 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2779 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2780 },
2781 })
2782 return formats
2783
f983b875 2784 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
6780154e
S
2785 def absolute_url(item_url):
2786 return urljoin(base_url, item_url)
59bbe491 2787
2788 def parse_content_type(content_type):
2789 if not content_type:
2790 return {}
2791 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2792 if ctr:
2793 mimetype, codecs = ctr.groups()
2794 f = parse_codecs(codecs)
2795 f['ext'] = mimetype2ext(mimetype)
2796 return f
2797 return {}
2798
868f79db 2799 def _media_formats(src, cur_media_type, type_info={}):
520251c0 2800 full_url = absolute_url(src)
82889d4a 2801 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 2802 if ext == 'm3u8':
520251c0
YCH
2803 is_plain_url = False
2804 formats = self._extract_m3u8_formats(
ad120ae1 2805 full_url, video_id, ext='mp4',
eeb0a956 2806 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 2807 preference=preference, quality=quality, fatal=False)
87a449c1
S
2808 elif ext == 'mpd':
2809 is_plain_url = False
2810 formats = self._extract_mpd_formats(
b359e977 2811 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
2812 else:
2813 is_plain_url = True
2814 formats = [{
2815 'url': full_url,
2816 'vcodec': 'none' if cur_media_type == 'audio' else None,
2817 }]
2818 return is_plain_url, formats
2819
59bbe491 2820 entries = []
4328ddf8
S
2821 # amp-video and amp-audio are very similar to their HTML5 counterparts
2822 # so we wll include them right here (see
2823 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 2824 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2825 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2826 media_tags = [(media_tag, media_tag_name, media_type, '')
2827 for media_tag, media_tag_name, media_type
2828 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
2829 media_tags.extend(re.findall(
2830 # We only allow video|audio followed by a whitespace or '>'.
2831 # Allowing more characters may end up in significant slow down (see
067aa17e 2832 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 2833 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 2834 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2835 for media_tag, _, media_type, media_content in media_tags:
59bbe491 2836 media_info = {
2837 'formats': [],
2838 'subtitles': {},
2839 }
2840 media_attributes = extract_attributes(media_tag)
f856816b 2841 src = strip_or_none(media_attributes.get('src'))
59bbe491 2842 if src:
dedb1770 2843 _, formats = _media_formats(src, media_type)
520251c0 2844 media_info['formats'].extend(formats)
6780154e 2845 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 2846 if media_content:
2847 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
2848 s_attr = extract_attributes(source_tag)
2849 # data-video-src and data-src are non standard but seen
2850 # several times in the wild
f856816b 2851 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 2852 if not src:
2853 continue
d493f15c 2854 f = parse_content_type(s_attr.get('type'))
868f79db 2855 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 2856 if is_plain_url:
d493f15c
S
2857 # width, height, res, label and title attributes are
2858 # all not standard but seen several times in the wild
2859 labels = [
2860 s_attr.get(lbl)
2861 for lbl in ('label', 'title')
2862 if str_or_none(s_attr.get(lbl))
2863 ]
2864 width = int_or_none(s_attr.get('width'))
3089bc74
S
2865 height = (int_or_none(s_attr.get('height'))
2866 or int_or_none(s_attr.get('res')))
d493f15c
S
2867 if not width or not height:
2868 for lbl in labels:
2869 resolution = parse_resolution(lbl)
2870 if not resolution:
2871 continue
2872 width = width or resolution.get('width')
2873 height = height or resolution.get('height')
2874 for lbl in labels:
2875 tbr = parse_bitrate(lbl)
2876 if tbr:
2877 break
2878 else:
2879 tbr = None
1ed45499 2880 f.update({
d493f15c
S
2881 'width': width,
2882 'height': height,
2883 'tbr': tbr,
2884 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 2885 })
520251c0
YCH
2886 f.update(formats[0])
2887 media_info['formats'].append(f)
2888 else:
2889 media_info['formats'].extend(formats)
59bbe491 2890 for track_tag in re.findall(r'<track[^>]+>', media_content):
2891 track_attributes = extract_attributes(track_tag)
2892 kind = track_attributes.get('kind')
5968d7d2 2893 if not kind or kind in ('subtitles', 'captions'):
f856816b 2894 src = strip_or_none(track_attributes.get('src'))
59bbe491 2895 if not src:
2896 continue
2897 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2898 media_info['subtitles'].setdefault(lang, []).append({
2899 'url': absolute_url(src),
2900 })
5e8e2fa5
S
2901 for f in media_info['formats']:
2902 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 2903 if media_info['formats'] or media_info['subtitles']:
59bbe491 2904 entries.append(media_info)
2905 return entries
2906
c4251b9a 2907 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
29f7c58a 2908 signed = 'hdnea=' in manifest_url
2909 if not signed:
2910 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2911 manifest_url = re.sub(
2912 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2913 '', manifest_url).strip('?')
2914
c7c43a93 2915 formats = []
70c5802b 2916
e71a4509 2917 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 2918 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
2919 hds_host = hosts.get('hds')
2920 if hds_host:
2921 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
2922 if 'hdcore=' not in f4m_url:
2923 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2924 f4m_formats = self._extract_f4m_formats(
2925 f4m_url, video_id, f4m_id='hds', fatal=False)
2926 for entry in f4m_formats:
2927 entry.update({'extra_param_to_segment_url': hdcore_sign})
2928 formats.extend(f4m_formats)
70c5802b 2929
c4251b9a
RA
2930 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2931 hls_host = hosts.get('hls')
2932 if hls_host:
2933 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
29f7c58a 2934 m3u8_formats = self._extract_m3u8_formats(
c7c43a93 2935 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 2936 m3u8_id='hls', fatal=False)
2937 formats.extend(m3u8_formats)
70c5802b 2938
2939 http_host = hosts.get('http')
29f7c58a 2940 if http_host and m3u8_formats and not signed:
2941 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 2942 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2943 qualities_length = len(qualities)
29f7c58a 2944 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 2945 i = 0
29f7c58a 2946 for f in m3u8_formats:
2947 if f['vcodec'] != 'none':
70c5802b 2948 for protocol in ('http', 'https'):
2949 http_f = f.copy()
2950 del http_f['manifest_url']
2951 http_url = re.sub(
29f7c58a 2952 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
70c5802b 2953 http_f.update({
2954 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2955 'url': http_url,
2956 'protocol': protocol,
2957 })
29f7c58a 2958 formats.append(http_f)
70c5802b 2959 i += 1
70c5802b 2960
c7c43a93
RA
2961 return formats
2962
6ad02195 2963 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 2964 query = compat_urlparse.urlparse(url).query
6ad02195 2965 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
2966 mobj = re.search(
2967 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2968 url_base = mobj.group('url')
2969 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 2970 formats = []
044eeb14
S
2971
2972 def manifest_url(manifest):
2973 m_url = '%s/%s' % (http_base_url, manifest)
2974 if query:
2975 m_url += '?%s' % query
2976 return m_url
2977
6ad02195
RA
2978 if 'm3u8' not in skip_protocols:
2979 formats.extend(self._extract_m3u8_formats(
044eeb14 2980 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
2981 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2982 if 'f4m' not in skip_protocols:
2983 formats.extend(self._extract_f4m_formats(
044eeb14 2984 manifest_url('manifest.f4m'),
6ad02195 2985 video_id, f4m_id='hds', fatal=False))
0384932e
RA
2986 if 'dash' not in skip_protocols:
2987 formats.extend(self._extract_mpd_formats(
044eeb14 2988 manifest_url('manifest.mpd'),
0384932e 2989 video_id, mpd_id='dash', fatal=False))
6ad02195 2990 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
2991 if 'smil' not in skip_protocols:
2992 rtmp_formats = self._extract_smil_formats(
044eeb14 2993 manifest_url('jwplayer.smil'),
6ad02195
RA
2994 video_id, fatal=False)
2995 for rtmp_format in rtmp_formats:
2996 rtsp_format = rtmp_format.copy()
2997 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2998 del rtsp_format['play_path']
2999 del rtsp_format['ext']
3000 rtsp_format.update({
3001 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3002 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3003 'protocol': 'rtsp',
3004 })
3005 formats.extend([rtmp_format, rtsp_format])
3006 else:
3007 for protocol in ('rtmp', 'rtsp'):
3008 if protocol not in skip_protocols:
3009 formats.append({
f2e2f0c7 3010 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
3011 'format_id': protocol,
3012 'protocol': protocol,
3013 })
3014 return formats
3015
c73e330e 3016 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 3017 mobj = re.search(
ac9c69ac 3018 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
3019 webpage)
3020 if mobj:
c73e330e
RU
3021 try:
3022 jwplayer_data = self._parse_json(mobj.group('options'),
3023 video_id=video_id,
3024 transform_source=transform_source)
3025 except ExtractorError:
3026 pass
3027 else:
3028 if isinstance(jwplayer_data, dict):
3029 return jwplayer_data
a4a554a7
YCH
3030
3031 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
3032 jwplayer_data = self._find_jwplayer_data(
3033 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
3034 return self._parse_jwplayer_data(
3035 jwplayer_data, video_id, *args, **kwargs)
3036
3037 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3038 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3039 # JWPlayer backward compatibility: flattened playlists
3040 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3041 if 'playlist' not in jwplayer_data:
3042 jwplayer_data = {'playlist': [jwplayer_data]}
3043
3044 entries = []
3045
3046 # JWPlayer backward compatibility: single playlist item
3047 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3048 if not isinstance(jwplayer_data['playlist'], list):
3049 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3050
3051 for video_data in jwplayer_data['playlist']:
3052 # JWPlayer backward compatibility: flattened sources
3053 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3054 if 'sources' not in video_data:
3055 video_data['sources'] = [video_data]
3056
3057 this_video_id = video_id or video_data['mediaid']
3058
1a2192cb
S
3059 formats = self._parse_jwplayer_formats(
3060 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3061 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
3062
3063 subtitles = {}
3064 tracks = video_data.get('tracks')
3065 if tracks and isinstance(tracks, list):
3066 for track in tracks:
96a2daa1
S
3067 if not isinstance(track, dict):
3068 continue
f4b74272
S
3069 track_kind = track.get('kind')
3070 if not track_kind or not isinstance(track_kind, compat_str):
3071 continue
3072 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
3073 continue
3074 track_url = urljoin(base_url, track.get('file'))
3075 if not track_url:
3076 continue
3077 subtitles.setdefault(track.get('label') or 'en', []).append({
3078 'url': self._proto_relative_url(track_url)
3079 })
3080
50d808f5 3081 entry = {
a4a554a7 3082 'id': this_video_id,
50d808f5 3083 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 3084 'description': clean_html(video_data.get('description')),
6945b9e7 3085 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
3086 'timestamp': int_or_none(video_data.get('pubdate')),
3087 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3088 'subtitles': subtitles,
50d808f5
RA
3089 }
3090 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3091 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3092 entry.update({
3093 '_type': 'url_transparent',
3094 'url': formats[0]['url'],
3095 })
3096 else:
3097 self._sort_formats(formats)
3098 entry['formats'] = formats
3099 entries.append(entry)
a4a554a7
YCH
3100 if len(entries) == 1:
3101 return entries[0]
3102 else:
3103 return self.playlist_result(entries)
3104
ed0cf9b3
S
3105 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3106 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3107 urls = []
ed0cf9b3 3108 formats = []
1a2192cb 3109 for source in jwplayer_sources_data:
0a268c6e
S
3110 if not isinstance(source, dict):
3111 continue
6945b9e7
RA
3112 source_url = urljoin(
3113 base_url, self._proto_relative_url(source.get('file')))
3114 if not source_url or source_url in urls:
bf1b87cd
RA
3115 continue
3116 urls.append(source_url)
ed0cf9b3
S
3117 source_type = source.get('type') or ''
3118 ext = mimetype2ext(source_type) or determine_ext(source_url)
3119 if source_type == 'hls' or ext == 'm3u8':
3120 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3121 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3122 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3123 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3124 formats.extend(self._extract_mpd_formats(
3125 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3126 elif ext == 'smil':
3127 formats.extend(self._extract_smil_formats(
3128 source_url, video_id, fatal=False))
ed0cf9b3 3129 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3130 elif source_type.startswith('audio') or ext in (
3131 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3132 formats.append({
3133 'url': source_url,
3134 'vcodec': 'none',
3135 'ext': ext,
3136 })
3137 else:
3138 height = int_or_none(source.get('height'))
3139 if height is None:
3140 # Often no height is provided but there is a label in
0236cd0d 3141 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3142 height = int_or_none(self._search_regex(
0236cd0d 3143 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
3144 'height', default=None))
3145 a_format = {
3146 'url': source_url,
3147 'width': int_or_none(source.get('width')),
3148 'height': height,
0236cd0d 3149 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3150 'ext': ext,
3151 }
3152 if source_url.startswith('rtmp'):
3153 a_format['ext'] = 'flv'
ed0cf9b3
S
3154 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3155 # of jwplayer.flash.swf
3156 rtmp_url_parts = re.split(
3157 r'((?:mp4|mp3|flv):)', source_url, 1)
3158 if len(rtmp_url_parts) == 3:
3159 rtmp_url, prefix, play_path = rtmp_url_parts
3160 a_format.update({
3161 'url': rtmp_url,
3162 'play_path': prefix + play_path,
3163 })
3164 if rtmp_params:
3165 a_format.update(rtmp_params)
3166 formats.append(a_format)
3167 return formats
3168
f4b1c7ad
PH
3169 def _live_title(self, name):
3170 """ Generate the title for a live video """
3171 now = datetime.datetime.now()
611c1dd9 3172 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
3173 return name + ' ' + now_str
3174
b14f3a4c
PH
3175 def _int(self, v, name, fatal=False, **kwargs):
3176 res = int_or_none(v, **kwargs)
3177 if 'get_attr' in kwargs:
3178 print(getattr(v, kwargs['get_attr']))
3179 if res is None:
3180 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3181 if fatal:
3182 raise ExtractorError(msg)
3183 else:
3184 self._downloader.report_warning(msg)
3185 return res
3186
3187 def _float(self, v, name, fatal=False, **kwargs):
3188 res = float_or_none(v, **kwargs)
3189 if res is None:
3190 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3191 if fatal:
3192 raise ExtractorError(msg)
3193 else:
3194 self._downloader.report_warning(msg)
3195 return res
3196
40e41780
TF
3197 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3198 path='/', secure=False, discard=False, rest={}, **kwargs):
6c22cee6 3199 cookie = compat_cookiejar_Cookie(
4ed2d7b7 3200 0, name, value, port, port is not None, domain, True,
40e41780
TF
3201 domain.startswith('.'), path, True, secure, expire_time,
3202 discard, None, None, rest)
42939b61
JMF
3203 self._downloader.cookiejar.set_cookie(cookie)
3204
799207e8 3205 def _get_cookies(self, url):
3206 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 3207 req = sanitized_Request(url)
799207e8 3208 self._downloader.cookiejar.add_cookie_header(req)
3209 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3210
e3c1266f 3211 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3212 """
3213 Apply first Set-Cookie header instead of the last. Experimental.
3214
3215 Some sites (e.g. [1-3]) may serve two cookies under the same name
3216 in Set-Cookie header and expect the first (old) one to be set rather
3217 than second (new). However, as of RFC6265 the newer one cookie
3218 should be set into cookie store what actually happens.
3219 We will workaround this issue by resetting the cookie to
3220 the first one manually.
3221 1. https://new.vk.com/
3222 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3223 3. https://learning.oreilly.com/
3224 """
e3c1266f
S
3225 for header, cookies in url_handle.headers.items():
3226 if header.lower() != 'set-cookie':
3227 continue
3228 if sys.version_info[0] >= 3:
3229 cookies = cookies.encode('iso-8859-1')
3230 cookies = cookies.decode('utf-8')
3231 cookie_value = re.search(
3232 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3233 if cookie_value:
3234 value, domain = cookie_value.groups()
3235 self._set_cookie(domain, cookie, value)
3236 break
3237
05900629
PH
3238 def get_testcases(self, include_onlymatching=False):
3239 t = getattr(self, '_TEST', None)
3240 if t:
3241 assert not hasattr(self, '_TESTS'), \
3242 '%s has _TEST and _TESTS' % type(self).__name__
3243 tests = [t]
3244 else:
3245 tests = getattr(self, '_TESTS', [])
3246 for t in tests:
3247 if not include_onlymatching and t.get('only_matching', False):
3248 continue
3249 t['name'] = type(self).__name__[:-len('IE')]
3250 yield t
3251
3252 def is_suitable(self, age_limit):
3253 """ Test whether the extractor is generally suitable for the given
3254 age limit (i.e. pornographic sites are not, all others usually are) """
3255
3256 any_restricted = False
3257 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 3258 if tc.get('playlist', []):
05900629
PH
3259 tc = tc['playlist'][0]
3260 is_restricted = age_restricted(
3261 tc.get('info_dict', {}).get('age_limit'), age_limit)
3262 if not is_restricted:
3263 return True
3264 any_restricted = any_restricted or is_restricted
3265 return not any_restricted
3266
a504ced0 3267 def extract_subtitles(self, *args, **kwargs):
3089bc74
S
3268 if (self._downloader.params.get('writesubtitles', False)
3269 or self._downloader.params.get('listsubtitles')):
9868ea49
JMF
3270 return self._get_subtitles(*args, **kwargs)
3271 return {}
a504ced0
JMF
3272
3273 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3274 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3275
912e0b7e
YCH
3276 @staticmethod
3277 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3278 """ Merge subtitle items for one language. Items with duplicated URLs
3279 will be dropped. """
3280 list1_urls = set([item['url'] for item in subtitle_list1])
3281 ret = list(subtitle_list1)
3282 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3283 return ret
3284
3285 @classmethod
8c97f819 3286 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 3287 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
3288 ret = dict(subtitle_dict1)
3289 for lang in subtitle_dict2:
8c97f819 3290 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
3291 return ret
3292
360e1ca5 3293 def extract_automatic_captions(self, *args, **kwargs):
3089bc74
S
3294 if (self._downloader.params.get('writeautomaticsub', False)
3295 or self._downloader.params.get('listsubtitles')):
9868ea49
JMF
3296 return self._get_automatic_captions(*args, **kwargs)
3297 return {}
360e1ca5
JMF
3298
3299 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3300 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3301
d77ab8e2 3302 def mark_watched(self, *args, **kwargs):
3089bc74
S
3303 if (self._downloader.params.get('mark_watched', False)
3304 and (self._get_login_info()[0] is not None
3305 or self._downloader.params.get('cookiefile') is not None)):
d77ab8e2
S
3306 self._mark_watched(*args, **kwargs)
3307
3308 def _mark_watched(self, *args, **kwargs):
3309 raise NotImplementedError('This method must be implemented by subclasses')
3310
38cce791
YCH
3311 def geo_verification_headers(self):
3312 headers = {}
3313 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3314 if geo_verification_proxy:
3315 headers['Ytdl-request-proxy'] = geo_verification_proxy
3316 return headers
3317
98763ee3
YCH
3318 def _generic_id(self, url):
3319 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3320
3321 def _generic_title(self, url):
3322 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3323
8dbe9899 3324
d6983cb4
PH
3325class SearchInfoExtractor(InfoExtractor):
3326 """
3327 Base class for paged search queries extractors.
10952eb2 3328 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
3329 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3330 """
3331
3332 @classmethod
3333 def _make_valid_url(cls):
3334 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3335
3336 @classmethod
3337 def suitable(cls, url):
3338 return re.match(cls._make_valid_url(), url) is not None
3339
3340 def _real_extract(self, query):
3341 mobj = re.match(self._make_valid_url(), query)
3342 if mobj is None:
f1a9d64e 3343 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
3344
3345 prefix = mobj.group('prefix')
3346 query = mobj.group('query')
3347 if prefix == '':
3348 return self._get_n_results(query, 1)
3349 elif prefix == 'all':
3350 return self._get_n_results(query, self._MAX_RESULTS)
3351 else:
3352 n = int(prefix)
3353 if n <= 0:
f1a9d64e 3354 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 3355 elif n > self._MAX_RESULTS:
f1a9d64e 3356 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3357 n = self._MAX_RESULTS
3358 return self._get_n_results(query, n)
3359
3360 def _get_n_results(self, query, n):
3361 """Get a specified number of results for a query"""
611c1dd9 3362 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
3363
3364 @property
3365 def SEARCH_KEY(self):
3366 return self._SEARCH_KEY