]> jfr.im git - yt-dlp.git/blame - youtube_dlc/extractor/common.py
[youtube] Show if video was a live stream in info
[yt-dlp.git] / youtube_dlc / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
f4b1c7ad 5import datetime
3ec05685 6import hashlib
3d3538e4 7import json
4094b6e3 8import netrc
d6983cb4 9import os
773f291d 10import random
d6983cb4
PH
11import re
12import socket
f8c7bed1 13import ssl
d6983cb4 14import sys
4094b6e3 15import time
1bac3455 16import math
d6983cb4 17
8c25f81b 18from ..compat import (
6c22cee6 19 compat_cookiejar_Cookie,
799207e8 20 compat_cookies,
ee0ba927 21 compat_etree_Element,
e9c0cdd3 22 compat_etree_fromstring,
e64b7569 23 compat_getpass,
d391b7e2 24 compat_integer_types,
d6983cb4 25 compat_http_client,
e9c0cdd3
YCH
26 compat_os_name,
27 compat_str,
d6983cb4 28 compat_urllib_error,
98763ee3 29 compat_urllib_parse_unquote,
15707c7e 30 compat_urllib_parse_urlencode,
41d06b04 31 compat_urllib_request,
f0b5d6af 32 compat_urlparse,
e01c3d2e 33 compat_xml_parse_error,
8c25f81b 34)
eb8a4433 35from ..downloader import FileDownloader
48107c19
S
36from ..downloader.f4m import (
37 get_base_url,
38 remove_encrypted_media,
39)
8c25f81b 40from ..utils import (
c342041f 41 NO_DEFAULT,
05900629 42 age_restricted,
02dc0a36 43 base_url,
08f2a92c 44 bug_reports_message,
d6983cb4
PH
45 clean_html,
46 compiled_regex_type,
70f0f5a8 47 determine_ext,
46b18f23 48 determine_protocol,
d493f15c 49 dict_get,
9b9c5355 50 error_to_compat_str,
d6983cb4 51 ExtractorError,
46b18f23 52 extract_attributes,
97f4aecf 53 fix_xml_ampersands,
b14f3a4c 54 float_or_none,
773f291d
S
55 GeoRestrictedError,
56 GeoUtils,
31bb8d3f 57 int_or_none,
a4a554a7 58 js_to_json,
0685d972 59 JSON_LD_RE,
46b18f23
JH
60 mimetype2ext,
61 orderedSet,
d493f15c 62 parse_bitrate,
46b18f23
JH
63 parse_codecs,
64 parse_duration,
4ca2a3cf 65 parse_iso8601,
46b18f23 66 parse_m3u8_attributes,
d493f15c 67 parse_resolution,
55b3e45b 68 RegexNotFoundError,
5c2266df 69 sanitized_Request,
46b18f23 70 sanitize_filename,
d493f15c 71 str_or_none,
ce5b9040 72 str_to_int,
f856816b 73 strip_or_none,
f38de77f 74 unescapeHTML,
647eab45 75 unified_strdate,
6b3a3098 76 unified_timestamp,
46b18f23
JH
77 update_Request,
78 update_url_query,
79 urljoin,
a107193e 80 url_basename,
bebef109 81 url_or_none,
a6571f10 82 xpath_element,
8d6765cf
S
83 xpath_text,
84 xpath_with_ns,
d6983cb4 85)
c342041f 86
d6983cb4
PH
87
88class InfoExtractor(object):
89 """Information Extractor class.
90
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
5d380852 95 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
98
cf0649f8 99 The type field determines the type of the result.
fed5d032
PH
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
102
103 For a video, the dictionaries must include the following fields:
d6983cb4
PH
104
105 id: Video identifier.
d6983cb4 106 title: Video title, unescaped.
d67b0b15 107
f49d89ee 108 Additionally, it must contain either a formats entry or a url one:
d67b0b15 109
f49d89ee
PH
110 formats: A list of dictionaries for each format available, ordered
111 from worst to best quality.
112
113 Potential fields:
c790e93a
S
114 * url The mandatory URL representing the media:
115 for plain file media - HTTP URL of this file,
116 for RTMP - RTMP URL,
117 for HLS - URL of the M3U8 media playlist,
118 for HDS - URL of the F4M manifest,
79d2077e
S
119 for DASH
120 - HTTP URL to plain file media (in case of
121 unfragmented media)
122 - URL of the MPD manifest or base URL
123 representing the media if MPD manifest
8ed7a233 124 is parsed from a string (in case of
79d2077e 125 fragmented media)
c790e93a 126 for MSS - URL of the ISM manifest.
86f4d14f
S
127 * manifest_url
128 The URL of the manifest file in case of
c790e93a
S
129 fragmented media:
130 for HLS - URL of the M3U8 master playlist,
131 for HDS - URL of the F4M manifest,
132 for DASH - URL of the MPD manifest,
133 for MSS - URL of the ISM manifest.
10952eb2 134 * ext Will be calculated from URL if missing
d67b0b15
PH
135 * format A human-readable description of the format
136 ("mp4 container with h264/opus").
137 Calculated from the format_id, width, height.
138 and format_note fields if missing.
139 * format_id A short description of the format
5d4f3985
PH
140 ("mp4_h264_opus" or "19").
141 Technically optional, but strongly recommended.
d67b0b15
PH
142 * format_note Additional info about the format
143 ("3D" or "DASH video")
144 * width Width of the video, if known
145 * height Height of the video, if known
f49d89ee 146 * resolution Textual description of width and height
7217e148 147 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
148 * abr Average audio bitrate in KBit/s
149 * acodec Name of the audio codec in use
dd27fd17 150 * asr Audio sampling rate in Hertz
d67b0b15 151 * vbr Average video bitrate in KBit/s
fbb21cf5 152 * fps Frame rate
d67b0b15 153 * vcodec Name of the video codec in use
1394ce65 154 * container Name of the container format
d67b0b15 155 * filesize The number of bytes, if known in advance
9732d77e 156 * filesize_approx An estimate for the number of bytes
d67b0b15 157 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
158 * protocol The protocol that will be used for the actual
159 download, lower-case.
b04b8852 160 "http", "https", "rtsp", "rtmp", "rtmpe",
af7d5a63 161 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
162 * fragment_base_url
163 Base URL for fragments. Each fragment's path
164 value (if present) will be relative to
165 this URL.
166 * fragments A list of fragments of a fragmented media.
167 Each fragment entry must contain either an url
168 or a path. If an url is present it should be
169 considered by a client. Otherwise both path and
170 fragment_base_url must be present. Here is
171 the list of all potential fields:
172 * "url" - fragment's URL
173 * "path" - fragment's path relative to
174 fragment_base_url
a0d5077c
S
175 * "duration" (optional, int or float)
176 * "filesize" (optional, int)
f49d89ee 177 * preference Order number of this format. If this field is
08d13955 178 present and not None, the formats get sorted
38d63d84 179 by this field, regardless of all other values.
f49d89ee
PH
180 -1 for default (order by other properties),
181 -2 or smaller for less than default.
e65566a9
PH
182 < -1000 to hide the format (if there is
183 another one which is strictly better)
32f90364
PH
184 * language Language code, e.g. "de" or "en-US".
185 * language_preference Is this in the language mentioned in
186 the URL?
aff2f4f4
PH
187 10 if it's what the URL is about,
188 -1 for default (don't know),
189 -10 otherwise, other values reserved for now.
5d73273f
PH
190 * quality Order number of the video quality of this
191 format, irrespective of the file format.
192 -1 for default (order by other properties),
193 -2 or smaller for less than default.
c64ed2a3
PH
194 * source_preference Order number for this video source
195 (quality takes higher priority)
196 -1 for default (order by other properties),
197 -2 or smaller for less than default.
d769be6c
PH
198 * http_headers A dictionary of additional HTTP headers
199 to add to the request.
6271f1ca 200 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
201 video's pixels are not square.
202 width : height ratio as float.
203 * no_resume The server does not support resuming the
204 (HTTP or RTMP) download. Boolean.
00c97e3e
S
205 * downloader_options A dictionary of downloader options as
206 described in FileDownloader
3dee7826 207
c0ba0f48 208 url: Final video URL.
d6983cb4 209 ext: Video filename extension.
d67b0b15
PH
210 format: The video format, defaults to ext (used for --get-format)
211 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 212
d6983cb4
PH
213 The following fields are optional:
214
f5e43bc6 215 alt_title: A secondary title of the video.
0afef30b
PH
216 display_id An alternative identifier for the video, not necessarily
217 unique, but available before title. Typically, id is
218 something like "4234987", title "Dancing naked mole rats",
219 and display_id "dancing-naked-mole-rats"
d5519808 220 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 221 * "id" (optional, string) - Thumbnail format ID
d5519808 222 * "url"
cfb56d1a 223 * "preference" (optional, int) - quality of the image
d5519808
PH
224 * "width" (optional, int)
225 * "height" (optional, int)
5e1c39ac 226 * "resolution" (optional, string "{width}x{height}",
d5519808 227 deprecated)
2de624fd 228 * "filesize" (optional, int)
d6983cb4 229 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 230 description: Full video description.
d6983cb4 231 uploader: Full name of the video uploader.
2bc0c46f 232 license: License name the video is licensed under.
8a92e51c 233 creator: The creator of the video.
8aab976b 234 release_date: The date (YYYYMMDD) when the video was released.
955c4514 235 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 236 upload_date: Video upload date (YYYYMMDD).
955c4514 237 If not explicitly set, calculated from timestamp.
d6983cb4 238 uploader_id: Nickname or id of the video uploader.
7bcd2830 239 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3 240 channel: Full name of the channel the video is uploaded on.
0e7b8d3e 241 Note that channel fields may or may not repeat uploader
6f1f59f3
S
242 fields. This depends on a particular extractor.
243 channel_id: Id of the channel.
244 channel_url: Full URL to a channel webpage.
da9ec3b9 245 location: Physical location where the video was filmed.
a504ced0 246 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
247 {tag: subformats}. "tag" is usually a language code, and
248 "subformats" is a list sorted from lower to higher
249 preference, each element is a dictionary with the "ext"
250 entry and one of:
a504ced0 251 * "data": The subtitles file contents
10952eb2 252 * "url": A URL pointing to the subtitles file
4bba3716 253 "ext" will be calculated from URL if missing
360e1ca5
JMF
254 automatic_captions: Like 'subtitles', used by the YoutubeIE for
255 automatically generated captions
62d231c0 256 duration: Length of the video in seconds, as an integer or float.
f3d29461 257 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
258 like_count: Number of positive ratings of the video
259 dislike_count: Number of negative ratings of the video
02835c6b 260 repost_count: Number of reposts of the video
2d30521a 261 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 262 comment_count: Number of comments on the video
dd622d7c
PH
263 comments: A list of comments, each with one or more of the following
264 properties (all but one of text or html optional):
265 * "author" - human-readable name of the comment author
266 * "author_id" - user ID of the comment author
267 * "id" - Comment ID
268 * "html" - Comment as HTML
269 * "text" - Plain text of the comment
270 * "timestamp" - UNIX timestamp of comment
271 * "parent" - ID of the comment this one is replying to.
272 Set to "root" to indicate that this is a
273 comment to the original video.
8dbe9899 274 age_limit: Age restriction for the video, as an integer (years)
cefecac1 275 webpage_url: The URL to the video webpage, if given to youtube-dlc it
9103bbc5
JMF
276 should allow to get the same result again. (It will be set
277 by YoutubeDL if it's missing)
ad3bc6ac
PH
278 categories: A list of categories that the video falls in, for example
279 ["Sports", "Berlin"]
864f24bd 280 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
281 is_live: True, False, or None (=unknown). Whether this video is a
282 live stream that goes on instead of a fixed-length video.
f76ede8e 283 was_live: True, False, or None (=unknown). Whether this video was
284 originally a live stream.
7c80519c 285 start_time: Time in seconds where the reproduction should start, as
10952eb2 286 specified in the URL.
297a564b 287 end_time: Time in seconds where the reproduction should end, as
10952eb2 288 specified in the URL.
55949fed 289 chapters: A list of dictionaries, with the following entries:
290 * "start_time" - The start time of the chapter in seconds
291 * "end_time" - The end time of the chapter in seconds
292 * "title" (optional, string)
d6983cb4 293
7109903e
S
294 The following fields should only be used when the video belongs to some logical
295 chapter or section:
296
297 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
298 chapter_number: Number of the chapter the video belongs to, as an integer.
299 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
300
301 The following fields should only be used when the video is an episode of some
8d76bdf1 302 series, programme or podcast:
7109903e
S
303
304 series: Title of the series or programme the video episode belongs to.
305 season: Title of the season the video episode belongs to.
27bfd4e5
S
306 season_number: Number of the season the video episode belongs to, as an integer.
307 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
308 episode: Title of the video episode. Unlike mandatory video title field,
309 this field should denote the exact title of the video episode
310 without any kind of decoration.
27bfd4e5
S
311 episode_number: Number of the video episode within a season, as an integer.
312 episode_id: Id of the video episode, as a unicode string.
7109903e 313
7a93ab5f
S
314 The following fields should only be used when the media is a track or a part of
315 a music album:
316
317 track: Title of the track.
318 track_number: Number of the track within an album or a disc, as an integer.
319 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
320 as a unicode string.
321 artist: Artist(s) of the track.
322 genre: Genre(s) of the track.
323 album: Title of the album the track belongs to.
324 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
325 album_artist: List of all artists appeared on the album (e.g.
326 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
327 and compilations).
328 disc_number: Number of the disc or other physical medium the track belongs to,
329 as an integer.
330 release_year: Year (YYYY) when the album was released.
331
deefc05b 332 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 333
d838b1bd
PH
334 Unless mentioned otherwise, None is equivalent to absence of information.
335
fed5d032
PH
336
337 _type "playlist" indicates multiple videos.
b82f815f
PH
338 There must be a key "entries", which is a list, an iterable, or a PagedList
339 object, each element of which is a valid dictionary by this specification.
fed5d032 340
b60419c5 341 Additionally, playlists can have "id", "title", and any other relevent
342 attributes with the same semantics as videos (see above).
fed5d032
PH
343
344
345 _type "multi_video" indicates that there are multiple videos that
346 form a single show, for examples multiple acts of an opera or TV episode.
347 It must have an entries key like a playlist and contain all the keys
348 required for a video at the same time.
349
350
351 _type "url" indicates that the video must be extracted from another
352 location, possibly by a different extractor. Its only required key is:
353 "url" - the next URL to extract.
f58766ce
PH
354 The key "ie_key" can be set to the class name (minus the trailing "IE",
355 e.g. "Youtube") if the extractor class is known in advance.
356 Additionally, the dictionary may have any properties of the resolved entity
357 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
358 known ahead of time.
359
360
361 _type "url_transparent" entities have the same specification as "url", but
362 indicate that the given additional information is more precise than the one
363 associated with the resolved URL.
364 This is useful when a site employs a video service that hosts the video and
365 its technical metadata, but that video service does not embed a useful
366 title, description etc.
367
368
d6983cb4
PH
369 Subclasses of this one should re-define the _real_initialize() and
370 _real_extract() methods and define a _VALID_URL regexp.
371 Probably, they should also be added to the list of extractors.
372
4248dad9 373 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
374 geo restriction bypass mechanisms for a particular extractor.
375 Though it won't disable explicit geo restriction bypass based on
504f20dd 376 country code provided with geo_bypass_country.
4248dad9
S
377
378 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
379 countries for this extractor. One of these countries will be used by
380 geo restriction bypass mechanism right away in order to bypass
504f20dd 381 geo restriction, of course, if the mechanism is not disabled.
773f291d 382
5f95927a
S
383 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
384 IP blocks in CIDR notation for this extractor. One of these IP blocks
385 will be used by geo restriction bypass mechanism similarly
504f20dd 386 to _GEO_COUNTRIES.
3ccdde8c 387
d6983cb4
PH
388 Finally, the _WORKING attribute should be set to False for broken IEs
389 in order to warn the users and skip the tests.
390 """
391
392 _ready = False
393 _downloader = None
773f291d 394 _x_forwarded_for_ip = None
4248dad9
S
395 _GEO_BYPASS = True
396 _GEO_COUNTRIES = None
5f95927a 397 _GEO_IP_BLOCKS = None
d6983cb4
PH
398 _WORKING = True
399
400 def __init__(self, downloader=None):
401 """Constructor. Receives an optional downloader."""
402 self._ready = False
773f291d 403 self._x_forwarded_for_ip = None
d6983cb4
PH
404 self.set_downloader(downloader)
405
406 @classmethod
407 def suitable(cls, url):
408 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
409
410 # This does not use has/getattr intentionally - we want to know whether
411 # we have cached the regexp for *this* class, whereas getattr would also
412 # match the superclass
413 if '_VALID_URL_RE' not in cls.__dict__:
414 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
415 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 416
ed9266db
PH
417 @classmethod
418 def _match_id(cls, url):
419 if '_VALID_URL_RE' not in cls.__dict__:
420 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
421 m = cls._VALID_URL_RE.match(url)
422 assert m
1afd0b0d 423 return compat_str(m.group('id'))
ed9266db 424
d6983cb4
PH
425 @classmethod
426 def working(cls):
427 """Getter method for _WORKING."""
428 return cls._WORKING
429
430 def initialize(self):
431 """Initializes an instance (authentication, etc)."""
5f95927a
S
432 self._initialize_geo_bypass({
433 'countries': self._GEO_COUNTRIES,
434 'ip_blocks': self._GEO_IP_BLOCKS,
435 })
4248dad9
S
436 if not self._ready:
437 self._real_initialize()
438 self._ready = True
439
5f95927a 440 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
441 """
442 Initialize geo restriction bypass mechanism.
443
444 This method is used to initialize geo bypass mechanism based on faking
445 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 446 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
447 IP will be passed as X-Forwarded-For HTTP header in all subsequent
448 HTTP requests.
e39b5d4a
S
449
450 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
451 during the instance initialization with _GEO_COUNTRIES and
452 _GEO_IP_BLOCKS.
e39b5d4a 453
5f95927a 454 You may also manually call it from extractor's code if geo bypass
e39b5d4a 455 information is not available beforehand (e.g. obtained during
5f95927a
S
456 extraction) or due to some other reason. In this case you should pass
457 this information in geo bypass context passed as first argument. It may
458 contain following fields:
459
460 countries: List of geo unrestricted countries (similar
461 to _GEO_COUNTRIES)
462 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
463 (similar to _GEO_IP_BLOCKS)
464
e39b5d4a 465 """
773f291d 466 if not self._x_forwarded_for_ip:
5f95927a
S
467
468 # Geo bypass mechanism is explicitly disabled by user
469 if not self._downloader.params.get('geo_bypass', True):
470 return
471
472 if not geo_bypass_context:
473 geo_bypass_context = {}
474
475 # Backward compatibility: previously _initialize_geo_bypass
476 # expected a list of countries, some 3rd party code may still use
477 # it this way
478 if isinstance(geo_bypass_context, (list, tuple)):
479 geo_bypass_context = {
480 'countries': geo_bypass_context,
481 }
482
483 # The whole point of geo bypass mechanism is to fake IP
484 # as X-Forwarded-For HTTP header based on some IP block or
485 # country code.
486
487 # Path 1: bypassing based on IP block in CIDR notation
488
489 # Explicit IP block specified by user, use it right away
490 # regardless of whether extractor is geo bypassable or not
491 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
492
493 # Otherwise use random IP block from geo bypass context but only
494 # if extractor is known as geo bypassable
495 if not ip_block:
496 ip_blocks = geo_bypass_context.get('ip_blocks')
497 if self._GEO_BYPASS and ip_blocks:
498 ip_block = random.choice(ip_blocks)
499
500 if ip_block:
501 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
502 if self._downloader.params.get('verbose', False):
503 self._downloader.to_screen(
504 '[debug] Using fake IP %s as X-Forwarded-For.'
505 % self._x_forwarded_for_ip)
506 return
507
508 # Path 2: bypassing based on country code
509
510 # Explicit country code specified by user, use it right away
511 # regardless of whether extractor is geo bypassable or not
512 country = self._downloader.params.get('geo_bypass_country', None)
513
514 # Otherwise use random country code from geo bypass context but
515 # only if extractor is known as geo bypassable
516 if not country:
517 countries = geo_bypass_context.get('countries')
518 if self._GEO_BYPASS and countries:
519 country = random.choice(countries)
520
521 if country:
522 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
4248dad9 523 if self._downloader.params.get('verbose', False):
6a9cb295 524 self._downloader.to_screen(
eea0716c 525 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
5f95927a 526 % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
527
528 def extract(self, url):
529 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 530 try:
773f291d
S
531 for _ in range(2):
532 try:
533 self.initialize()
0016b84e
S
534 ie_result = self._real_extract(url)
535 if self._x_forwarded_for_ip:
536 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
537 return ie_result
773f291d 538 except GeoRestrictedError as e:
4248dad9
S
539 if self.__maybe_fake_ip_and_retry(e.countries):
540 continue
773f291d 541 raise
3a5bcd03
PH
542 except ExtractorError:
543 raise
544 except compat_http_client.IncompleteRead as e:
dfb1b146 545 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 546 except (KeyError, StopIteration) as e:
dfb1b146 547 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4 548
4248dad9 549 def __maybe_fake_ip_and_retry(self, countries):
3089bc74
S
550 if (not self._downloader.params.get('geo_bypass_country', None)
551 and self._GEO_BYPASS
552 and self._downloader.params.get('geo_bypass', True)
553 and not self._x_forwarded_for_ip
554 and countries):
eea0716c
S
555 country_code = random.choice(countries)
556 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
557 if self._x_forwarded_for_ip:
558 self.report_warning(
eea0716c
S
559 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
560 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
561 return True
562 return False
563
d6983cb4
PH
564 def set_downloader(self, downloader):
565 """Sets the downloader for this IE."""
566 self._downloader = downloader
567
568 def _real_initialize(self):
569 """Real initialization process. Redefine in subclasses."""
570 pass
571
572 def _real_extract(self, url):
573 """Real extraction process. Redefine in subclasses."""
574 pass
575
56c73665
JMF
576 @classmethod
577 def ie_key(cls):
578 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 579 return compat_str(cls.__name__[:-2])
56c73665 580
d6983cb4
PH
581 @property
582 def IE_NAME(self):
dc519b54 583 return compat_str(type(self).__name__[:-2])
d6983cb4 584
d391b7e2
S
585 @staticmethod
586 def __can_accept_status_code(err, expected_status):
587 assert isinstance(err, compat_urllib_error.HTTPError)
588 if expected_status is None:
589 return False
590 if isinstance(expected_status, compat_integer_types):
591 return err.code == expected_status
592 elif isinstance(expected_status, (list, tuple)):
593 return err.code in expected_status
594 elif callable(expected_status):
595 return expected_status(err.code) is True
596 else:
597 assert False
598
599 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
600 """
601 Return the response handle.
602
603 See _download_webpage docstring for arguments specification.
604 """
d6983cb4
PH
605 if note is None:
606 self.report_download_webpage(video_id)
607 elif note is not False:
7cc3570e 608 if video_id is None:
f1a9d64e 609 self.to_screen('%s' % (note,))
7cc3570e 610 else:
f1a9d64e 611 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
612
613 # Some sites check X-Forwarded-For HTTP header in order to figure out
614 # the origin of the client behind proxy. This allows bypassing geo
615 # restriction by faking this header's value to IP that belongs to some
616 # geo unrestricted country. We will do so once we encounter any
617 # geo restriction error.
618 if self._x_forwarded_for_ip:
619 if 'X-Forwarded-For' not in headers:
620 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
621
41d06b04
S
622 if isinstance(url_or_request, compat_urllib_request.Request):
623 url_or_request = update_Request(
624 url_or_request, data=data, headers=headers, query=query)
625 else:
cdfee168 626 if query:
627 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 628 if data is not None or headers:
41d06b04 629 url_or_request = sanitized_Request(url_or_request, data, headers)
f8c7bed1
S
630 exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
631 if hasattr(ssl, 'CertificateError'):
632 exceptions.append(ssl.CertificateError)
d6983cb4 633 try:
dca08720 634 return self._downloader.urlopen(url_or_request)
f8c7bed1 635 except tuple(exceptions) as err:
d391b7e2
S
636 if isinstance(err, compat_urllib_error.HTTPError):
637 if self.__can_accept_status_code(err, expected_status):
95e42d73
XDG
638 # Retain reference to error to prevent file object from
639 # being closed before it can be read. Works around the
640 # effects of <https://bugs.python.org/issue15002>
641 # introduced in Python 3.4.1.
642 err.fp._error = err
d391b7e2
S
643 return err.fp
644
aa94a6d3
PH
645 if errnote is False:
646 return False
d6983cb4 647 if errnote is None:
f1a9d64e 648 errnote = 'Unable to download webpage'
7f8b2714 649
9b9c5355 650 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
651 if fatal:
652 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
653 else:
654 self._downloader.report_warning(errmsg)
655 return False
d6983cb4 656
d391b7e2
S
657 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
658 """
659 Return a tuple (page content as string, URL handle).
660
661 See _download_webpage docstring for arguments specification.
662 """
b9d3e163
PH
663 # Strip hashes from the URL (#1038)
664 if isinstance(url_or_request, (compat_str, str)):
665 url_or_request = url_or_request.partition('#')[0]
666
d391b7e2 667 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
668 if urlh is False:
669 assert not fatal
670 return False
c9a77969 671 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
672 return (content, urlh)
673
c9a77969
YCH
674 @staticmethod
675 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
676 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
677 if m:
678 encoding = m.group(1)
679 else:
0d75ae2c 680 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
681 webpage_bytes[:1024])
682 if m:
683 encoding = m.group(1).decode('ascii')
b60016e8
PH
684 elif webpage_bytes.startswith(b'\xff\xfe'):
685 encoding = 'utf-16'
f143d86a
PH
686 else:
687 encoding = 'utf-8'
c9a77969
YCH
688
689 return encoding
690
4457823d
S
691 def __check_blocked(self, content):
692 first_block = content[:512]
3089bc74
S
693 if ('<title>Access to this site is blocked</title>' in content
694 and 'Websense' in first_block):
4457823d
S
695 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
696 blocked_iframe = self._html_search_regex(
697 r'<iframe src="([^"]+)"', content,
698 'Websense information URL', default=None)
699 if blocked_iframe:
700 msg += ' Visit %s for more details' % blocked_iframe
701 raise ExtractorError(msg, expected=True)
702 if '<title>The URL you requested has been blocked</title>' in first_block:
703 msg = (
704 'Access to this webpage has been blocked by Indian censorship. '
705 'Use a VPN or proxy server (with --proxy) to route around it.')
706 block_msg = self._html_search_regex(
707 r'</h1><p>(.*?)</p>',
708 content, 'block message', default=None)
709 if block_msg:
710 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
711 raise ExtractorError(msg, expected=True)
3089bc74
S
712 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
713 and 'blocklist.rkn.gov.ru' in content):
4457823d
S
714 raise ExtractorError(
715 'Access to this webpage has been blocked by decision of the Russian government. '
716 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
717 expected=True)
718
c9a77969
YCH
719 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
720 content_type = urlh.headers.get('Content-Type', '')
721 webpage_bytes = urlh.read()
722 if prefix is not None:
723 webpage_bytes = prefix + webpage_bytes
724 if not encoding:
725 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4 726 if self._downloader.params.get('dump_intermediate_pages', False):
f610dbb0 727 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
728 dump = base64.b64encode(webpage_bytes).decode('ascii')
729 self._downloader.to_screen(dump)
d41e6efc 730 if self._downloader.params.get('write_pages', False):
f610dbb0 731 basen = '%s_%s' % (video_id, urlh.geturl())
c1bce22f 732 if len(basen) > 240:
f1a9d64e 733 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
734 basen = basen[:240 - len(h)] + h
735 raw_filename = basen + '.dump'
d41e6efc 736 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 737 self.to_screen('Saving request to ' + filename)
5f58165d
S
738 # Working around MAX_PATH limitation on Windows (see
739 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 740 if compat_os_name == 'nt':
5f58165d
S
741 absfilepath = os.path.abspath(filename)
742 if len(absfilepath) > 259:
743 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
744 with open(filename, 'wb') as outf:
745 outf.write(webpage_bytes)
746
ec0fafbb
AA
747 try:
748 content = webpage_bytes.decode(encoding, 'replace')
749 except LookupError:
750 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 751
4457823d 752 self.__check_blocked(content)
2410c43d 753
23be51d8 754 return content
d6983cb4 755
d391b7e2
S
756 def _download_webpage(
757 self, url_or_request, video_id, note=None, errnote=None,
758 fatal=True, tries=1, timeout=5, encoding=None, data=None,
759 headers={}, query={}, expected_status=None):
760 """
761 Return the data of the page as a string.
762
763 Arguments:
764 url_or_request -- plain text URL as a string or
765 a compat_urllib_request.Requestobject
766 video_id -- Video/playlist/item identifier (string)
767
768 Keyword arguments:
769 note -- note printed before downloading (string)
770 errnote -- note printed in case of an error (string)
771 fatal -- flag denoting whether error should be considered fatal,
772 i.e. whether it should cause ExtractionError to be raised,
773 otherwise a warning will be reported and extraction continued
774 tries -- number of tries
775 timeout -- sleep interval between tries
776 encoding -- encoding for a page content decoding, guessed automatically
777 when not explicitly specified
778 data -- POST data (bytes)
779 headers -- HTTP headers (dict)
780 query -- URL query (dict)
781 expected_status -- allows to accept failed HTTP requests (non 2xx
782 status code) by explicitly specifying a set of accepted status
783 codes. Can be any of the following entities:
784 - an integer type specifying an exact failed status code to
785 accept
786 - a list or a tuple of integer types specifying a list of
787 failed status codes to accept
788 - a callable accepting an actual failed status code and
789 returning True if it should be accepted
790 Note that this argument does not affect success status codes (2xx)
791 which are always accepted.
792 """
793
995ad69c
TF
794 success = False
795 try_count = 0
796 while success is False:
797 try:
d391b7e2
S
798 res = self._download_webpage_handle(
799 url_or_request, video_id, note, errnote, fatal,
800 encoding=encoding, data=data, headers=headers, query=query,
801 expected_status=expected_status)
995ad69c
TF
802 success = True
803 except compat_http_client.IncompleteRead as e:
804 try_count += 1
805 if try_count >= tries:
806 raise e
807 self._sleep(timeout, video_id)
7cc3570e
PH
808 if res is False:
809 return res
810 else:
811 content, _ = res
812 return content
d6983cb4 813
e0d198c1
S
814 def _download_xml_handle(
815 self, url_or_request, video_id, note='Downloading XML',
816 errnote='Unable to download XML', transform_source=None,
d391b7e2
S
817 fatal=True, encoding=None, data=None, headers={}, query={},
818 expected_status=None):
819 """
ee0ba927 820 Return a tuple (xml as an compat_etree_Element, URL handle).
d391b7e2
S
821
822 See _download_webpage docstring for arguments specification.
823 """
e0d198c1
S
824 res = self._download_webpage_handle(
825 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
826 encoding=encoding, data=data, headers=headers, query=query,
827 expected_status=expected_status)
e0d198c1
S
828 if res is False:
829 return res
830 xml_string, urlh = res
831 return self._parse_xml(
832 xml_string, video_id, transform_source=transform_source,
833 fatal=fatal), urlh
834
d391b7e2
S
835 def _download_xml(
836 self, url_or_request, video_id,
837 note='Downloading XML', errnote='Unable to download XML',
838 transform_source=None, fatal=True, encoding=None,
839 data=None, headers={}, query={}, expected_status=None):
840 """
ee0ba927 841 Return the xml as an compat_etree_Element.
d391b7e2
S
842
843 See _download_webpage docstring for arguments specification.
844 """
e0d198c1
S
845 res = self._download_xml_handle(
846 url_or_request, video_id, note=note, errnote=errnote,
847 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
848 data=data, headers=headers, query=query,
849 expected_status=expected_status)
e0d198c1 850 return res if res is False else res[0]
e01c3d2e
S
851
852 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
853 if transform_source:
854 xml_string = transform_source(xml_string)
e01c3d2e
S
855 try:
856 return compat_etree_fromstring(xml_string.encode('utf-8'))
857 except compat_xml_parse_error as ve:
858 errmsg = '%s: Failed to parse XML ' % video_id
859 if fatal:
860 raise ExtractorError(errmsg, cause=ve)
861 else:
862 self.report_warning(errmsg + str(ve))
267ed0c5 863
0fe7783e
S
864 def _download_json_handle(
865 self, url_or_request, video_id, note='Downloading JSON metadata',
866 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
867 fatal=True, encoding=None, data=None, headers={}, query={},
868 expected_status=None):
869 """
870 Return a tuple (JSON object, URL handle).
871
872 See _download_webpage docstring for arguments specification.
873 """
0fe7783e 874 res = self._download_webpage_handle(
c9a77969 875 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
876 encoding=encoding, data=data, headers=headers, query=query,
877 expected_status=expected_status)
0fe7783e
S
878 if res is False:
879 return res
880 json_string, urlh = res
ebb64199 881 return self._parse_json(
0fe7783e
S
882 json_string, video_id, transform_source=transform_source,
883 fatal=fatal), urlh
884
885 def _download_json(
886 self, url_or_request, video_id, note='Downloading JSON metadata',
887 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
888 fatal=True, encoding=None, data=None, headers={}, query={},
889 expected_status=None):
890 """
891 Return the JSON object as a dict.
892
893 See _download_webpage docstring for arguments specification.
894 """
0fe7783e
S
895 res = self._download_json_handle(
896 url_or_request, video_id, note=note, errnote=errnote,
897 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
898 data=data, headers=headers, query=query,
899 expected_status=expected_status)
0fe7783e 900 return res if res is False else res[0]
ebb64199
TF
901
902 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
903 if transform_source:
904 json_string = transform_source(json_string)
3d3538e4
PH
905 try:
906 return json.loads(json_string)
907 except ValueError as ve:
e7b6d122
PH
908 errmsg = '%s: Failed to parse JSON ' % video_id
909 if fatal:
910 raise ExtractorError(errmsg, cause=ve)
911 else:
912 self.report_warning(errmsg + str(ve))
3d3538e4 913
f45f96f8 914 def report_warning(self, msg, video_id=None):
f1a9d64e 915 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 916 self._downloader.report_warning(
f1a9d64e 917 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 918
d6983cb4
PH
919 def to_screen(self, msg):
920 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 921 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
922
923 def report_extraction(self, id_or_name):
924 """Report information extraction."""
f1a9d64e 925 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
926
927 def report_download_webpage(self, video_id):
928 """Report webpage download."""
f1a9d64e 929 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
930
931 def report_age_confirmation(self):
932 """Report attempt to confirm age."""
f1a9d64e 933 self.to_screen('Confirming age')
d6983cb4 934
fc79158d
JMF
935 def report_login(self):
936 """Report attempt to log in."""
f1a9d64e 937 self.to_screen('Logging in')
fc79158d 938
43e7d3c9
S
939 @staticmethod
940 def raise_login_required(msg='This video is only available for registered users'):
941 raise ExtractorError(
942 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
943 expected=True)
944
c430802e 945 @staticmethod
773f291d
S
946 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
947 raise GeoRestrictedError(msg, countries=countries)
c430802e 948
5f6a1245 949 # Methods for following #608
c0d0b01f 950 @staticmethod
830d53bf 951 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 952 """Returns a URL that points to a page that should be processed"""
5f6a1245 953 # TODO: ie should be the class used for getting the info
d6983cb4
PH
954 video_info = {'_type': 'url',
955 'url': url,
956 'ie_key': ie}
7012b23c
PH
957 if video_id is not None:
958 video_info['id'] = video_id
830d53bf
S
959 if video_title is not None:
960 video_info['title'] = video_title
d6983cb4 961 return video_info
5f6a1245 962
749ca5ec
S
963 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
964 urls = orderedSet(
46b18f23
JH
965 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
966 for m in matches)
967 return self.playlist_result(
749ca5ec 968 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 969
c0d0b01f 970 @staticmethod
b60419c5 971 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
d6983cb4
PH
972 """Returns a playlist"""
973 video_info = {'_type': 'playlist',
974 'entries': entries}
b60419c5 975 video_info.update(kwargs)
d6983cb4
PH
976 if playlist_id:
977 video_info['id'] = playlist_id
978 if playlist_title:
979 video_info['title'] = playlist_title
ecc97af3 980 if playlist_description is not None:
acf5cbfe 981 video_info['description'] = playlist_description
d6983cb4
PH
982 return video_info
983
c342041f 984 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
985 """
986 Perform a regex search on the given string, using a single or a list of
987 patterns returning the first matching group.
988 In case of failure return a default value or raise a WARNING or a
55b3e45b 989 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
990 """
991 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
992 mobj = re.search(pattern, string, flags)
993 else:
994 for p in pattern:
995 mobj = re.search(p, string, flags)
c3415d1b
PH
996 if mobj:
997 break
d6983cb4 998
e9c0cdd3 999 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 1000 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
1001 else:
1002 _name = name
1003
1004 if mobj:
711ede6e
PH
1005 if group is None:
1006 # return the first matching group
1007 return next(g for g in mobj.groups() if g is not None)
1008 else:
1009 return mobj.group(group)
c342041f 1010 elif default is not NO_DEFAULT:
d6983cb4
PH
1011 return default
1012 elif fatal:
f1a9d64e 1013 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 1014 else:
08f2a92c 1015 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
1016 return None
1017
c342041f 1018 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
1019 """
1020 Like _search_regex, but strips HTML tags and unescapes entities.
1021 """
711ede6e 1022 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
1023 if res:
1024 return clean_html(res).strip()
1025 else:
1026 return res
1027
2118fdd1
RA
1028 def _get_netrc_login_info(self, netrc_machine=None):
1029 username = None
1030 password = None
1031 netrc_machine = netrc_machine or self._NETRC_MACHINE
1032
1033 if self._downloader.params.get('usenetrc', False):
1034 try:
1035 info = netrc.netrc().authenticators(netrc_machine)
1036 if info is not None:
1037 username = info[0]
1038 password = info[2]
1039 else:
dcce092e
S
1040 raise netrc.NetrcParseError(
1041 'No authenticators for %s' % netrc_machine)
2118fdd1 1042 except (IOError, netrc.NetrcParseError) as err:
dcce092e
S
1043 self._downloader.report_warning(
1044 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1045
dcce092e 1046 return username, password
2118fdd1 1047
1b6712ab 1048 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1049 """
cf0649f8 1050 Get the login info as (username, password)
32443dd3
S
1051 First look for the manually specified credentials using username_option
1052 and password_option as keys in params dictionary. If no such credentials
1053 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1054 value.
fc79158d
JMF
1055 If there's no info available, return (None, None)
1056 """
1057 if self._downloader is None:
1058 return (None, None)
1059
fc79158d
JMF
1060 downloader_params = self._downloader.params
1061
1062 # Attempt to use provided username and password or .netrc data
1b6712ab
RA
1063 if downloader_params.get(username_option) is not None:
1064 username = downloader_params[username_option]
1065 password = downloader_params[password_option]
2118fdd1 1066 else:
1b6712ab 1067 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1068
2133565c 1069 return username, password
fc79158d 1070
e64b7569 1071 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1072 """
1073 Get the two-factor authentication info
1074 TODO - asking the user will be required for sms/phone verify
1075 currently just uses the command line option
1076 If there's no info available, return None
1077 """
1078 if self._downloader is None:
83317f69 1079 return None
1080 downloader_params = self._downloader.params
1081
d800609c 1082 if downloader_params.get('twofactor') is not None:
83317f69 1083 return downloader_params['twofactor']
1084
e64b7569 1085 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1086
46720279
JMF
1087 # Helper functions for extracting OpenGraph info
1088 @staticmethod
ab2d5247 1089 def _og_regexes(prop):
448ef1f3 1090 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
22f5f5c6 1091 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
7a6d76a6 1092 % {'prop': re.escape(prop)})
78fb87b2 1093 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1094 return [
78fb87b2
JMF
1095 template % (property_re, content_re),
1096 template % (content_re, property_re),
ab2d5247 1097 ]
46720279 1098
864f24bd
S
1099 @staticmethod
1100 def _meta_regex(prop):
1101 return r'''(?isx)<meta
8b9848ac 1102 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1103 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1104
3c4e6d83 1105 def _og_search_property(self, prop, html, name=None, **kargs):
b070564e
S
1106 if not isinstance(prop, (list, tuple)):
1107 prop = [prop]
46720279 1108 if name is None:
b070564e
S
1109 name = 'OpenGraph %s' % prop[0]
1110 og_regexes = []
1111 for p in prop:
1112 og_regexes.extend(self._og_regexes(p))
1113 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1114 if escaped is None:
1115 return None
1116 return unescapeHTML(escaped)
46720279
JMF
1117
1118 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1119 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1120
1121 def _og_search_description(self, html, **kargs):
1122 return self._og_search_property('description', html, fatal=False, **kargs)
1123
1124 def _og_search_title(self, html, **kargs):
1125 return self._og_search_property('title', html, **kargs)
1126
8ffa13e0 1127 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1128 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1129 if secure:
1130 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1131 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1132
78338f71
JMF
1133 def _og_search_url(self, html, **kargs):
1134 return self._og_search_property('url', html, **kargs)
1135
40c696e5 1136 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
88d9f6c0
S
1137 if not isinstance(name, (list, tuple)):
1138 name = [name]
59040888 1139 if display_name is None:
88d9f6c0 1140 display_name = name[0]
59040888 1141 return self._html_search_regex(
88d9f6c0 1142 [self._meta_regex(n) for n in name],
711ede6e 1143 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1144
1145 def _dc_search_uploader(self, html):
1146 return self._html_search_meta('dc.creator', html, 'uploader')
1147
8dbe9899
PH
1148 def _rta_search(self, html):
1149 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1150 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1151 r' content="RTA-5042-1996-1400-1577-RTA"',
1152 html):
1153 return 18
1154 return 0
1155
59040888
PH
1156 def _media_rating_search(self, html):
1157 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1158 rating = self._html_search_meta('rating', html)
1159
1160 if not rating:
1161 return None
1162
1163 RATING_TABLE = {
1164 'safe for kids': 0,
1165 'general': 8,
1166 '14 years': 14,
1167 'mature': 17,
1168 'restricted': 19,
1169 }
d800609c 1170 return RATING_TABLE.get(rating.lower())
59040888 1171
69319969 1172 def _family_friendly_search(self, html):
6ca7732d 1173 # See http://schema.org/VideoObject
ac8491fc
S
1174 family_friendly = self._html_search_meta(
1175 'isFamilyFriendly', html, default=None)
69319969
NJ
1176
1177 if not family_friendly:
1178 return None
1179
1180 RATING_TABLE = {
1181 '1': 0,
1182 'true': 0,
1183 '0': 18,
1184 'false': 18,
1185 }
d800609c 1186 return RATING_TABLE.get(family_friendly.lower())
69319969 1187
0c708f11
JMF
1188 def _twitter_search_player(self, html):
1189 return self._html_search_meta('twitter:player', html,
9e1a5b84 1190 'twitter card player')
0c708f11 1191
95b31e26 1192 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4433bb02 1193 json_ld_list = list(re.finditer(JSON_LD_RE, html))
321b5e08 1194 default = kwargs.get('default', NO_DEFAULT)
321b5e08
S
1195 # JSON-LD may be malformed and thus `fatal` should be respected.
1196 # At the same time `default` may be passed that assumes `fatal=False`
1197 # for _search_regex. Let's simulate the same behavior here as well.
1198 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
4433bb02
S
1199 json_ld = []
1200 for mobj in json_ld_list:
1201 json_ld_item = self._parse_json(
1202 mobj.group('json_ld'), video_id, fatal=fatal)
1203 if not json_ld_item:
1204 continue
1205 if isinstance(json_ld_item, dict):
1206 json_ld.append(json_ld_item)
1207 elif isinstance(json_ld_item, (list, tuple)):
1208 json_ld.extend(json_ld_item)
1209 if json_ld:
1210 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1211 if json_ld:
1212 return json_ld
1213 if default is not NO_DEFAULT:
1214 return default
1215 elif fatal:
1216 raise RegexNotFoundError('Unable to extract JSON-LD')
1217 else:
1218 self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1219 return {}
4ca2a3cf 1220
95b31e26 1221 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1222 if isinstance(json_ld, compat_str):
1223 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1224 if not json_ld:
1225 return {}
1226 info = {}
46933a15
S
1227 if not isinstance(json_ld, (list, tuple, dict)):
1228 return info
1229 if isinstance(json_ld, dict):
1230 json_ld = [json_ld]
bae14048 1231
e7e4a6e0
S
1232 INTERACTION_TYPE_MAP = {
1233 'CommentAction': 'comment',
1234 'AgreeAction': 'like',
1235 'DisagreeAction': 'dislike',
1236 'LikeAction': 'like',
1237 'DislikeAction': 'dislike',
1238 'ListenAction': 'view',
1239 'WatchAction': 'view',
1240 'ViewAction': 'view',
1241 }
1242
29f7c58a 1243 def extract_interaction_type(e):
1244 interaction_type = e.get('interactionType')
1245 if isinstance(interaction_type, dict):
1246 interaction_type = interaction_type.get('@type')
1247 return str_or_none(interaction_type)
1248
e7e4a6e0
S
1249 def extract_interaction_statistic(e):
1250 interaction_statistic = e.get('interactionStatistic')
29f7c58a 1251 if isinstance(interaction_statistic, dict):
1252 interaction_statistic = [interaction_statistic]
e7e4a6e0
S
1253 if not isinstance(interaction_statistic, list):
1254 return
1255 for is_e in interaction_statistic:
1256 if not isinstance(is_e, dict):
1257 continue
1258 if is_e.get('@type') != 'InteractionCounter':
1259 continue
29f7c58a 1260 interaction_type = extract_interaction_type(is_e)
1261 if not interaction_type:
e7e4a6e0 1262 continue
ce5b9040
S
1263 # For interaction count some sites provide string instead of
1264 # an integer (as per spec) with non digit characters (e.g. ",")
1265 # so extracting count with more relaxed str_to_int
1266 interaction_count = str_to_int(is_e.get('userInteractionCount'))
e7e4a6e0
S
1267 if interaction_count is None:
1268 continue
1269 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1270 if not count_kind:
1271 continue
1272 count_key = '%s_count' % count_kind
1273 if info.get(count_key) is not None:
1274 continue
1275 info[count_key] = interaction_count
1276
bae14048
S
1277 def extract_video_object(e):
1278 assert e['@type'] == 'VideoObject'
1279 info.update({
bebef109 1280 'url': url_or_none(e.get('contentUrl')),
bae14048
S
1281 'title': unescapeHTML(e.get('name')),
1282 'description': unescapeHTML(e.get('description')),
bebef109 1283 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
bae14048
S
1284 'duration': parse_duration(e.get('duration')),
1285 'timestamp': unified_timestamp(e.get('uploadDate')),
ad06b99d 1286 'uploader': str_or_none(e.get('author')),
bae14048
S
1287 'filesize': float_or_none(e.get('contentSize')),
1288 'tbr': int_or_none(e.get('bitrate')),
1289 'width': int_or_none(e.get('width')),
1290 'height': int_or_none(e.get('height')),
33a81c2c 1291 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1292 })
e7e4a6e0 1293 extract_interaction_statistic(e)
bae14048 1294
46933a15 1295 for e in json_ld:
4433bb02 1296 if '@context' in e:
46933a15
S
1297 item_type = e.get('@type')
1298 if expected_type is not None and expected_type != item_type:
4433bb02 1299 continue
c69701c6 1300 if item_type in ('TVEpisode', 'Episode'):
440863ad 1301 episode_name = unescapeHTML(e.get('name'))
46933a15 1302 info.update({
440863ad 1303 'episode': episode_name,
46933a15
S
1304 'episode_number': int_or_none(e.get('episodeNumber')),
1305 'description': unescapeHTML(e.get('description')),
1306 })
440863ad
S
1307 if not info.get('title') and episode_name:
1308 info['title'] = episode_name
46933a15 1309 part_of_season = e.get('partOfSeason')
c69701c6 1310 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
458fd30f
S
1311 info.update({
1312 'season': unescapeHTML(part_of_season.get('name')),
1313 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1314 })
d16b3c66 1315 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1316 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1317 info['series'] = unescapeHTML(part_of_series.get('name'))
391256dc
S
1318 elif item_type == 'Movie':
1319 info.update({
1320 'title': unescapeHTML(e.get('name')),
1321 'description': unescapeHTML(e.get('description')),
1322 'duration': parse_duration(e.get('duration')),
1323 'timestamp': unified_timestamp(e.get('dateCreated')),
1324 })
3931b845 1325 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1326 info.update({
1327 'timestamp': parse_iso8601(e.get('datePublished')),
1328 'title': unescapeHTML(e.get('headline')),
1329 'description': unescapeHTML(e.get('articleBody')),
1330 })
1331 elif item_type == 'VideoObject':
bae14048 1332 extract_video_object(e)
4433bb02
S
1333 if expected_type is None:
1334 continue
1335 else:
1336 break
c69701c6
S
1337 video = e.get('video')
1338 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1339 extract_video_object(video)
4433bb02
S
1340 if expected_type is None:
1341 continue
1342 else:
1343 break
4ca2a3cf
S
1344 return dict((k, v) for k, v in info.items() if v is not None)
1345
27713812 1346 @staticmethod
f8da79f8 1347 def _hidden_inputs(html):
586f1cc5 1348 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1349 hidden_inputs = {}
c8498368
S
1350 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1351 attrs = extract_attributes(input)
1352 if not input:
201ea3ee 1353 continue
c8498368 1354 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1355 continue
c8498368
S
1356 name = attrs.get('name') or attrs.get('id')
1357 value = attrs.get('value')
1358 if name and value is not None:
1359 hidden_inputs[name] = value
201ea3ee 1360 return hidden_inputs
27713812 1361
cf61d96d
S
1362 def _form_hidden_inputs(self, form_id, html):
1363 form = self._search_regex(
73eb13df 1364 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1365 html, '%s form' % form_id, group='form')
1366 return self._hidden_inputs(form)
1367
eb8a4433 1368 class FormatSort:
1369 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<seperator>[~:])(?P<limit>.*?))?)? *$'
1370
c10d0213 1371 default = ('hidden', 'hasvid', 'ie_pref', 'lang', 'quality',
155d2b48 1372 'res', 'fps', 'codec:vp9.2', 'size', 'br', 'asr',
c10d0213 1373 'proto', 'ext', 'has_audio', 'source', 'format_id') # These must not be aliases
eb8a4433 1374
1375 settings = {
1376 'vcodec': {'type': 'ordered', 'regex': True,
155d2b48 1377 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
eb8a4433 1378 'acodec': {'type': 'ordered', 'regex': True,
1379 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
f137c99e 1380 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
63be1aab 1381 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', 'm3u8', '.*dash', '', 'mms|rtsp', 'none', 'f4']},
eb8a4433 1382 'vext': {'type': 'ordered', 'field': 'video_ext',
91ebc640 1383 'order': ('mp4', 'webm', 'flv', '', 'none'),
eb8a4433 1384 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1385 'aext': {'type': 'ordered', 'field': 'audio_ext',
1386 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1387 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1388 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
f983b875 1389 'ie_pref': {'priority': True, 'type': 'extractor'},
63be1aab 1390 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1391 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
f983b875 1392 'lang': {'priority': True, 'convert': 'ignore', 'type': 'extractor', 'field': 'language_preference'},
1393 'quality': {'convert': 'float_none', 'type': 'extractor'},
eb8a4433 1394 'filesize': {'convert': 'bytes'},
f137c99e 1395 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1396 'id': {'convert': 'string', 'field': 'format_id'},
eb8a4433 1397 'height': {'convert': 'float_none'},
1398 'width': {'convert': 'float_none'},
1399 'fps': {'convert': 'float_none'},
1400 'tbr': {'convert': 'float_none'},
1401 'vbr': {'convert': 'float_none'},
1402 'abr': {'convert': 'float_none'},
1403 'asr': {'convert': 'float_none'},
f983b875 1404 'source': {'convert': 'ignore', 'type': 'extractor', 'field': 'source_preference'},
63be1aab 1405
eb8a4433 1406 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
63be1aab 1407 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1408 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1409 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1410 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1411
1412 # Most of these exist only for compatibility reasons
1413 'dimension': {'type': 'alias', 'field': 'res'},
1414 'resolution': {'type': 'alias', 'field': 'res'},
1415 'extension': {'type': 'alias', 'field': 'ext'},
1416 'bitrate': {'type': 'alias', 'field': 'br'},
eb8a4433 1417 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1418 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1419 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1420 'framerate': {'type': 'alias', 'field': 'fps'},
63be1aab 1421 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists
1422 'protocol': {'type': 'alias', 'field': 'proto'},
1423 'source_preference': {'type': 'alias', 'field': 'source'},
1424 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1425 'filesize_estimate': {'type': 'alias', 'field': 'size'},
eb8a4433 1426 'samplerate': {'type': 'alias', 'field': 'asr'},
1427 'video_ext': {'type': 'alias', 'field': 'vext'},
1428 'audio_ext': {'type': 'alias', 'field': 'aext'},
1429 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1430 'audio_codec': {'type': 'alias', 'field': 'acodec'},
63be1aab 1431 'video': {'type': 'alias', 'field': 'hasvid'},
1432 'has_video': {'type': 'alias', 'field': 'hasvid'},
1433 'audio': {'type': 'alias', 'field': 'hasaud'},
1434 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1435 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1436 'preference': {'type': 'alias', 'field': 'ie_pref'},
1437 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1438 'format_id': {'type': 'alias', 'field': 'id'},
1439 }
eb8a4433 1440
1441 _order = []
1442
1443 def _get_field_setting(self, field, key):
1444 if field not in self.settings:
1445 self.settings[field] = {}
1446 propObj = self.settings[field]
1447 if key not in propObj:
1448 type = propObj.get('type')
1449 if key == 'field':
1450 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1451 elif key == 'convert':
1452 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
4bcc7bd1 1453 else:
eb8a4433 1454 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1455 propObj[key] = default
1456 return propObj[key]
1457
1458 def _resolve_field_value(self, field, value, convertNone=False):
1459 if value is None:
1460 if not convertNone:
1461 return None
4bcc7bd1 1462 else:
eb8a4433 1463 value = value.lower()
1464 conversion = self._get_field_setting(field, 'convert')
1465 if conversion == 'ignore':
1466 return None
1467 if conversion == 'string':
1468 return value
1469 elif conversion == 'float_none':
1470 return float_or_none(value)
1471 elif conversion == 'bytes':
1472 return FileDownloader.parse_bytes(value)
1473 elif conversion == 'order':
da9be05e 1474 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
eb8a4433 1475 use_regex = self._get_field_setting(field, 'regex')
1476 list_length = len(order_list)
1477 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1478 if use_regex and value is not None:
da9be05e 1479 for i, regex in enumerate(order_list):
eb8a4433 1480 if regex and re.match(regex, value):
1481 return list_length - i
1482 return list_length - empty_pos # not in list
1483 else: # not regex or value = None
1484 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1485 else:
1486 if value.isnumeric():
1487 return float(value)
4bcc7bd1 1488 else:
eb8a4433 1489 self.settings[field]['convert'] = 'string'
1490 return value
1491
1492 def evaluate_params(self, params, sort_extractor):
1493 self._use_free_order = params.get('prefer_free_formats', False)
1494 self._sort_user = params.get('format_sort', [])
1495 self._sort_extractor = sort_extractor
1496
1497 def add_item(field, reverse, closest, limit_text):
1498 field = field.lower()
1499 if field in self._order:
1500 return
1501 self._order.append(field)
1502 limit = self._resolve_field_value(field, limit_text)
1503 data = {
1504 'reverse': reverse,
1505 'closest': False if limit is None else closest,
1506 'limit_text': limit_text,
1507 'limit': limit}
1508 if field in self.settings:
1509 self.settings[field].update(data)
1510 else:
1511 self.settings[field] = data
1512
1513 sort_list = (
1514 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1515 + (tuple() if params.get('format_sort_force', False)
1516 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1517 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1518
1519 for item in sort_list:
1520 match = re.match(self.regex, item)
1521 if match is None:
1522 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1523 field = match.group('field')
1524 if field is None:
1525 continue
1526 if self._get_field_setting(field, 'type') == 'alias':
1527 field = self._get_field_setting(field, 'field')
1528 reverse = match.group('reverse') is not None
1529 closest = match.group('seperator') == '~'
1530 limit_text = match.group('limit')
1531
1532 has_limit = limit_text is not None
1533 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1534 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1535
1536 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1537 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1538 limit_count = len(limits)
1539 for (i, f) in enumerate(fields):
1540 add_item(f, reverse, closest,
1541 limits[i] if i < limit_count
1542 else limits[0] if has_limit and not has_multiple_limits
1543 else None)
1544
1545 def print_verbose_info(self, to_screen):
1546 to_screen('[debug] Sort order given by user: %s' % ','.join(self._sort_user))
1547 if self._sort_extractor:
f983b875 1548 to_screen('[debug] Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
eb8a4433 1549 to_screen('[debug] Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1550 '+' if self._get_field_setting(field, 'reverse') else '', field,
1551 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1552 self._get_field_setting(field, 'limit_text'),
1553 self._get_field_setting(field, 'limit'))
1554 if self._get_field_setting(field, 'limit_text') is not None else '')
1555 for field in self._order if self._get_field_setting(field, 'visible')]))
1556
1557 def _calculate_field_preference_from_value(self, format, field, type, value):
1558 reverse = self._get_field_setting(field, 'reverse')
1559 closest = self._get_field_setting(field, 'closest')
1560 limit = self._get_field_setting(field, 'limit')
1561
1562 if type == 'extractor':
1563 maximum = self._get_field_setting(field, 'max')
1564 if value is None or (maximum is not None and value >= maximum):
f983b875 1565 value = -1
eb8a4433 1566 elif type == 'boolean':
1567 in_list = self._get_field_setting(field, 'in_list')
1568 not_in_list = self._get_field_setting(field, 'not_in_list')
1569 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1570 elif type == 'ordered':
1571 value = self._resolve_field_value(field, value, True)
1572
1573 # try to convert to number
1574 val_num = float_or_none(value)
1575 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1576 if is_num:
1577 value = val_num
1578
1579 return ((-10, 0) if value is None
1580 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1581 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1582 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1583 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1584 else (-1, value, 0))
1585
1586 def _calculate_field_preference(self, format, field):
1587 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1588 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1589 if type == 'multiple':
1590 type = 'field' # Only 'field' is allowed in multiple for now
1591 actual_fields = self._get_field_setting(field, 'field')
1592
1593 def wrapped_function(values):
1594 values = tuple(filter(lambda x: x is not None, values))
1595 return (self._get_field_setting(field, 'function')(*values) if len(values) > 1
1596 else values[0] if values
1597 else None)
1598
1599 value = wrapped_function((get_value(f) for f in actual_fields))
1600 else:
1601 value = get_value(field)
1602 return self._calculate_field_preference_from_value(format, field, type, value)
1603
1604 def calculate_preference(self, format):
1605 # Determine missing protocol
1606 if not format.get('protocol'):
1607 format['protocol'] = determine_protocol(format)
1608
1609 # Determine missing ext
1610 if not format.get('ext') and 'url' in format:
1611 format['ext'] = determine_ext(format['url'])
1612 if format.get('vcodec') == 'none':
1613 format['audio_ext'] = format['ext']
1614 format['video_ext'] = 'none'
1615 else:
1616 format['video_ext'] = format['ext']
1617 format['audio_ext'] = 'none'
1618 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1619 # format['preference'] = -1000
1620
1621 # Determine missing bitrates
1622 if format.get('tbr') is None:
1623 if format.get('vbr') is not None and format.get('abr') is not None:
1624 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1625 else:
1626 if format.get('vcodec') != "none" and format.get('vbr') is None:
1627 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1628 if format.get('acodec') != "none" and format.get('abr') is None:
1629 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1630
1631 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1632
1633 def _sort_formats(self, formats, field_preference=[]):
1634 if not formats:
1635 raise ExtractorError('No video formats found')
1636 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1637 format_sort.evaluate_params(self._downloader.params, field_preference)
1638 if self._downloader.params.get('verbose', False):
1639 format_sort.print_verbose_info(self._downloader.to_screen)
1640 formats.sort(key=lambda f: format_sort.calculate_preference(f))
59040888 1641
96a53167
S
1642 def _check_formats(self, formats, video_id):
1643 if formats:
1644 formats[:] = filter(
1645 lambda f: self._is_valid_url(
1646 f['url'], video_id,
1647 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1648 formats)
1649
f5bdb444
S
1650 @staticmethod
1651 def _remove_duplicate_formats(formats):
1652 format_urls = set()
1653 unique_formats = []
1654 for f in formats:
1655 if f['url'] not in format_urls:
1656 format_urls.add(f['url'])
1657 unique_formats.append(f)
1658 formats[:] = unique_formats
1659
45024183 1660 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1661 url = self._proto_relative_url(url, scheme='http:')
1662 # For now assume non HTTP(S) URLs always valid
1663 if not (url.startswith('http://') or url.startswith('https://')):
1664 return True
96a53167 1665 try:
45024183 1666 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167 1667 return True
8bdd16b4 1668 except ExtractorError as e:
25e911a9 1669 self.to_screen(
8bdd16b4 1670 '%s: %s URL is invalid, skipping: %s'
1671 % (video_id, item, error_to_compat_str(e.cause)))
25e911a9 1672 return False
96a53167 1673
20991253 1674 def http_scheme(self):
1ede5b24 1675 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1676 return (
1677 'http:'
1678 if self._downloader.params.get('prefer_insecure', False)
1679 else 'https:')
1680
57c7411f
PH
1681 def _proto_relative_url(self, url, scheme=None):
1682 if url is None:
1683 return url
1684 if url.startswith('//'):
1685 if scheme is None:
1686 scheme = self.http_scheme()
1687 return scheme + url
1688 else:
1689 return url
1690
4094b6e3
PH
1691 def _sleep(self, timeout, video_id, msg_template=None):
1692 if msg_template is None:
f1a9d64e 1693 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1694 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1695 self.to_screen(msg)
1696 time.sleep(timeout)
1697
f983b875 1698 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
4de61310 1699 transform_source=lambda s: fix_xml_ampersands(s).strip(),
7360c06f 1700 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
f036a632
JMF
1701 manifest = self._download_xml(
1702 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1703 'Unable to download f4m manifest',
1704 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
067aa17e 1705 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
4de61310 1706 transform_source=transform_source,
7360c06f 1707 fatal=fatal, data=data, headers=headers, query=query)
4de61310
S
1708
1709 if manifest is False:
8d29e47f 1710 return []
31bb8d3f 1711
0fdbb332 1712 return self._parse_f4m_formats(
f983b875 1713 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
448bb5f3 1714 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332 1715
f983b875 1716 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
0fdbb332 1717 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1718 fatal=True, m3u8_id=None):
ee0ba927 1719 if not isinstance(manifest, compat_etree_Element) and not fatal:
d9eb580a
S
1720 return []
1721
cefecac1 1722 # currently youtube-dlc cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
fb72ec58 1723 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1724 if akamai_pv is not None and ';' in akamai_pv.text:
1725 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1726 if playerVerificationChallenge.strip() != '':
1727 return []
1728
31bb8d3f 1729 formats = []
7a47d07c 1730 manifest_version = '1.0'
b2527359 1731 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1732 if not media_nodes:
7a47d07c 1733 manifest_version = '2.0'
34e48bed 1734 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762 1735 # Remove unsupported DRM protected media from final formats
067aa17e 1736 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
b22ca762
S
1737 media_nodes = remove_encrypted_media(media_nodes)
1738 if not media_nodes:
1739 return formats
48107c19
S
1740
1741 manifest_base_url = get_base_url(manifest)
0a5685b2 1742
a6571f10 1743 bootstrap_info = xpath_element(
0a5685b2
YCH
1744 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1745 'bootstrap info', default=None)
1746
edd6074c
RA
1747 vcodec = None
1748 mime_type = xpath_text(
1749 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1750 'base URL', default=None)
1751 if mime_type and mime_type.startswith('audio/'):
1752 vcodec = 'none'
1753
b2527359 1754 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1755 tbr = int_or_none(media_el.attrib.get('bitrate'))
1756 width = int_or_none(media_el.attrib.get('width'))
1757 height = int_or_none(media_el.attrib.get('height'))
1758 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1759 # If <bootstrapInfo> is present, the specified f4m is a
1760 # stream-level manifest, and only set-level manifests may refer to
1761 # external resources. See section 11.4 and section 4 of F4M spec
1762 if bootstrap_info is None:
1763 media_url = None
1764 # @href is introduced in 2.0, see section 11.6 of F4M spec
1765 if manifest_version == '2.0':
1766 media_url = media_el.attrib.get('href')
1767 if media_url is None:
1768 media_url = media_el.attrib.get('url')
31c746e5
S
1769 if not media_url:
1770 continue
cc357c4d
S
1771 manifest_url = (
1772 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1773 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1774 # If media_url is itself a f4m manifest do the recursive extraction
1775 # since bitrates in parent manifest (this one) and media_url manifest
1776 # may differ leading to inability to resolve the format by requested
1777 # bitrate in f4m downloader
240b6045
YCH
1778 ext = determine_ext(manifest_url)
1779 if ext == 'f4m':
77b8b4e6 1780 f4m_formats = self._extract_f4m_formats(
f983b875 1781 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
77b8b4e6
S
1782 transform_source=transform_source, fatal=fatal)
1783 # Sometimes stream-level manifest contains single media entry that
1784 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1785 # At the same time parent's media entry in set-level manifest may
1786 # contain it. We will copy it from parent in such cases.
1787 if len(f4m_formats) == 1:
1788 f = f4m_formats[0]
1789 f.update({
1790 'tbr': f.get('tbr') or tbr,
1791 'width': f.get('width') or width,
1792 'height': f.get('height') or height,
1793 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1794 'vcodec': vcodec,
77b8b4e6
S
1795 })
1796 formats.extend(f4m_formats)
70f0f5a8 1797 continue
240b6045
YCH
1798 elif ext == 'm3u8':
1799 formats.extend(self._extract_m3u8_formats(
1800 manifest_url, video_id, 'mp4', preference=preference,
f983b875 1801 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
240b6045 1802 continue
31bb8d3f 1803 formats.append({
77b8b4e6 1804 'format_id': format_id,
31bb8d3f 1805 'url': manifest_url,
30d0b549 1806 'manifest_url': manifest_url,
a6571f10 1807 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1808 'protocol': 'f4m',
b2527359 1809 'tbr': tbr,
77b8b4e6
S
1810 'width': width,
1811 'height': height,
edd6074c 1812 'vcodec': vcodec,
60ca389c 1813 'preference': preference,
f983b875 1814 'quality': quality,
31bb8d3f 1815 })
31bb8d3f
JMF
1816 return formats
1817
f983b875 1818 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
16da9bbc 1819 return {
f207019c 1820 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1821 'url': m3u8_url,
1822 'ext': ext,
1823 'protocol': 'm3u8',
37768f92 1824 'preference': preference - 100 if preference else -100,
f983b875 1825 'quality': quality,
704df56d
PH
1826 'resolution': 'multiple',
1827 'format_note': 'Quality selection URL',
16da9bbc
YCH
1828 }
1829
1830 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
f983b875 1831 entry_protocol='m3u8', preference=None, quality=None,
16da9bbc 1832 m3u8_id=None, note=None, errnote=None,
7360c06f
S
1833 fatal=True, live=False, data=None, headers={},
1834 query={}):
dbd82a1d 1835 res = self._download_webpage_handle(
81515ad9 1836 m3u8_url, video_id,
621ed9f5 1837 note=note or 'Downloading m3u8 information',
13af92fd 1838 errnote=errnote or 'Failed to download m3u8 information',
7360c06f 1839 fatal=fatal, data=data, headers=headers, query=query)
cb252080 1840
dbd82a1d 1841 if res is False:
8d29e47f 1842 return []
cb252080 1843
dbd82a1d 1844 m3u8_doc, urlh = res
37113045 1845 m3u8_url = urlh.geturl()
9cdffeeb 1846
cb252080
S
1847 return self._parse_m3u8_formats(
1848 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
f983b875 1849 preference=preference, quality=quality, m3u8_id=m3u8_id, live=live)
cb252080
S
1850
1851 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
f983b875 1852 entry_protocol='m3u8', preference=None, quality=None,
cb252080 1853 m3u8_id=None, live=False):
08a00eef
RA
1854 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1855 return []
1856
ea229584
RA
1857 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1858 return []
1859
ff99fe52 1860 formats = []
0def7587
RA
1861
1862 format_url = lambda u: (
1863 u
1864 if re.match(r'^https?://', u)
1865 else compat_urlparse.urljoin(m3u8_url, u))
1866
cb252080
S
1867 # References:
1868 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
067aa17e
S
1869 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1870 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
cb252080
S
1871
1872 # We should try extracting formats only from master playlists [1, 4.3.4],
1873 # i.e. playlists that describe available qualities. On the other hand
1874 # media playlists [1, 4.3.3] should be returned as is since they contain
1875 # just the media without qualities renditions.
9cdffeeb 1876 # Fortunately, master playlist can be easily distinguished from media
cb252080 1877 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
a0566bbf 1878 # master playlist tags MUST NOT appear in a media playlist and vice versa.
cb252080
S
1879 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1880 # media playlist and MUST NOT appear in master playlist thus we can
1881 # clearly detect media playlist with this criterion.
1882
9cdffeeb 1883 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
7f32e5dc 1884 return [{
1885 'url': m3u8_url,
1886 'format_id': m3u8_id,
1887 'ext': ext,
1888 'protocol': entry_protocol,
1889 'preference': preference,
f983b875 1890 'quality': quality,
7f32e5dc 1891 }]
cb252080
S
1892
1893 groups = {}
1894 last_stream_inf = {}
1895
1896 def extract_media(x_media_line):
1897 media = parse_m3u8_attributes(x_media_line)
1898 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1899 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1900 if not (media_type and group_id and name):
1901 return
1902 groups.setdefault(group_id, []).append(media)
1903 if media_type not in ('VIDEO', 'AUDIO'):
1904 return
1905 media_url = media.get('URI')
1906 if media_url:
1907 format_id = []
9211e331 1908 for v in (m3u8_id, group_id, name):
cb252080
S
1909 if v:
1910 format_id.append(v)
1911 f = {
1912 'format_id': '-'.join(format_id),
1913 'url': format_url(media_url),
c89b49f7 1914 'manifest_url': m3u8_url,
cb252080
S
1915 'language': media.get('LANGUAGE'),
1916 'ext': ext,
1917 'protocol': entry_protocol,
1918 'preference': preference,
f983b875 1919 'quality': quality,
cb252080
S
1920 }
1921 if media_type == 'AUDIO':
1922 f['vcodec'] = 'none'
1923 formats.append(f)
1924
1925 def build_stream_name():
1926 # Despite specification does not mention NAME attribute for
3019cb0c
S
1927 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1928 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 1929 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
1930 stream_name = last_stream_inf.get('NAME')
1931 if stream_name:
1932 return stream_name
1933 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1934 # from corresponding rendition group
1935 stream_group_id = last_stream_inf.get('VIDEO')
1936 if not stream_group_id:
1937 return
1938 stream_group = groups.get(stream_group_id)
1939 if not stream_group:
1940 return stream_group_id
1941 rendition = stream_group[0]
1942 return rendition.get('NAME') or stream_group_id
1943
379306ef 1944 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2bfc1d9d
RA
1945 # chance to detect video only formats when EXT-X-STREAM-INF tags
1946 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1947 for line in m3u8_doc.splitlines():
1948 if line.startswith('#EXT-X-MEDIA:'):
1949 extract_media(line)
1950
704df56d
PH
1951 for line in m3u8_doc.splitlines():
1952 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 1953 last_stream_inf = parse_m3u8_attributes(line)
704df56d
PH
1954 elif line.startswith('#') or not line.strip():
1955 continue
1956 else:
9c99bef7 1957 tbr = float_or_none(
3089bc74
S
1958 last_stream_inf.get('AVERAGE-BANDWIDTH')
1959 or last_stream_inf.get('BANDWIDTH'), scale=1000)
8dc9d361
S
1960 format_id = []
1961 if m3u8_id:
1962 format_id.append(m3u8_id)
cb252080 1963 stream_name = build_stream_name()
b24d6336
KH
1964 # Bandwidth of live streams may differ over time thus making
1965 # format_id unpredictable. So it's better to keep provided
1966 # format_id intact.
e9c6cdf4 1967 if not live:
ed56f260 1968 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
30d0b549 1969 manifest_url = format_url(line.strip())
704df56d 1970 f = {
8dc9d361 1971 'format_id': '-'.join(format_id),
30d0b549 1972 'url': manifest_url,
ff99fe52 1973 'manifest_url': m3u8_url,
704df56d
PH
1974 'tbr': tbr,
1975 'ext': ext,
cb252080 1976 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
f0b5d6af
PH
1977 'protocol': entry_protocol,
1978 'preference': preference,
f983b875 1979 'quality': quality,
704df56d 1980 }
cb252080 1981 resolution = last_stream_inf.get('RESOLUTION')
704df56d 1982 if resolution:
c4c9b844
S
1983 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1984 if mobj:
1985 f['width'] = int(mobj.group('width'))
1986 f['height'] = int(mobj.group('height'))
00f4764c
RA
1987 # Unified Streaming Platform
1988 mobj = re.search(
1989 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1990 if mobj:
1991 abr, vbr = mobj.groups()
1992 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
fbb6edd2 1993 f.update({
00f4764c
RA
1994 'vbr': vbr,
1995 'abr': abr,
fbb6edd2 1996 })
cb252080
S
1997 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1998 f.update(codecs)
1999 audio_group_id = last_stream_inf.get('AUDIO')
2000 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2001 # references a rendition group MUST have a CODECS attribute.
2002 # However, this is not always respected, for example, [2]
2003 # contains EXT-X-STREAM-INF tag which references AUDIO
2004 # rendition group but does not have CODECS and despite
0e7b8d3e
S
2005 # referencing an audio group it represents a complete
2006 # (with audio and video) format. So, for such cases we will
2007 # ignore references to rendition groups and treat them
cb252080
S
2008 # as complete formats.
2009 if audio_group_id and codecs and f.get('vcodec') != 'none':
2010 audio_group = groups.get(audio_group_id)
2011 if audio_group and audio_group[0].get('URI'):
2012 # TODO: update acodec for audio only formats with
2013 # the same GROUP-ID
2014 f['acodec'] = 'none'
704df56d 2015 formats.append(f)
5ef62fc4
RA
2016
2017 # for DailyMotion
2018 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2019 if progressive_uri:
2020 http_f = f.copy()
2021 del http_f['manifest_url']
2022 http_f.update({
2023 'format_id': f['format_id'].replace('hls-', 'http-'),
2024 'protocol': 'http',
2025 'url': progressive_uri,
2026 })
2027 formats.append(http_f)
2028
cb252080 2029 last_stream_inf = {}
704df56d
PH
2030 return formats
2031
a107193e
S
2032 @staticmethod
2033 def _xpath_ns(path, namespace=None):
2034 if not namespace:
2035 return path
2036 out = []
2037 for c in path.split('/'):
2038 if not c or c == '.':
2039 out.append(c)
2040 else:
2041 out.append('{%s}%s' % (namespace, c))
2042 return '/'.join(out)
2043
09f572fb 2044 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2045 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 2046
995029a1
PH
2047 if smil is False:
2048 assert not fatal
2049 return []
e89a2aab 2050
17712eeb 2051 namespace = self._parse_smil_namespace(smil)
a107193e
S
2052
2053 return self._parse_smil_formats(
2054 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2055
2056 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2057 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2058 if smil is False:
2059 return {}
2060 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2061
09f572fb 2062 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
2063 return self._download_xml(
2064 smil_url, video_id, 'Downloading SMIL file',
09f572fb 2065 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
2066
2067 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 2068 namespace = self._parse_smil_namespace(smil)
a107193e
S
2069
2070 formats = self._parse_smil_formats(
2071 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2072 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2073
2074 video_id = os.path.splitext(url_basename(smil_url))[0]
2075 title = None
2076 description = None
647eab45 2077 upload_date = None
a107193e
S
2078 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2079 name = meta.attrib.get('name')
2080 content = meta.attrib.get('content')
2081 if not name or not content:
2082 continue
2083 if not title and name == 'title':
2084 title = content
2085 elif not description and name in ('description', 'abstract'):
2086 description = content
647eab45
S
2087 elif not upload_date and name == 'date':
2088 upload_date = unified_strdate(content)
a107193e 2089
1e5bcdec
S
2090 thumbnails = [{
2091 'id': image.get('type'),
2092 'url': image.get('src'),
2093 'width': int_or_none(image.get('width')),
2094 'height': int_or_none(image.get('height')),
2095 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2096
a107193e
S
2097 return {
2098 'id': video_id,
2099 'title': title or video_id,
2100 'description': description,
647eab45 2101 'upload_date': upload_date,
1e5bcdec 2102 'thumbnails': thumbnails,
a107193e
S
2103 'formats': formats,
2104 'subtitles': subtitles,
2105 }
2106
17712eeb
S
2107 def _parse_smil_namespace(self, smil):
2108 return self._search_regex(
2109 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2110
f877c6ae 2111 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
2112 base = smil_url
2113 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2114 b = meta.get('base') or meta.get('httpBase')
2115 if b:
2116 base = b
2117 break
e89a2aab
S
2118
2119 formats = []
2120 rtmp_count = 0
a107193e 2121 http_count = 0
7f32e5dc 2122 m3u8_count = 0
a107193e 2123
81e1c4e2 2124 srcs = []
ad96b4c8
YCH
2125 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2126 for medium in media:
2127 src = medium.get('src')
81e1c4e2 2128 if not src or src in srcs:
a107193e 2129 continue
81e1c4e2 2130 srcs.append(src)
a107193e 2131
ad96b4c8
YCH
2132 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2133 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2134 width = int_or_none(medium.get('width'))
2135 height = int_or_none(medium.get('height'))
2136 proto = medium.get('proto')
2137 ext = medium.get('ext')
a107193e 2138 src_ext = determine_ext(src)
ad96b4c8 2139 streamer = medium.get('streamer') or base
a107193e
S
2140
2141 if proto == 'rtmp' or streamer.startswith('rtmp'):
2142 rtmp_count += 1
2143 formats.append({
2144 'url': streamer,
2145 'play_path': src,
2146 'ext': 'flv',
2147 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2148 'tbr': bitrate,
2149 'filesize': filesize,
2150 'width': width,
2151 'height': height,
2152 })
f877c6ae
YCH
2153 if transform_rtmp_url:
2154 streamer, src = transform_rtmp_url(streamer, src)
2155 formats[-1].update({
2156 'url': streamer,
2157 'play_path': src,
2158 })
a107193e
S
2159 continue
2160
2161 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 2162 src_url = src_url.strip()
a107193e
S
2163
2164 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 2165 m3u8_formats = self._extract_m3u8_formats(
2166 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2167 if len(m3u8_formats) == 1:
2168 m3u8_count += 1
2169 m3u8_formats[0].update({
2170 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2171 'tbr': bitrate,
2172 'width': width,
2173 'height': height,
2174 })
2175 formats.extend(m3u8_formats)
bd21ead2 2176 elif src_ext == 'f4m':
a107193e
S
2177 f4m_url = src_url
2178 if not f4m_params:
2179 f4m_params = {
2180 'hdcore': '3.2.0',
2181 'plugin': 'flowplayer-3.2.0.1',
2182 }
2183 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 2184 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 2185 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
2186 elif src_ext == 'mpd':
2187 formats.extend(self._extract_mpd_formats(
2188 src_url, video_id, mpd_id='dash', fatal=False))
2189 elif re.search(r'\.ism/[Mm]anifest', src_url):
2190 formats.extend(self._extract_ism_formats(
2191 src_url, video_id, ism_id='mss', fatal=False))
2192 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
2193 http_count += 1
2194 formats.append({
2195 'url': src_url,
2196 'ext': ext or src_ext or 'flv',
2197 'format_id': 'http-%d' % (bitrate or http_count),
2198 'tbr': bitrate,
2199 'filesize': filesize,
2200 'width': width,
2201 'height': height,
2202 })
63757032 2203
e89a2aab
S
2204 return formats
2205
ce00af87 2206 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 2207 urls = []
a107193e
S
2208 subtitles = {}
2209 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2210 src = textstream.get('src')
d413095f 2211 if not src or src in urls:
a107193e 2212 continue
d413095f 2213 urls.append(src)
df634be2 2214 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 2215 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
2216 subtitles.setdefault(lang, []).append({
2217 'url': src,
2218 'ext': ext,
2219 })
2220 return subtitles
63757032 2221
47a5cb77 2222 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 2223 xspf = self._download_xml(
47a5cb77 2224 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
2225 'Unable to download xspf manifest', fatal=fatal)
2226 if xspf is False:
2227 return []
47a5cb77
S
2228 return self._parse_xspf(
2229 xspf, playlist_id, xspf_url=xspf_url,
2230 xspf_base_url=base_url(xspf_url))
8d6765cf 2231
47a5cb77 2232 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
2233 NS_MAP = {
2234 'xspf': 'http://xspf.org/ns/0/',
2235 's1': 'http://static.streamone.nl/player/ns/0',
2236 }
2237
2238 entries = []
47a5cb77 2239 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 2240 title = xpath_text(
98044462 2241 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
2242 description = xpath_text(
2243 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2244 thumbnail = xpath_text(
2245 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2246 duration = float_or_none(
2247 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2248
47a5cb77
S
2249 formats = []
2250 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2251 format_url = urljoin(xspf_base_url, location.text)
2252 if not format_url:
2253 continue
2254 formats.append({
2255 'url': format_url,
2256 'manifest_url': xspf_url,
2257 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2258 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2259 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2260 })
8d6765cf
S
2261 self._sort_formats(formats)
2262
2263 entries.append({
2264 'id': playlist_id,
2265 'title': title,
2266 'description': description,
2267 'thumbnail': thumbnail,
2268 'duration': duration,
2269 'formats': formats,
2270 })
2271 return entries
2272
545cc85d 2273 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2274 res = self._download_xml_handle(
1bac3455 2275 mpd_url, video_id,
2276 note=note or 'Downloading MPD manifest',
2277 errnote=errnote or 'Failed to download MPD manifest',
7360c06f 2278 fatal=fatal, data=data, headers=headers, query=query)
1bac3455 2279 if res is False:
2d2fa82d 2280 return []
47a5cb77 2281 mpd_doc, urlh = res
c25720ef
RA
2282 if mpd_doc is None:
2283 return []
02dc0a36 2284 mpd_base_url = base_url(urlh.geturl())
1bac3455 2285
91cb6b50 2286 return self._parse_mpd_formats(
545cc85d 2287 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2d2fa82d 2288
545cc85d 2289 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
f0948348
S
2290 """
2291 Parse formats from MPD manifest.
2292 References:
2293 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2294 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2295 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2296 """
78895bd3
U
2297 if not self._downloader.params.get('dynamic_mpd'):
2298 if mpd_doc.get('type') == 'dynamic':
2299 return []
2d2fa82d 2300
91cb6b50 2301 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 2302
2303 def _add_ns(path):
2304 return self._xpath_ns(path, namespace)
2305
675d0016 2306 def is_drm_protected(element):
2307 return element.find(_add_ns('ContentProtection')) is not None
2308
1bac3455 2309 def extract_multisegment_info(element, ms_parent_info):
2310 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2311
2312 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2313 # common attributes and elements. We will only extract relevant
2314 # for us.
2315 def extract_common(source):
2316 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2317 if segment_timeline is not None:
2318 s_e = segment_timeline.findall(_add_ns('S'))
2319 if s_e:
2320 ms_info['total_number'] = 0
2321 ms_info['s'] = []
2322 for s in s_e:
2323 r = int(s.get('r', 0))
2324 ms_info['total_number'] += 1 + r
2325 ms_info['s'].append({
2326 't': int(s.get('t', 0)),
2327 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2328 'd': int(s.attrib['d']),
2329 'r': r,
2330 })
2331 start_number = source.get('startNumber')
2332 if start_number:
2333 ms_info['start_number'] = int(start_number)
2334 timescale = source.get('timescale')
2335 if timescale:
2336 ms_info['timescale'] = int(timescale)
2337 segment_duration = source.get('duration')
2338 if segment_duration:
48504785 2339 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2340
2341 def extract_Initialization(source):
2342 initialization = source.find(_add_ns('Initialization'))
2343 if initialization is not None:
2344 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2345
f14be228 2346 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2347 if segment_list is not None:
b4c1d6e8
S
2348 extract_common(segment_list)
2349 extract_Initialization(segment_list)
f14be228 2350 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2351 if segment_urls_e:
2352 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2353 else:
f14be228 2354 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2355 if segment_template is not None:
b4c1d6e8 2356 extract_common(segment_template)
e228616c
S
2357 media = segment_template.get('media')
2358 if media:
2359 ms_info['media'] = media
1bac3455 2360 initialization = segment_template.get('initialization')
2361 if initialization:
e228616c 2362 ms_info['initialization'] = initialization
1bac3455 2363 else:
b4c1d6e8 2364 extract_Initialization(segment_template)
1bac3455 2365 return ms_info
b323e170 2366
06869367 2367 skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
63ad4d43 2368
1bac3455 2369 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 2370 formats = []
f14be228 2371 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2372 period_duration = parse_duration(period.get('duration')) or mpd_duration
2373 period_ms_info = extract_multisegment_info(period, {
2374 'start_number': 1,
2375 'timescale': 1,
2376 })
f14be228 2377 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
06869367 2378 if skip_unplayable and is_drm_protected(adaptation_set):
675d0016 2379 continue
1bac3455 2380 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2381 for representation in adaptation_set.findall(_add_ns('Representation')):
06869367 2382 if skip_unplayable and is_drm_protected(representation):
675d0016 2383 continue
1bac3455 2384 representation_attrib = adaptation_set.attrib.copy()
2385 representation_attrib.update(representation.attrib)
f0948348 2386 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759
YCH
2387 mime_type = representation_attrib['mimeType']
2388 content_type = mime_type.split('/')[0]
1bac3455 2389 if content_type == 'text':
2390 # TODO implement WebVTT downloading
2391 pass
40fcba5e 2392 elif content_type in ('video', 'audio'):
1bac3455 2393 base_url = ''
2394 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 2395 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 2396 if base_url_e is not None:
2397 base_url = base_url_e.text + base_url
2398 if re.match(r'^https?://', base_url):
2399 break
bb20526b
S
2400 if mpd_base_url and not re.match(r'^https?://', base_url):
2401 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2402 mpd_base_url += '/'
1bac3455 2403 base_url = mpd_base_url + base_url
2404 representation_id = representation_attrib.get('id')
d577c796 2405 lang = representation_attrib.get('lang')
51e9094f 2406 url_el = representation.find(_add_ns('BaseURL'))
2407 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
e228616c 2408 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1bac3455 2409 f = {
154c209e 2410 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
86f4d14f 2411 'manifest_url': mpd_url,
a6c8b759 2412 'ext': mimetype2ext(mime_type),
1bac3455 2413 'width': int_or_none(representation_attrib.get('width')),
2414 'height': int_or_none(representation_attrib.get('height')),
9c99bef7 2415 'tbr': float_or_none(bandwidth, 1000),
1bac3455 2416 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2417 'fps': int_or_none(representation_attrib.get('frameRate')),
d577c796 2418 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 2419 'format_note': 'DASH %s' % content_type,
51e9094f 2420 'filesize': filesize,
126f225b 2421 'container': mimetype2ext(mime_type) + '_dash',
1bac3455 2422 }
7fe15920 2423 f.update(parse_codecs(representation_attrib.get('codecs')))
1bac3455 2424 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
b4c1d6e8 2425
e228616c 2426 def prepare_template(template_name, identifiers):
eca1f0d1
S
2427 tmpl = representation_ms_info[template_name]
2428 # First of, % characters outside $...$ templates
2429 # must be escaped by doubling for proper processing
2430 # by % operator string formatting used further (see
067aa17e 2431 # https://github.com/ytdl-org/youtube-dl/issues/16867).
eca1f0d1
S
2432 t = ''
2433 in_template = False
2434 for c in tmpl:
2435 t += c
2436 if c == '$':
2437 in_template = not in_template
2438 elif c == '%' and not in_template:
2439 t += c
2440 # Next, $...$ templates are translated to their
2441 # %(...) counterparts to be used with % operator
e228616c
S
2442 t = t.replace('$RepresentationID$', representation_id)
2443 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2444 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2445 t.replace('$$', '$')
2446 return t
2447
2448 # @initialization is a regular template like @media one
2449 # so it should be handled just the same way (see
067aa17e 2450 # https://github.com/ytdl-org/youtube-dl/issues/11605)
e228616c
S
2451 if 'initialization' in representation_ms_info:
2452 initialization_template = prepare_template(
2453 'initialization',
2454 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2455 # $Time$ shall not be included for @initialization thus
2456 # only $Bandwidth$ remains
2457 ('Bandwidth', ))
2458 representation_ms_info['initialization_url'] = initialization_template % {
2459 'Bandwidth': bandwidth,
2460 }
2461
1141e910
S
2462 def location_key(location):
2463 return 'url' if re.match(r'^https?://', location) else 'path'
2464
e228616c
S
2465 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2466
2467 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1141e910 2468 media_location_key = location_key(media_template)
f0948348
S
2469
2470 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2471 # can't be used at the same time
b4c1d6e8
S
2472 if '%(Number' in media_template and 's' not in representation_ms_info:
2473 segment_duration = None
c110944f 2474 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
b4c1d6e8
S
2475 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2476 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
b4c1d6e8 2477 representation_ms_info['fragments'] = [{
1141e910 2478 media_location_key: media_template % {
b4c1d6e8 2479 'Number': segment_number,
e228616c 2480 'Bandwidth': bandwidth,
b4c1d6e8
S
2481 },
2482 'duration': segment_duration,
2483 } for segment_number in range(
2484 representation_ms_info['start_number'],
2485 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
f0948348 2486 else:
b4c1d6e8
S
2487 # $Number*$ or $Time$ in media template with S list available
2488 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2489 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
b4c1d6e8 2490 representation_ms_info['fragments'] = []
f0948348 2491 segment_time = 0
b4c1d6e8
S
2492 segment_d = None
2493 segment_number = representation_ms_info['start_number']
f0948348
S
2494
2495 def add_segment_url():
b4c1d6e8
S
2496 segment_url = media_template % {
2497 'Time': segment_time,
e228616c 2498 'Bandwidth': bandwidth,
b4c1d6e8
S
2499 'Number': segment_number,
2500 }
b4c1d6e8 2501 representation_ms_info['fragments'].append({
1141e910 2502 media_location_key: segment_url,
b4c1d6e8
S
2503 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2504 })
f0948348
S
2505
2506 for num, s in enumerate(representation_ms_info['s']):
2507 segment_time = s.get('t') or segment_time
b4c1d6e8 2508 segment_d = s['d']
f0948348 2509 add_segment_url()
b4c1d6e8 2510 segment_number += 1
f0948348 2511 for r in range(s.get('r', 0)):
b4c1d6e8 2512 segment_time += segment_d
f0948348 2513 add_segment_url()
b4c1d6e8
S
2514 segment_number += 1
2515 segment_time += segment_d
2516 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2517 # No media template
2518 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2519 # or any YouTube dashsegments video
2520 fragments = []
d04621da
S
2521 segment_index = 0
2522 timescale = representation_ms_info['timescale']
2523 for s in representation_ms_info['s']:
2524 duration = float_or_none(s['d'], timescale)
b4c1d6e8 2525 for r in range(s.get('r', 0) + 1):
1141e910 2526 segment_uri = representation_ms_info['segment_urls'][segment_index]
b4c1d6e8 2527 fragments.append({
1141e910 2528 location_key(segment_uri): segment_uri,
d04621da 2529 'duration': duration,
b4c1d6e8 2530 })
d04621da 2531 segment_index += 1
b4c1d6e8 2532 representation_ms_info['fragments'] = fragments
41bf647e
PN
2533 elif 'segment_urls' in representation_ms_info:
2534 # Segment URLs with no SegmentTimeline
2535 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
067aa17e 2536 # https://github.com/ytdl-org/youtube-dl/pull/14844
41bf647e 2537 fragments = []
603fc4e0
S
2538 segment_duration = float_or_none(
2539 representation_ms_info['segment_duration'],
2540 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
41bf647e 2541 for segment_url in representation_ms_info['segment_urls']:
603fc4e0 2542 fragment = {
41bf647e 2543 location_key(segment_url): segment_url,
603fc4e0
S
2544 }
2545 if segment_duration:
2546 fragment['duration'] = segment_duration
2547 fragments.append(fragment)
41bf647e 2548 representation_ms_info['fragments'] = fragments
79d2077e
S
2549 # If there is a fragments key available then we correctly recognized fragmented media.
2550 # Otherwise we will assume unfragmented media with direct access. Technically, such
2551 # assumption is not necessarily correct since we may simply have no support for
2552 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
86f4d14f 2553 if 'fragments' in representation_ms_info:
1bac3455 2554 f.update({
79d2077e
S
2555 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2556 'url': mpd_url or base_url,
1141e910 2557 'fragment_base_url': base_url,
b4c1d6e8 2558 'fragments': [],
1bac3455 2559 'protocol': 'http_dash_segments',
df374b52 2560 })
1bac3455 2561 if 'initialization_url' in representation_ms_info:
e228616c 2562 initialization_url = representation_ms_info['initialization_url']
1bac3455 2563 if not f.get('url'):
2564 f['url'] = initialization_url
1141e910 2565 f['fragments'].append({location_key(initialization_url): initialization_url})
b4c1d6e8 2566 f['fragments'].extend(representation_ms_info['fragments'])
79d2077e
S
2567 else:
2568 # Assuming direct URL to unfragmented media.
2569 f['url'] = base_url
545cc85d 2570 formats.append(f)
17b598d3 2571 else:
1bac3455 2572 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
17b598d3
YCH
2573 return formats
2574
7360c06f 2575 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
47a5cb77 2576 res = self._download_xml_handle(
b2758123
RA
2577 ism_url, video_id,
2578 note=note or 'Downloading ISM manifest',
2579 errnote=errnote or 'Failed to download ISM manifest',
7360c06f 2580 fatal=fatal, data=data, headers=headers, query=query)
b2758123
RA
2581 if res is False:
2582 return []
47a5cb77 2583 ism_doc, urlh = res
13b08034
S
2584 if ism_doc is None:
2585 return []
b2758123 2586
7947a1f7 2587 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
b2758123
RA
2588
2589 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2590 """
2591 Parse formats from ISM manifest.
2592 References:
2593 1. [MS-SSTR]: Smooth Streaming Protocol,
2594 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2595 """
06869367 2596 if ism_doc.get('IsLive') == 'TRUE':
2597 return []
2598 if (not self._downloader.params.get('allow_unplayable_formats')
2599 and ism_doc.find('Protection') is not None):
b2758123
RA
2600 return []
2601
b2758123
RA
2602 duration = int(ism_doc.attrib['Duration'])
2603 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2604
2605 formats = []
2606 for stream in ism_doc.findall('StreamIndex'):
2607 stream_type = stream.get('Type')
2608 if stream_type not in ('video', 'audio'):
2609 continue
2610 url_pattern = stream.attrib['Url']
2611 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2612 stream_name = stream.get('Name')
2613 for track in stream.findall('QualityLevel'):
2501d41e 2614 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
b2758123
RA
2615 # TODO: add support for WVC1 and WMAP
2616 if fourcc not in ('H264', 'AVC1', 'AACL'):
2617 self.report_warning('%s is not a supported codec' % fourcc)
2618 continue
2619 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2620 # [1] does not mention Width and Height attributes. However,
2621 # they're often present while MaxWidth and MaxHeight are
2622 # missing, so should be used as fallbacks
2623 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2624 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2625 sampling_rate = int_or_none(track.get('SamplingRate'))
2626
2627 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2628 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2629
2630 fragments = []
2631 fragment_ctx = {
2632 'time': 0,
2633 }
2634 stream_fragments = stream.findall('c')
2635 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2636 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2637 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2638 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2639 if not fragment_ctx['duration']:
2640 try:
2641 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2642 except IndexError:
2643 next_fragment_time = duration
1616f9b4 2644 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2645 for _ in range(fragment_repeat):
2646 fragments.append({
1616f9b4 2647 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2648 'duration': fragment_ctx['duration'] / stream_timescale,
2649 })
2650 fragment_ctx['time'] += fragment_ctx['duration']
2651
2652 format_id = []
2653 if ism_id:
2654 format_id.append(ism_id)
2655 if stream_name:
2656 format_id.append(stream_name)
2657 format_id.append(compat_str(tbr))
2658
2659 formats.append({
2660 'format_id': '-'.join(format_id),
2661 'url': ism_url,
2662 'manifest_url': ism_url,
2663 'ext': 'ismv' if stream_type == 'video' else 'isma',
2664 'width': width,
2665 'height': height,
2666 'tbr': tbr,
2667 'asr': sampling_rate,
2668 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2669 'acodec': 'none' if stream_type == 'video' else fourcc,
2670 'protocol': 'ism',
2671 'fragments': fragments,
2672 '_download_params': {
2673 'duration': duration,
2674 'timescale': stream_timescale,
2675 'width': width or 0,
2676 'height': height or 0,
2677 'fourcc': fourcc,
2678 'codec_private_data': track.get('CodecPrivateData'),
2679 'sampling_rate': sampling_rate,
2680 'channels': int_or_none(track.get('Channels', 2)),
2681 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2682 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2683 },
2684 })
2685 return formats
2686
f983b875 2687 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
6780154e
S
2688 def absolute_url(item_url):
2689 return urljoin(base_url, item_url)
59bbe491 2690
2691 def parse_content_type(content_type):
2692 if not content_type:
2693 return {}
2694 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2695 if ctr:
2696 mimetype, codecs = ctr.groups()
2697 f = parse_codecs(codecs)
2698 f['ext'] = mimetype2ext(mimetype)
2699 return f
2700 return {}
2701
868f79db 2702 def _media_formats(src, cur_media_type, type_info={}):
520251c0 2703 full_url = absolute_url(src)
82889d4a 2704 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 2705 if ext == 'm3u8':
520251c0
YCH
2706 is_plain_url = False
2707 formats = self._extract_m3u8_formats(
ad120ae1 2708 full_url, video_id, ext='mp4',
eeb0a956 2709 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
f983b875 2710 preference=preference, quality=quality, fatal=False)
87a449c1
S
2711 elif ext == 'mpd':
2712 is_plain_url = False
2713 formats = self._extract_mpd_formats(
b359e977 2714 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
2715 else:
2716 is_plain_url = True
2717 formats = [{
2718 'url': full_url,
2719 'vcodec': 'none' if cur_media_type == 'audio' else None,
2720 }]
2721 return is_plain_url, formats
2722
59bbe491 2723 entries = []
4328ddf8
S
2724 # amp-video and amp-audio are very similar to their HTML5 counterparts
2725 # so we wll include them right here (see
2726 # https://www.ampproject.org/docs/reference/components/amp-video)
29f7c58a 2727 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2728 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2729 media_tags = [(media_tag, media_tag_name, media_type, '')
2730 for media_tag, media_tag_name, media_type
2731 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2aec7256
S
2732 media_tags.extend(re.findall(
2733 # We only allow video|audio followed by a whitespace or '>'.
2734 # Allowing more characters may end up in significant slow down (see
067aa17e 2735 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2aec7256 2736 # http://www.porntrex.com/maps/videositemap.xml).
29f7c58a 2737 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2738 for media_tag, _, media_type, media_content in media_tags:
59bbe491 2739 media_info = {
2740 'formats': [],
2741 'subtitles': {},
2742 }
2743 media_attributes = extract_attributes(media_tag)
f856816b 2744 src = strip_or_none(media_attributes.get('src'))
59bbe491 2745 if src:
dedb1770 2746 _, formats = _media_formats(src, media_type)
520251c0 2747 media_info['formats'].extend(formats)
6780154e 2748 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 2749 if media_content:
2750 for source_tag in re.findall(r'<source[^>]+>', media_content):
d493f15c
S
2751 s_attr = extract_attributes(source_tag)
2752 # data-video-src and data-src are non standard but seen
2753 # several times in the wild
f856816b 2754 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
59bbe491 2755 if not src:
2756 continue
d493f15c 2757 f = parse_content_type(s_attr.get('type'))
868f79db 2758 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 2759 if is_plain_url:
d493f15c
S
2760 # width, height, res, label and title attributes are
2761 # all not standard but seen several times in the wild
2762 labels = [
2763 s_attr.get(lbl)
2764 for lbl in ('label', 'title')
2765 if str_or_none(s_attr.get(lbl))
2766 ]
2767 width = int_or_none(s_attr.get('width'))
3089bc74
S
2768 height = (int_or_none(s_attr.get('height'))
2769 or int_or_none(s_attr.get('res')))
d493f15c
S
2770 if not width or not height:
2771 for lbl in labels:
2772 resolution = parse_resolution(lbl)
2773 if not resolution:
2774 continue
2775 width = width or resolution.get('width')
2776 height = height or resolution.get('height')
2777 for lbl in labels:
2778 tbr = parse_bitrate(lbl)
2779 if tbr:
2780 break
2781 else:
2782 tbr = None
1ed45499 2783 f.update({
d493f15c
S
2784 'width': width,
2785 'height': height,
2786 'tbr': tbr,
2787 'format_id': s_attr.get('label') or s_attr.get('title'),
1ed45499 2788 })
520251c0
YCH
2789 f.update(formats[0])
2790 media_info['formats'].append(f)
2791 else:
2792 media_info['formats'].extend(formats)
59bbe491 2793 for track_tag in re.findall(r'<track[^>]+>', media_content):
2794 track_attributes = extract_attributes(track_tag)
2795 kind = track_attributes.get('kind')
5968d7d2 2796 if not kind or kind in ('subtitles', 'captions'):
f856816b 2797 src = strip_or_none(track_attributes.get('src'))
59bbe491 2798 if not src:
2799 continue
2800 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2801 media_info['subtitles'].setdefault(lang, []).append({
2802 'url': absolute_url(src),
2803 })
5e8e2fa5
S
2804 for f in media_info['formats']:
2805 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 2806 if media_info['formats'] or media_info['subtitles']:
59bbe491 2807 entries.append(media_info)
2808 return entries
2809
c4251b9a 2810 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
29f7c58a 2811 signed = 'hdnea=' in manifest_url
2812 if not signed:
2813 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2814 manifest_url = re.sub(
2815 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2816 '', manifest_url).strip('?')
2817
c7c43a93 2818 formats = []
70c5802b 2819
e71a4509 2820 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 2821 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
2822 hds_host = hosts.get('hds')
2823 if hds_host:
2824 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
2825 if 'hdcore=' not in f4m_url:
2826 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2827 f4m_formats = self._extract_f4m_formats(
2828 f4m_url, video_id, f4m_id='hds', fatal=False)
2829 for entry in f4m_formats:
2830 entry.update({'extra_param_to_segment_url': hdcore_sign})
2831 formats.extend(f4m_formats)
70c5802b 2832
c4251b9a
RA
2833 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2834 hls_host = hosts.get('hls')
2835 if hls_host:
2836 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
29f7c58a 2837 m3u8_formats = self._extract_m3u8_formats(
c7c43a93 2838 m3u8_url, video_id, 'mp4', 'm3u8_native',
29f7c58a 2839 m3u8_id='hls', fatal=False)
2840 formats.extend(m3u8_formats)
70c5802b 2841
2842 http_host = hosts.get('http')
29f7c58a 2843 if http_host and m3u8_formats and not signed:
2844 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
70c5802b 2845 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2846 qualities_length = len(qualities)
29f7c58a 2847 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
70c5802b 2848 i = 0
29f7c58a 2849 for f in m3u8_formats:
2850 if f['vcodec'] != 'none':
70c5802b 2851 for protocol in ('http', 'https'):
2852 http_f = f.copy()
2853 del http_f['manifest_url']
2854 http_url = re.sub(
29f7c58a 2855 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
70c5802b 2856 http_f.update({
2857 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2858 'url': http_url,
2859 'protocol': protocol,
2860 })
29f7c58a 2861 formats.append(http_f)
70c5802b 2862 i += 1
70c5802b 2863
c7c43a93
RA
2864 return formats
2865
6ad02195 2866 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 2867 query = compat_urlparse.urlparse(url).query
6ad02195 2868 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
2869 mobj = re.search(
2870 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2871 url_base = mobj.group('url')
2872 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 2873 formats = []
044eeb14
S
2874
2875 def manifest_url(manifest):
2876 m_url = '%s/%s' % (http_base_url, manifest)
2877 if query:
2878 m_url += '?%s' % query
2879 return m_url
2880
6ad02195
RA
2881 if 'm3u8' not in skip_protocols:
2882 formats.extend(self._extract_m3u8_formats(
044eeb14 2883 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
2884 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2885 if 'f4m' not in skip_protocols:
2886 formats.extend(self._extract_f4m_formats(
044eeb14 2887 manifest_url('manifest.f4m'),
6ad02195 2888 video_id, f4m_id='hds', fatal=False))
0384932e
RA
2889 if 'dash' not in skip_protocols:
2890 formats.extend(self._extract_mpd_formats(
044eeb14 2891 manifest_url('manifest.mpd'),
0384932e 2892 video_id, mpd_id='dash', fatal=False))
6ad02195 2893 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
2894 if 'smil' not in skip_protocols:
2895 rtmp_formats = self._extract_smil_formats(
044eeb14 2896 manifest_url('jwplayer.smil'),
6ad02195
RA
2897 video_id, fatal=False)
2898 for rtmp_format in rtmp_formats:
2899 rtsp_format = rtmp_format.copy()
2900 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2901 del rtsp_format['play_path']
2902 del rtsp_format['ext']
2903 rtsp_format.update({
2904 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2905 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2906 'protocol': 'rtsp',
2907 })
2908 formats.extend([rtmp_format, rtsp_format])
2909 else:
2910 for protocol in ('rtmp', 'rtsp'):
2911 if protocol not in skip_protocols:
2912 formats.append({
f2e2f0c7 2913 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
2914 'format_id': protocol,
2915 'protocol': protocol,
2916 })
2917 return formats
2918
c73e330e 2919 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 2920 mobj = re.search(
ac9c69ac 2921 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
2922 webpage)
2923 if mobj:
c73e330e
RU
2924 try:
2925 jwplayer_data = self._parse_json(mobj.group('options'),
2926 video_id=video_id,
2927 transform_source=transform_source)
2928 except ExtractorError:
2929 pass
2930 else:
2931 if isinstance(jwplayer_data, dict):
2932 return jwplayer_data
a4a554a7
YCH
2933
2934 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
2935 jwplayer_data = self._find_jwplayer_data(
2936 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
2937 return self._parse_jwplayer_data(
2938 jwplayer_data, video_id, *args, **kwargs)
2939
2940 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2941 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2942 # JWPlayer backward compatibility: flattened playlists
2943 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2944 if 'playlist' not in jwplayer_data:
2945 jwplayer_data = {'playlist': [jwplayer_data]}
2946
2947 entries = []
2948
2949 # JWPlayer backward compatibility: single playlist item
2950 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2951 if not isinstance(jwplayer_data['playlist'], list):
2952 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2953
2954 for video_data in jwplayer_data['playlist']:
2955 # JWPlayer backward compatibility: flattened sources
2956 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2957 if 'sources' not in video_data:
2958 video_data['sources'] = [video_data]
2959
2960 this_video_id = video_id or video_data['mediaid']
2961
1a2192cb
S
2962 formats = self._parse_jwplayer_formats(
2963 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2964 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
2965
2966 subtitles = {}
2967 tracks = video_data.get('tracks')
2968 if tracks and isinstance(tracks, list):
2969 for track in tracks:
96a2daa1
S
2970 if not isinstance(track, dict):
2971 continue
f4b74272
S
2972 track_kind = track.get('kind')
2973 if not track_kind or not isinstance(track_kind, compat_str):
2974 continue
2975 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
2976 continue
2977 track_url = urljoin(base_url, track.get('file'))
2978 if not track_url:
2979 continue
2980 subtitles.setdefault(track.get('label') or 'en', []).append({
2981 'url': self._proto_relative_url(track_url)
2982 })
2983
50d808f5 2984 entry = {
a4a554a7 2985 'id': this_video_id,
50d808f5 2986 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
f81dd65b 2987 'description': clean_html(video_data.get('description')),
6945b9e7 2988 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
a4a554a7
YCH
2989 'timestamp': int_or_none(video_data.get('pubdate')),
2990 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2991 'subtitles': subtitles,
50d808f5
RA
2992 }
2993 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2994 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2995 entry.update({
2996 '_type': 'url_transparent',
2997 'url': formats[0]['url'],
2998 })
2999 else:
3000 self._sort_formats(formats)
3001 entry['formats'] = formats
3002 entries.append(entry)
a4a554a7
YCH
3003 if len(entries) == 1:
3004 return entries[0]
3005 else:
3006 return self.playlist_result(entries)
3007
ed0cf9b3
S
3008 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3009 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 3010 urls = []
ed0cf9b3 3011 formats = []
1a2192cb 3012 for source in jwplayer_sources_data:
0a268c6e
S
3013 if not isinstance(source, dict):
3014 continue
6945b9e7
RA
3015 source_url = urljoin(
3016 base_url, self._proto_relative_url(source.get('file')))
3017 if not source_url or source_url in urls:
bf1b87cd
RA
3018 continue
3019 urls.append(source_url)
ed0cf9b3
S
3020 source_type = source.get('type') or ''
3021 ext = mimetype2ext(source_type) or determine_ext(source_url)
3022 if source_type == 'hls' or ext == 'm3u8':
3023 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
3024 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3025 m3u8_id=m3u8_id, fatal=False))
0d9c48de 3026 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
3027 formats.extend(self._extract_mpd_formats(
3028 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
3029 elif ext == 'smil':
3030 formats.extend(self._extract_smil_formats(
3031 source_url, video_id, fatal=False))
ed0cf9b3 3032 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
3033 elif source_type.startswith('audio') or ext in (
3034 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
3035 formats.append({
3036 'url': source_url,
3037 'vcodec': 'none',
3038 'ext': ext,
3039 })
3040 else:
3041 height = int_or_none(source.get('height'))
3042 if height is None:
3043 # Often no height is provided but there is a label in
0236cd0d 3044 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 3045 height = int_or_none(self._search_regex(
0236cd0d 3046 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
3047 'height', default=None))
3048 a_format = {
3049 'url': source_url,
3050 'width': int_or_none(source.get('width')),
3051 'height': height,
0236cd0d 3052 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
3053 'ext': ext,
3054 }
3055 if source_url.startswith('rtmp'):
3056 a_format['ext'] = 'flv'
ed0cf9b3
S
3057 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3058 # of jwplayer.flash.swf
3059 rtmp_url_parts = re.split(
3060 r'((?:mp4|mp3|flv):)', source_url, 1)
3061 if len(rtmp_url_parts) == 3:
3062 rtmp_url, prefix, play_path = rtmp_url_parts
3063 a_format.update({
3064 'url': rtmp_url,
3065 'play_path': prefix + play_path,
3066 })
3067 if rtmp_params:
3068 a_format.update(rtmp_params)
3069 formats.append(a_format)
3070 return formats
3071
f4b1c7ad
PH
3072 def _live_title(self, name):
3073 """ Generate the title for a live video """
3074 now = datetime.datetime.now()
611c1dd9 3075 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
3076 return name + ' ' + now_str
3077
b14f3a4c
PH
3078 def _int(self, v, name, fatal=False, **kwargs):
3079 res = int_or_none(v, **kwargs)
3080 if 'get_attr' in kwargs:
3081 print(getattr(v, kwargs['get_attr']))
3082 if res is None:
3083 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3084 if fatal:
3085 raise ExtractorError(msg)
3086 else:
3087 self._downloader.report_warning(msg)
3088 return res
3089
3090 def _float(self, v, name, fatal=False, **kwargs):
3091 res = float_or_none(v, **kwargs)
3092 if res is None:
3093 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3094 if fatal:
3095 raise ExtractorError(msg)
3096 else:
3097 self._downloader.report_warning(msg)
3098 return res
3099
40e41780
TF
3100 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3101 path='/', secure=False, discard=False, rest={}, **kwargs):
6c22cee6 3102 cookie = compat_cookiejar_Cookie(
4ed2d7b7 3103 0, name, value, port, port is not None, domain, True,
40e41780
TF
3104 domain.startswith('.'), path, True, secure, expire_time,
3105 discard, None, None, rest)
42939b61
JMF
3106 self._downloader.cookiejar.set_cookie(cookie)
3107
799207e8 3108 def _get_cookies(self, url):
3109 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 3110 req = sanitized_Request(url)
799207e8 3111 self._downloader.cookiejar.add_cookie_header(req)
3112 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3113
e3c1266f 3114 def _apply_first_set_cookie_header(self, url_handle, cookie):
ce2fe4c0
S
3115 """
3116 Apply first Set-Cookie header instead of the last. Experimental.
3117
3118 Some sites (e.g. [1-3]) may serve two cookies under the same name
3119 in Set-Cookie header and expect the first (old) one to be set rather
3120 than second (new). However, as of RFC6265 the newer one cookie
3121 should be set into cookie store what actually happens.
3122 We will workaround this issue by resetting the cookie to
3123 the first one manually.
3124 1. https://new.vk.com/
3125 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3126 3. https://learning.oreilly.com/
3127 """
e3c1266f
S
3128 for header, cookies in url_handle.headers.items():
3129 if header.lower() != 'set-cookie':
3130 continue
3131 if sys.version_info[0] >= 3:
3132 cookies = cookies.encode('iso-8859-1')
3133 cookies = cookies.decode('utf-8')
3134 cookie_value = re.search(
3135 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3136 if cookie_value:
3137 value, domain = cookie_value.groups()
3138 self._set_cookie(domain, cookie, value)
3139 break
3140
05900629
PH
3141 def get_testcases(self, include_onlymatching=False):
3142 t = getattr(self, '_TEST', None)
3143 if t:
3144 assert not hasattr(self, '_TESTS'), \
3145 '%s has _TEST and _TESTS' % type(self).__name__
3146 tests = [t]
3147 else:
3148 tests = getattr(self, '_TESTS', [])
3149 for t in tests:
3150 if not include_onlymatching and t.get('only_matching', False):
3151 continue
3152 t['name'] = type(self).__name__[:-len('IE')]
3153 yield t
3154
3155 def is_suitable(self, age_limit):
3156 """ Test whether the extractor is generally suitable for the given
3157 age limit (i.e. pornographic sites are not, all others usually are) """
3158
3159 any_restricted = False
3160 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 3161 if tc.get('playlist', []):
05900629
PH
3162 tc = tc['playlist'][0]
3163 is_restricted = age_restricted(
3164 tc.get('info_dict', {}).get('age_limit'), age_limit)
3165 if not is_restricted:
3166 return True
3167 any_restricted = any_restricted or is_restricted
3168 return not any_restricted
3169
a504ced0 3170 def extract_subtitles(self, *args, **kwargs):
3089bc74
S
3171 if (self._downloader.params.get('writesubtitles', False)
3172 or self._downloader.params.get('listsubtitles')):
9868ea49
JMF
3173 return self._get_subtitles(*args, **kwargs)
3174 return {}
a504ced0
JMF
3175
3176 def _get_subtitles(self, *args, **kwargs):
611c1dd9 3177 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 3178
912e0b7e
YCH
3179 @staticmethod
3180 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3181 """ Merge subtitle items for one language. Items with duplicated URLs
3182 will be dropped. """
3183 list1_urls = set([item['url'] for item in subtitle_list1])
3184 ret = list(subtitle_list1)
3185 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3186 return ret
3187
3188 @classmethod
8c97f819 3189 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 3190 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
3191 ret = dict(subtitle_dict1)
3192 for lang in subtitle_dict2:
8c97f819 3193 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
3194 return ret
3195
360e1ca5 3196 def extract_automatic_captions(self, *args, **kwargs):
3089bc74
S
3197 if (self._downloader.params.get('writeautomaticsub', False)
3198 or self._downloader.params.get('listsubtitles')):
9868ea49
JMF
3199 return self._get_automatic_captions(*args, **kwargs)
3200 return {}
360e1ca5
JMF
3201
3202 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 3203 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 3204
d77ab8e2 3205 def mark_watched(self, *args, **kwargs):
3089bc74
S
3206 if (self._downloader.params.get('mark_watched', False)
3207 and (self._get_login_info()[0] is not None
3208 or self._downloader.params.get('cookiefile') is not None)):
d77ab8e2
S
3209 self._mark_watched(*args, **kwargs)
3210
3211 def _mark_watched(self, *args, **kwargs):
3212 raise NotImplementedError('This method must be implemented by subclasses')
3213
38cce791
YCH
3214 def geo_verification_headers(self):
3215 headers = {}
3216 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3217 if geo_verification_proxy:
3218 headers['Ytdl-request-proxy'] = geo_verification_proxy
3219 return headers
3220
98763ee3
YCH
3221 def _generic_id(self, url):
3222 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3223
3224 def _generic_title(self, url):
3225 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3226
8dbe9899 3227
d6983cb4
PH
3228class SearchInfoExtractor(InfoExtractor):
3229 """
3230 Base class for paged search queries extractors.
10952eb2 3231 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
3232 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3233 """
3234
3235 @classmethod
3236 def _make_valid_url(cls):
3237 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3238
3239 @classmethod
3240 def suitable(cls, url):
3241 return re.match(cls._make_valid_url(), url) is not None
3242
3243 def _real_extract(self, query):
3244 mobj = re.match(self._make_valid_url(), query)
3245 if mobj is None:
f1a9d64e 3246 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
3247
3248 prefix = mobj.group('prefix')
3249 query = mobj.group('query')
3250 if prefix == '':
3251 return self._get_n_results(query, 1)
3252 elif prefix == 'all':
3253 return self._get_n_results(query, self._MAX_RESULTS)
3254 else:
3255 n = int(prefix)
3256 if n <= 0:
f1a9d64e 3257 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 3258 elif n > self._MAX_RESULTS:
f1a9d64e 3259 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
3260 n = self._MAX_RESULTS
3261 return self._get_n_results(query, n)
3262
3263 def _get_n_results(self, query, n):
3264 """Get a specified number of results for a query"""
611c1dd9 3265 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
3266
3267 @property
3268 def SEARCH_KEY(self):
3269 return self._SEARCH_KEY