]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[porntube] Extract channel meta fields
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
fd475508 1# coding: utf-8
6a3828fd 2from __future__ import unicode_literals
f1a9d64e 3
d6983cb4 4import base64
f4b1c7ad 5import datetime
3ec05685 6import hashlib
3d3538e4 7import json
4094b6e3 8import netrc
d6983cb4 9import os
773f291d 10import random
d6983cb4
PH
11import re
12import socket
13import sys
4094b6e3 14import time
1bac3455 15import math
d6983cb4 16
8c25f81b 17from ..compat import (
42939b61 18 compat_cookiejar,
799207e8 19 compat_cookies,
e9c0cdd3 20 compat_etree_fromstring,
e64b7569 21 compat_getpass,
d391b7e2 22 compat_integer_types,
d6983cb4 23 compat_http_client,
e9c0cdd3
YCH
24 compat_os_name,
25 compat_str,
d6983cb4 26 compat_urllib_error,
98763ee3 27 compat_urllib_parse_unquote,
15707c7e 28 compat_urllib_parse_urlencode,
41d06b04 29 compat_urllib_request,
f0b5d6af 30 compat_urlparse,
e01c3d2e 31 compat_xml_parse_error,
8c25f81b 32)
48107c19
S
33from ..downloader.f4m import (
34 get_base_url,
35 remove_encrypted_media,
36)
8c25f81b 37from ..utils import (
c342041f 38 NO_DEFAULT,
05900629 39 age_restricted,
02dc0a36 40 base_url,
08f2a92c 41 bug_reports_message,
d6983cb4
PH
42 clean_html,
43 compiled_regex_type,
70f0f5a8 44 determine_ext,
46b18f23 45 determine_protocol,
9b9c5355 46 error_to_compat_str,
d6983cb4 47 ExtractorError,
46b18f23 48 extract_attributes,
97f4aecf 49 fix_xml_ampersands,
b14f3a4c 50 float_or_none,
773f291d
S
51 GeoRestrictedError,
52 GeoUtils,
31bb8d3f 53 int_or_none,
a4a554a7 54 js_to_json,
0685d972 55 JSON_LD_RE,
46b18f23
JH
56 mimetype2ext,
57 orderedSet,
58 parse_codecs,
59 parse_duration,
4ca2a3cf 60 parse_iso8601,
46b18f23 61 parse_m3u8_attributes,
55b3e45b 62 RegexNotFoundError,
5c2266df 63 sanitized_Request,
46b18f23 64 sanitize_filename,
f38de77f 65 unescapeHTML,
647eab45 66 unified_strdate,
6b3a3098 67 unified_timestamp,
46b18f23
JH
68 update_Request,
69 update_url_query,
70 urljoin,
a107193e 71 url_basename,
a6571f10 72 xpath_element,
8d6765cf
S
73 xpath_text,
74 xpath_with_ns,
d6983cb4 75)
c342041f 76
d6983cb4
PH
77
78class InfoExtractor(object):
79 """Information Extractor class.
80
81 Information extractors are the classes that, given a URL, extract
82 information about the video (or videos) the URL refers to. This
83 information includes the real video URL, the video title, author and
84 others. The information is stored in a dictionary which is then
5d380852 85 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
86 information possibly downloading the video to the file system, among
87 other possible outcomes.
88
cf0649f8 89 The type field determines the type of the result.
fed5d032
PH
90 By far the most common value (and the default if _type is missing) is
91 "video", which indicates a single video.
92
93 For a video, the dictionaries must include the following fields:
d6983cb4
PH
94
95 id: Video identifier.
d6983cb4 96 title: Video title, unescaped.
d67b0b15 97
f49d89ee 98 Additionally, it must contain either a formats entry or a url one:
d67b0b15 99
f49d89ee
PH
100 formats: A list of dictionaries for each format available, ordered
101 from worst to best quality.
102
103 Potential fields:
86f4d14f
S
104 * url Mandatory. The URL of the video file
105 * manifest_url
106 The URL of the manifest file in case of
107 fragmented media (DASH, hls, hds)
10952eb2 108 * ext Will be calculated from URL if missing
d67b0b15
PH
109 * format A human-readable description of the format
110 ("mp4 container with h264/opus").
111 Calculated from the format_id, width, height.
112 and format_note fields if missing.
113 * format_id A short description of the format
5d4f3985
PH
114 ("mp4_h264_opus" or "19").
115 Technically optional, but strongly recommended.
d67b0b15
PH
116 * format_note Additional info about the format
117 ("3D" or "DASH video")
118 * width Width of the video, if known
119 * height Height of the video, if known
f49d89ee 120 * resolution Textual description of width and height
7217e148 121 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
122 * abr Average audio bitrate in KBit/s
123 * acodec Name of the audio codec in use
dd27fd17 124 * asr Audio sampling rate in Hertz
d67b0b15 125 * vbr Average video bitrate in KBit/s
fbb21cf5 126 * fps Frame rate
d67b0b15 127 * vcodec Name of the video codec in use
1394ce65 128 * container Name of the container format
d67b0b15 129 * filesize The number of bytes, if known in advance
9732d77e 130 * filesize_approx An estimate for the number of bytes
d67b0b15 131 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
132 * protocol The protocol that will be used for the actual
133 download, lower-case.
b04b8852 134 "http", "https", "rtsp", "rtmp", "rtmpe",
af7d5a63 135 "m3u8", "m3u8_native" or "http_dash_segments".
c58c2d63
S
136 * fragment_base_url
137 Base URL for fragments. Each fragment's path
138 value (if present) will be relative to
139 this URL.
140 * fragments A list of fragments of a fragmented media.
141 Each fragment entry must contain either an url
142 or a path. If an url is present it should be
143 considered by a client. Otherwise both path and
144 fragment_base_url must be present. Here is
145 the list of all potential fields:
146 * "url" - fragment's URL
147 * "path" - fragment's path relative to
148 fragment_base_url
a0d5077c
S
149 * "duration" (optional, int or float)
150 * "filesize" (optional, int)
f49d89ee 151 * preference Order number of this format. If this field is
08d13955 152 present and not None, the formats get sorted
38d63d84 153 by this field, regardless of all other values.
f49d89ee
PH
154 -1 for default (order by other properties),
155 -2 or smaller for less than default.
e65566a9
PH
156 < -1000 to hide the format (if there is
157 another one which is strictly better)
32f90364
PH
158 * language Language code, e.g. "de" or "en-US".
159 * language_preference Is this in the language mentioned in
160 the URL?
aff2f4f4
PH
161 10 if it's what the URL is about,
162 -1 for default (don't know),
163 -10 otherwise, other values reserved for now.
5d73273f
PH
164 * quality Order number of the video quality of this
165 format, irrespective of the file format.
166 -1 for default (order by other properties),
167 -2 or smaller for less than default.
c64ed2a3
PH
168 * source_preference Order number for this video source
169 (quality takes higher priority)
170 -1 for default (order by other properties),
171 -2 or smaller for less than default.
d769be6c
PH
172 * http_headers A dictionary of additional HTTP headers
173 to add to the request.
6271f1ca 174 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
175 video's pixels are not square.
176 width : height ratio as float.
177 * no_resume The server does not support resuming the
178 (HTTP or RTMP) download. Boolean.
00c97e3e
S
179 * downloader_options A dictionary of downloader options as
180 described in FileDownloader
3dee7826 181
c0ba0f48 182 url: Final video URL.
d6983cb4 183 ext: Video filename extension.
d67b0b15
PH
184 format: The video format, defaults to ext (used for --get-format)
185 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 186
d6983cb4
PH
187 The following fields are optional:
188
f5e43bc6 189 alt_title: A secondary title of the video.
0afef30b
PH
190 display_id An alternative identifier for the video, not necessarily
191 unique, but available before title. Typically, id is
192 something like "4234987", title "Dancing naked mole rats",
193 and display_id "dancing-naked-mole-rats"
d5519808 194 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 195 * "id" (optional, string) - Thumbnail format ID
d5519808 196 * "url"
cfb56d1a 197 * "preference" (optional, int) - quality of the image
d5519808
PH
198 * "width" (optional, int)
199 * "height" (optional, int)
200 * "resolution" (optional, string "{width}x{height"},
201 deprecated)
2de624fd 202 * "filesize" (optional, int)
d6983cb4 203 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 204 description: Full video description.
d6983cb4 205 uploader: Full name of the video uploader.
2bc0c46f 206 license: License name the video is licensed under.
8a92e51c 207 creator: The creator of the video.
8aab976b 208 release_date: The date (YYYYMMDD) when the video was released.
955c4514 209 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 210 upload_date: Video upload date (YYYYMMDD).
955c4514 211 If not explicitly set, calculated from timestamp.
d6983cb4 212 uploader_id: Nickname or id of the video uploader.
7bcd2830 213 uploader_url: Full URL to a personal webpage of the video uploader.
6f1f59f3
S
214 channel: Full name of the channel the video is uploaded on.
215 Note that channel fields may or may noy repeat uploader
216 fields. This depends on a particular extractor.
217 channel_id: Id of the channel.
218 channel_url: Full URL to a channel webpage.
da9ec3b9 219 location: Physical location where the video was filmed.
a504ced0 220 subtitles: The available subtitles as a dictionary in the format
4606c34e
YCH
221 {tag: subformats}. "tag" is usually a language code, and
222 "subformats" is a list sorted from lower to higher
223 preference, each element is a dictionary with the "ext"
224 entry and one of:
a504ced0 225 * "data": The subtitles file contents
10952eb2 226 * "url": A URL pointing to the subtitles file
4bba3716 227 "ext" will be calculated from URL if missing
360e1ca5
JMF
228 automatic_captions: Like 'subtitles', used by the YoutubeIE for
229 automatically generated captions
62d231c0 230 duration: Length of the video in seconds, as an integer or float.
f3d29461 231 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
232 like_count: Number of positive ratings of the video
233 dislike_count: Number of negative ratings of the video
02835c6b 234 repost_count: Number of reposts of the video
2d30521a 235 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 236 comment_count: Number of comments on the video
dd622d7c
PH
237 comments: A list of comments, each with one or more of the following
238 properties (all but one of text or html optional):
239 * "author" - human-readable name of the comment author
240 * "author_id" - user ID of the comment author
241 * "id" - Comment ID
242 * "html" - Comment as HTML
243 * "text" - Plain text of the comment
244 * "timestamp" - UNIX timestamp of comment
245 * "parent" - ID of the comment this one is replying to.
246 Set to "root" to indicate that this is a
247 comment to the original video.
8dbe9899 248 age_limit: Age restriction for the video, as an integer (years)
10952eb2 249 webpage_url: The URL to the video webpage, if given to youtube-dl it
9103bbc5
JMF
250 should allow to get the same result again. (It will be set
251 by YoutubeDL if it's missing)
ad3bc6ac
PH
252 categories: A list of categories that the video falls in, for example
253 ["Sports", "Berlin"]
864f24bd 254 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
255 is_live: True, False, or None (=unknown). Whether this video is a
256 live stream that goes on instead of a fixed-length video.
7c80519c 257 start_time: Time in seconds where the reproduction should start, as
10952eb2 258 specified in the URL.
297a564b 259 end_time: Time in seconds where the reproduction should end, as
10952eb2 260 specified in the URL.
55949fed 261 chapters: A list of dictionaries, with the following entries:
262 * "start_time" - The start time of the chapter in seconds
263 * "end_time" - The end time of the chapter in seconds
264 * "title" (optional, string)
d6983cb4 265
7109903e
S
266 The following fields should only be used when the video belongs to some logical
267 chapter or section:
268
269 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
270 chapter_number: Number of the chapter the video belongs to, as an integer.
271 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
272
273 The following fields should only be used when the video is an episode of some
8d76bdf1 274 series, programme or podcast:
7109903e
S
275
276 series: Title of the series or programme the video episode belongs to.
277 season: Title of the season the video episode belongs to.
27bfd4e5
S
278 season_number: Number of the season the video episode belongs to, as an integer.
279 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
280 episode: Title of the video episode. Unlike mandatory video title field,
281 this field should denote the exact title of the video episode
282 without any kind of decoration.
27bfd4e5
S
283 episode_number: Number of the video episode within a season, as an integer.
284 episode_id: Id of the video episode, as a unicode string.
7109903e 285
7a93ab5f
S
286 The following fields should only be used when the media is a track or a part of
287 a music album:
288
289 track: Title of the track.
290 track_number: Number of the track within an album or a disc, as an integer.
291 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
292 as a unicode string.
293 artist: Artist(s) of the track.
294 genre: Genre(s) of the track.
295 album: Title of the album the track belongs to.
296 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
297 album_artist: List of all artists appeared on the album (e.g.
298 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
299 and compilations).
300 disc_number: Number of the disc or other physical medium the track belongs to,
301 as an integer.
302 release_year: Year (YYYY) when the album was released.
303
deefc05b 304 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 305
d838b1bd
PH
306 Unless mentioned otherwise, None is equivalent to absence of information.
307
fed5d032
PH
308
309 _type "playlist" indicates multiple videos.
b82f815f
PH
310 There must be a key "entries", which is a list, an iterable, or a PagedList
311 object, each element of which is a valid dictionary by this specification.
fed5d032 312
c10c9323
S
313 Additionally, playlists can have "id", "title", "description", "uploader",
314 "uploader_id", "uploader_url" attributes with the same semantics as videos
315 (see above).
fed5d032
PH
316
317
318 _type "multi_video" indicates that there are multiple videos that
319 form a single show, for examples multiple acts of an opera or TV episode.
320 It must have an entries key like a playlist and contain all the keys
321 required for a video at the same time.
322
323
324 _type "url" indicates that the video must be extracted from another
325 location, possibly by a different extractor. Its only required key is:
326 "url" - the next URL to extract.
f58766ce
PH
327 The key "ie_key" can be set to the class name (minus the trailing "IE",
328 e.g. "Youtube") if the extractor class is known in advance.
329 Additionally, the dictionary may have any properties of the resolved entity
330 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
331 known ahead of time.
332
333
334 _type "url_transparent" entities have the same specification as "url", but
335 indicate that the given additional information is more precise than the one
336 associated with the resolved URL.
337 This is useful when a site employs a video service that hosts the video and
338 its technical metadata, but that video service does not embed a useful
339 title, description etc.
340
341
d6983cb4
PH
342 Subclasses of this one should re-define the _real_initialize() and
343 _real_extract() methods and define a _VALID_URL regexp.
344 Probably, they should also be added to the list of extractors.
345
4248dad9 346 _GEO_BYPASS attribute may be set to False in order to disable
773f291d
S
347 geo restriction bypass mechanisms for a particular extractor.
348 Though it won't disable explicit geo restriction bypass based on
504f20dd 349 country code provided with geo_bypass_country.
4248dad9
S
350
351 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
352 countries for this extractor. One of these countries will be used by
353 geo restriction bypass mechanism right away in order to bypass
504f20dd 354 geo restriction, of course, if the mechanism is not disabled.
773f291d 355
5f95927a
S
356 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
357 IP blocks in CIDR notation for this extractor. One of these IP blocks
358 will be used by geo restriction bypass mechanism similarly
504f20dd 359 to _GEO_COUNTRIES.
3ccdde8c 360
d6983cb4
PH
361 Finally, the _WORKING attribute should be set to False for broken IEs
362 in order to warn the users and skip the tests.
363 """
364
365 _ready = False
366 _downloader = None
773f291d 367 _x_forwarded_for_ip = None
4248dad9
S
368 _GEO_BYPASS = True
369 _GEO_COUNTRIES = None
5f95927a 370 _GEO_IP_BLOCKS = None
d6983cb4
PH
371 _WORKING = True
372
373 def __init__(self, downloader=None):
374 """Constructor. Receives an optional downloader."""
375 self._ready = False
773f291d 376 self._x_forwarded_for_ip = None
d6983cb4
PH
377 self.set_downloader(downloader)
378
379 @classmethod
380 def suitable(cls, url):
381 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
382
383 # This does not use has/getattr intentionally - we want to know whether
384 # we have cached the regexp for *this* class, whereas getattr would also
385 # match the superclass
386 if '_VALID_URL_RE' not in cls.__dict__:
387 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
388 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 389
ed9266db
PH
390 @classmethod
391 def _match_id(cls, url):
392 if '_VALID_URL_RE' not in cls.__dict__:
393 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
394 m = cls._VALID_URL_RE.match(url)
395 assert m
1afd0b0d 396 return compat_str(m.group('id'))
ed9266db 397
d6983cb4
PH
398 @classmethod
399 def working(cls):
400 """Getter method for _WORKING."""
401 return cls._WORKING
402
403 def initialize(self):
404 """Initializes an instance (authentication, etc)."""
5f95927a
S
405 self._initialize_geo_bypass({
406 'countries': self._GEO_COUNTRIES,
407 'ip_blocks': self._GEO_IP_BLOCKS,
408 })
4248dad9
S
409 if not self._ready:
410 self._real_initialize()
411 self._ready = True
412
5f95927a 413 def _initialize_geo_bypass(self, geo_bypass_context):
e39b5d4a
S
414 """
415 Initialize geo restriction bypass mechanism.
416
417 This method is used to initialize geo bypass mechanism based on faking
418 X-Forwarded-For HTTP header. A random country from provided country list
dc0a869e 419 is selected and a random IP belonging to this country is generated. This
e39b5d4a
S
420 IP will be passed as X-Forwarded-For HTTP header in all subsequent
421 HTTP requests.
e39b5d4a
S
422
423 This method will be used for initial geo bypass mechanism initialization
5f95927a
S
424 during the instance initialization with _GEO_COUNTRIES and
425 _GEO_IP_BLOCKS.
e39b5d4a 426
5f95927a 427 You may also manually call it from extractor's code if geo bypass
e39b5d4a 428 information is not available beforehand (e.g. obtained during
5f95927a
S
429 extraction) or due to some other reason. In this case you should pass
430 this information in geo bypass context passed as first argument. It may
431 contain following fields:
432
433 countries: List of geo unrestricted countries (similar
434 to _GEO_COUNTRIES)
435 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
436 (similar to _GEO_IP_BLOCKS)
437
e39b5d4a 438 """
773f291d 439 if not self._x_forwarded_for_ip:
5f95927a
S
440
441 # Geo bypass mechanism is explicitly disabled by user
442 if not self._downloader.params.get('geo_bypass', True):
443 return
444
445 if not geo_bypass_context:
446 geo_bypass_context = {}
447
448 # Backward compatibility: previously _initialize_geo_bypass
449 # expected a list of countries, some 3rd party code may still use
450 # it this way
451 if isinstance(geo_bypass_context, (list, tuple)):
452 geo_bypass_context = {
453 'countries': geo_bypass_context,
454 }
455
456 # The whole point of geo bypass mechanism is to fake IP
457 # as X-Forwarded-For HTTP header based on some IP block or
458 # country code.
459
460 # Path 1: bypassing based on IP block in CIDR notation
461
462 # Explicit IP block specified by user, use it right away
463 # regardless of whether extractor is geo bypassable or not
464 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
465
466 # Otherwise use random IP block from geo bypass context but only
467 # if extractor is known as geo bypassable
468 if not ip_block:
469 ip_blocks = geo_bypass_context.get('ip_blocks')
470 if self._GEO_BYPASS and ip_blocks:
471 ip_block = random.choice(ip_blocks)
472
473 if ip_block:
474 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
475 if self._downloader.params.get('verbose', False):
476 self._downloader.to_screen(
477 '[debug] Using fake IP %s as X-Forwarded-For.'
478 % self._x_forwarded_for_ip)
479 return
480
481 # Path 2: bypassing based on country code
482
483 # Explicit country code specified by user, use it right away
484 # regardless of whether extractor is geo bypassable or not
485 country = self._downloader.params.get('geo_bypass_country', None)
486
487 # Otherwise use random country code from geo bypass context but
488 # only if extractor is known as geo bypassable
489 if not country:
490 countries = geo_bypass_context.get('countries')
491 if self._GEO_BYPASS and countries:
492 country = random.choice(countries)
493
494 if country:
495 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
4248dad9 496 if self._downloader.params.get('verbose', False):
6a9cb295 497 self._downloader.to_screen(
eea0716c 498 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
5f95927a 499 % (self._x_forwarded_for_ip, country.upper()))
d6983cb4
PH
500
501 def extract(self, url):
502 """Extracts URL information and returns it in list of dicts."""
3a5bcd03 503 try:
773f291d
S
504 for _ in range(2):
505 try:
506 self.initialize()
0016b84e
S
507 ie_result = self._real_extract(url)
508 if self._x_forwarded_for_ip:
509 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
510 return ie_result
773f291d 511 except GeoRestrictedError as e:
4248dad9
S
512 if self.__maybe_fake_ip_and_retry(e.countries):
513 continue
773f291d 514 raise
3a5bcd03
PH
515 except ExtractorError:
516 raise
517 except compat_http_client.IncompleteRead as e:
dfb1b146 518 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
9650885b 519 except (KeyError, StopIteration) as e:
dfb1b146 520 raise ExtractorError('An extractor error has occurred.', cause=e)
d6983cb4 521
4248dad9
S
522 def __maybe_fake_ip_and_retry(self, countries):
523 if (not self._downloader.params.get('geo_bypass_country', None) and
524 self._GEO_BYPASS and
525 self._downloader.params.get('geo_bypass', True) and
526 not self._x_forwarded_for_ip and
527 countries):
eea0716c
S
528 country_code = random.choice(countries)
529 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
4248dad9
S
530 if self._x_forwarded_for_ip:
531 self.report_warning(
eea0716c
S
532 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
533 % (self._x_forwarded_for_ip, country_code.upper()))
4248dad9
S
534 return True
535 return False
536
d6983cb4
PH
537 def set_downloader(self, downloader):
538 """Sets the downloader for this IE."""
539 self._downloader = downloader
540
541 def _real_initialize(self):
542 """Real initialization process. Redefine in subclasses."""
543 pass
544
545 def _real_extract(self, url):
546 """Real extraction process. Redefine in subclasses."""
547 pass
548
56c73665
JMF
549 @classmethod
550 def ie_key(cls):
551 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 552 return compat_str(cls.__name__[:-2])
56c73665 553
d6983cb4
PH
554 @property
555 def IE_NAME(self):
dc519b54 556 return compat_str(type(self).__name__[:-2])
d6983cb4 557
d391b7e2
S
558 @staticmethod
559 def __can_accept_status_code(err, expected_status):
560 assert isinstance(err, compat_urllib_error.HTTPError)
561 if expected_status is None:
562 return False
563 if isinstance(expected_status, compat_integer_types):
564 return err.code == expected_status
565 elif isinstance(expected_status, (list, tuple)):
566 return err.code in expected_status
567 elif callable(expected_status):
568 return expected_status(err.code) is True
569 else:
570 assert False
571
572 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
573 """
574 Return the response handle.
575
576 See _download_webpage docstring for arguments specification.
577 """
d6983cb4
PH
578 if note is None:
579 self.report_download_webpage(video_id)
580 elif note is not False:
7cc3570e 581 if video_id is None:
f1a9d64e 582 self.to_screen('%s' % (note,))
7cc3570e 583 else:
f1a9d64e 584 self.to_screen('%s: %s' % (video_id, note))
2132edaa
S
585
586 # Some sites check X-Forwarded-For HTTP header in order to figure out
587 # the origin of the client behind proxy. This allows bypassing geo
588 # restriction by faking this header's value to IP that belongs to some
589 # geo unrestricted country. We will do so once we encounter any
590 # geo restriction error.
591 if self._x_forwarded_for_ip:
592 if 'X-Forwarded-For' not in headers:
593 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
594
41d06b04
S
595 if isinstance(url_or_request, compat_urllib_request.Request):
596 url_or_request = update_Request(
597 url_or_request, data=data, headers=headers, query=query)
598 else:
cdfee168 599 if query:
600 url_or_request = update_url_query(url_or_request, query)
2c0d9c62 601 if data is not None or headers:
41d06b04 602 url_or_request = sanitized_Request(url_or_request, data, headers)
d6983cb4 603 try:
dca08720 604 return self._downloader.urlopen(url_or_request)
d6983cb4 605 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
d391b7e2
S
606 if isinstance(err, compat_urllib_error.HTTPError):
607 if self.__can_accept_status_code(err, expected_status):
608 return err.fp
609
aa94a6d3
PH
610 if errnote is False:
611 return False
d6983cb4 612 if errnote is None:
f1a9d64e 613 errnote = 'Unable to download webpage'
7f8b2714 614
9b9c5355 615 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
616 if fatal:
617 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
618 else:
619 self._downloader.report_warning(errmsg)
620 return False
d6983cb4 621
d391b7e2
S
622 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
623 """
624 Return a tuple (page content as string, URL handle).
625
626 See _download_webpage docstring for arguments specification.
627 """
b9d3e163
PH
628 # Strip hashes from the URL (#1038)
629 if isinstance(url_or_request, (compat_str, str)):
630 url_or_request = url_or_request.partition('#')[0]
631
d391b7e2 632 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
7cc3570e
PH
633 if urlh is False:
634 assert not fatal
635 return False
c9a77969 636 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
637 return (content, urlh)
638
c9a77969
YCH
639 @staticmethod
640 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
641 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
642 if m:
643 encoding = m.group(1)
644 else:
0d75ae2c 645 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
646 webpage_bytes[:1024])
647 if m:
648 encoding = m.group(1).decode('ascii')
b60016e8
PH
649 elif webpage_bytes.startswith(b'\xff\xfe'):
650 encoding = 'utf-16'
f143d86a
PH
651 else:
652 encoding = 'utf-8'
c9a77969
YCH
653
654 return encoding
655
4457823d
S
656 def __check_blocked(self, content):
657 first_block = content[:512]
658 if ('<title>Access to this site is blocked</title>' in content and
659 'Websense' in first_block):
660 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
661 blocked_iframe = self._html_search_regex(
662 r'<iframe src="([^"]+)"', content,
663 'Websense information URL', default=None)
664 if blocked_iframe:
665 msg += ' Visit %s for more details' % blocked_iframe
666 raise ExtractorError(msg, expected=True)
667 if '<title>The URL you requested has been blocked</title>' in first_block:
668 msg = (
669 'Access to this webpage has been blocked by Indian censorship. '
670 'Use a VPN or proxy server (with --proxy) to route around it.')
671 block_msg = self._html_search_regex(
672 r'</h1><p>(.*?)</p>',
673 content, 'block message', default=None)
674 if block_msg:
675 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
676 raise ExtractorError(msg, expected=True)
677 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
678 'blocklist.rkn.gov.ru' in content):
679 raise ExtractorError(
680 'Access to this webpage has been blocked by decision of the Russian government. '
681 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
682 expected=True)
683
c9a77969
YCH
684 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
685 content_type = urlh.headers.get('Content-Type', '')
686 webpage_bytes = urlh.read()
687 if prefix is not None:
688 webpage_bytes = prefix + webpage_bytes
689 if not encoding:
690 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4 691 if self._downloader.params.get('dump_intermediate_pages', False):
f610dbb0 692 self.to_screen('Dumping request to ' + urlh.geturl())
d6983cb4
PH
693 dump = base64.b64encode(webpage_bytes).decode('ascii')
694 self._downloader.to_screen(dump)
d41e6efc 695 if self._downloader.params.get('write_pages', False):
f610dbb0 696 basen = '%s_%s' % (video_id, urlh.geturl())
c1bce22f 697 if len(basen) > 240:
f1a9d64e 698 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
699 basen = basen[:240 - len(h)] + h
700 raw_filename = basen + '.dump'
d41e6efc 701 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 702 self.to_screen('Saving request to ' + filename)
5f58165d
S
703 # Working around MAX_PATH limitation on Windows (see
704 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
e9c0cdd3 705 if compat_os_name == 'nt':
5f58165d
S
706 absfilepath = os.path.abspath(filename)
707 if len(absfilepath) > 259:
708 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
709 with open(filename, 'wb') as outf:
710 outf.write(webpage_bytes)
711
ec0fafbb
AA
712 try:
713 content = webpage_bytes.decode(encoding, 'replace')
714 except LookupError:
715 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 716
4457823d 717 self.__check_blocked(content)
2410c43d 718
23be51d8 719 return content
d6983cb4 720
d391b7e2
S
721 def _download_webpage(
722 self, url_or_request, video_id, note=None, errnote=None,
723 fatal=True, tries=1, timeout=5, encoding=None, data=None,
724 headers={}, query={}, expected_status=None):
725 """
726 Return the data of the page as a string.
727
728 Arguments:
729 url_or_request -- plain text URL as a string or
730 a compat_urllib_request.Requestobject
731 video_id -- Video/playlist/item identifier (string)
732
733 Keyword arguments:
734 note -- note printed before downloading (string)
735 errnote -- note printed in case of an error (string)
736 fatal -- flag denoting whether error should be considered fatal,
737 i.e. whether it should cause ExtractionError to be raised,
738 otherwise a warning will be reported and extraction continued
739 tries -- number of tries
740 timeout -- sleep interval between tries
741 encoding -- encoding for a page content decoding, guessed automatically
742 when not explicitly specified
743 data -- POST data (bytes)
744 headers -- HTTP headers (dict)
745 query -- URL query (dict)
746 expected_status -- allows to accept failed HTTP requests (non 2xx
747 status code) by explicitly specifying a set of accepted status
748 codes. Can be any of the following entities:
749 - an integer type specifying an exact failed status code to
750 accept
751 - a list or a tuple of integer types specifying a list of
752 failed status codes to accept
753 - a callable accepting an actual failed status code and
754 returning True if it should be accepted
755 Note that this argument does not affect success status codes (2xx)
756 which are always accepted.
757 """
758
995ad69c
TF
759 success = False
760 try_count = 0
761 while success is False:
762 try:
d391b7e2
S
763 res = self._download_webpage_handle(
764 url_or_request, video_id, note, errnote, fatal,
765 encoding=encoding, data=data, headers=headers, query=query,
766 expected_status=expected_status)
995ad69c
TF
767 success = True
768 except compat_http_client.IncompleteRead as e:
769 try_count += 1
770 if try_count >= tries:
771 raise e
772 self._sleep(timeout, video_id)
7cc3570e
PH
773 if res is False:
774 return res
775 else:
776 content, _ = res
777 return content
d6983cb4 778
e0d198c1
S
779 def _download_xml_handle(
780 self, url_or_request, video_id, note='Downloading XML',
781 errnote='Unable to download XML', transform_source=None,
d391b7e2
S
782 fatal=True, encoding=None, data=None, headers={}, query={},
783 expected_status=None):
784 """
785 Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
786
787 See _download_webpage docstring for arguments specification.
788 """
e0d198c1
S
789 res = self._download_webpage_handle(
790 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
791 encoding=encoding, data=data, headers=headers, query=query,
792 expected_status=expected_status)
e0d198c1
S
793 if res is False:
794 return res
795 xml_string, urlh = res
796 return self._parse_xml(
797 xml_string, video_id, transform_source=transform_source,
798 fatal=fatal), urlh
799
d391b7e2
S
800 def _download_xml(
801 self, url_or_request, video_id,
802 note='Downloading XML', errnote='Unable to download XML',
803 transform_source=None, fatal=True, encoding=None,
804 data=None, headers={}, query={}, expected_status=None):
805 """
806 Return the xml as an xml.etree.ElementTree.Element.
807
808 See _download_webpage docstring for arguments specification.
809 """
e0d198c1
S
810 res = self._download_xml_handle(
811 url_or_request, video_id, note=note, errnote=errnote,
812 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
813 data=data, headers=headers, query=query,
814 expected_status=expected_status)
e0d198c1 815 return res if res is False else res[0]
e01c3d2e
S
816
817 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
e2b38da9
PH
818 if transform_source:
819 xml_string = transform_source(xml_string)
e01c3d2e
S
820 try:
821 return compat_etree_fromstring(xml_string.encode('utf-8'))
822 except compat_xml_parse_error as ve:
823 errmsg = '%s: Failed to parse XML ' % video_id
824 if fatal:
825 raise ExtractorError(errmsg, cause=ve)
826 else:
827 self.report_warning(errmsg + str(ve))
267ed0c5 828
0fe7783e
S
829 def _download_json_handle(
830 self, url_or_request, video_id, note='Downloading JSON metadata',
831 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
832 fatal=True, encoding=None, data=None, headers={}, query={},
833 expected_status=None):
834 """
835 Return a tuple (JSON object, URL handle).
836
837 See _download_webpage docstring for arguments specification.
838 """
0fe7783e 839 res = self._download_webpage_handle(
c9a77969 840 url_or_request, video_id, note, errnote, fatal=fatal,
d391b7e2
S
841 encoding=encoding, data=data, headers=headers, query=query,
842 expected_status=expected_status)
0fe7783e
S
843 if res is False:
844 return res
845 json_string, urlh = res
ebb64199 846 return self._parse_json(
0fe7783e
S
847 json_string, video_id, transform_source=transform_source,
848 fatal=fatal), urlh
849
850 def _download_json(
851 self, url_or_request, video_id, note='Downloading JSON metadata',
852 errnote='Unable to download JSON metadata', transform_source=None,
d391b7e2
S
853 fatal=True, encoding=None, data=None, headers={}, query={},
854 expected_status=None):
855 """
856 Return the JSON object as a dict.
857
858 See _download_webpage docstring for arguments specification.
859 """
0fe7783e
S
860 res = self._download_json_handle(
861 url_or_request, video_id, note=note, errnote=errnote,
862 transform_source=transform_source, fatal=fatal, encoding=encoding,
d391b7e2
S
863 data=data, headers=headers, query=query,
864 expected_status=expected_status)
0fe7783e 865 return res if res is False else res[0]
ebb64199
TF
866
867 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
868 if transform_source:
869 json_string = transform_source(json_string)
3d3538e4
PH
870 try:
871 return json.loads(json_string)
872 except ValueError as ve:
e7b6d122
PH
873 errmsg = '%s: Failed to parse JSON ' % video_id
874 if fatal:
875 raise ExtractorError(errmsg, cause=ve)
876 else:
877 self.report_warning(errmsg + str(ve))
3d3538e4 878
f45f96f8 879 def report_warning(self, msg, video_id=None):
f1a9d64e 880 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 881 self._downloader.report_warning(
f1a9d64e 882 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 883
d6983cb4
PH
884 def to_screen(self, msg):
885 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 886 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
887
888 def report_extraction(self, id_or_name):
889 """Report information extraction."""
f1a9d64e 890 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
891
892 def report_download_webpage(self, video_id):
893 """Report webpage download."""
f1a9d64e 894 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
895
896 def report_age_confirmation(self):
897 """Report attempt to confirm age."""
f1a9d64e 898 self.to_screen('Confirming age')
d6983cb4 899
fc79158d
JMF
900 def report_login(self):
901 """Report attempt to log in."""
f1a9d64e 902 self.to_screen('Logging in')
fc79158d 903
43e7d3c9
S
904 @staticmethod
905 def raise_login_required(msg='This video is only available for registered users'):
906 raise ExtractorError(
907 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
908 expected=True)
909
c430802e 910 @staticmethod
773f291d
S
911 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
912 raise GeoRestrictedError(msg, countries=countries)
c430802e 913
5f6a1245 914 # Methods for following #608
c0d0b01f 915 @staticmethod
830d53bf 916 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 917 """Returns a URL that points to a page that should be processed"""
5f6a1245 918 # TODO: ie should be the class used for getting the info
d6983cb4
PH
919 video_info = {'_type': 'url',
920 'url': url,
921 'ie_key': ie}
7012b23c
PH
922 if video_id is not None:
923 video_info['id'] = video_id
830d53bf
S
924 if video_title is not None:
925 video_info['title'] = video_title
d6983cb4 926 return video_info
5f6a1245 927
749ca5ec
S
928 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
929 urls = orderedSet(
46b18f23
JH
930 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
931 for m in matches)
932 return self.playlist_result(
749ca5ec 933 urls, playlist_id=playlist_id, playlist_title=playlist_title)
46b18f23 934
c0d0b01f 935 @staticmethod
acf5cbfe 936 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
937 """Returns a playlist"""
938 video_info = {'_type': 'playlist',
939 'entries': entries}
940 if playlist_id:
941 video_info['id'] = playlist_id
942 if playlist_title:
943 video_info['title'] = playlist_title
acf5cbfe
S
944 if playlist_description:
945 video_info['description'] = playlist_description
d6983cb4
PH
946 return video_info
947
c342041f 948 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
949 """
950 Perform a regex search on the given string, using a single or a list of
951 patterns returning the first matching group.
952 In case of failure return a default value or raise a WARNING or a
55b3e45b 953 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
954 """
955 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
956 mobj = re.search(pattern, string, flags)
957 else:
958 for p in pattern:
959 mobj = re.search(p, string, flags)
c3415d1b
PH
960 if mobj:
961 break
d6983cb4 962
e9c0cdd3 963 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
f1a9d64e 964 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
965 else:
966 _name = name
967
968 if mobj:
711ede6e
PH
969 if group is None:
970 # return the first matching group
971 return next(g for g in mobj.groups() if g is not None)
972 else:
973 return mobj.group(group)
c342041f 974 elif default is not NO_DEFAULT:
d6983cb4
PH
975 return default
976 elif fatal:
f1a9d64e 977 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 978 else:
08f2a92c 979 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
980 return None
981
c342041f 982 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
983 """
984 Like _search_regex, but strips HTML tags and unescapes entities.
985 """
711ede6e 986 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
987 if res:
988 return clean_html(res).strip()
989 else:
990 return res
991
2118fdd1
RA
992 def _get_netrc_login_info(self, netrc_machine=None):
993 username = None
994 password = None
995 netrc_machine = netrc_machine or self._NETRC_MACHINE
996
997 if self._downloader.params.get('usenetrc', False):
998 try:
999 info = netrc.netrc().authenticators(netrc_machine)
1000 if info is not None:
1001 username = info[0]
1002 password = info[2]
1003 else:
dcce092e
S
1004 raise netrc.NetrcParseError(
1005 'No authenticators for %s' % netrc_machine)
2118fdd1 1006 except (IOError, netrc.NetrcParseError) as err:
dcce092e
S
1007 self._downloader.report_warning(
1008 'parsing .netrc: %s' % error_to_compat_str(err))
2118fdd1 1009
dcce092e 1010 return username, password
2118fdd1 1011
1b6712ab 1012 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
fc79158d 1013 """
cf0649f8 1014 Get the login info as (username, password)
32443dd3
S
1015 First look for the manually specified credentials using username_option
1016 and password_option as keys in params dictionary. If no such credentials
1017 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1018 value.
fc79158d
JMF
1019 If there's no info available, return (None, None)
1020 """
1021 if self._downloader is None:
1022 return (None, None)
1023
fc79158d
JMF
1024 downloader_params = self._downloader.params
1025
1026 # Attempt to use provided username and password or .netrc data
1b6712ab
RA
1027 if downloader_params.get(username_option) is not None:
1028 username = downloader_params[username_option]
1029 password = downloader_params[password_option]
2118fdd1 1030 else:
1b6712ab 1031 username, password = self._get_netrc_login_info(netrc_machine)
5f6a1245 1032
2133565c 1033 return username, password
fc79158d 1034
e64b7569 1035 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 1036 """
1037 Get the two-factor authentication info
1038 TODO - asking the user will be required for sms/phone verify
1039 currently just uses the command line option
1040 If there's no info available, return None
1041 """
1042 if self._downloader is None:
83317f69 1043 return None
1044 downloader_params = self._downloader.params
1045
d800609c 1046 if downloader_params.get('twofactor') is not None:
83317f69 1047 return downloader_params['twofactor']
1048
e64b7569 1049 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 1050
46720279
JMF
1051 # Helper functions for extracting OpenGraph info
1052 @staticmethod
ab2d5247 1053 def _og_regexes(prop):
448ef1f3 1054 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
7a6d76a6
S
1055 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
1056 % {'prop': re.escape(prop)})
78fb87b2 1057 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 1058 return [
78fb87b2
JMF
1059 template % (property_re, content_re),
1060 template % (content_re, property_re),
ab2d5247 1061 ]
46720279 1062
864f24bd
S
1063 @staticmethod
1064 def _meta_regex(prop):
1065 return r'''(?isx)<meta
8b9848ac 1066 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
1067 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1068
3c4e6d83 1069 def _og_search_property(self, prop, html, name=None, **kargs):
b070564e
S
1070 if not isinstance(prop, (list, tuple)):
1071 prop = [prop]
46720279 1072 if name is None:
b070564e
S
1073 name = 'OpenGraph %s' % prop[0]
1074 og_regexes = []
1075 for p in prop:
1076 og_regexes.extend(self._og_regexes(p))
1077 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
1078 if escaped is None:
1079 return None
1080 return unescapeHTML(escaped)
46720279
JMF
1081
1082 def _og_search_thumbnail(self, html, **kargs):
10952eb2 1083 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
1084
1085 def _og_search_description(self, html, **kargs):
1086 return self._og_search_property('description', html, fatal=False, **kargs)
1087
1088 def _og_search_title(self, html, **kargs):
1089 return self._og_search_property('title', html, **kargs)
1090
8ffa13e0 1091 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
1092 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1093 if secure:
1094 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 1095 return self._html_search_regex(regexes, html, name, **kargs)
46720279 1096
78338f71
JMF
1097 def _og_search_url(self, html, **kargs):
1098 return self._og_search_property('url', html, **kargs)
1099
40c696e5 1100 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
88d9f6c0
S
1101 if not isinstance(name, (list, tuple)):
1102 name = [name]
59040888 1103 if display_name is None:
88d9f6c0 1104 display_name = name[0]
59040888 1105 return self._html_search_regex(
88d9f6c0 1106 [self._meta_regex(n) for n in name],
711ede6e 1107 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
1108
1109 def _dc_search_uploader(self, html):
1110 return self._html_search_meta('dc.creator', html, 'uploader')
1111
8dbe9899
PH
1112 def _rta_search(self, html):
1113 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1114 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1115 r' content="RTA-5042-1996-1400-1577-RTA"',
1116 html):
1117 return 18
1118 return 0
1119
59040888
PH
1120 def _media_rating_search(self, html):
1121 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1122 rating = self._html_search_meta('rating', html)
1123
1124 if not rating:
1125 return None
1126
1127 RATING_TABLE = {
1128 'safe for kids': 0,
1129 'general': 8,
1130 '14 years': 14,
1131 'mature': 17,
1132 'restricted': 19,
1133 }
d800609c 1134 return RATING_TABLE.get(rating.lower())
59040888 1135
69319969 1136 def _family_friendly_search(self, html):
6ca7732d 1137 # See http://schema.org/VideoObject
ac8491fc
S
1138 family_friendly = self._html_search_meta(
1139 'isFamilyFriendly', html, default=None)
69319969
NJ
1140
1141 if not family_friendly:
1142 return None
1143
1144 RATING_TABLE = {
1145 '1': 0,
1146 'true': 0,
1147 '0': 18,
1148 'false': 18,
1149 }
d800609c 1150 return RATING_TABLE.get(family_friendly.lower())
69319969 1151
0c708f11
JMF
1152 def _twitter_search_player(self, html):
1153 return self._html_search_meta('twitter:player', html,
9e1a5b84 1154 'twitter card player')
0c708f11 1155
95b31e26 1156 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
4ca2a3cf 1157 json_ld = self._search_regex(
0685d972 1158 JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
321b5e08 1159 default = kwargs.get('default', NO_DEFAULT)
4ca2a3cf 1160 if not json_ld:
321b5e08
S
1161 return default if default is not NO_DEFAULT else {}
1162 # JSON-LD may be malformed and thus `fatal` should be respected.
1163 # At the same time `default` may be passed that assumes `fatal=False`
1164 # for _search_regex. Let's simulate the same behavior here as well.
1165 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1166 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
4ca2a3cf 1167
95b31e26 1168 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
4ca2a3cf
S
1169 if isinstance(json_ld, compat_str):
1170 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1171 if not json_ld:
1172 return {}
1173 info = {}
46933a15
S
1174 if not isinstance(json_ld, (list, tuple, dict)):
1175 return info
1176 if isinstance(json_ld, dict):
1177 json_ld = [json_ld]
bae14048 1178
e7e4a6e0
S
1179 INTERACTION_TYPE_MAP = {
1180 'CommentAction': 'comment',
1181 'AgreeAction': 'like',
1182 'DisagreeAction': 'dislike',
1183 'LikeAction': 'like',
1184 'DislikeAction': 'dislike',
1185 'ListenAction': 'view',
1186 'WatchAction': 'view',
1187 'ViewAction': 'view',
1188 }
1189
1190 def extract_interaction_statistic(e):
1191 interaction_statistic = e.get('interactionStatistic')
1192 if not isinstance(interaction_statistic, list):
1193 return
1194 for is_e in interaction_statistic:
1195 if not isinstance(is_e, dict):
1196 continue
1197 if is_e.get('@type') != 'InteractionCounter':
1198 continue
1199 interaction_type = is_e.get('interactionType')
1200 if not isinstance(interaction_type, compat_str):
1201 continue
1202 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1203 if interaction_count is None:
1204 continue
1205 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1206 if not count_kind:
1207 continue
1208 count_key = '%s_count' % count_kind
1209 if info.get(count_key) is not None:
1210 continue
1211 info[count_key] = interaction_count
1212
bae14048
S
1213 def extract_video_object(e):
1214 assert e['@type'] == 'VideoObject'
1215 info.update({
1216 'url': e.get('contentUrl'),
1217 'title': unescapeHTML(e.get('name')),
1218 'description': unescapeHTML(e.get('description')),
1219 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1220 'duration': parse_duration(e.get('duration')),
1221 'timestamp': unified_timestamp(e.get('uploadDate')),
1222 'filesize': float_or_none(e.get('contentSize')),
1223 'tbr': int_or_none(e.get('bitrate')),
1224 'width': int_or_none(e.get('width')),
1225 'height': int_or_none(e.get('height')),
33a81c2c 1226 'view_count': int_or_none(e.get('interactionCount')),
bae14048 1227 })
e7e4a6e0 1228 extract_interaction_statistic(e)
bae14048 1229
46933a15 1230 for e in json_ld:
66b68672 1231 if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
46933a15
S
1232 item_type = e.get('@type')
1233 if expected_type is not None and expected_type != item_type:
1234 return info
c69701c6 1235 if item_type in ('TVEpisode', 'Episode'):
46933a15
S
1236 info.update({
1237 'episode': unescapeHTML(e.get('name')),
1238 'episode_number': int_or_none(e.get('episodeNumber')),
1239 'description': unescapeHTML(e.get('description')),
1240 })
1241 part_of_season = e.get('partOfSeason')
c69701c6 1242 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
46933a15 1243 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
d16b3c66 1244 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
c69701c6 1245 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
46933a15 1246 info['series'] = unescapeHTML(part_of_series.get('name'))
3931b845 1247 elif item_type in ('Article', 'NewsArticle'):
46933a15
S
1248 info.update({
1249 'timestamp': parse_iso8601(e.get('datePublished')),
1250 'title': unescapeHTML(e.get('headline')),
1251 'description': unescapeHTML(e.get('articleBody')),
1252 })
1253 elif item_type == 'VideoObject':
bae14048 1254 extract_video_object(e)
c69701c6
S
1255 continue
1256 video = e.get('video')
1257 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1258 extract_video_object(video)
46933a15 1259 break
4ca2a3cf
S
1260 return dict((k, v) for k, v in info.items() if v is not None)
1261
27713812 1262 @staticmethod
f8da79f8 1263 def _hidden_inputs(html):
586f1cc5 1264 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 1265 hidden_inputs = {}
c8498368
S
1266 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1267 attrs = extract_attributes(input)
1268 if not input:
201ea3ee 1269 continue
c8498368 1270 if attrs.get('type') not in ('hidden', 'submit'):
201ea3ee 1271 continue
c8498368
S
1272 name = attrs.get('name') or attrs.get('id')
1273 value = attrs.get('value')
1274 if name and value is not None:
1275 hidden_inputs[name] = value
201ea3ee 1276 return hidden_inputs
27713812 1277
cf61d96d
S
1278 def _form_hidden_inputs(self, form_id, html):
1279 form = self._search_regex(
73eb13df 1280 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
1281 html, '%s form' % form_id, group='form')
1282 return self._hidden_inputs(form)
1283
3ded7bac 1284 def _sort_formats(self, formats, field_preference=None):
7e8caf30 1285 if not formats:
f1a9d64e 1286 raise ExtractorError('No video formats found')
7e8caf30 1287
b0d21ded
S
1288 for f in formats:
1289 # Automatically determine tbr when missing based on abr and vbr (improves
1290 # formats sorting in some cases)
350cf045 1291 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
b0d21ded
S
1292 f['tbr'] = f['abr'] + f['vbr']
1293
4bcc7bd1 1294 def _formats_key(f):
e6812ac9
PH
1295 # TODO remove the following workaround
1296 from ..utils import determine_ext
1297 if not f.get('ext') and 'url' in f:
1298 f['ext'] = determine_ext(f['url'])
1299
3ded7bac 1300 if isinstance(field_preference, (list, tuple)):
bf8dd790
S
1301 return tuple(
1302 f.get(field)
1303 if f.get(field) is not None
1304 else ('' if field == 'format_id' else -1)
1305 for field in field_preference)
3ded7bac 1306
4bcc7bd1
PH
1307 preference = f.get('preference')
1308 if preference is None:
d497a201 1309 preference = 0
4bcc7bd1
PH
1310 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1311 preference -= 0.5
1312
8b408545
RA
1313 protocol = f.get('protocol') or determine_protocol(f)
1314 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
d497a201 1315
4bcc7bd1 1316 if f.get('vcodec') == 'none': # audio only
dd867805 1317 preference -= 50
4bcc7bd1 1318 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 1319 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 1320 else:
f1a9d64e 1321 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
1322 ext_preference = 0
1323 try:
1324 audio_ext_preference = ORDER.index(f['ext'])
1325 except ValueError:
1326 audio_ext_preference = -1
1327 else:
dd867805 1328 if f.get('acodec') == 'none': # video only
1329 preference -= 40
4bcc7bd1 1330 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 1331 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 1332 else:
f1a9d64e 1333 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
1334 try:
1335 ext_preference = ORDER.index(f['ext'])
1336 except ValueError:
1337 ext_preference = -1
1338 audio_ext_preference = 0
1339
1340 return (
1341 preference,
aff2f4f4 1342 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 1343 f.get('quality') if f.get('quality') is not None else -1,
9933b574 1344 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 1345 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 1346 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
1347 f.get('height') if f.get('height') is not None else -1,
1348 f.get('width') if f.get('width') is not None else -1,
d497a201 1349 proto_preference,
1e1896f2 1350 ext_preference,
4bcc7bd1
PH
1351 f.get('abr') if f.get('abr') is not None else -1,
1352 audio_ext_preference,
2c8e03d9 1353 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 1354 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 1355 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 1356 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
1357 )
1358 formats.sort(key=_formats_key)
59040888 1359
96a53167
S
1360 def _check_formats(self, formats, video_id):
1361 if formats:
1362 formats[:] = filter(
1363 lambda f: self._is_valid_url(
1364 f['url'], video_id,
1365 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1366 formats)
1367
f5bdb444
S
1368 @staticmethod
1369 def _remove_duplicate_formats(formats):
1370 format_urls = set()
1371 unique_formats = []
1372 for f in formats:
1373 if f['url'] not in format_urls:
1374 format_urls.add(f['url'])
1375 unique_formats.append(f)
1376 formats[:] = unique_formats
1377
45024183 1378 def _is_valid_url(self, url, video_id, item='video', headers={}):
2f0f6578
S
1379 url = self._proto_relative_url(url, scheme='http:')
1380 # For now assume non HTTP(S) URLs always valid
1381 if not (url.startswith('http://') or url.startswith('https://')):
1382 return True
96a53167 1383 try:
45024183 1384 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
96a53167
S
1385 return True
1386 except ExtractorError as e:
943a1e24 1387 if isinstance(e.cause, compat_urllib_error.URLError):
baa43cba
S
1388 self.to_screen(
1389 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
1390 return False
1391 raise
1392
20991253 1393 def http_scheme(self):
1ede5b24 1394 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
1395 return (
1396 'http:'
1397 if self._downloader.params.get('prefer_insecure', False)
1398 else 'https:')
1399
57c7411f
PH
1400 def _proto_relative_url(self, url, scheme=None):
1401 if url is None:
1402 return url
1403 if url.startswith('//'):
1404 if scheme is None:
1405 scheme = self.http_scheme()
1406 return scheme + url
1407 else:
1408 return url
1409
4094b6e3
PH
1410 def _sleep(self, timeout, video_id, msg_template=None):
1411 if msg_template is None:
f1a9d64e 1412 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
1413 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1414 self.to_screen(msg)
1415 time.sleep(timeout)
1416
a38436e8 1417 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
4de61310 1418 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1419 fatal=True, m3u8_id=None):
f036a632
JMF
1420 manifest = self._download_xml(
1421 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
1422 'Unable to download f4m manifest',
1423 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1424 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
4de61310
S
1425 transform_source=transform_source,
1426 fatal=fatal)
1427
1428 if manifest is False:
8d29e47f 1429 return []
31bb8d3f 1430
0fdbb332
S
1431 return self._parse_f4m_formats(
1432 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
448bb5f3 1433 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
0fdbb332
S
1434
1435 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1436 transform_source=lambda s: fix_xml_ampersands(s).strip(),
448bb5f3 1437 fatal=True, m3u8_id=None):
fb72ec58 1438 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1439 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1440 if akamai_pv is not None and ';' in akamai_pv.text:
1441 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1442 if playerVerificationChallenge.strip() != '':
1443 return []
1444
31bb8d3f 1445 formats = []
7a47d07c 1446 manifest_version = '1.0'
b2527359 1447 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 1448 if not media_nodes:
7a47d07c 1449 manifest_version = '2.0'
34e48bed 1450 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b22ca762
S
1451 # Remove unsupported DRM protected media from final formats
1452 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1453 media_nodes = remove_encrypted_media(media_nodes)
1454 if not media_nodes:
1455 return formats
48107c19
S
1456
1457 manifest_base_url = get_base_url(manifest)
0a5685b2 1458
a6571f10 1459 bootstrap_info = xpath_element(
0a5685b2
YCH
1460 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1461 'bootstrap info', default=None)
1462
edd6074c
RA
1463 vcodec = None
1464 mime_type = xpath_text(
1465 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1466 'base URL', default=None)
1467 if mime_type and mime_type.startswith('audio/'):
1468 vcodec = 'none'
1469
b2527359 1470 for i, media_el in enumerate(media_nodes):
77b8b4e6
S
1471 tbr = int_or_none(media_el.attrib.get('bitrate'))
1472 width = int_or_none(media_el.attrib.get('width'))
1473 height = int_or_none(media_el.attrib.get('height'))
1474 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
448bb5f3
YCH
1475 # If <bootstrapInfo> is present, the specified f4m is a
1476 # stream-level manifest, and only set-level manifests may refer to
1477 # external resources. See section 11.4 and section 4 of F4M spec
1478 if bootstrap_info is None:
1479 media_url = None
1480 # @href is introduced in 2.0, see section 11.6 of F4M spec
1481 if manifest_version == '2.0':
1482 media_url = media_el.attrib.get('href')
1483 if media_url is None:
1484 media_url = media_el.attrib.get('url')
31c746e5
S
1485 if not media_url:
1486 continue
cc357c4d
S
1487 manifest_url = (
1488 media_url if media_url.startswith('http://') or media_url.startswith('https://')
48107c19 1489 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
1490 # If media_url is itself a f4m manifest do the recursive extraction
1491 # since bitrates in parent manifest (this one) and media_url manifest
1492 # may differ leading to inability to resolve the format by requested
1493 # bitrate in f4m downloader
240b6045
YCH
1494 ext = determine_ext(manifest_url)
1495 if ext == 'f4m':
77b8b4e6 1496 f4m_formats = self._extract_f4m_formats(
0fdbb332 1497 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
77b8b4e6
S
1498 transform_source=transform_source, fatal=fatal)
1499 # Sometimes stream-level manifest contains single media entry that
1500 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1501 # At the same time parent's media entry in set-level manifest may
1502 # contain it. We will copy it from parent in such cases.
1503 if len(f4m_formats) == 1:
1504 f = f4m_formats[0]
1505 f.update({
1506 'tbr': f.get('tbr') or tbr,
1507 'width': f.get('width') or width,
1508 'height': f.get('height') or height,
1509 'format_id': f.get('format_id') if not tbr else format_id,
edd6074c 1510 'vcodec': vcodec,
77b8b4e6
S
1511 })
1512 formats.extend(f4m_formats)
70f0f5a8 1513 continue
240b6045
YCH
1514 elif ext == 'm3u8':
1515 formats.extend(self._extract_m3u8_formats(
1516 manifest_url, video_id, 'mp4', preference=preference,
fac2af3c 1517 m3u8_id=m3u8_id, fatal=fatal))
240b6045 1518 continue
31bb8d3f 1519 formats.append({
77b8b4e6 1520 'format_id': format_id,
31bb8d3f 1521 'url': manifest_url,
30d0b549 1522 'manifest_url': manifest_url,
a6571f10 1523 'ext': 'flv' if bootstrap_info is not None else None,
187ee66c 1524 'protocol': 'f4m',
b2527359 1525 'tbr': tbr,
77b8b4e6
S
1526 'width': width,
1527 'height': height,
edd6074c 1528 'vcodec': vcodec,
60ca389c 1529 'preference': preference,
31bb8d3f 1530 })
31bb8d3f
JMF
1531 return formats
1532
16da9bbc
YCH
1533 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1534 return {
f207019c 1535 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
1536 'url': m3u8_url,
1537 'ext': ext,
1538 'protocol': 'm3u8',
37768f92 1539 'preference': preference - 100 if preference else -100,
704df56d
PH
1540 'resolution': 'multiple',
1541 'format_note': 'Quality selection URL',
16da9bbc
YCH
1542 }
1543
1544 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1545 entry_protocol='m3u8', preference=None,
1546 m3u8_id=None, note=None, errnote=None,
1547 fatal=True, live=False):
dbd82a1d 1548 res = self._download_webpage_handle(
81515ad9 1549 m3u8_url, video_id,
621ed9f5 1550 note=note or 'Downloading m3u8 information',
13af92fd
YCH
1551 errnote=errnote or 'Failed to download m3u8 information',
1552 fatal=fatal)
cb252080 1553
dbd82a1d 1554 if res is False:
8d29e47f 1555 return []
cb252080 1556
dbd82a1d 1557 m3u8_doc, urlh = res
37113045 1558 m3u8_url = urlh.geturl()
9cdffeeb 1559
cb252080
S
1560 return self._parse_m3u8_formats(
1561 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1562 preference=preference, m3u8_id=m3u8_id, live=live)
1563
1564 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1565 entry_protocol='m3u8', preference=None,
1566 m3u8_id=None, live=False):
08a00eef
RA
1567 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1568 return []
1569
ea229584
RA
1570 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1571 return []
1572
ff99fe52 1573 formats = []
0def7587
RA
1574
1575 format_url = lambda u: (
1576 u
1577 if re.match(r'^https?://', u)
1578 else compat_urlparse.urljoin(m3u8_url, u))
1579
cb252080
S
1580 # References:
1581 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1582 # 2. https://github.com/rg3/youtube-dl/issues/12211
1583
1584 # We should try extracting formats only from master playlists [1, 4.3.4],
1585 # i.e. playlists that describe available qualities. On the other hand
1586 # media playlists [1, 4.3.3] should be returned as is since they contain
1587 # just the media without qualities renditions.
9cdffeeb 1588 # Fortunately, master playlist can be easily distinguished from media
cb252080
S
1589 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1590 # master playlist tags MUST NOT appear in a media playist and vice versa.
1591 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1592 # media playlist and MUST NOT appear in master playlist thus we can
1593 # clearly detect media playlist with this criterion.
1594
9cdffeeb 1595 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
7f32e5dc 1596 return [{
1597 'url': m3u8_url,
1598 'format_id': m3u8_id,
1599 'ext': ext,
1600 'protocol': entry_protocol,
1601 'preference': preference,
1602 }]
cb252080
S
1603
1604 groups = {}
1605 last_stream_inf = {}
1606
1607 def extract_media(x_media_line):
1608 media = parse_m3u8_attributes(x_media_line)
1609 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1610 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1611 if not (media_type and group_id and name):
1612 return
1613 groups.setdefault(group_id, []).append(media)
1614 if media_type not in ('VIDEO', 'AUDIO'):
1615 return
1616 media_url = media.get('URI')
1617 if media_url:
1618 format_id = []
9211e331 1619 for v in (m3u8_id, group_id, name):
cb252080
S
1620 if v:
1621 format_id.append(v)
1622 f = {
1623 'format_id': '-'.join(format_id),
1624 'url': format_url(media_url),
c89b49f7 1625 'manifest_url': m3u8_url,
cb252080
S
1626 'language': media.get('LANGUAGE'),
1627 'ext': ext,
1628 'protocol': entry_protocol,
1629 'preference': preference,
1630 }
1631 if media_type == 'AUDIO':
1632 f['vcodec'] = 'none'
1633 formats.append(f)
1634
1635 def build_stream_name():
1636 # Despite specification does not mention NAME attribute for
3019cb0c
S
1637 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1638 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
ddd258f9 1639 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
cb252080
S
1640 stream_name = last_stream_inf.get('NAME')
1641 if stream_name:
1642 return stream_name
1643 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1644 # from corresponding rendition group
1645 stream_group_id = last_stream_inf.get('VIDEO')
1646 if not stream_group_id:
1647 return
1648 stream_group = groups.get(stream_group_id)
1649 if not stream_group:
1650 return stream_group_id
1651 rendition = stream_group[0]
1652 return rendition.get('NAME') or stream_group_id
1653
704df56d
PH
1654 for line in m3u8_doc.splitlines():
1655 if line.startswith('#EXT-X-STREAM-INF:'):
cb252080 1656 last_stream_inf = parse_m3u8_attributes(line)
4cd95bcb 1657 elif line.startswith('#EXT-X-MEDIA:'):
cb252080 1658 extract_media(line)
704df56d
PH
1659 elif line.startswith('#') or not line.strip():
1660 continue
1661 else:
9c99bef7
S
1662 tbr = float_or_none(
1663 last_stream_inf.get('AVERAGE-BANDWIDTH') or
1664 last_stream_inf.get('BANDWIDTH'), scale=1000)
8dc9d361
S
1665 format_id = []
1666 if m3u8_id:
1667 format_id.append(m3u8_id)
cb252080 1668 stream_name = build_stream_name()
b24d6336
KH
1669 # Bandwidth of live streams may differ over time thus making
1670 # format_id unpredictable. So it's better to keep provided
1671 # format_id intact.
e9c6cdf4 1672 if not live:
ed56f260 1673 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
30d0b549 1674 manifest_url = format_url(line.strip())
704df56d 1675 f = {
8dc9d361 1676 'format_id': '-'.join(format_id),
30d0b549 1677 'url': manifest_url,
ff99fe52 1678 'manifest_url': m3u8_url,
704df56d
PH
1679 'tbr': tbr,
1680 'ext': ext,
cb252080 1681 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
f0b5d6af
PH
1682 'protocol': entry_protocol,
1683 'preference': preference,
704df56d 1684 }
cb252080 1685 resolution = last_stream_inf.get('RESOLUTION')
704df56d 1686 if resolution:
c4c9b844
S
1687 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1688 if mobj:
1689 f['width'] = int(mobj.group('width'))
1690 f['height'] = int(mobj.group('height'))
00f4764c
RA
1691 # Unified Streaming Platform
1692 mobj = re.search(
1693 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1694 if mobj:
1695 abr, vbr = mobj.groups()
1696 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
fbb6edd2 1697 f.update({
00f4764c
RA
1698 'vbr': vbr,
1699 'abr': abr,
fbb6edd2 1700 })
cb252080
S
1701 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1702 f.update(codecs)
1703 audio_group_id = last_stream_inf.get('AUDIO')
1704 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1705 # references a rendition group MUST have a CODECS attribute.
1706 # However, this is not always respected, for example, [2]
1707 # contains EXT-X-STREAM-INF tag which references AUDIO
1708 # rendition group but does not have CODECS and despite
1709 # referencing audio group an audio group, it represents
1710 # a complete (with audio and video) format. So, for such cases
1711 # we will ignore references to rendition groups and treat them
1712 # as complete formats.
1713 if audio_group_id and codecs and f.get('vcodec') != 'none':
1714 audio_group = groups.get(audio_group_id)
1715 if audio_group and audio_group[0].get('URI'):
1716 # TODO: update acodec for audio only formats with
1717 # the same GROUP-ID
1718 f['acodec'] = 'none'
704df56d 1719 formats.append(f)
cb252080 1720 last_stream_inf = {}
704df56d
PH
1721 return formats
1722
a107193e
S
1723 @staticmethod
1724 def _xpath_ns(path, namespace=None):
1725 if not namespace:
1726 return path
1727 out = []
1728 for c in path.split('/'):
1729 if not c or c == '.':
1730 out.append(c)
1731 else:
1732 out.append('{%s}%s' % (namespace, c))
1733 return '/'.join(out)
1734
09f572fb 1735 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1736 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
a107193e 1737
995029a1
PH
1738 if smil is False:
1739 assert not fatal
1740 return []
e89a2aab 1741
17712eeb 1742 namespace = self._parse_smil_namespace(smil)
a107193e
S
1743
1744 return self._parse_smil_formats(
1745 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1746
1747 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1748 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1749 if smil is False:
1750 return {}
1751 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1752
09f572fb 1753 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
a107193e
S
1754 return self._download_xml(
1755 smil_url, video_id, 'Downloading SMIL file',
09f572fb 1756 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
a107193e
S
1757
1758 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 1759 namespace = self._parse_smil_namespace(smil)
a107193e
S
1760
1761 formats = self._parse_smil_formats(
1762 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1763 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1764
1765 video_id = os.path.splitext(url_basename(smil_url))[0]
1766 title = None
1767 description = None
647eab45 1768 upload_date = None
a107193e
S
1769 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1770 name = meta.attrib.get('name')
1771 content = meta.attrib.get('content')
1772 if not name or not content:
1773 continue
1774 if not title and name == 'title':
1775 title = content
1776 elif not description and name in ('description', 'abstract'):
1777 description = content
647eab45
S
1778 elif not upload_date and name == 'date':
1779 upload_date = unified_strdate(content)
a107193e 1780
1e5bcdec
S
1781 thumbnails = [{
1782 'id': image.get('type'),
1783 'url': image.get('src'),
1784 'width': int_or_none(image.get('width')),
1785 'height': int_or_none(image.get('height')),
1786 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1787
a107193e
S
1788 return {
1789 'id': video_id,
1790 'title': title or video_id,
1791 'description': description,
647eab45 1792 'upload_date': upload_date,
1e5bcdec 1793 'thumbnails': thumbnails,
a107193e
S
1794 'formats': formats,
1795 'subtitles': subtitles,
1796 }
1797
17712eeb
S
1798 def _parse_smil_namespace(self, smil):
1799 return self._search_regex(
1800 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1801
f877c6ae 1802 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
1803 base = smil_url
1804 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1805 b = meta.get('base') or meta.get('httpBase')
1806 if b:
1807 base = b
1808 break
e89a2aab
S
1809
1810 formats = []
1811 rtmp_count = 0
a107193e 1812 http_count = 0
7f32e5dc 1813 m3u8_count = 0
a107193e 1814
81e1c4e2 1815 srcs = []
ad96b4c8
YCH
1816 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1817 for medium in media:
1818 src = medium.get('src')
81e1c4e2 1819 if not src or src in srcs:
a107193e 1820 continue
81e1c4e2 1821 srcs.append(src)
a107193e 1822
ad96b4c8
YCH
1823 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1824 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1825 width = int_or_none(medium.get('width'))
1826 height = int_or_none(medium.get('height'))
1827 proto = medium.get('proto')
1828 ext = medium.get('ext')
a107193e 1829 src_ext = determine_ext(src)
ad96b4c8 1830 streamer = medium.get('streamer') or base
a107193e
S
1831
1832 if proto == 'rtmp' or streamer.startswith('rtmp'):
1833 rtmp_count += 1
1834 formats.append({
1835 'url': streamer,
1836 'play_path': src,
1837 'ext': 'flv',
1838 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1839 'tbr': bitrate,
1840 'filesize': filesize,
1841 'width': width,
1842 'height': height,
1843 })
f877c6ae
YCH
1844 if transform_rtmp_url:
1845 streamer, src = transform_rtmp_url(streamer, src)
1846 formats[-1].update({
1847 'url': streamer,
1848 'play_path': src,
1849 })
a107193e
S
1850 continue
1851
1852 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
c349456e 1853 src_url = src_url.strip()
a107193e
S
1854
1855 if proto == 'm3u8' or src_ext == 'm3u8':
7f32e5dc 1856 m3u8_formats = self._extract_m3u8_formats(
1857 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1858 if len(m3u8_formats) == 1:
1859 m3u8_count += 1
1860 m3u8_formats[0].update({
1861 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1862 'tbr': bitrate,
1863 'width': width,
1864 'height': height,
1865 })
1866 formats.extend(m3u8_formats)
bd21ead2 1867 elif src_ext == 'f4m':
a107193e
S
1868 f4m_url = src_url
1869 if not f4m_params:
1870 f4m_params = {
1871 'hdcore': '3.2.0',
1872 'plugin': 'flowplayer-3.2.0.1',
1873 }
1874 f4m_url += '&' if '?' in f4m_url else '?'
15707c7e 1875 f4m_url += compat_urllib_parse_urlencode(f4m_params)
7e5edcfd 1876 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
bd21ead2
RA
1877 elif src_ext == 'mpd':
1878 formats.extend(self._extract_mpd_formats(
1879 src_url, video_id, mpd_id='dash', fatal=False))
1880 elif re.search(r'\.ism/[Mm]anifest', src_url):
1881 formats.extend(self._extract_ism_formats(
1882 src_url, video_id, ism_id='mss', fatal=False))
1883 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
1884 http_count += 1
1885 formats.append({
1886 'url': src_url,
1887 'ext': ext or src_ext or 'flv',
1888 'format_id': 'http-%d' % (bitrate or http_count),
1889 'tbr': bitrate,
1890 'filesize': filesize,
1891 'width': width,
1892 'height': height,
1893 })
63757032 1894
e89a2aab
S
1895 return formats
1896
ce00af87 1897 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
d413095f 1898 urls = []
a107193e
S
1899 subtitles = {}
1900 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1901 src = textstream.get('src')
d413095f 1902 if not src or src in urls:
a107193e 1903 continue
d413095f 1904 urls.append(src)
df634be2 1905 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
03bc7237 1906 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
1907 subtitles.setdefault(lang, []).append({
1908 'url': src,
1909 'ext': ext,
1910 })
1911 return subtitles
63757032 1912
47a5cb77 1913 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
942acef5 1914 xspf = self._download_xml(
47a5cb77 1915 xspf_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
1916 'Unable to download xspf manifest', fatal=fatal)
1917 if xspf is False:
1918 return []
47a5cb77
S
1919 return self._parse_xspf(
1920 xspf, playlist_id, xspf_url=xspf_url,
1921 xspf_base_url=base_url(xspf_url))
8d6765cf 1922
47a5cb77 1923 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
8d6765cf
S
1924 NS_MAP = {
1925 'xspf': 'http://xspf.org/ns/0/',
1926 's1': 'http://static.streamone.nl/player/ns/0',
1927 }
1928
1929 entries = []
47a5cb77 1930 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
8d6765cf 1931 title = xpath_text(
98044462 1932 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
1933 description = xpath_text(
1934 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1935 thumbnail = xpath_text(
1936 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1937 duration = float_or_none(
1938 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1939
47a5cb77
S
1940 formats = []
1941 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1942 format_url = urljoin(xspf_base_url, location.text)
1943 if not format_url:
1944 continue
1945 formats.append({
1946 'url': format_url,
1947 'manifest_url': xspf_url,
1948 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1949 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1950 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1951 })
8d6765cf
S
1952 self._sort_formats(formats)
1953
1954 entries.append({
1955 'id': playlist_id,
1956 'title': title,
1957 'description': description,
1958 'thumbnail': thumbnail,
1959 'duration': duration,
1960 'formats': formats,
1961 })
1962 return entries
1963
1bac3455 1964 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
47a5cb77 1965 res = self._download_xml_handle(
1bac3455 1966 mpd_url, video_id,
1967 note=note or 'Downloading MPD manifest',
1968 errnote=errnote or 'Failed to download MPD manifest',
2d2fa82d 1969 fatal=fatal)
1bac3455 1970 if res is False:
2d2fa82d 1971 return []
47a5cb77 1972 mpd_doc, urlh = res
02dc0a36 1973 mpd_base_url = base_url(urlh.geturl())
1bac3455 1974
91cb6b50 1975 return self._parse_mpd_formats(
47a5cb77 1976 mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
86f4d14f 1977 formats_dict=formats_dict, mpd_url=mpd_url)
2d2fa82d 1978
86f4d14f 1979 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
f0948348
S
1980 """
1981 Parse formats from MPD manifest.
1982 References:
1983 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1984 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1985 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1986 """
1bac3455 1987 if mpd_doc.get('type') == 'dynamic':
1988 return []
2d2fa82d 1989
91cb6b50 1990 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
f14be228 1991
1992 def _add_ns(path):
1993 return self._xpath_ns(path, namespace)
1994
675d0016 1995 def is_drm_protected(element):
1996 return element.find(_add_ns('ContentProtection')) is not None
1997
1bac3455 1998 def extract_multisegment_info(element, ms_parent_info):
1999 ms_info = ms_parent_info.copy()
b4c1d6e8
S
2000
2001 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2002 # common attributes and elements. We will only extract relevant
2003 # for us.
2004 def extract_common(source):
2005 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2006 if segment_timeline is not None:
2007 s_e = segment_timeline.findall(_add_ns('S'))
2008 if s_e:
2009 ms_info['total_number'] = 0
2010 ms_info['s'] = []
2011 for s in s_e:
2012 r = int(s.get('r', 0))
2013 ms_info['total_number'] += 1 + r
2014 ms_info['s'].append({
2015 't': int(s.get('t', 0)),
2016 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2017 'd': int(s.attrib['d']),
2018 'r': r,
2019 })
2020 start_number = source.get('startNumber')
2021 if start_number:
2022 ms_info['start_number'] = int(start_number)
2023 timescale = source.get('timescale')
2024 if timescale:
2025 ms_info['timescale'] = int(timescale)
2026 segment_duration = source.get('duration')
2027 if segment_duration:
48504785 2028 ms_info['segment_duration'] = float(segment_duration)
b4c1d6e8
S
2029
2030 def extract_Initialization(source):
2031 initialization = source.find(_add_ns('Initialization'))
2032 if initialization is not None:
2033 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2034
f14be228 2035 segment_list = element.find(_add_ns('SegmentList'))
1bac3455 2036 if segment_list is not None:
b4c1d6e8
S
2037 extract_common(segment_list)
2038 extract_Initialization(segment_list)
f14be228 2039 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1bac3455 2040 if segment_urls_e:
2041 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1bac3455 2042 else:
f14be228 2043 segment_template = element.find(_add_ns('SegmentTemplate'))
1bac3455 2044 if segment_template is not None:
b4c1d6e8 2045 extract_common(segment_template)
e228616c
S
2046 media = segment_template.get('media')
2047 if media:
2048 ms_info['media'] = media
1bac3455 2049 initialization = segment_template.get('initialization')
2050 if initialization:
e228616c 2051 ms_info['initialization'] = initialization
1bac3455 2052 else:
b4c1d6e8 2053 extract_Initialization(segment_template)
1bac3455 2054 return ms_info
b323e170 2055
1bac3455 2056 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
17b598d3 2057 formats = []
f14be228 2058 for period in mpd_doc.findall(_add_ns('Period')):
1bac3455 2059 period_duration = parse_duration(period.get('duration')) or mpd_duration
2060 period_ms_info = extract_multisegment_info(period, {
2061 'start_number': 1,
2062 'timescale': 1,
2063 })
f14be228 2064 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
675d0016 2065 if is_drm_protected(adaptation_set):
2066 continue
1bac3455 2067 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
f14be228 2068 for representation in adaptation_set.findall(_add_ns('Representation')):
675d0016 2069 if is_drm_protected(representation):
2070 continue
1bac3455 2071 representation_attrib = adaptation_set.attrib.copy()
2072 representation_attrib.update(representation.attrib)
f0948348 2073 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
a6c8b759
YCH
2074 mime_type = representation_attrib['mimeType']
2075 content_type = mime_type.split('/')[0]
1bac3455 2076 if content_type == 'text':
2077 # TODO implement WebVTT downloading
2078 pass
40fcba5e 2079 elif content_type in ('video', 'audio'):
1bac3455 2080 base_url = ''
2081 for element in (representation, adaptation_set, period, mpd_doc):
f14be228 2082 base_url_e = element.find(_add_ns('BaseURL'))
1bac3455 2083 if base_url_e is not None:
2084 base_url = base_url_e.text + base_url
2085 if re.match(r'^https?://', base_url):
2086 break
bb20526b
S
2087 if mpd_base_url and not re.match(r'^https?://', base_url):
2088 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2089 mpd_base_url += '/'
1bac3455 2090 base_url = mpd_base_url + base_url
2091 representation_id = representation_attrib.get('id')
d577c796 2092 lang = representation_attrib.get('lang')
51e9094f 2093 url_el = representation.find(_add_ns('BaseURL'))
2094 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
e228616c 2095 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1bac3455 2096 f = {
154c209e 2097 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1bac3455 2098 'url': base_url,
86f4d14f 2099 'manifest_url': mpd_url,
a6c8b759 2100 'ext': mimetype2ext(mime_type),
1bac3455 2101 'width': int_or_none(representation_attrib.get('width')),
2102 'height': int_or_none(representation_attrib.get('height')),
9c99bef7 2103 'tbr': float_or_none(bandwidth, 1000),
1bac3455 2104 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2105 'fps': int_or_none(representation_attrib.get('frameRate')),
d577c796 2106 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1bac3455 2107 'format_note': 'DASH %s' % content_type,
51e9094f 2108 'filesize': filesize,
126f225b 2109 'container': mimetype2ext(mime_type) + '_dash',
1bac3455 2110 }
7fe15920 2111 f.update(parse_codecs(representation_attrib.get('codecs')))
1bac3455 2112 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
b4c1d6e8 2113
e228616c 2114 def prepare_template(template_name, identifiers):
eca1f0d1
S
2115 tmpl = representation_ms_info[template_name]
2116 # First of, % characters outside $...$ templates
2117 # must be escaped by doubling for proper processing
2118 # by % operator string formatting used further (see
2119 # https://github.com/rg3/youtube-dl/issues/16867).
2120 t = ''
2121 in_template = False
2122 for c in tmpl:
2123 t += c
2124 if c == '$':
2125 in_template = not in_template
2126 elif c == '%' and not in_template:
2127 t += c
2128 # Next, $...$ templates are translated to their
2129 # %(...) counterparts to be used with % operator
e228616c
S
2130 t = t.replace('$RepresentationID$', representation_id)
2131 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2132 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2133 t.replace('$$', '$')
2134 return t
2135
2136 # @initialization is a regular template like @media one
2137 # so it should be handled just the same way (see
2138 # https://github.com/rg3/youtube-dl/issues/11605)
2139 if 'initialization' in representation_ms_info:
2140 initialization_template = prepare_template(
2141 'initialization',
2142 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2143 # $Time$ shall not be included for @initialization thus
2144 # only $Bandwidth$ remains
2145 ('Bandwidth', ))
2146 representation_ms_info['initialization_url'] = initialization_template % {
2147 'Bandwidth': bandwidth,
2148 }
2149
1141e910
S
2150 def location_key(location):
2151 return 'url' if re.match(r'^https?://', location) else 'path'
2152
e228616c
S
2153 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2154
2155 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1141e910 2156 media_location_key = location_key(media_template)
f0948348
S
2157
2158 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2159 # can't be used at the same time
b4c1d6e8
S
2160 if '%(Number' in media_template and 's' not in representation_ms_info:
2161 segment_duration = None
c110944f 2162 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
b4c1d6e8
S
2163 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2164 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
b4c1d6e8 2165 representation_ms_info['fragments'] = [{
1141e910 2166 media_location_key: media_template % {
b4c1d6e8 2167 'Number': segment_number,
e228616c 2168 'Bandwidth': bandwidth,
b4c1d6e8
S
2169 },
2170 'duration': segment_duration,
2171 } for segment_number in range(
2172 representation_ms_info['start_number'],
2173 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
f0948348 2174 else:
b4c1d6e8
S
2175 # $Number*$ or $Time$ in media template with S list available
2176 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2177 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
b4c1d6e8 2178 representation_ms_info['fragments'] = []
f0948348 2179 segment_time = 0
b4c1d6e8
S
2180 segment_d = None
2181 segment_number = representation_ms_info['start_number']
f0948348
S
2182
2183 def add_segment_url():
b4c1d6e8
S
2184 segment_url = media_template % {
2185 'Time': segment_time,
e228616c 2186 'Bandwidth': bandwidth,
b4c1d6e8
S
2187 'Number': segment_number,
2188 }
b4c1d6e8 2189 representation_ms_info['fragments'].append({
1141e910 2190 media_location_key: segment_url,
b4c1d6e8
S
2191 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2192 })
f0948348
S
2193
2194 for num, s in enumerate(representation_ms_info['s']):
2195 segment_time = s.get('t') or segment_time
b4c1d6e8 2196 segment_d = s['d']
f0948348 2197 add_segment_url()
b4c1d6e8 2198 segment_number += 1
f0948348 2199 for r in range(s.get('r', 0)):
b4c1d6e8 2200 segment_time += segment_d
f0948348 2201 add_segment_url()
b4c1d6e8
S
2202 segment_number += 1
2203 segment_time += segment_d
2204 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2205 # No media template
2206 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2207 # or any YouTube dashsegments video
2208 fragments = []
d04621da
S
2209 segment_index = 0
2210 timescale = representation_ms_info['timescale']
2211 for s in representation_ms_info['s']:
2212 duration = float_or_none(s['d'], timescale)
b4c1d6e8 2213 for r in range(s.get('r', 0) + 1):
1141e910 2214 segment_uri = representation_ms_info['segment_urls'][segment_index]
b4c1d6e8 2215 fragments.append({
1141e910 2216 location_key(segment_uri): segment_uri,
d04621da 2217 'duration': duration,
b4c1d6e8 2218 })
d04621da 2219 segment_index += 1
b4c1d6e8 2220 representation_ms_info['fragments'] = fragments
41bf647e
PN
2221 elif 'segment_urls' in representation_ms_info:
2222 # Segment URLs with no SegmentTimeline
2223 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
78593e29 2224 # https://github.com/rg3/youtube-dl/pull/14844
41bf647e 2225 fragments = []
603fc4e0
S
2226 segment_duration = float_or_none(
2227 representation_ms_info['segment_duration'],
2228 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
41bf647e 2229 for segment_url in representation_ms_info['segment_urls']:
603fc4e0 2230 fragment = {
41bf647e 2231 location_key(segment_url): segment_url,
603fc4e0
S
2232 }
2233 if segment_duration:
2234 fragment['duration'] = segment_duration
2235 fragments.append(fragment)
41bf647e 2236 representation_ms_info['fragments'] = fragments
86f4d14f
S
2237 # NB: MPD manifest may contain direct URLs to unfragmented media.
2238 # No fragments key is present in this case.
2239 if 'fragments' in representation_ms_info:
1bac3455 2240 f.update({
1141e910 2241 'fragment_base_url': base_url,
b4c1d6e8 2242 'fragments': [],
1bac3455 2243 'protocol': 'http_dash_segments',
df374b52 2244 })
1bac3455 2245 if 'initialization_url' in representation_ms_info:
e228616c 2246 initialization_url = representation_ms_info['initialization_url']
1bac3455 2247 if not f.get('url'):
2248 f['url'] = initialization_url
1141e910 2249 f['fragments'].append({location_key(initialization_url): initialization_url})
b4c1d6e8 2250 f['fragments'].extend(representation_ms_info['fragments'])
9d6ac71c
S
2251 # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2252 # is not necessarily unique within a Period thus formats with
2253 # the same `format_id` are quite possible. There are numerous examples
2254 # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2255 # https://github.com/rg3/youtube-dl/issues/13919)
2256 full_info = formats_dict.get(representation_id, {}).copy()
2257 full_info.update(f)
2258 formats.append(full_info)
17b598d3 2259 else:
1bac3455 2260 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
17b598d3
YCH
2261 return formats
2262
b2758123 2263 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
47a5cb77 2264 res = self._download_xml_handle(
b2758123
RA
2265 ism_url, video_id,
2266 note=note or 'Downloading ISM manifest',
2267 errnote=errnote or 'Failed to download ISM manifest',
2268 fatal=fatal)
2269 if res is False:
2270 return []
47a5cb77 2271 ism_doc, urlh = res
b2758123 2272
47a5cb77 2273 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
b2758123
RA
2274
2275 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
76d5a363
S
2276 """
2277 Parse formats from ISM manifest.
2278 References:
2279 1. [MS-SSTR]: Smooth Streaming Protocol,
2280 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2281 """
b2758123
RA
2282 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2283 return []
2284
b2758123
RA
2285 duration = int(ism_doc.attrib['Duration'])
2286 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2287
2288 formats = []
2289 for stream in ism_doc.findall('StreamIndex'):
2290 stream_type = stream.get('Type')
2291 if stream_type not in ('video', 'audio'):
2292 continue
2293 url_pattern = stream.attrib['Url']
2294 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2295 stream_name = stream.get('Name')
2296 for track in stream.findall('QualityLevel'):
2501d41e 2297 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
b2758123
RA
2298 # TODO: add support for WVC1 and WMAP
2299 if fourcc not in ('H264', 'AVC1', 'AACL'):
2300 self.report_warning('%s is not a supported codec' % fourcc)
2301 continue
2302 tbr = int(track.attrib['Bitrate']) // 1000
76d5a363
S
2303 # [1] does not mention Width and Height attributes. However,
2304 # they're often present while MaxWidth and MaxHeight are
2305 # missing, so should be used as fallbacks
2306 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2307 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
b2758123
RA
2308 sampling_rate = int_or_none(track.get('SamplingRate'))
2309
2310 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2311 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2312
2313 fragments = []
2314 fragment_ctx = {
2315 'time': 0,
2316 }
2317 stream_fragments = stream.findall('c')
2318 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2319 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2320 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2321 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2322 if not fragment_ctx['duration']:
2323 try:
2324 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2325 except IndexError:
2326 next_fragment_time = duration
1616f9b4 2327 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
b2758123
RA
2328 for _ in range(fragment_repeat):
2329 fragments.append({
1616f9b4 2330 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
b2758123
RA
2331 'duration': fragment_ctx['duration'] / stream_timescale,
2332 })
2333 fragment_ctx['time'] += fragment_ctx['duration']
2334
2335 format_id = []
2336 if ism_id:
2337 format_id.append(ism_id)
2338 if stream_name:
2339 format_id.append(stream_name)
2340 format_id.append(compat_str(tbr))
2341
2342 formats.append({
2343 'format_id': '-'.join(format_id),
2344 'url': ism_url,
2345 'manifest_url': ism_url,
2346 'ext': 'ismv' if stream_type == 'video' else 'isma',
2347 'width': width,
2348 'height': height,
2349 'tbr': tbr,
2350 'asr': sampling_rate,
2351 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2352 'acodec': 'none' if stream_type == 'video' else fourcc,
2353 'protocol': 'ism',
2354 'fragments': fragments,
2355 '_download_params': {
2356 'duration': duration,
2357 'timescale': stream_timescale,
2358 'width': width or 0,
2359 'height': height or 0,
2360 'fourcc': fourcc,
2361 'codec_private_data': track.get('CodecPrivateData'),
2362 'sampling_rate': sampling_rate,
2363 'channels': int_or_none(track.get('Channels', 2)),
2364 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2365 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2366 },
2367 })
2368 return formats
2369
eeb0a956 2370 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
6780154e
S
2371 def absolute_url(item_url):
2372 return urljoin(base_url, item_url)
59bbe491 2373
2374 def parse_content_type(content_type):
2375 if not content_type:
2376 return {}
2377 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2378 if ctr:
2379 mimetype, codecs = ctr.groups()
2380 f = parse_codecs(codecs)
2381 f['ext'] = mimetype2ext(mimetype)
2382 return f
2383 return {}
2384
868f79db 2385 def _media_formats(src, cur_media_type, type_info={}):
520251c0 2386 full_url = absolute_url(src)
82889d4a 2387 ext = type_info.get('ext') or determine_ext(full_url)
87a449c1 2388 if ext == 'm3u8':
520251c0
YCH
2389 is_plain_url = False
2390 formats = self._extract_m3u8_formats(
ad120ae1 2391 full_url, video_id, ext='mp4',
eeb0a956 2392 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
b359e977 2393 preference=preference, fatal=False)
87a449c1
S
2394 elif ext == 'mpd':
2395 is_plain_url = False
2396 formats = self._extract_mpd_formats(
b359e977 2397 full_url, video_id, mpd_id=mpd_id, fatal=False)
520251c0
YCH
2398 else:
2399 is_plain_url = True
2400 formats = [{
2401 'url': full_url,
2402 'vcodec': 'none' if cur_media_type == 'audio' else None,
2403 }]
2404 return is_plain_url, formats
2405
59bbe491 2406 entries = []
4328ddf8
S
2407 # amp-video and amp-audio are very similar to their HTML5 counterparts
2408 # so we wll include them right here (see
2409 # https://www.ampproject.org/docs/reference/components/amp-video)
cea364f7
YCH
2410 media_tags = [(media_tag, media_type, '')
2411 for media_tag, media_type
4328ddf8 2412 in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2aec7256
S
2413 media_tags.extend(re.findall(
2414 # We only allow video|audio followed by a whitespace or '>'.
2415 # Allowing more characters may end up in significant slow down (see
2416 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2417 # http://www.porntrex.com/maps/videositemap.xml).
4328ddf8 2418 r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
cea364f7 2419 for media_tag, media_type, media_content in media_tags:
59bbe491 2420 media_info = {
2421 'formats': [],
2422 'subtitles': {},
2423 }
2424 media_attributes = extract_attributes(media_tag)
2425 src = media_attributes.get('src')
2426 if src:
dedb1770 2427 _, formats = _media_formats(src, media_type)
520251c0 2428 media_info['formats'].extend(formats)
6780154e 2429 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
59bbe491 2430 if media_content:
2431 for source_tag in re.findall(r'<source[^>]+>', media_content):
2432 source_attributes = extract_attributes(source_tag)
2433 src = source_attributes.get('src')
2434 if not src:
2435 continue
82889d4a 2436 f = parse_content_type(source_attributes.get('type'))
868f79db 2437 is_plain_url, formats = _media_formats(src, media_type, f)
520251c0 2438 if is_plain_url:
dd121cc1
S
2439 # res attribute is not standard but seen several times
2440 # in the wild
1ed45499
S
2441 f.update({
2442 'height': int_or_none(source_attributes.get('res')),
2443 'format_id': source_attributes.get('label'),
2444 })
520251c0
YCH
2445 f.update(formats[0])
2446 media_info['formats'].append(f)
2447 else:
2448 media_info['formats'].extend(formats)
59bbe491 2449 for track_tag in re.findall(r'<track[^>]+>', media_content):
2450 track_attributes = extract_attributes(track_tag)
2451 kind = track_attributes.get('kind')
5968d7d2 2452 if not kind or kind in ('subtitles', 'captions'):
59bbe491 2453 src = track_attributes.get('src')
2454 if not src:
2455 continue
2456 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2457 media_info['subtitles'].setdefault(lang, []).append({
2458 'url': absolute_url(src),
2459 })
5e8e2fa5
S
2460 for f in media_info['formats']:
2461 f.setdefault('http_headers', {})['Referer'] = base_url
5968d7d2 2462 if media_info['formats'] or media_info['subtitles']:
59bbe491 2463 entries.append(media_info)
2464 return entries
2465
c4251b9a 2466 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
c7c43a93 2467 formats = []
e71a4509 2468 hdcore_sign = 'hdcore=3.7.0'
ff6f9a67 2469 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
c4251b9a
RA
2470 hds_host = hosts.get('hds')
2471 if hds_host:
2472 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
e71a4509
RA
2473 if 'hdcore=' not in f4m_url:
2474 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2475 f4m_formats = self._extract_f4m_formats(
2476 f4m_url, video_id, f4m_id='hds', fatal=False)
2477 for entry in f4m_formats:
2478 entry.update({'extra_param_to_segment_url': hdcore_sign})
2479 formats.extend(f4m_formats)
c4251b9a
RA
2480 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2481 hls_host = hosts.get('hls')
2482 if hls_host:
2483 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
c7c43a93
RA
2484 formats.extend(self._extract_m3u8_formats(
2485 m3u8_url, video_id, 'mp4', 'm3u8_native',
2486 m3u8_id='hls', fatal=False))
2487 return formats
2488
6ad02195 2489 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
044eeb14 2490 query = compat_urlparse.urlparse(url).query
6ad02195 2491 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
240f2622
S
2492 mobj = re.search(
2493 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2494 url_base = mobj.group('url')
2495 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
6ad02195 2496 formats = []
044eeb14
S
2497
2498 def manifest_url(manifest):
2499 m_url = '%s/%s' % (http_base_url, manifest)
2500 if query:
2501 m_url += '?%s' % query
2502 return m_url
2503
6ad02195
RA
2504 if 'm3u8' not in skip_protocols:
2505 formats.extend(self._extract_m3u8_formats(
044eeb14 2506 manifest_url('playlist.m3u8'), video_id, 'mp4',
6ad02195
RA
2507 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2508 if 'f4m' not in skip_protocols:
2509 formats.extend(self._extract_f4m_formats(
044eeb14 2510 manifest_url('manifest.f4m'),
6ad02195 2511 video_id, f4m_id='hds', fatal=False))
0384932e
RA
2512 if 'dash' not in skip_protocols:
2513 formats.extend(self._extract_mpd_formats(
044eeb14 2514 manifest_url('manifest.mpd'),
0384932e 2515 video_id, mpd_id='dash', fatal=False))
6ad02195 2516 if re.search(r'(?:/smil:|\.smil)', url_base):
6ad02195
RA
2517 if 'smil' not in skip_protocols:
2518 rtmp_formats = self._extract_smil_formats(
044eeb14 2519 manifest_url('jwplayer.smil'),
6ad02195
RA
2520 video_id, fatal=False)
2521 for rtmp_format in rtmp_formats:
2522 rtsp_format = rtmp_format.copy()
2523 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2524 del rtsp_format['play_path']
2525 del rtsp_format['ext']
2526 rtsp_format.update({
2527 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2528 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2529 'protocol': 'rtsp',
2530 })
2531 formats.extend([rtmp_format, rtsp_format])
2532 else:
2533 for protocol in ('rtmp', 'rtsp'):
2534 if protocol not in skip_protocols:
2535 formats.append({
f2e2f0c7 2536 'url': '%s:%s' % (protocol, url_base),
6ad02195
RA
2537 'format_id': protocol,
2538 'protocol': protocol,
2539 })
2540 return formats
2541
c73e330e 2542 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
a4a554a7 2543 mobj = re.search(
ac9c69ac 2544 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
a4a554a7
YCH
2545 webpage)
2546 if mobj:
c73e330e
RU
2547 try:
2548 jwplayer_data = self._parse_json(mobj.group('options'),
2549 video_id=video_id,
2550 transform_source=transform_source)
2551 except ExtractorError:
2552 pass
2553 else:
2554 if isinstance(jwplayer_data, dict):
2555 return jwplayer_data
a4a554a7
YCH
2556
2557 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
c73e330e
RU
2558 jwplayer_data = self._find_jwplayer_data(
2559 webpage, video_id, transform_source=js_to_json)
a4a554a7
YCH
2560 return self._parse_jwplayer_data(
2561 jwplayer_data, video_id, *args, **kwargs)
2562
2563 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2564 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2565 # JWPlayer backward compatibility: flattened playlists
2566 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2567 if 'playlist' not in jwplayer_data:
2568 jwplayer_data = {'playlist': [jwplayer_data]}
2569
2570 entries = []
2571
2572 # JWPlayer backward compatibility: single playlist item
2573 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2574 if not isinstance(jwplayer_data['playlist'], list):
2575 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2576
2577 for video_data in jwplayer_data['playlist']:
2578 # JWPlayer backward compatibility: flattened sources
2579 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2580 if 'sources' not in video_data:
2581 video_data['sources'] = [video_data]
2582
2583 this_video_id = video_id or video_data['mediaid']
2584
1a2192cb
S
2585 formats = self._parse_jwplayer_formats(
2586 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2587 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
a4a554a7
YCH
2588
2589 subtitles = {}
2590 tracks = video_data.get('tracks')
2591 if tracks and isinstance(tracks, list):
2592 for track in tracks:
96a2daa1
S
2593 if not isinstance(track, dict):
2594 continue
f4b74272
S
2595 track_kind = track.get('kind')
2596 if not track_kind or not isinstance(track_kind, compat_str):
2597 continue
2598 if track_kind.lower() not in ('captions', 'subtitles'):
a4a554a7
YCH
2599 continue
2600 track_url = urljoin(base_url, track.get('file'))
2601 if not track_url:
2602 continue
2603 subtitles.setdefault(track.get('label') or 'en', []).append({
2604 'url': self._proto_relative_url(track_url)
2605 })
2606
50d808f5 2607 entry = {
a4a554a7 2608 'id': this_video_id,
50d808f5 2609 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
a4a554a7
YCH
2610 'description': video_data.get('description'),
2611 'thumbnail': self._proto_relative_url(video_data.get('image')),
2612 'timestamp': int_or_none(video_data.get('pubdate')),
2613 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2614 'subtitles': subtitles,
50d808f5
RA
2615 }
2616 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2617 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2618 entry.update({
2619 '_type': 'url_transparent',
2620 'url': formats[0]['url'],
2621 })
2622 else:
2623 self._sort_formats(formats)
2624 entry['formats'] = formats
2625 entries.append(entry)
a4a554a7
YCH
2626 if len(entries) == 1:
2627 return entries[0]
2628 else:
2629 return self.playlist_result(entries)
2630
ed0cf9b3
S
2631 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2632 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
bf1b87cd 2633 urls = []
ed0cf9b3 2634 formats = []
1a2192cb 2635 for source in jwplayer_sources_data:
0a268c6e
S
2636 if not isinstance(source, dict):
2637 continue
bf1b87cd
RA
2638 source_url = self._proto_relative_url(source.get('file'))
2639 if not source_url:
2640 continue
ed0cf9b3
S
2641 if base_url:
2642 source_url = compat_urlparse.urljoin(base_url, source_url)
bf1b87cd
RA
2643 if source_url in urls:
2644 continue
2645 urls.append(source_url)
ed0cf9b3
S
2646 source_type = source.get('type') or ''
2647 ext = mimetype2ext(source_type) or determine_ext(source_url)
2648 if source_type == 'hls' or ext == 'm3u8':
2649 formats.extend(self._extract_m3u8_formats(
0236cd0d
S
2650 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2651 m3u8_id=m3u8_id, fatal=False))
0d9c48de 2652 elif source_type == 'dash' or ext == 'mpd':
ed0cf9b3
S
2653 formats.extend(self._extract_mpd_formats(
2654 source_url, video_id, mpd_id=mpd_id, fatal=False))
b51dc9db
S
2655 elif ext == 'smil':
2656 formats.extend(self._extract_smil_formats(
2657 source_url, video_id, fatal=False))
ed0cf9b3 2658 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
0236cd0d
S
2659 elif source_type.startswith('audio') or ext in (
2660 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
ed0cf9b3
S
2661 formats.append({
2662 'url': source_url,
2663 'vcodec': 'none',
2664 'ext': ext,
2665 })
2666 else:
2667 height = int_or_none(source.get('height'))
2668 if height is None:
2669 # Often no height is provided but there is a label in
0236cd0d 2670 # format like "1080p", "720p SD", or 1080.
ed0cf9b3 2671 height = int_or_none(self._search_regex(
0236cd0d 2672 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
ed0cf9b3
S
2673 'height', default=None))
2674 a_format = {
2675 'url': source_url,
2676 'width': int_or_none(source.get('width')),
2677 'height': height,
0236cd0d 2678 'tbr': int_or_none(source.get('bitrate')),
ed0cf9b3
S
2679 'ext': ext,
2680 }
2681 if source_url.startswith('rtmp'):
2682 a_format['ext'] = 'flv'
ed0cf9b3
S
2683 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2684 # of jwplayer.flash.swf
2685 rtmp_url_parts = re.split(
2686 r'((?:mp4|mp3|flv):)', source_url, 1)
2687 if len(rtmp_url_parts) == 3:
2688 rtmp_url, prefix, play_path = rtmp_url_parts
2689 a_format.update({
2690 'url': rtmp_url,
2691 'play_path': prefix + play_path,
2692 })
2693 if rtmp_params:
2694 a_format.update(rtmp_params)
2695 formats.append(a_format)
2696 return formats
2697
f4b1c7ad
PH
2698 def _live_title(self, name):
2699 """ Generate the title for a live video """
2700 now = datetime.datetime.now()
611c1dd9 2701 now_str = now.strftime('%Y-%m-%d %H:%M')
f4b1c7ad
PH
2702 return name + ' ' + now_str
2703
b14f3a4c
PH
2704 def _int(self, v, name, fatal=False, **kwargs):
2705 res = int_or_none(v, **kwargs)
2706 if 'get_attr' in kwargs:
2707 print(getattr(v, kwargs['get_attr']))
2708 if res is None:
2709 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2710 if fatal:
2711 raise ExtractorError(msg)
2712 else:
2713 self._downloader.report_warning(msg)
2714 return res
2715
2716 def _float(self, v, name, fatal=False, **kwargs):
2717 res = float_or_none(v, **kwargs)
2718 if res is None:
2719 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2720 if fatal:
2721 raise ExtractorError(msg)
2722 else:
2723 self._downloader.report_warning(msg)
2724 return res
2725
40e41780
TF
2726 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2727 path='/', secure=False, discard=False, rest={}, **kwargs):
810fb84d 2728 cookie = compat_cookiejar.Cookie(
4ed2d7b7 2729 0, name, value, port, port is not None, domain, True,
40e41780
TF
2730 domain.startswith('.'), path, True, secure, expire_time,
2731 discard, None, None, rest)
42939b61
JMF
2732 self._downloader.cookiejar.set_cookie(cookie)
2733
799207e8 2734 def _get_cookies(self, url):
2735 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 2736 req = sanitized_Request(url)
799207e8 2737 self._downloader.cookiejar.add_cookie_header(req)
2738 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2739
05900629
PH
2740 def get_testcases(self, include_onlymatching=False):
2741 t = getattr(self, '_TEST', None)
2742 if t:
2743 assert not hasattr(self, '_TESTS'), \
2744 '%s has _TEST and _TESTS' % type(self).__name__
2745 tests = [t]
2746 else:
2747 tests = getattr(self, '_TESTS', [])
2748 for t in tests:
2749 if not include_onlymatching and t.get('only_matching', False):
2750 continue
2751 t['name'] = type(self).__name__[:-len('IE')]
2752 yield t
2753
2754 def is_suitable(self, age_limit):
2755 """ Test whether the extractor is generally suitable for the given
2756 age limit (i.e. pornographic sites are not, all others usually are) """
2757
2758 any_restricted = False
2759 for tc in self.get_testcases(include_onlymatching=False):
40090e8d 2760 if tc.get('playlist', []):
05900629
PH
2761 tc = tc['playlist'][0]
2762 is_restricted = age_restricted(
2763 tc.get('info_dict', {}).get('age_limit'), age_limit)
2764 if not is_restricted:
2765 return True
2766 any_restricted = any_restricted or is_restricted
2767 return not any_restricted
2768
a504ced0 2769 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
2770 if (self._downloader.params.get('writesubtitles', False) or
2771 self._downloader.params.get('listsubtitles')):
2772 return self._get_subtitles(*args, **kwargs)
2773 return {}
a504ced0
JMF
2774
2775 def _get_subtitles(self, *args, **kwargs):
611c1dd9 2776 raise NotImplementedError('This method must be implemented by subclasses')
a504ced0 2777
912e0b7e
YCH
2778 @staticmethod
2779 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2780 """ Merge subtitle items for one language. Items with duplicated URLs
2781 will be dropped. """
2782 list1_urls = set([item['url'] for item in subtitle_list1])
2783 ret = list(subtitle_list1)
2784 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2785 return ret
2786
2787 @classmethod
8c97f819 2788 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 2789 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
2790 ret = dict(subtitle_dict1)
2791 for lang in subtitle_dict2:
8c97f819 2792 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
2793 return ret
2794
360e1ca5 2795 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
2796 if (self._downloader.params.get('writeautomaticsub', False) or
2797 self._downloader.params.get('listsubtitles')):
2798 return self._get_automatic_captions(*args, **kwargs)
2799 return {}
360e1ca5
JMF
2800
2801 def _get_automatic_captions(self, *args, **kwargs):
611c1dd9 2802 raise NotImplementedError('This method must be implemented by subclasses')
360e1ca5 2803
d77ab8e2
S
2804 def mark_watched(self, *args, **kwargs):
2805 if (self._downloader.params.get('mark_watched', False) and
2806 (self._get_login_info()[0] is not None or
2807 self._downloader.params.get('cookiefile') is not None)):
2808 self._mark_watched(*args, **kwargs)
2809
2810 def _mark_watched(self, *args, **kwargs):
2811 raise NotImplementedError('This method must be implemented by subclasses')
2812
38cce791
YCH
2813 def geo_verification_headers(self):
2814 headers = {}
2815 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2816 if geo_verification_proxy:
2817 headers['Ytdl-request-proxy'] = geo_verification_proxy
2818 return headers
2819
98763ee3
YCH
2820 def _generic_id(self, url):
2821 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2822
2823 def _generic_title(self, url):
2824 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2825
8dbe9899 2826
d6983cb4
PH
2827class SearchInfoExtractor(InfoExtractor):
2828 """
2829 Base class for paged search queries extractors.
10952eb2 2830 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
2831 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2832 """
2833
2834 @classmethod
2835 def _make_valid_url(cls):
2836 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2837
2838 @classmethod
2839 def suitable(cls, url):
2840 return re.match(cls._make_valid_url(), url) is not None
2841
2842 def _real_extract(self, query):
2843 mobj = re.match(self._make_valid_url(), query)
2844 if mobj is None:
f1a9d64e 2845 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
2846
2847 prefix = mobj.group('prefix')
2848 query = mobj.group('query')
2849 if prefix == '':
2850 return self._get_n_results(query, 1)
2851 elif prefix == 'all':
2852 return self._get_n_results(query, self._MAX_RESULTS)
2853 else:
2854 n = int(prefix)
2855 if n <= 0:
f1a9d64e 2856 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 2857 elif n > self._MAX_RESULTS:
f1a9d64e 2858 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
2859 n = self._MAX_RESULTS
2860 return self._get_n_results(query, n)
2861
2862 def _get_n_results(self, query, n):
2863 """Get a specified number of results for a query"""
611c1dd9 2864 raise NotImplementedError('This method must be implemented by subclasses')
0f818663
PH
2865
2866 @property
2867 def SEARCH_KEY(self):
2868 return self._SEARCH_KEY