2 from __future__
import unicode_literals
16 from ..compat
import (
17 compat_cookiejar_Cookie
,
18 compat_cookies_SimpleCookie
,
20 compat_etree_fromstring
,
27 compat_urllib_parse_unquote
,
28 compat_urllib_parse_urlencode
,
29 compat_urllib_request
,
31 compat_xml_parse_error
,
33 from ..downloader
import FileDownloader
34 from ..downloader
.f4m
import (
36 remove_encrypted_media
,
65 parse_m3u8_attributes
,
87 class InfoExtractor(object):
88 """Information Extractor class.
90 Information extractors are the classes that, given a URL, extract
91 information about the video (or videos) the URL refers to. This
92 information includes the real video URL, the video title, author and
93 others. The information is stored in a dictionary which is then
94 passed to the YoutubeDL. The YoutubeDL processes this
95 information possibly downloading the video to the file system, among
96 other possible outcomes.
98 The type field determines the type of the result.
99 By far the most common value (and the default if _type is missing) is
100 "video", which indicates a single video.
102 For a video, the dictionaries must include the following fields:
104 id: Video identifier.
105 title: Video title, unescaped.
107 Additionally, it must contain either a formats entry or a url one:
109 formats: A list of dictionaries for each format available, ordered
110 from worst to best quality.
113 * url The mandatory URL representing the media:
114 for plain file media - HTTP URL of this file,
116 for HLS - URL of the M3U8 media playlist,
117 for HDS - URL of the F4M manifest,
119 - HTTP URL to plain file media (in case of
121 - URL of the MPD manifest or base URL
122 representing the media if MPD manifest
123 is parsed from a string (in case of
125 for MSS - URL of the ISM manifest.
127 The URL of the manifest file in case of
129 for HLS - URL of the M3U8 master playlist,
130 for HDS - URL of the F4M manifest,
131 for DASH - URL of the MPD manifest,
132 for MSS - URL of the ISM manifest.
133 * ext Will be calculated from URL if missing
134 * format A human-readable description of the format
135 ("mp4 container with h264/opus").
136 Calculated from the format_id, width, height.
137 and format_note fields if missing.
138 * format_id A short description of the format
139 ("mp4_h264_opus" or "19").
140 Technically optional, but strongly recommended.
141 * format_note Additional info about the format
142 ("3D" or "DASH video")
143 * width Width of the video, if known
144 * height Height of the video, if known
145 * resolution Textual description of width and height
146 * tbr Average bitrate of audio and video in KBit/s
147 * abr Average audio bitrate in KBit/s
148 * acodec Name of the audio codec in use
149 * asr Audio sampling rate in Hertz
150 * vbr Average video bitrate in KBit/s
152 * vcodec Name of the video codec in use
153 * container Name of the container format
154 * filesize The number of bytes, if known in advance
155 * filesize_approx An estimate for the number of bytes
156 * player_url SWF Player URL (used for rtmpdump).
157 * protocol The protocol that will be used for the actual
158 download, lower-case.
159 "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
160 "m3u8", "m3u8_native" or "http_dash_segments".
162 Base URL for fragments. Each fragment's path
163 value (if present) will be relative to
165 * fragments A list of fragments of a fragmented media.
166 Each fragment entry must contain either an url
167 or a path. If an url is present it should be
168 considered by a client. Otherwise both path and
169 fragment_base_url must be present. Here is
170 the list of all potential fields:
171 * "url" - fragment's URL
172 * "path" - fragment's path relative to
174 * "duration" (optional, int or float)
175 * "filesize" (optional, int)
176 * preference Order number of this format. If this field is
177 present and not None, the formats get sorted
178 by this field, regardless of all other values.
179 -1 for default (order by other properties),
180 -2 or smaller for less than default.
181 < -1000 to hide the format (if there is
182 another one which is strictly better)
183 * language Language code, e.g. "de" or "en-US".
184 * language_preference Is this in the language mentioned in
186 10 if it's what the URL is about,
187 -1 for default (don't know),
188 -10 otherwise, other values reserved for now.
189 * quality Order number of the video quality of this
190 format, irrespective of the file format.
191 -1 for default (order by other properties),
192 -2 or smaller for less than default.
193 * source_preference Order number for this video source
194 (quality takes higher priority)
195 -1 for default (order by other properties),
196 -2 or smaller for less than default.
197 * http_headers A dictionary of additional HTTP headers
198 to add to the request.
199 * stretched_ratio If given and not 1, indicates that the
200 video's pixels are not square.
201 width : height ratio as float.
202 * no_resume The server does not support resuming the
203 (HTTP or RTMP) download. Boolean.
204 * downloader_options A dictionary of downloader options as
205 described in FileDownloader
206 RTMP formats can also have the additional fields: page_url,
207 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
208 rtmp_protocol, rtmp_real_time
210 url: Final video URL.
211 ext: Video filename extension.
212 format: The video format, defaults to ext (used for --get-format)
213 player_url: SWF Player URL (used for rtmpdump).
215 The following fields are optional:
217 alt_title: A secondary title of the video.
218 display_id An alternative identifier for the video, not necessarily
219 unique, but available before title. Typically, id is
220 something like "4234987", title "Dancing naked mole rats",
221 and display_id "dancing-naked-mole-rats"
222 thumbnails: A list of dictionaries, with the following entries:
223 * "id" (optional, string) - Thumbnail format ID
225 * "preference" (optional, int) - quality of the image
226 * "width" (optional, int)
227 * "height" (optional, int)
228 * "resolution" (optional, string "{width}x{height}",
230 * "filesize" (optional, int)
231 thumbnail: Full URL to a video thumbnail image.
232 description: Full video description.
233 uploader: Full name of the video uploader.
234 license: License name the video is licensed under.
235 creator: The creator of the video.
236 release_timestamp: UNIX timestamp of the moment the video was released.
237 release_date: The date (YYYYMMDD) when the video was released.
238 timestamp: UNIX timestamp of the moment the video was uploaded
239 upload_date: Video upload date (YYYYMMDD).
240 If not explicitly set, calculated from timestamp.
241 uploader_id: Nickname or id of the video uploader.
242 uploader_url: Full URL to a personal webpage of the video uploader.
243 channel: Full name of the channel the video is uploaded on.
244 Note that channel fields may or may not repeat uploader
245 fields. This depends on a particular extractor.
246 channel_id: Id of the channel.
247 channel_url: Full URL to a channel webpage.
248 location: Physical location where the video was filmed.
249 subtitles: The available subtitles as a dictionary in the format
250 {tag: subformats}. "tag" is usually a language code, and
251 "subformats" is a list sorted from lower to higher
252 preference, each element is a dictionary with the "ext"
254 * "data": The subtitles file contents
255 * "url": A URL pointing to the subtitles file
256 It can optionally also have:
257 * "name": Name or description of the subtitles
258 "ext" will be calculated from URL if missing
259 automatic_captions: Like 'subtitles'; contains automatically generated
260 captions instead of normal subtitles
261 duration: Length of the video in seconds, as an integer or float.
262 view_count: How many users have watched the video on the platform.
263 like_count: Number of positive ratings of the video
264 dislike_count: Number of negative ratings of the video
265 repost_count: Number of reposts of the video
266 average_rating: Average rating give by users, the scale used depends on the webpage
267 comment_count: Number of comments on the video
268 comments: A list of comments, each with one or more of the following
269 properties (all but one of text or html optional):
270 * "author" - human-readable name of the comment author
271 * "author_id" - user ID of the comment author
272 * "author_thumbnail" - The thumbnail of the comment author
274 * "html" - Comment as HTML
275 * "text" - Plain text of the comment
276 * "timestamp" - UNIX timestamp of comment
277 * "parent" - ID of the comment this one is replying to.
278 Set to "root" to indicate that this is a
279 comment to the original video.
280 * "like_count" - Number of positive ratings of the comment
281 * "dislike_count" - Number of negative ratings of the comment
282 * "is_favorited" - Whether the comment is marked as
283 favorite by the video uploader
284 * "author_is_uploader" - Whether the comment is made by
286 age_limit: Age restriction for the video, as an integer (years)
287 webpage_url: The URL to the video webpage, if given to yt-dlp it
288 should allow to get the same result again. (It will be set
289 by YoutubeDL if it's missing)
290 categories: A list of categories that the video falls in, for example
292 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
293 is_live: True, False, or None (=unknown). Whether this video is a
294 live stream that goes on instead of a fixed-length video.
295 was_live: True, False, or None (=unknown). Whether this video was
296 originally a live stream.
297 start_time: Time in seconds where the reproduction should start, as
298 specified in the URL.
299 end_time: Time in seconds where the reproduction should end, as
300 specified in the URL.
301 chapters: A list of dictionaries, with the following entries:
302 * "start_time" - The start time of the chapter in seconds
303 * "end_time" - The end time of the chapter in seconds
304 * "title" (optional, string)
305 playable_in_embed: Whether this video is allowed to play in embedded
306 players on other sites. Can be True (=always allowed),
307 False (=never allowed), None (=unknown), or a string
308 specifying the criteria for embedability (Eg: 'whitelist')
309 availability: Under what condition the video is available. One of
310 'private', 'premium_only', 'subscriber_only', 'needs_auth',
311 'unlisted' or 'public'. Use 'InfoExtractor._availability'
313 __post_extractor: A function to be called just before the metadata is
314 written to either disk, logger or console. The function
315 must return a dict which will be added to the info_dict.
316 This is usefull for additional information that is
317 time-consuming to extract. Note that the fields thus
318 extracted will not be available to output template and
319 match_filter. So, only "comments" and "comment_count" are
320 currently allowed to be extracted via this method.
322 The following fields should only be used when the video belongs to some logical
325 chapter: Name or title of the chapter the video belongs to.
326 chapter_number: Number of the chapter the video belongs to, as an integer.
327 chapter_id: Id of the chapter the video belongs to, as a unicode string.
329 The following fields should only be used when the video is an episode of some
330 series, programme or podcast:
332 series: Title of the series or programme the video episode belongs to.
333 season: Title of the season the video episode belongs to.
334 season_number: Number of the season the video episode belongs to, as an integer.
335 season_id: Id of the season the video episode belongs to, as a unicode string.
336 episode: Title of the video episode. Unlike mandatory video title field,
337 this field should denote the exact title of the video episode
338 without any kind of decoration.
339 episode_number: Number of the video episode within a season, as an integer.
340 episode_id: Id of the video episode, as a unicode string.
342 The following fields should only be used when the media is a track or a part of
345 track: Title of the track.
346 track_number: Number of the track within an album or a disc, as an integer.
347 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
349 artist: Artist(s) of the track.
350 genre: Genre(s) of the track.
351 album: Title of the album the track belongs to.
352 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
353 album_artist: List of all artists appeared on the album (e.g.
354 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
356 disc_number: Number of the disc or other physical medium the track belongs to,
358 release_year: Year (YYYY) when the album was released.
360 Unless mentioned otherwise, the fields should be Unicode strings.
362 Unless mentioned otherwise, None is equivalent to absence of information.
365 _type "playlist" indicates multiple videos.
366 There must be a key "entries", which is a list, an iterable, or a PagedList
367 object, each element of which is a valid dictionary by this specification.
369 Additionally, playlists can have "id", "title", and any other relevent
370 attributes with the same semantics as videos (see above).
373 _type "multi_video" indicates that there are multiple videos that
374 form a single show, for examples multiple acts of an opera or TV episode.
375 It must have an entries key like a playlist and contain all the keys
376 required for a video at the same time.
379 _type "url" indicates that the video must be extracted from another
380 location, possibly by a different extractor. Its only required key is:
381 "url" - the next URL to extract.
382 The key "ie_key" can be set to the class name (minus the trailing "IE",
383 e.g. "Youtube") if the extractor class is known in advance.
384 Additionally, the dictionary may have any properties of the resolved entity
385 known in advance, for example "title" if the title of the referred video is
389 _type "url_transparent" entities have the same specification as "url", but
390 indicate that the given additional information is more precise than the one
391 associated with the resolved URL.
392 This is useful when a site employs a video service that hosts the video and
393 its technical metadata, but that video service does not embed a useful
394 title, description etc.
397 Subclasses of this one should re-define the _real_initialize() and
398 _real_extract() methods and define a _VALID_URL regexp.
399 Probably, they should also be added to the list of extractors.
401 _GEO_BYPASS attribute may be set to False in order to disable
402 geo restriction bypass mechanisms for a particular extractor.
403 Though it won't disable explicit geo restriction bypass based on
404 country code provided with geo_bypass_country.
406 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
407 countries for this extractor. One of these countries will be used by
408 geo restriction bypass mechanism right away in order to bypass
409 geo restriction, of course, if the mechanism is not disabled.
411 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
412 IP blocks in CIDR notation for this extractor. One of these IP blocks
413 will be used by geo restriction bypass mechanism similarly
416 Finally, the _WORKING attribute should be set to False for broken IEs
417 in order to warn the users and skip the tests.
422 _x_forwarded_for_ip
= None
424 _GEO_COUNTRIES
= None
425 _GEO_IP_BLOCKS
= None
429 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
431 'Use --cookies for the authentication. '
432 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to pass cookies'),
433 'password': 'Use --username and --password or --netrc to provide account credentials',
436 def __init__(self
, downloader
=None):
437 """Constructor. Receives an optional downloader."""
439 self
._x
_forwarded
_for
_ip
= None
440 self
.set_downloader(downloader
)
443 def suitable(cls
, url
):
444 """Receives a URL and returns True if suitable for this IE."""
446 # This does not use has/getattr intentionally - we want to know whether
447 # we have cached the regexp for *this* class, whereas getattr would also
448 # match the superclass
449 if '_VALID_URL_RE' not in cls
.__dict
__:
450 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
451 return cls
._VALID
_URL
_RE
.match(url
) is not None
454 def _match_id(cls
, url
):
455 if '_VALID_URL_RE' not in cls
.__dict
__:
456 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
457 m
= cls
._VALID
_URL
_RE
.match(url
)
459 return compat_str(m
.group('id'))
463 """Getter method for _WORKING."""
466 def initialize(self
):
467 """Initializes an instance (authentication, etc)."""
468 self
._initialize
_geo
_bypass
({
469 'countries': self
._GEO
_COUNTRIES
,
470 'ip_blocks': self
._GEO
_IP
_BLOCKS
,
473 self
._real
_initialize
()
476 def _initialize_geo_bypass(self
, geo_bypass_context
):
478 Initialize geo restriction bypass mechanism.
480 This method is used to initialize geo bypass mechanism based on faking
481 X-Forwarded-For HTTP header. A random country from provided country list
482 is selected and a random IP belonging to this country is generated. This
483 IP will be passed as X-Forwarded-For HTTP header in all subsequent
486 This method will be used for initial geo bypass mechanism initialization
487 during the instance initialization with _GEO_COUNTRIES and
490 You may also manually call it from extractor's code if geo bypass
491 information is not available beforehand (e.g. obtained during
492 extraction) or due to some other reason. In this case you should pass
493 this information in geo bypass context passed as first argument. It may
494 contain following fields:
496 countries: List of geo unrestricted countries (similar
498 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
499 (similar to _GEO_IP_BLOCKS)
502 if not self
._x
_forwarded
_for
_ip
:
504 # Geo bypass mechanism is explicitly disabled by user
505 if not self
.get_param('geo_bypass', True):
508 if not geo_bypass_context
:
509 geo_bypass_context
= {}
511 # Backward compatibility: previously _initialize_geo_bypass
512 # expected a list of countries, some 3rd party code may still use
514 if isinstance(geo_bypass_context
, (list, tuple)):
515 geo_bypass_context
= {
516 'countries': geo_bypass_context
,
519 # The whole point of geo bypass mechanism is to fake IP
520 # as X-Forwarded-For HTTP header based on some IP block or
523 # Path 1: bypassing based on IP block in CIDR notation
525 # Explicit IP block specified by user, use it right away
526 # regardless of whether extractor is geo bypassable or not
527 ip_block
= self
.get_param('geo_bypass_ip_block', None)
529 # Otherwise use random IP block from geo bypass context but only
530 # if extractor is known as geo bypassable
532 ip_blocks
= geo_bypass_context
.get('ip_blocks')
533 if self
._GEO
_BYPASS
and ip_blocks
:
534 ip_block
= random
.choice(ip_blocks
)
537 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(ip_block
)
538 self
._downloader
.write_debug(
539 '[debug] Using fake IP %s as X-Forwarded-For' % self
._x
_forwarded
_for
_ip
)
542 # Path 2: bypassing based on country code
544 # Explicit country code specified by user, use it right away
545 # regardless of whether extractor is geo bypassable or not
546 country
= self
.get_param('geo_bypass_country', None)
548 # Otherwise use random country code from geo bypass context but
549 # only if extractor is known as geo bypassable
551 countries
= geo_bypass_context
.get('countries')
552 if self
._GEO
_BYPASS
and countries
:
553 country
= random
.choice(countries
)
556 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country
)
557 self
._downloader
.write_debug(
558 'Using fake IP %s (%s) as X-Forwarded-For' % (self
._x
_forwarded
_for
_ip
, country
.upper()))
560 def extract(self
, url
):
561 """Extracts URL information and returns it in list of dicts."""
566 self
.write_debug('Extracting URL: %s' % url
)
567 ie_result
= self
._real
_extract
(url
)
568 if ie_result
is None:
570 if self
._x
_forwarded
_for
_ip
:
571 ie_result
['__x_forwarded_for_ip'] = self
._x
_forwarded
_for
_ip
572 subtitles
= ie_result
.get('subtitles')
573 if (subtitles
and 'live_chat' in subtitles
574 and 'no-live-chat' in self
.get_param('compat_opts', [])):
575 del subtitles
['live_chat']
577 except GeoRestrictedError
as e
:
578 if self
.__maybe
_fake
_ip
_and
_retry
(e
.countries
):
581 except ExtractorError
:
583 except compat_http_client
.IncompleteRead
as e
:
584 raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True)
585 except (KeyError, StopIteration) as e
:
586 raise ExtractorError('An extractor error has occurred.', cause
=e
)
588 def __maybe_fake_ip_and_retry(self
, countries
):
589 if (not self
.get_param('geo_bypass_country', None)
591 and self
.get_param('geo_bypass', True)
592 and not self
._x
_forwarded
_for
_ip
594 country_code
= random
.choice(countries
)
595 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country_code
)
596 if self
._x
_forwarded
_for
_ip
:
598 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
599 % (self
._x
_forwarded
_for
_ip
, country_code
.upper()))
603 def set_downloader(self
, downloader
):
604 """Sets the downloader for this IE."""
605 self
._downloader
= downloader
607 def _real_initialize(self
):
608 """Real initialization process. Redefine in subclasses."""
611 def _real_extract(self
, url
):
612 """Real extraction process. Redefine in subclasses."""
617 """A string for getting the InfoExtractor with get_info_extractor"""
618 return compat_str(cls
.__name
__[:-2])
622 return compat_str(type(self
).__name
__[:-2])
625 def __can_accept_status_code(err
, expected_status
):
626 assert isinstance(err
, compat_urllib_error
.HTTPError
)
627 if expected_status
is None:
629 if isinstance(expected_status
, compat_integer_types
):
630 return err
.code
== expected_status
631 elif isinstance(expected_status
, (list, tuple)):
632 return err
.code
in expected_status
633 elif callable(expected_status
):
634 return expected_status(err
.code
) is True
638 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query={}
, expected_status
=None):
640 Return the response handle.
642 See _download_webpage docstring for arguments specification.
644 if not self
._downloader
._first
_webpage
_request
:
645 sleep_interval
= float_or_none(self
.get_param('sleep_interval_requests')) or 0
646 if sleep_interval
> 0:
647 self
.to_screen('Sleeping %s seconds ...' % sleep_interval
)
648 time
.sleep(sleep_interval
)
650 self
._downloader
._first
_webpage
_request
= False
653 self
.report_download_webpage(video_id
)
654 elif note
is not False:
656 self
.to_screen('%s' % (note
,))
658 self
.to_screen('%s: %s' % (video_id
, note
))
660 # Some sites check X-Forwarded-For HTTP header in order to figure out
661 # the origin of the client behind proxy. This allows bypassing geo
662 # restriction by faking this header's value to IP that belongs to some
663 # geo unrestricted country. We will do so once we encounter any
664 # geo restriction error.
665 if self
._x
_forwarded
_for
_ip
:
666 if 'X-Forwarded-For' not in headers
:
667 headers
['X-Forwarded-For'] = self
._x
_forwarded
_for
_ip
669 if isinstance(url_or_request
, compat_urllib_request
.Request
):
670 url_or_request
= update_Request(
671 url_or_request
, data
=data
, headers
=headers
, query
=query
)
674 url_or_request
= update_url_query(url_or_request
, query
)
675 if data
is not None or headers
:
676 url_or_request
= sanitized_Request(url_or_request
, data
, headers
)
678 return self
._downloader
.urlopen(url_or_request
)
679 except network_exceptions
as err
:
680 if isinstance(err
, compat_urllib_error
.HTTPError
):
681 if self
.__can
_accept
_status
_code
(err
, expected_status
):
682 # Retain reference to error to prevent file object from
683 # being closed before it can be read. Works around the
684 # effects of <https://bugs.python.org/issue15002>
685 # introduced in Python 3.4.1.
692 errnote
= 'Unable to download webpage'
694 errmsg
= '%s: %s' % (errnote
, error_to_compat_str(err
))
696 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
)
698 self
.report_warning(errmsg
)
701 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, encoding
=None, data
=None, headers
={}, query={}
, expected_status
=None):
703 Return a tuple (page content as string, URL handle).
705 See _download_webpage docstring for arguments specification.
707 # Strip hashes from the URL (#1038)
708 if isinstance(url_or_request
, (compat_str
, str)):
709 url_or_request
= url_or_request
.partition('#')[0]
711 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
, expected_status
=expected_status
)
715 content
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
)
716 return (content
, urlh
)
719 def _guess_encoding_from_content(content_type
, webpage_bytes
):
720 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
722 encoding
= m
.group(1)
724 m
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
725 webpage_bytes[:1024])
727 encoding = m.group(1).decode('ascii')
728 elif webpage_bytes.startswith(b'\xff\xfe'):
735 def __check_blocked(self, content):
736 first_block = content[:512]
737 if ('<title>Access to this site is blocked</title>' in content
738 and 'Websense' in first_block):
739 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
740 blocked_iframe = self._html_search_regex(
741 r'<iframe src="([^
"]+)"', content,
742 'Websense information URL
', default=None)
744 msg += ' Visit
%s for more details
' % blocked_iframe
745 raise ExtractorError(msg, expected=True)
746 if '<title
>The URL you requested has been blocked
</title
>' in first_block:
748 'Access to this webpage has been blocked by Indian censorship
. '
749 'Use a VPN
or proxy
server (with --proxy
) to route around it
.')
750 block_msg = self._html_search_regex(
751 r'</h1
><p
>(.*?
)</p
>',
752 content, 'block message
', default=None)
754 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ')
755 raise ExtractorError(msg, expected=True)
756 if ('<title
>TTK
:: Доступ к ресурсу ограничен
</title
>' in content
757 and 'blocklist
.rkn
.gov
.ru
' in content):
758 raise ExtractorError(
759 'Access to this webpage has been blocked by decision of the Russian government
. '
760 'Visit http
://blocklist
.rkn
.gov
.ru
/ for a block reason
.',
763 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
764 content_type = urlh.headers.get('Content
-Type
', '')
765 webpage_bytes = urlh.read()
766 if prefix is not None:
767 webpage_bytes = prefix + webpage_bytes
769 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
770 if self.get_param('dump_intermediate_pages
', False):
771 self.to_screen('Dumping request to
' + urlh.geturl())
772 dump = base64.b64encode(webpage_bytes).decode('ascii
')
773 self._downloader.to_screen(dump)
774 if self.get_param('write_pages
', False):
775 basen = '%s_%s' % (video_id, urlh.geturl())
777 h = '___
' + hashlib.md5(basen.encode('utf
-8')).hexdigest()
778 basen = basen[:240 - len(h)] + h
779 raw_filename = basen + '.dump
'
780 filename = sanitize_filename(raw_filename, restricted=True)
781 self.to_screen('Saving request to
' + filename)
782 # Working around MAX_PATH limitation on Windows (see
783 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
784 if compat_os_name == 'nt
':
785 absfilepath = os.path.abspath(filename)
786 if len(absfilepath) > 259:
787 filename = '\\\\?
\\' + absfilepath
788 with open(filename, 'wb
') as outf:
789 outf.write(webpage_bytes)
792 content = webpage_bytes.decode(encoding, 'replace
')
794 content = webpage_bytes.decode('utf
-8', 'replace
')
796 self.__check_blocked(content)
800 def _download_webpage(
801 self, url_or_request, video_id, note=None, errnote=None,
802 fatal=True, tries=1, timeout=5, encoding=None, data=None,
803 headers={}, query={}, expected_status=None):
805 Return the data of the page as a string.
808 url_or_request -- plain text URL as a string or
809 a compat_urllib_request.Requestobject
810 video_id -- Video/playlist/item identifier (string)
813 note -- note printed before downloading (string)
814 errnote -- note printed in case of an error (string)
815 fatal -- flag denoting whether error should be considered fatal,
816 i.e. whether it should cause ExtractionError to be raised,
817 otherwise a warning will be reported and extraction continued
818 tries -- number of tries
819 timeout -- sleep interval between tries
820 encoding -- encoding for a page content decoding, guessed automatically
821 when not explicitly specified
822 data -- POST data (bytes)
823 headers -- HTTP headers (dict)
824 query -- URL query (dict)
825 expected_status -- allows to accept failed HTTP requests (non 2xx
826 status code) by explicitly specifying a set of accepted status
827 codes. Can be any of the following entities:
828 - an integer type specifying an exact failed status code to
830 - a list or a tuple of integer types specifying a list of
831 failed status codes to accept
832 - a callable accepting an actual failed status code and
833 returning True if it should be accepted
834 Note that this argument does not affect success status codes (2xx)
835 which are always accepted.
840 while success is False:
842 res = self._download_webpage_handle(
843 url_or_request, video_id, note, errnote, fatal,
844 encoding=encoding, data=data, headers=headers, query=query,
845 expected_status=expected_status)
847 except compat_http_client.IncompleteRead as e:
849 if try_count >= tries:
851 self._sleep(timeout, video_id)
858 def _download_xml_handle(
859 self, url_or_request, video_id, note='Downloading XML
',
860 errnote='Unable to download XML
', transform_source=None,
861 fatal=True, encoding=None, data=None, headers={}, query={},
862 expected_status=None):
864 Return a tuple (xml as an compat_etree_Element, URL handle).
866 See _download_webpage docstring for arguments specification.
868 res = self._download_webpage_handle(
869 url_or_request, video_id, note, errnote, fatal=fatal,
870 encoding=encoding, data=data, headers=headers, query=query,
871 expected_status=expected_status)
874 xml_string, urlh = res
875 return self._parse_xml(
876 xml_string, video_id, transform_source=transform_source,
880 self, url_or_request, video_id,
881 note='Downloading XML
', errnote='Unable to download XML
',
882 transform_source=None, fatal=True, encoding=None,
883 data=None, headers={}, query={}, expected_status=None):
885 Return the xml as an compat_etree_Element.
887 See _download_webpage docstring for arguments specification.
889 res = self._download_xml_handle(
890 url_or_request, video_id, note=note, errnote=errnote,
891 transform_source=transform_source, fatal=fatal, encoding=encoding,
892 data=data, headers=headers, query=query,
893 expected_status=expected_status)
894 return res if res is False else res[0]
896 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
898 xml_string = transform_source(xml_string)
900 return compat_etree_fromstring(xml_string.encode('utf
-8'))
901 except compat_xml_parse_error as ve:
902 errmsg = '%s: Failed to parse XML
' % video_id
904 raise ExtractorError(errmsg, cause=ve)
906 self.report_warning(errmsg + str(ve))
908 def _download_json_handle(
909 self, url_or_request, video_id, note='Downloading JSON metadata
',
910 errnote='Unable to download JSON metadata
', transform_source=None,
911 fatal=True, encoding=None, data=None, headers={}, query={},
912 expected_status=None):
914 Return a tuple (JSON object, URL handle).
916 See _download_webpage docstring for arguments specification.
918 res = self._download_webpage_handle(
919 url_or_request, video_id, note, errnote, fatal=fatal,
920 encoding=encoding, data=data, headers=headers, query=query,
921 expected_status=expected_status)
924 json_string, urlh = res
925 return self._parse_json(
926 json_string, video_id, transform_source=transform_source,
930 self, url_or_request, video_id, note='Downloading JSON metadata
',
931 errnote='Unable to download JSON metadata
', transform_source=None,
932 fatal=True, encoding=None, data=None, headers={}, query={},
933 expected_status=None):
935 Return the JSON object as a dict.
937 See _download_webpage docstring for arguments specification.
939 res = self._download_json_handle(
940 url_or_request, video_id, note=note, errnote=errnote,
941 transform_source=transform_source, fatal=fatal, encoding=encoding,
942 data=data, headers=headers, query=query,
943 expected_status=expected_status)
944 return res if res is False else res[0]
946 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
948 json_string = transform_source(json_string)
950 return json.loads(json_string)
951 except ValueError as ve:
952 errmsg = '%s: Failed to parse JSON
' % video_id
954 raise ExtractorError(errmsg, cause=ve)
956 self.report_warning(errmsg + str(ve))
958 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
959 return self._parse_json(
960 data[data.find('{'):data.rfind('}
') + 1],
961 video_id, transform_source, fatal)
963 def _download_socket_json_handle(
964 self, url_or_request, video_id, note='Polling socket
',
965 errnote='Unable to poll socket
', transform_source=None,
966 fatal=True, encoding=None, data=None, headers={}, query={},
967 expected_status=None):
969 Return a tuple (JSON object, URL handle).
971 See _download_webpage docstring for arguments specification.
973 res = self._download_webpage_handle(
974 url_or_request, video_id, note, errnote, fatal=fatal,
975 encoding=encoding, data=data, headers=headers, query=query,
976 expected_status=expected_status)
980 return self._parse_socket_response_as_json(
981 webpage, video_id, transform_source=transform_source,
984 def _download_socket_json(
985 self, url_or_request, video_id, note='Polling socket
',
986 errnote='Unable to poll socket
', transform_source=None,
987 fatal=True, encoding=None, data=None, headers={}, query={},
988 expected_status=None):
990 Return the JSON object as a dict.
992 See _download_webpage docstring for arguments specification.
994 res = self._download_socket_json_handle(
995 url_or_request, video_id, note=note, errnote=errnote,
996 transform_source=transform_source, fatal=fatal, encoding=encoding,
997 data=data, headers=headers, query=query,
998 expected_status=expected_status)
999 return res if res is False else res[0]
1001 def report_warning(self, msg, video_id=None, *args, **kwargs):
1002 idstr = '' if video_id is None else '%s: ' % video_id
1003 self._downloader.report_warning(
1004 '[%s] %s%s' % (self.IE_NAME, idstr, msg), *args, **kwargs)
1006 def to_screen(self, msg, *args, **kwargs):
1007 """Print msg to screen, prefixing it with '[ie_name
]'"""
1008 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1010 def write_debug(self, msg, *args, **kwargs):
1011 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1013 def get_param(self, name, default=None, *args, **kwargs):
1014 if self._downloader:
1015 return self._downloader.params.get(name, default, *args, **kwargs)
1018 def report_extraction(self, id_or_name):
1019 """Report information extraction."""
1020 self.to_screen('%s: Extracting information
' % id_or_name)
1022 def report_download_webpage(self, video_id):
1023 """Report webpage download."""
1024 self.to_screen('%s: Downloading webpage
' % video_id)
1026 def report_age_confirmation(self):
1027 """Report attempt to confirm age."""
1028 self.to_screen('Confirming age
')
1030 def report_login(self):
1031 """Report attempt to log in."""
1032 self.to_screen('Logging
in')
1034 def raise_login_required(
1035 self, msg='This video
is only available
for registered users
',
1036 metadata_available=False, method='any
'):
1037 if metadata_available and self.get_param('ignore_no_formats_error
'):
1038 self.report_warning(msg)
1039 raise ExtractorError('%s. %s' % (msg, self._LOGIN_HINTS[method]), expected=True)
1041 def raise_geo_restricted(
1042 self, msg='This video
is not available
from your location due to geo restriction
',
1043 countries=None, metadata_available=False):
1044 if metadata_available and self.get_param('ignore_no_formats_error
'):
1045 self.report_warning(msg)
1047 raise GeoRestrictedError(msg, countries=countries)
1049 def raise_no_formats(self, msg, expected=False, video_id=None):
1050 if expected and self.get_param('ignore_no_formats_error
'):
1051 self.report_warning(msg, video_id)
1053 raise ExtractorError(msg, expected=expected, video_id=video_id)
1055 # Methods for following #608
1057 def url_result(url, ie=None, video_id=None, video_title=None):
1058 """Returns a URL that points to a page that should be processed"""
1059 # TODO: ie should be the class used for getting the info
1060 video_info = {'_type
': 'url
',
1063 if video_id is not None:
1064 video_info['id'] = video_id
1065 if video_title is not None:
1066 video_info['title
'] = video_title
1069 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1071 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1073 return self.playlist_result(
1074 urls, playlist_id=playlist_id, playlist_title=playlist_title)
1077 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1078 """Returns a playlist"""
1079 video_info = {'_type
': 'playlist
',
1081 video_info.update(kwargs)
1083 video_info['id'] = playlist_id
1085 video_info['title
'] = playlist_title
1086 if playlist_description is not None:
1087 video_info['description
'] = playlist_description
1090 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1092 Perform a regex search on the given string, using a single or a list of
1093 patterns returning the first matching group.
1094 In case of failure return a default value or raise a WARNING or a
1095 RegexNotFoundError, depending on fatal, specifying the field name.
1097 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1098 mobj = re.search(pattern, string, flags)
1101 mobj = re.search(p, string, flags)
1105 if not self.get_param('no_color
') and compat_os_name != 'nt
' and sys.stderr.isatty():
1106 _name = '\033[0;34m
%s\033[0m
' % name
1112 # return the first matching group
1113 return next(g for g in mobj.groups() if g is not None)
1115 return mobj.group(group)
1116 elif default is not NO_DEFAULT:
1119 raise RegexNotFoundError('Unable to extract
%s' % _name)
1121 self.report_warning('unable to extract
%s' % _name + bug_reports_message())
1124 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1126 Like _search_regex, but strips HTML tags and unescapes entities.
1128 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1130 return clean_html(res).strip()
1134 def _get_netrc_login_info(self, netrc_machine=None):
1137 netrc_machine = netrc_machine or self._NETRC_MACHINE
1139 if self.get_param('usenetrc
', False):
1141 info = netrc.netrc().authenticators(netrc_machine)
1142 if info is not None:
1146 raise netrc.NetrcParseError(
1147 'No authenticators
for %s' % netrc_machine)
1148 except (IOError, netrc.NetrcParseError) as err:
1149 self.report_warning(
1150 'parsing
.netrc
: %s' % error_to_compat_str(err))
1152 return username, password
1154 def _get_login_info(self, username_option='username
', password_option='password
', netrc_machine=None):
1156 Get the login info as (username, password)
1157 First look for the manually specified credentials using username_option
1158 and password_option as keys in params dictionary. If no such credentials
1159 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1161 If there's no info available
, return (None, None)
1164 # Attempt to use provided username and password or .netrc data
1165 username = self.get_param(username_option)
1166 if username is not None:
1167 password = self.get_param(password_option)
1169 username, password = self._get_netrc_login_info(netrc_machine)
1171 return username, password
1173 def _get_tfa_info(self, note='two-factor verification code'):
1175 Get the two
-factor authentication info
1176 TODO
- asking the user will be required
for sms
/phone verify
1177 currently just uses the command line option
1178 If there
's no info available, return None
1181 tfa = self.get_param('twofactor
')
1185 return compat_getpass('Type
%s and press
[Return
]: ' % note)
1187 # Helper functions for extracting OpenGraph info
1189 def _og_regexes(prop):
1190 content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))'
1191 property_re = (r'(?
:name|
property)=(?
:\'og
[:-]%(prop)s\'|
"og[:-]%(prop)s"|\s
*og
[:-]%(prop)s\b)'
1192 % {'prop': re.escape(prop)})
1193 template = r'<meta
[^
>]+?
%s[^
>]+?
%s'
1195 template % (property_re, content_re),
1196 template % (content_re, property_re),
1200 def _meta_regex(prop):
1201 return r'''(?isx)<meta
1202 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1203 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1205 def _og_search_property(self, prop, html, name=None, **kargs):
1206 if not isinstance(prop, (list, tuple)):
1209 name = 'OpenGraph
%s' % prop[0]
1212 og_regexes.extend(self._og_regexes(p))
1213 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1216 return unescapeHTML(escaped)
1218 def _og_search_thumbnail(self, html, **kargs):
1219 return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs)
1221 def _og_search_description(self, html, **kargs):
1222 return self._og_search_property('description
', html, fatal=False, **kargs)
1224 def _og_search_title(self, html, **kargs):
1225 return self._og_search_property('title
', html, **kargs)
1227 def _og_search_video_url(self, html, name='video url
', secure=True, **kargs):
1228 regexes = self._og_regexes('video
') + self._og_regexes('video
:url
')
1230 regexes = self._og_regexes('video
:secure_url
') + regexes
1231 return self._html_search_regex(regexes, html, name, **kargs)
1233 def _og_search_url(self, html, **kargs):
1234 return self._og_search_property('url
', html, **kargs)
1236 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1237 if not isinstance(name, (list, tuple)):
1239 if display_name is None:
1240 display_name = name[0]
1241 return self._html_search_regex(
1242 [self._meta_regex(n) for n in name],
1243 html, display_name, fatal=fatal, group='content
', **kwargs)
1245 def _dc_search_uploader(self, html):
1246 return self._html_search_meta('dc
.creator
', html, 'uploader
')
1248 def _rta_search(self, html):
1249 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1250 if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+'
1251 r' content
="RTA-5042-1996-1400-1577-RTA"',
1256 def _media_rating_search(self, html):
1257 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1258 rating = self._html_search_meta('rating
', html)
1270 return RATING_TABLE.get(rating.lower())
1272 def _family_friendly_search(self, html):
1273 # See http://schema.org/VideoObject
1274 family_friendly = self._html_search_meta(
1275 'isFamilyFriendly
', html, default=None)
1277 if not family_friendly:
1286 return RATING_TABLE.get(family_friendly.lower())
1288 def _twitter_search_player(self, html):
1289 return self._html_search_meta('twitter
:player
', html,
1290 'twitter card player
')
1292 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1293 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1294 default = kwargs.get('default
', NO_DEFAULT)
1295 # JSON-LD may be malformed and thus `fatal` should be respected.
1296 # At the same time `default` may be passed that assumes `fatal=False`
1297 # for _search_regex. Let's simulate the same behavior here
as well
.
1298 fatal
= kwargs
.get('fatal', True) if default
== NO_DEFAULT
else False
1300 for mobj
in json_ld_list
:
1301 json_ld_item
= self
._parse
_json
(
1302 mobj
.group('json_ld'), video_id
, fatal
=fatal
)
1303 if not json_ld_item
:
1305 if isinstance(json_ld_item
, dict):
1306 json_ld
.append(json_ld_item
)
1307 elif isinstance(json_ld_item
, (list, tuple)):
1308 json_ld
.extend(json_ld_item
)
1310 json_ld
= self
._json
_ld
(json_ld
, video_id
, fatal
=fatal
, expected_type
=expected_type
)
1313 if default
is not NO_DEFAULT
:
1316 raise RegexNotFoundError('Unable to extract JSON-LD')
1318 self
.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1321 def _json_ld(self
, json_ld
, video_id
, fatal
=True, expected_type
=None):
1322 if isinstance(json_ld
, compat_str
):
1323 json_ld
= self
._parse
_json
(json_ld
, video_id
, fatal
=fatal
)
1327 if not isinstance(json_ld
, (list, tuple, dict)):
1329 if isinstance(json_ld
, dict):
1332 INTERACTION_TYPE_MAP
= {
1333 'CommentAction': 'comment',
1334 'AgreeAction': 'like',
1335 'DisagreeAction': 'dislike',
1336 'LikeAction': 'like',
1337 'DislikeAction': 'dislike',
1338 'ListenAction': 'view',
1339 'WatchAction': 'view',
1340 'ViewAction': 'view',
1343 def extract_interaction_type(e
):
1344 interaction_type
= e
.get('interactionType')
1345 if isinstance(interaction_type
, dict):
1346 interaction_type
= interaction_type
.get('@type')
1347 return str_or_none(interaction_type
)
1349 def extract_interaction_statistic(e
):
1350 interaction_statistic
= e
.get('interactionStatistic')
1351 if isinstance(interaction_statistic
, dict):
1352 interaction_statistic
= [interaction_statistic
]
1353 if not isinstance(interaction_statistic
, list):
1355 for is_e
in interaction_statistic
:
1356 if not isinstance(is_e
, dict):
1358 if is_e
.get('@type') != 'InteractionCounter':
1360 interaction_type
= extract_interaction_type(is_e
)
1361 if not interaction_type
:
1363 # For interaction count some sites provide string instead of
1364 # an integer (as per spec) with non digit characters (e.g. ",")
1365 # so extracting count with more relaxed str_to_int
1366 interaction_count
= str_to_int(is_e
.get('userInteractionCount'))
1367 if interaction_count
is None:
1369 count_kind
= INTERACTION_TYPE_MAP
.get(interaction_type
.split('/')[-1])
1372 count_key
= '%s_count' % count_kind
1373 if info
.get(count_key
) is not None:
1375 info
[count_key
] = interaction_count
1377 def extract_video_object(e
):
1378 assert e
['@type'] == 'VideoObject'
1379 author
= e
.get('author')
1381 'url': url_or_none(e
.get('contentUrl')),
1382 'title': unescapeHTML(e
.get('name')),
1383 'description': unescapeHTML(e
.get('description')),
1384 'thumbnail': url_or_none(e
.get('thumbnailUrl') or e
.get('thumbnailURL')),
1385 'duration': parse_duration(e
.get('duration')),
1386 'timestamp': unified_timestamp(e
.get('uploadDate')),
1387 # author can be an instance of 'Organization' or 'Person' types.
1388 # both types can have 'name' property(inherited from 'Thing' type). [1]
1389 # however some websites are using 'Text' type instead.
1390 # 1. https://schema.org/VideoObject
1391 'uploader': author
.get('name') if isinstance(author
, dict) else author
if isinstance(author
, compat_str
) else None,
1392 'filesize': float_or_none(e
.get('contentSize')),
1393 'tbr': int_or_none(e
.get('bitrate')),
1394 'width': int_or_none(e
.get('width')),
1395 'height': int_or_none(e
.get('height')),
1396 'view_count': int_or_none(e
.get('interactionCount')),
1398 extract_interaction_statistic(e
)
1402 item_type
= e
.get('@type')
1403 if expected_type
is not None and expected_type
!= item_type
:
1405 if item_type
in ('TVEpisode', 'Episode'):
1406 episode_name
= unescapeHTML(e
.get('name'))
1408 'episode': episode_name
,
1409 'episode_number': int_or_none(e
.get('episodeNumber')),
1410 'description': unescapeHTML(e
.get('description')),
1412 if not info
.get('title') and episode_name
:
1413 info
['title'] = episode_name
1414 part_of_season
= e
.get('partOfSeason')
1415 if isinstance(part_of_season
, dict) and part_of_season
.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1417 'season': unescapeHTML(part_of_season
.get('name')),
1418 'season_number': int_or_none(part_of_season
.get('seasonNumber')),
1420 part_of_series
= e
.get('partOfSeries') or e
.get('partOfTVSeries')
1421 if isinstance(part_of_series
, dict) and part_of_series
.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1422 info
['series'] = unescapeHTML(part_of_series
.get('name'))
1423 elif item_type
== 'Movie':
1425 'title': unescapeHTML(e
.get('name')),
1426 'description': unescapeHTML(e
.get('description')),
1427 'duration': parse_duration(e
.get('duration')),
1428 'timestamp': unified_timestamp(e
.get('dateCreated')),
1430 elif item_type
in ('Article', 'NewsArticle'):
1432 'timestamp': parse_iso8601(e
.get('datePublished')),
1433 'title': unescapeHTML(e
.get('headline')),
1434 'description': unescapeHTML(e
.get('articleBody')),
1436 elif item_type
== 'VideoObject':
1437 extract_video_object(e
)
1438 if expected_type
is None:
1442 video
= e
.get('video')
1443 if isinstance(video
, dict) and video
.get('@type') == 'VideoObject':
1444 extract_video_object(video
)
1445 if expected_type
is None:
1449 return dict((k
, v
) for k
, v
in info
.items() if v
is not None)
1452 def _hidden_inputs(html
):
1453 html
= re
.sub(r
'<!--(?:(?!<!--).)*-->', '', html
)
1455 for input in re
.findall(r
'(?i)(<input[^>]+>)', html
):
1456 attrs
= extract_attributes(input)
1459 if attrs
.get('type') not in ('hidden', 'submit'):
1461 name
= attrs
.get('name') or attrs
.get('id')
1462 value
= attrs
.get('value')
1463 if name
and value
is not None:
1464 hidden_inputs
[name
] = value
1465 return hidden_inputs
1467 def _form_hidden_inputs(self
, form_id
, html
):
1468 form
= self
._search
_regex
(
1469 r
'(?is)<form[^>]+?id=(["\'])%s\
1[^
>]*>(?P
<form
>.+?
)</form
>' % form_id,
1470 html, '%s form
' % form_id, group='form
')
1471 return self._hidden_inputs(form)
1474 regex = r' *((?P
<reverse
>\
+)?
(?P
<field
>[a
-zA
-Z0
-9_]+)((?P
<separator
>[~
:])(?P
<limit
>.*?
))?
)?
*$
'
1476 default = ('hidden
', 'hasvid
', 'ie_pref
', 'lang
', 'quality
',
1477 'res
', 'fps
', 'codec
:vp9
.2
', 'size
', 'br
', 'asr
',
1478 'proto
', 'ext
', 'hasaud
', 'source
', 'format_id
') # These must not be aliases
1479 ytdl_default = ('hasaud
', 'quality
', 'tbr
', 'filesize
', 'vbr
',
1480 'height
', 'width
', 'proto
', 'vext
', 'abr
', 'aext
',
1481 'fps
', 'fs_approx
', 'source
', 'format_id
')
1484 'vcodec
': {'type': 'ordered
', 'regex
': True,
1485 'order
': ['av0?
1', 'vp0?
9.2', 'vp0?
9', '[hx
]265|he?vc?
', '[hx
]264|avc
', 'vp0?
8', 'mp4v|h263
', 'theora
', '', None, 'none
']},
1486 'acodec
': {'type': 'ordered
', 'regex
': True,
1487 'order
': ['opus
', 'vorbis
', 'aac
', 'mp?
4a?
', 'mp3
', 'e?a?c
-?
3', 'dts
', '', None, 'none
']},
1488 'proto
': {'type': 'ordered
', 'regex
': True, 'field
': 'protocol
',
1489 'order
': ['(ht|f
)tps
', '(ht|f
)tp$
', 'm3u8
.+', 'm3u8
', '.*dash
', '', 'mms|rtsp
', 'none
', 'f4
']},
1490 'vext
': {'type': 'ordered
', 'field
': 'video_ext
',
1491 'order
': ('mp4
', 'webm
', 'flv
', '', 'none
'),
1492 'order_free
': ('webm
', 'mp4
', 'flv
', '', 'none
')},
1493 'aext
': {'type': 'ordered
', 'field
': 'audio_ext
',
1494 'order
': ('m4a
', 'aac
', 'mp3
', 'ogg
', 'opus
', 'webm
', '', 'none
'),
1495 'order_free
': ('opus
', 'ogg
', 'webm
', 'm4a
', 'mp3
', 'aac
', '', 'none
')},
1496 'hidden
': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1497 'ie_pref
': {'priority': True, 'type': 'extractor'},
1498 'hasvid
': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1499 'hasaud
': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1500 'lang
': {'priority': True, 'convert': 'ignore', 'field': 'language_preference'},
1501 'quality
': {'convert': 'float_none', 'default': -1},
1502 'filesize
': {'convert': 'bytes'},
1503 'fs_approx
': {'convert': 'bytes', 'field': 'filesize_approx'},
1504 'id': {'convert': 'string', 'field': 'format_id'},
1505 'height
': {'convert': 'float_none'},
1506 'width
': {'convert': 'float_none'},
1507 'fps
': {'convert': 'float_none'},
1508 'tbr
': {'convert': 'float_none'},
1509 'vbr
': {'convert': 'float_none'},
1510 'abr
': {'convert': 'float_none'},
1511 'asr
': {'convert': 'float_none'},
1512 'source
': {'convert': 'ignore', 'field': 'source_preference'},
1514 'codec
': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1515 'br
': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1516 'size
': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1517 'ext
': {'type': 'combined', 'field': ('vext', 'aext')},
1518 'res
': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1520 # Most of these exist only for compatibility reasons
1521 'dimension
': {'type': 'alias', 'field': 'res'},
1522 'resolution
': {'type': 'alias', 'field': 'res'},
1523 'extension
': {'type': 'alias', 'field': 'ext'},
1524 'bitrate
': {'type': 'alias', 'field': 'br'},
1525 'total_bitrate
': {'type': 'alias', 'field': 'tbr'},
1526 'video_bitrate
': {'type': 'alias', 'field': 'vbr'},
1527 'audio_bitrate
': {'type': 'alias', 'field': 'abr'},
1528 'framerate
': {'type': 'alias', 'field': 'fps'},
1529 'language_preference
': {'type': 'alias', 'field': 'lang'}, # not named as 'language
' because such a field exists
1530 'protocol
': {'type': 'alias', 'field': 'proto'},
1531 'source_preference
': {'type': 'alias', 'field': 'source'},
1532 'filesize_approx
': {'type': 'alias', 'field': 'fs_approx'},
1533 'filesize_estimate
': {'type': 'alias', 'field': 'size'},
1534 'samplerate
': {'type': 'alias', 'field': 'asr'},
1535 'video_ext
': {'type': 'alias', 'field': 'vext'},
1536 'audio_ext
': {'type': 'alias', 'field': 'aext'},
1537 'video_codec
': {'type': 'alias', 'field': 'vcodec'},
1538 'audio_codec
': {'type': 'alias', 'field': 'acodec'},
1539 'video
': {'type': 'alias', 'field': 'hasvid'},
1540 'has_video
': {'type': 'alias', 'field': 'hasvid'},
1541 'audio
': {'type': 'alias', 'field': 'hasaud'},
1542 'has_audio
': {'type': 'alias', 'field': 'hasaud'},
1543 'extractor
': {'type': 'alias', 'field': 'ie_pref'},
1544 'preference
': {'type': 'alias', 'field': 'ie_pref'},
1545 'extractor_preference
': {'type': 'alias', 'field': 'ie_pref'},
1546 'format_id
': {'type': 'alias', 'field': 'id'},
1551 def _get_field_setting(self, field, key):
1552 if field not in self.settings:
1553 self.settings[field] = {}
1554 propObj = self.settings[field]
1555 if key not in propObj:
1556 type = propObj.get('type')
1558 default = 'preference
' if type == 'extractor
' else (field,) if type in ('combined
', 'multiple
') else field
1559 elif key == 'convert
':
1560 default = 'order
' if type == 'ordered
' else 'float_string
' if field else 'ignore
'
1562 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1563 propObj[key] = default
1566 def _resolve_field_value(self, field, value, convertNone=False):
1571 value = value.lower()
1572 conversion = self._get_field_setting(field, 'convert
')
1573 if conversion == 'ignore
':
1575 if conversion == 'string
':
1577 elif conversion == 'float_none
':
1578 return float_or_none(value)
1579 elif conversion == 'bytes':
1580 return FileDownloader.parse_bytes(value)
1581 elif conversion == 'order
':
1582 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free
')) or self._get_field_setting(field, 'order
')
1583 use_regex = self._get_field_setting(field, 'regex
')
1584 list_length = len(order_list)
1585 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1586 if use_regex and value is not None:
1587 for i, regex in enumerate(order_list):
1588 if regex and re.match(regex, value):
1589 return list_length - i
1590 return list_length - empty_pos # not in list
1591 else: # not regex or value = None
1592 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1594 if value.isnumeric():
1597 self.settings[field]['convert
'] = 'string
'
1600 def evaluate_params(self, params, sort_extractor):
1601 self._use_free_order = params.get('prefer_free_formats
', False)
1602 self._sort_user = params.get('format_sort
', [])
1603 self._sort_extractor = sort_extractor
1605 def add_item(field, reverse, closest, limit_text):
1606 field = field.lower()
1607 if field in self._order:
1609 self._order.append(field)
1610 limit = self._resolve_field_value(field, limit_text)
1613 'closest
': False if limit is None else closest,
1614 'limit_text
': limit_text,
1616 if field in self.settings:
1617 self.settings[field].update(data)
1619 self.settings[field] = data
1622 tuple(field for field in self.default if self._get_field_setting(field, 'forced
'))
1623 + (tuple() if params.get('format_sort_force
', False)
1624 else tuple(field for field in self.default if self._get_field_setting(field, 'priority
')))
1625 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1627 for item in sort_list:
1628 match = re.match(self.regex, item)
1630 raise ExtractorError('Invalid format sort string
"%s" given by extractor
' % item)
1631 field = match.group('field
')
1634 if self._get_field_setting(field, 'type') == 'alias
':
1635 field = self._get_field_setting(field, 'field
')
1636 reverse = match.group('reverse
') is not None
1637 closest = match.group('separator
') == '~
'
1638 limit_text = match.group('limit
')
1640 has_limit = limit_text is not None
1641 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined
'
1642 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit
')
1644 fields = self._get_field_setting(field, 'field
') if has_multiple_fields else (field,)
1645 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1646 limit_count = len(limits)
1647 for (i, f) in enumerate(fields):
1648 add_item(f, reverse, closest,
1649 limits[i] if i < limit_count
1650 else limits[0] if has_limit and not has_multiple_limits
1653 def print_verbose_info(self, write_debug):
1655 write_debug('Sort order given by user
: %s' % ', '.join(self._sort_user))
1656 if self._sort_extractor:
1657 write_debug('Sort order given by extractor
: %s' % ', '.join(self._sort_extractor))
1658 write_debug('Formats
sorted by
: %s' % ', '.join(['%s%s%s' % (
1659 '+' if self._get_field_setting(field, 'reverse
') else '', field,
1660 '%s%s(%s)' % ('~
' if self._get_field_setting(field, 'closest
') else ':',
1661 self._get_field_setting(field, 'limit_text
'),
1662 self._get_field_setting(field, 'limit
'))
1663 if self._get_field_setting(field, 'limit_text
') is not None else '')
1664 for field in self._order if self._get_field_setting(field, 'visible
')]))
1666 def _calculate_field_preference_from_value(self, format, field, type, value):
1667 reverse = self._get_field_setting(field, 'reverse
')
1668 closest = self._get_field_setting(field, 'closest
')
1669 limit = self._get_field_setting(field, 'limit
')
1671 if type == 'extractor
':
1672 maximum = self._get_field_setting(field, 'max')
1673 if value is None or (maximum is not None and value >= maximum):
1675 elif type == 'boolean
':
1676 in_list = self._get_field_setting(field, 'in_list
')
1677 not_in_list = self._get_field_setting(field, 'not_in_list
')
1678 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1679 elif type == 'ordered
':
1680 value = self._resolve_field_value(field, value, True)
1682 # try to convert to number
1683 val_num = float_or_none(value, default=self._get_field_setting(field, 'default
'))
1684 is_num = self._get_field_setting(field, 'convert
') != 'string
' and val_num is not None
1688 return ((-10, 0) if value is None
1689 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1690 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1691 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1692 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1693 else (-1, value, 0))
1695 def _calculate_field_preference(self, format, field):
1696 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1697 get_value = lambda f: format.get(self._get_field_setting(f, 'field
'))
1698 if type == 'multiple
':
1699 type = 'field
' # Only 'field
' is allowed in multiple for now
1700 actual_fields = self._get_field_setting(field, 'field
')
1702 def wrapped_function(values):
1703 values = tuple(filter(lambda x: x is not None, values))
1704 return (self._get_field_setting(field, 'function
')(*values) if len(values) > 1
1705 else values[0] if values
1708 value = wrapped_function((get_value(f) for f in actual_fields))
1710 value = get_value(field)
1711 return self._calculate_field_preference_from_value(format, field, type, value)
1713 def calculate_preference(self, format):
1714 # Determine missing protocol
1715 if not format.get('protocol
'):
1716 format['protocol
'] = determine_protocol(format)
1718 # Determine missing ext
1719 if not format.get('ext
') and 'url
' in format:
1720 format['ext
'] = determine_ext(format['url
'])
1721 if format.get('vcodec
') == 'none
':
1722 format['audio_ext
'] = format['ext
']
1723 format['video_ext
'] = 'none
'
1725 format['video_ext
'] = format['ext
']
1726 format['audio_ext
'] = 'none
'
1727 # if format.get('preference
') is None and format.get('ext
') in ('f4f
', 'f4m
'): # Not supported?
1728 # format['preference
'] = -1000
1730 # Determine missing bitrates
1731 if format.get('tbr
') is None:
1732 if format.get('vbr
') is not None and format.get('abr
') is not None:
1733 format['tbr
'] = format.get('vbr
', 0) + format.get('abr
', 0)
1735 if format.get('vcodec
') != "none" and format.get('vbr
') is None:
1736 format['vbr
'] = format.get('tbr
') - format.get('abr
', 0)
1737 if format.get('acodec
') != "none" and format.get('abr
') is None:
1738 format['abr
'] = format.get('tbr
') - format.get('vbr
', 0)
1740 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1742 def _sort_formats(self, formats, field_preference=[]):
1744 if self.get_param('ignore_no_formats_error
'):
1746 raise ExtractorError('No video formats found
')
1747 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1748 format_sort.evaluate_params(self._downloader.params, field_preference)
1749 if self.get_param('verbose
', False):
1750 format_sort.print_verbose_info(self._downloader.write_debug)
1751 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1753 def _check_formats(self, formats, video_id):
1755 formats[:] = filter(
1756 lambda f: self._is_valid_url(
1758 item='%s video format
' % f.get('format_id
') if f.get('format_id
') else 'video
'),
1762 def _remove_duplicate_formats(formats):
1766 if f['url
'] not in format_urls:
1767 format_urls.add(f['url
'])
1768 unique_formats.append(f)
1769 formats[:] = unique_formats
1771 def _is_valid_url(self, url, video_id, item='video
', headers={}):
1772 url = self._proto_relative_url(url, scheme='http
:')
1773 # For now assume non HTTP(S) URLs always valid
1774 if not (url.startswith('http
://') or url.startswith('https
://')):
1777 self._request_webpage(url, video_id, 'Checking
%s URL
' % item, headers=headers)
1779 except ExtractorError as e:
1781 '%s: %s URL
is invalid
, skipping
: %s'
1782 % (video_id, item, error_to_compat_str(e.cause)))
1785 def http_scheme(self):
1786 """ Either "http:" or "https:", depending on the user's preferences
"""
1789 if self.get_param('prefer_insecure', False)
1792 def _proto_relative_url(self, url, scheme=None):
1795 if url.startswith('//'):
1797 scheme = self.http_scheme()
1802 def _sleep(self, timeout, video_id, msg_template=None):
1803 if msg_template is None:
1804 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1805 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1809 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1810 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1811 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1812 manifest = self._download_xml(
1813 manifest_url, video_id, 'Downloading f4m manifest',
1814 'Unable to download f4m manifest',
1815 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1816 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1817 transform_source=transform_source,
1818 fatal=fatal, data=data, headers=headers, query=query)
1820 if manifest is False:
1823 return self._parse_f4m_formats(
1824 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1825 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1827 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1828 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1829 fatal=True, m3u8_id=None):
1830 if not isinstance(manifest, compat_etree_Element) and not fatal:
1833 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1834 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1835 if akamai_pv is not None and ';' in akamai_pv.text:
1836 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1837 if playerVerificationChallenge.strip() != '':
1841 manifest_version = '1.0'
1842 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1844 manifest_version = '2.0'
1845 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1846 # Remove unsupported DRM protected media from final formats
1847 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1848 media_nodes = remove_encrypted_media(media_nodes)
1852 manifest_base_url = get_base_url(manifest)
1854 bootstrap_info = xpath_element(
1855 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1856 'bootstrap info', default=None)
1859 mime_type = xpath_text(
1860 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1861 'base URL', default=None)
1862 if mime_type and mime_type.startswith('audio/'):
1865 for i, media_el in enumerate(media_nodes):
1866 tbr = int_or_none(media_el.attrib.get('bitrate'))
1867 width = int_or_none(media_el.attrib.get('width'))
1868 height = int_or_none(media_el.attrib.get('height'))
1869 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1870 # If <bootstrapInfo> is present, the specified f4m is a
1871 # stream-level manifest, and only set-level manifests may refer to
1872 # external resources. See section 11.4 and section 4 of F4M spec
1873 if bootstrap_info is None:
1875 # @href is introduced in 2.0, see section 11.6 of F4M spec
1876 if manifest_version == '2.0':
1877 media_url = media_el.attrib.get('href')
1878 if media_url is None:
1879 media_url = media_el.attrib.get('url')
1883 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1884 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1885 # If media_url is itself a f4m manifest do the recursive extraction
1886 # since bitrates in parent manifest (this one) and media_url manifest
1887 # may differ leading to inability to resolve the format by requested
1888 # bitrate in f4m downloader
1889 ext = determine_ext(manifest_url)
1891 f4m_formats = self._extract_f4m_formats(
1892 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1893 transform_source=transform_source, fatal=fatal)
1894 # Sometimes stream-level manifest contains single media entry that
1895 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1896 # At the same time parent's media entry in set-level manifest may
1897 # contain it. We will copy it from parent in such cases.
1898 if len(f4m_formats) == 1:
1901 'tbr': f.get('tbr') or tbr,
1902 'width': f.get('width') or width,
1903 'height': f.get('height') or height,
1904 'format_id': f.get('format_id') if not tbr else format_id,
1907 formats.extend(f4m_formats)
1910 formats.extend(self._extract_m3u8_formats(
1911 manifest_url, video_id, 'mp4', preference=preference,
1912 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1915 'format_id': format_id,
1916 'url': manifest_url,
1917 'manifest_url': manifest_url,
1918 'ext': 'flv' if bootstrap_info is not None else None,
1924 'preference': preference,
1929 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1931 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1935 'preference': preference - 100 if preference else -100,
1937 'resolution': 'multiple',
1938 'format_note': 'Quality selection URL',
1941 def _extract_m3u8_formats(self, *args, **kwargs):
1942 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1944 self.report_warning(bug_reports_message(
1945 "Ignoring subtitle tracks found in the HLS manifest; "
1946 "if any subtitle tracks are missing,"
1950 def _extract_m3u8_formats_and_subtitles(
1951 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1952 preference=None, quality=None, m3u8_id=None, note=None,
1953 errnote=None, fatal=True, live=False, data=None, headers={},
1956 res = self._download_webpage_handle(
1958 note='Downloading m3u8 information' if note is None else note,
1959 errnote='Failed to download m3u8 information' if errnote is None else errnote,
1960 fatal=fatal, data=data, headers=headers, query=query)
1965 m3u8_doc, urlh = res
1966 m3u8_url = urlh.geturl()
1968 return self._parse_m3u8_formats_and_subtitles(
1969 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1970 preference=preference, quality=quality, m3u8_id=m3u8_id,
1971 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1972 headers=headers, query=query, video_id=video_id)
1974 def _parse_m3u8_formats_and_subtitles(
1975 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
1976 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1977 errnote=None, fatal=True, data=None, headers={}, query={},
1980 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1983 if (not self.get_param('allow_unplayable_formats')
1984 and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
1991 format_url = lambda u: (
1993 if re.match(r'^https?://', u)
1994 else compat_urlparse.urljoin(m3u8_url, u))
1996 split_discontinuity = self.get_param('hls_split_discontinuity', False)
1999 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2000 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2001 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2003 # We should try extracting formats only from master playlists [1, 4.3.4],
2004 # i.e. playlists that describe available qualities. On the other hand
2005 # media playlists [1, 4.3.3] should be returned as is since they contain
2006 # just the media without qualities renditions.
2007 # Fortunately, master playlist can be easily distinguished from media
2008 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2009 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2010 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2011 # media playlist and MUST NOT appear in master playlist thus we can
2012 # clearly detect media playlist with this criterion.
2014 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None, video_id=None,
2015 fatal=True, data=None, headers={}):
2019 res = self._download_webpage_handle(
2020 format_url, video_id,
2022 errnote='Failed to download m3u8 playlist information',
2023 fatal=fatal, data=data, headers=headers)
2028 m3u8_doc, urlh = res
2029 format_url = urlh.geturl()
2031 playlist_formats = []
2034 if split_discontinuity
2041 for line in m3u8_doc.splitlines():
2042 if not line.startswith('#'):
2043 format_info['files'].append(line)
2044 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
2046 playlist_formats.append(format_info)
2052 playlist_formats.append(format_info)
2053 return playlist_formats
2055 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2057 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
2059 for format in playlist_formats:
2062 format_id.append(m3u8_id)
2063 format_index = format.get('index')
2065 format_id.append(str(format_index))
2067 'format_id': '-'.join(format_id),
2068 'format_index': format_index,
2071 'protocol': entry_protocol,
2072 'preference': preference,
2077 return formats, subtitles
2080 last_stream_inf = {}
2082 def extract_media(x_media_line):
2083 media = parse_m3u8_attributes(x_media_line)
2084 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2085 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2086 if not (media_type and group_id and name):
2088 groups.setdefault(group_id, []).append(media)
2089 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2090 if media_type == 'SUBTITLES':
2091 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2092 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2093 # However, lack of URI has been spotted in the wild.
2094 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2095 if not media.get('URI'):
2097 url = format_url(media['URI'])
2100 'ext': determine_ext(url),
2102 if sub_info['ext'] == 'm3u8':
2103 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2104 # files may contain is WebVTT:
2105 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2106 sub_info['ext'] = 'vtt'
2107 sub_info['protocol'] = 'm3u8_native'
2108 lang = media.get('LANGUAGE') or 'und'
2109 subtitles.setdefault(lang, []).append(sub_info)
2110 if media_type not in ('VIDEO', 'AUDIO'):
2112 media_url = media.get('URI')
2114 manifest_url = format_url(media_url)
2116 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2117 fatal=fatal, data=data, headers=headers)
2119 for format in playlist_formats:
2120 format_index = format.get('index')
2121 for v in (m3u8_id, group_id, name):
2125 format_id.append(str(format_index))
2127 'format_id': '-'.join(format_id),
2128 'format_index': format_index,
2129 'url': manifest_url,
2130 'manifest_url': m3u8_url,
2131 'language': media.get('LANGUAGE'),
2133 'protocol': entry_protocol,
2134 'preference': preference,
2137 if media_type == 'AUDIO':
2138 f['vcodec'] = 'none'
2141 def build_stream_name():
2142 # Despite specification does not mention NAME attribute for
2143 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2144 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2145 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2146 stream_name = last_stream_inf.get('NAME')
2149 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2150 # from corresponding rendition group
2151 stream_group_id = last_stream_inf.get('VIDEO')
2152 if not stream_group_id:
2154 stream_group = groups.get(stream_group_id)
2155 if not stream_group:
2156 return stream_group_id
2157 rendition = stream_group[0]
2158 return rendition.get('NAME') or stream_group_id
2160 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2161 # chance to detect video only formats when EXT-X-STREAM-INF tags
2162 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2163 for line in m3u8_doc.splitlines():
2164 if line.startswith('#EXT-X-MEDIA:'):
2167 for line in m3u8_doc.splitlines():
2168 if line.startswith('#EXT-X-STREAM-INF:'):
2169 last_stream_inf = parse_m3u8_attributes(line)
2170 elif line.startswith('#') or not line.strip():
2173 tbr = float_or_none(
2174 last_stream_inf.get('AVERAGE-BANDWIDTH')
2175 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2176 manifest_url = format_url(line.strip())
2178 playlist_formats = _extract_m3u8_playlist_formats(manifest_url, video_id=video_id,
2179 fatal=fatal, data=data, headers=headers)
2181 for frmt in playlist_formats:
2184 format_id.append(m3u8_id)
2185 format_index = frmt.get('index')
2186 stream_name = build_stream_name()
2187 # Bandwidth of live streams may differ over time thus making
2188 # format_id unpredictable. So it's better to keep provided
2191 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2193 format_id.append(str(format_index))
2195 'format_id': '-'.join(format_id),
2196 'format_index': format_index,
2197 'url': manifest_url,
2198 'manifest_url': m3u8_url,
2201 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2202 'protocol': entry_protocol,
2203 'preference': preference,
2206 resolution = last_stream_inf.get('RESOLUTION')
2208 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2210 f['width'] = int(mobj.group('width'))
2211 f['height'] = int(mobj.group('height'))
2212 # Unified Streaming Platform
2214 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2216 abr, vbr = mobj.groups()
2217 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2222 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2224 audio_group_id = last_stream_inf.get('AUDIO')
2225 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2226 # references a rendition group MUST have a CODECS attribute.
2227 # However, this is not always respected, for example, [2]
2228 # contains EXT-X-STREAM-INF tag which references AUDIO
2229 # rendition group but does not have CODECS and despite
2230 # referencing an audio group it represents a complete
2231 # (with audio and video) format. So, for such cases we will
2232 # ignore references to rendition groups and treat them
2233 # as complete formats.
2234 if audio_group_id and codecs and f.get('vcodec') != 'none':
2235 audio_group = groups.get(audio_group_id)
2236 if audio_group and audio_group[0].get('URI'):
2237 # TODO: update acodec for audio only formats with
2239 f['acodec'] = 'none'
2240 if not f.get('ext'):
2241 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2245 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2248 del http_f['manifest_url']
2250 'format_id': f['format_id'].replace('hls-', 'http-'),
2252 'url': progressive_uri,
2254 formats.append(http_f)
2256 last_stream_inf = {}
2257 return formats, subtitles
2260 def _xpath_ns(path, namespace=None):
2264 for c in path.split('/'):
2265 if not c or c == '.':
2268 out.append('{%s}%s' % (namespace, c))
2269 return '/'.join(out)
2271 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2272 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2278 namespace = self._parse_smil_namespace(smil)
2280 return self._parse_smil_formats(
2281 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2283 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2284 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2287 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2289 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2290 return self._download_xml(
2291 smil_url, video_id, 'Downloading SMIL file',
2292 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2294 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2295 namespace = self._parse_smil_namespace(smil)
2297 formats = self._parse_smil_formats(
2298 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2299 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2301 video_id = os.path.splitext(url_basename(smil_url))[0]
2305 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2306 name = meta.attrib.get('name')
2307 content = meta.attrib.get('content')
2308 if not name or not content:
2310 if not title and name == 'title':
2312 elif not description and name in ('description', 'abstract'):
2313 description = content
2314 elif not upload_date and name == 'date':
2315 upload_date = unified_strdate(content)
2318 'id': image.get('type'),
2319 'url': image.get('src'),
2320 'width': int_or_none(image.get('width')),
2321 'height': int_or_none(image.get('height')),
2322 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2326 'title': title or video_id,
2327 'description': description,
2328 'upload_date': upload_date,
2329 'thumbnails': thumbnails,
2331 'subtitles': subtitles,
2334 def _parse_smil_namespace(self, smil):
2335 return self._search_regex(
2336 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2338 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2340 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2341 b = meta.get('base') or meta.get('httpBase')
2352 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2353 for medium in media:
2354 src = medium.get('src')
2355 if not src or src in srcs:
2359 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2360 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2361 width = int_or_none(medium.get('width'))
2362 height = int_or_none(medium.get('height'))
2363 proto = medium.get('proto')
2364 ext = medium.get('ext')
2365 src_ext = determine_ext(src)
2366 streamer = medium.get('streamer') or base
2368 if proto == 'rtmp' or streamer.startswith('rtmp'):
2374 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2376 'filesize': filesize,
2380 if transform_rtmp_url:
2381 streamer, src = transform_rtmp_url(streamer, src)
2382 formats[-1].update({
2388 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2389 src_url = src_url.strip()
2391 if proto == 'm3u8' or src_ext == 'm3u8':
2392 m3u8_formats = self._extract_m3u8_formats(
2393 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2394 if len(m3u8_formats) == 1:
2396 m3u8_formats[0].update({
2397 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2402 formats.extend(m3u8_formats)
2403 elif src_ext == 'f4m':
2408 'plugin': 'flowplayer-3.2.0.1',
2410 f4m_url += '&' if '?' in f4m_url else '?'
2411 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2412 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2413 elif src_ext == 'mpd':
2414 formats.extend(self._extract_mpd_formats(
2415 src_url, video_id, mpd_id='dash', fatal=False))
2416 elif re.search(r'\.ism/[Mm]anifest', src_url):
2417 formats.extend(self._extract_ism_formats(
2418 src_url, video_id, ism_id='mss', fatal=False))
2419 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2423 'ext': ext or src_ext or 'flv',
2424 'format_id': 'http-%d' % (bitrate or http_count),
2426 'filesize': filesize,
2433 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2436 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2437 src = textstream.get('src')
2438 if not src or src in urls:
2441 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2442 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2443 subtitles.setdefault(lang, []).append({
2449 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2450 xspf = self._download_xml(
2451 xspf_url, playlist_id, 'Downloading xpsf playlist',
2452 'Unable to download xspf manifest', fatal=fatal)
2455 return self._parse_xspf(
2456 xspf, playlist_id, xspf_url=xspf_url,
2457 xspf_base_url=base_url(xspf_url))
2459 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2461 'xspf': 'http://xspf.org/ns/0/',
2462 's1': 'http://static.streamone.nl/player/ns/0',
2466 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2468 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2469 description = xpath_text(
2470 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2471 thumbnail = xpath_text(
2472 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2473 duration = float_or_none(
2474 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2477 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2478 format_url = urljoin(xspf_base_url, location.text)
2483 'manifest_url': xspf_url,
2484 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2485 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2486 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2488 self._sort_formats(formats)
2493 'description': description,
2494 'thumbnail': thumbnail,
2495 'duration': duration,
2500 def _extract_mpd_formats(self, *args, **kwargs):
2501 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2503 self.report_warning(bug_reports_message(
2504 "Ignoring subtitle tracks found in the DASH manifest; "
2505 "if any subtitle tracks are missing,"
2509 def _extract_mpd_formats_and_subtitles(
2510 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2511 fatal=True, data=None, headers={}, query={}):
2512 res = self._download_xml_handle(
2514 note='Downloading MPD manifest' if note is None else note,
2515 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2516 fatal=fatal, data=data, headers=headers, query=query)
2522 mpd_base_url = base_url(urlh.geturl())
2524 return self._parse_mpd_formats_and_subtitles(
2525 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2527 def _parse_mpd_formats(self, *args, **kwargs):
2528 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2530 self.report_warning(bug_reports_message(
2531 "Ignoring subtitle tracks found in the DASH manifest; "
2532 "if any subtitle tracks are missing,"
2536 def _parse_mpd_formats_and_subtitles(
2537 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2539 Parse formats
from MPD manifest
.
2541 1. MPEG
-DASH Standard
, ISO
/IEC
23009-1:2014(E
),
2542 http
://standards
.iso
.org
/ittf
/PubliclyAvailableStandards
/c065274_ISO_IEC_23009
-1_2014.zip
2543 2. https
://en
.wikipedia
.org
/wiki
/Dynamic_Adaptive_Streaming_over_HTTP
2545 if not self.get_param('dynamic_mpd', True):
2546 if mpd_doc.get('type') == 'dynamic':
2549 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2552 return self._xpath_ns(path, namespace)
2554 def is_drm_protected(element):
2555 return element.find(_add_ns('ContentProtection')) is not None
2557 def extract_multisegment_info(element, ms_parent_info):
2558 ms_info = ms_parent_info.copy()
2560 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2561 # common attributes and elements. We will only extract relevant
2563 def extract_common(source):
2564 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2565 if segment_timeline is not None:
2566 s_e = segment_timeline.findall(_add_ns('S'))
2568 ms_info['total_number'] = 0
2571 r = int(s.get('r', 0))
2572 ms_info['total_number'] += 1 + r
2573 ms_info['s'].append({
2574 't': int(s.get('t', 0)),
2575 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2576 'd': int(s.attrib['d']),
2579 start_number = source.get('startNumber')
2581 ms_info['start_number'] = int(start_number)
2582 timescale = source.get('timescale')
2584 ms_info['timescale'] = int(timescale)
2585 segment_duration = source.get('duration')
2586 if segment_duration:
2587 ms_info['segment_duration'] = float(segment_duration)
2589 def extract_Initialization(source):
2590 initialization = source.find(_add_ns('Initialization'))
2591 if initialization is not None:
2592 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2594 segment_list = element.find(_add_ns('SegmentList'))
2595 if segment_list is not None:
2596 extract_common(segment_list)
2597 extract_Initialization(segment_list)
2598 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2600 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2602 segment_template = element.find(_add_ns('SegmentTemplate'))
2603 if segment_template is not None:
2604 extract_common(segment_template)
2605 media = segment_template.get('media')
2607 ms_info['media'] = media
2608 initialization = segment_template.get('initialization')
2610 ms_info['initialization'] = initialization
2612 extract_Initialization(segment_template)
2615 skip_unplayable = not self.get_param('allow_unplayable_formats')
2617 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2620 for period in mpd_doc.findall(_add_ns('Period')):
2621 period_duration = parse_duration(period.get('duration')) or mpd_duration
2622 period_ms_info = extract_multisegment_info(period, {
2626 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2627 if skip_unplayable and is_drm_protected(adaptation_set):
2629 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2630 for representation in adaptation_set.findall(_add_ns('Representation')):
2631 if skip_unplayable and is_drm_protected(representation):
2633 representation_attrib = adaptation_set.attrib.copy()
2634 representation_attrib.update(representation.attrib)
2635 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2636 mime_type = representation_attrib['mimeType']
2637 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2639 if content_type in ('video', 'audio', 'text'):
2641 for element in (representation, adaptation_set, period, mpd_doc):
2642 base_url_e = element.find(_add_ns('BaseURL'))
2643 if base_url_e is not None:
2644 base_url = base_url_e.text + base_url
2645 if re.match(r'^https?://', base_url):
2647 if mpd_base_url and not re.match(r'^https?://', base_url):
2648 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2650 base_url = mpd_base_url + base_url
2651 representation_id = representation_attrib.get('id')
2652 lang = representation_attrib.get('lang')
2653 url_el = representation.find(_add_ns('BaseURL'))
2654 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2655 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2656 if content_type in ('video', 'audio'):
2658 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2659 'manifest_url': mpd_url,
2660 'ext': mimetype2ext(mime_type),
2661 'width': int_or_none(representation_attrib.get('width')),
2662 'height': int_or_none(representation_attrib.get('height')),
2663 'tbr': float_or_none(bandwidth, 1000),
2664 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2665 'fps': int_or_none(representation_attrib.get('frameRate')),
2666 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2667 'format_note': 'DASH %s' % content_type,
2668 'filesize': filesize,
2669 'container': mimetype2ext(mime_type) + '_dash',
2671 f.update(parse_codecs(representation_attrib.get('codecs')))
2672 elif content_type == 'text':
2674 'ext': mimetype2ext(mime_type),
2675 'manifest_url': mpd_url,
2676 'filesize': filesize,
2678 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2680 def prepare_template(template_name, identifiers):
2681 tmpl = representation_ms_info[template_name]
2682 # First of, % characters outside $...$ templates
2683 # must be escaped by doubling for proper processing
2684 # by % operator string formatting used further (see
2685 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2691 in_template = not in_template
2692 elif c == '%' and not in_template:
2694 # Next, $...$ templates are translated to their
2695 # %(...) counterparts to be used with % operator
2696 t = t.replace('$RepresentationID$', representation_id)
2697 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2698 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2699 t.replace('$$', '$')
2702 # @initialization is a regular template like @media one
2703 # so it should be handled just the same way (see
2704 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2705 if 'initialization' in representation_ms_info:
2706 initialization_template = prepare_template(
2708 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2709 # $Time$ shall not be included for @initialization thus
2710 # only $Bandwidth$ remains
2712 representation_ms_info['initialization_url'] = initialization_template % {
2713 'Bandwidth': bandwidth,
2716 def location_key(location):
2717 return 'url' if re.match(r'^https?://', location) else 'path'
2719 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2721 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2722 media_location_key = location_key(media_template)
2724 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2725 # can't be used at the same time
2726 if '%(Number' in media_template and 's' not in representation_ms_info:
2727 segment_duration = None
2728 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2729 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2730 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2731 representation_ms_info['fragments'] = [{
2732 media_location_key: media_template % {
2733 'Number': segment_number,
2734 'Bandwidth': bandwidth,
2736 'duration': segment_duration,
2737 } for segment_number in range(
2738 representation_ms_info['start_number'],
2739 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2741 # $Number*$ or $Time$ in media template with S list available
2742 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2743 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2744 representation_ms_info['fragments'] = []
2747 segment_number = representation_ms_info['start_number']
2749 def add_segment_url():
2750 segment_url = media_template % {
2751 'Time': segment_time,
2752 'Bandwidth': bandwidth,
2753 'Number': segment_number,
2755 representation_ms_info['fragments'].append({
2756 media_location_key: segment_url,
2757 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2760 for num, s in enumerate(representation_ms_info['s']):
2761 segment_time = s.get('t') or segment_time
2765 for r in range(s.get('r', 0)):
2766 segment_time += segment_d
2769 segment_time += segment_d
2770 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2772 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2773 # or any YouTube dashsegments video
2776 timescale = representation_ms_info['timescale']
2777 for s in representation_ms_info['s']:
2778 duration = float_or_none(s['d'], timescale)
2779 for r in range(s.get('r', 0) + 1):
2780 segment_uri = representation_ms_info['segment_urls'][segment_index]
2782 location_key(segment_uri): segment_uri,
2783 'duration': duration,
2786 representation_ms_info['fragments'] = fragments
2787 elif 'segment_urls' in representation_ms_info:
2788 # Segment URLs with no SegmentTimeline
2789 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2790 # https://github.com/ytdl-org/youtube-dl/pull/14844
2792 segment_duration = float_or_none(
2793 representation_ms_info['segment_duration'],
2794 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2795 for segment_url in representation_ms_info['segment_urls']:
2797 location_key(segment_url): segment_url,
2799 if segment_duration:
2800 fragment['duration'] = segment_duration
2801 fragments.append(fragment)
2802 representation_ms_info['fragments'] = fragments
2803 # If there is a fragments key available then we correctly recognized fragmented media.
2804 # Otherwise we will assume unfragmented media with direct access. Technically, such
2805 # assumption is not necessarily correct since we may simply have no support for
2806 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2807 if 'fragments' in representation_ms_info:
2809 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2810 'url': mpd_url or base_url,
2811 'fragment_base_url': base_url,
2813 'protocol': 'http_dash_segments',
2815 if 'initialization_url' in representation_ms_info:
2816 initialization_url = representation_ms_info['initialization_url']
2817 if not f.get('url'):
2818 f['url'] = initialization_url
2819 f['fragments'].append({location_key(initialization_url): initialization_url})
2820 f['fragments'].extend(representation_ms_info['fragments'])
2822 # Assuming direct URL to unfragmented media.
2824 if content_type in ('video', 'audio'):
2826 elif content_type == 'text':
2827 subtitles.setdefault(lang or 'und', []).append(f)
2829 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2830 return formats, subtitles
2832 def _extract_ism_formats(self, *args, **kwargs):
2833 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2835 self.report_warning(bug_reports_message(
2836 "Ignoring subtitle tracks found in the ISM manifest; "
2837 "if any subtitle tracks are missing,"
2841 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2842 res = self._download_xml_handle(
2844 note='Downloading ISM manifest' if note is None else note,
2845 errnote='Failed to download ISM manifest' if errnote is None else errnote,
2846 fatal=fatal, data=data, headers=headers, query=query)
2853 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2855 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2857 Parse formats
from ISM manifest
.
2859 1. [MS
-SSTR
]: Smooth Streaming Protocol
,
2860 https
://msdn
.microsoft
.com
/en
-us
/library
/ff469518
.aspx
2862 if ism_doc.get('IsLive') == 'TRUE':
2864 if (not self.get_param('allow_unplayable_formats')
2865 and ism_doc.find('Protection') is not None):
2868 duration = int(ism_doc.attrib['Duration'])
2869 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2873 for stream in ism_doc.findall('StreamIndex'):
2874 stream_type = stream.get('Type')
2875 if stream_type not in ('video', 'audio', 'text'):
2877 url_pattern = stream.attrib['Url']
2878 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2879 stream_name = stream.get('Name')
2880 stream_language = stream.get('Language', 'und')
2881 for track in stream.findall('QualityLevel'):
2882 fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None)
2883 # TODO: add support for WVC1 and WMAP
2884 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
2885 self.report_warning('%s is not a supported codec' % fourcc)
2887 tbr = int(track.attrib['Bitrate']) // 1000
2888 # [1] does not mention Width and Height attributes. However,
2889 # they're often present while MaxWidth and MaxHeight are
2890 # missing, so should be used as fallbacks
2891 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2892 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2893 sampling_rate = int_or_none(track.get('SamplingRate'))
2895 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2896 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2902 stream_fragments = stream.findall('c')
2903 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2904 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2905 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2906 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2907 if not fragment_ctx['duration']:
2909 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2911 next_fragment_time = duration
2912 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2913 for _ in range(fragment_repeat):
2915 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2916 'duration': fragment_ctx['duration'] / stream_timescale,
2918 fragment_ctx['time'] += fragment_ctx['duration']
2922 format_id.append(ism_id)
2924 format_id.append(stream_name)
2925 format_id.append(compat_str(tbr))
2927 if stream_type == 'text':
2928 subtitles.setdefault(stream_language, []).append({
2932 'manifest_url': ism_url,
2933 'fragments': fragments,
2934 '_download_params': {
2935 'stream_type': stream_type,
2936 'duration': duration,
2937 'timescale': stream_timescale,
2939 'language': stream_language,
2940 'codec_private_data': track.get('CodecPrivateData'),
2943 elif stream_type in ('video', 'audio'):
2945 'format_id': '-'.join(format_id),
2947 'manifest_url': ism_url,
2948 'ext': 'ismv' if stream_type == 'video' else 'isma',
2952 'asr': sampling_rate,
2953 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2954 'acodec': 'none' if stream_type == 'video' else fourcc,
2956 'fragments': fragments,
2957 '_download_params': {
2958 'stream_type': stream_type,
2959 'duration': duration,
2960 'timescale': stream_timescale,
2961 'width': width or 0,
2962 'height': height or 0,
2964 'language': stream_language,
2965 'codec_private_data': track.get('CodecPrivateData'),
2966 'sampling_rate': sampling_rate,
2967 'channels': int_or_none(track.get('Channels', 2)),
2968 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2969 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2972 return formats, subtitles
2974 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2975 def absolute_url(item_url):
2976 return urljoin(base_url, item_url)
2978 def parse_content_type(content_type):
2979 if not content_type:
2981 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2983 mimetype, codecs = ctr.groups()
2984 f = parse_codecs(codecs)
2985 f['ext'] = mimetype2ext(mimetype)
2989 def _media_formats(src, cur_media_type, type_info={}):
2990 full_url = absolute_url(src)
2991 ext = type_info.get('ext') or determine_ext(full_url)
2993 is_plain_url = False
2994 formats = self._extract_m3u8_formats(
2995 full_url, video_id, ext='mp4',
2996 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2997 preference=preference, quality=quality, fatal=False)
2999 is_plain_url = False
3000 formats = self._extract_mpd_formats(
3001 full_url, video_id, mpd_id=mpd_id, fatal=False)
3006 'vcodec': 'none' if cur_media_type == 'audio' else None,
3008 return is_plain_url, formats
3011 # amp-video and amp-audio are very similar to their HTML5 counterparts
3012 # so we wll include them right here (see
3013 # https://www.ampproject.org/docs/reference/components/amp-video)
3014 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3015 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3016 media_tags = [(media_tag, media_tag_name, media_type, '')
3017 for media_tag, media_tag_name, media_type
3018 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3019 media_tags.extend(re.findall(
3020 # We only allow video|audio followed by a whitespace or '>'.
3021 # Allowing more characters may end up in significant slow down (see
3022 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3023 # http://www.porntrex.com/maps/videositemap.xml).
3024 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3025 for media_tag, _, media_type, media_content in media_tags:
3030 media_attributes = extract_attributes(media_tag)
3031 src = strip_or_none(media_attributes.get('src'))
3033 _, formats = _media_formats(src, media_type)
3034 media_info['formats'].extend(formats)
3035 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3037 for source_tag in re.findall(r'<source[^>]+>', media_content):
3038 s_attr = extract_attributes(source_tag)
3039 # data-video-src and data-src are non standard but seen
3040 # several times in the wild
3041 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
3044 f = parse_content_type(s_attr.get('type'))
3045 is_plain_url, formats = _media_formats(src, media_type, f)
3047 # width, height, res, label and title attributes are
3048 # all not standard but seen several times in the wild
3051 for lbl in ('label', 'title')
3052 if str_or_none(s_attr.get(lbl))
3054 width = int_or_none(s_attr.get('width'))
3055 height = (int_or_none(s_attr.get('height'))
3056 or int_or_none(s_attr.get('res')))
3057 if not width or not height:
3059 resolution = parse_resolution(lbl)
3062 width = width or resolution.get('width')
3063 height = height or resolution.get('height')
3065 tbr = parse_bitrate(lbl)
3074 'format_id': s_attr.get('label') or s_attr.get('title'),
3076 f.update(formats[0])
3077 media_info['formats'].append(f)
3079 media_info['formats'].extend(formats)
3080 for track_tag in re.findall(r'<track[^>]+>', media_content):
3081 track_attributes = extract_attributes(track_tag)
3082 kind = track_attributes.get('kind')
3083 if not kind or kind in ('subtitles', 'captions'):
3084 src = strip_or_none(track_attributes.get('src'))
3087 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3088 media_info['subtitles'].setdefault(lang, []).append({
3089 'url': absolute_url(src),
3091 for f in media_info['formats']:
3092 f.setdefault('http_headers', {})['Referer'] = base_url
3093 if media_info['formats'] or media_info['subtitles']:
3094 entries.append(media_info)
3097 def _extract_akamai_formats(self, *args, **kwargs):
3098 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3100 self.report_warning(bug_reports_message(
3101 "Ignoring subtitle tracks found in the manifests; "
3102 "if any subtitle tracks are missing,"
3106 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3107 signed = 'hdnea=' in manifest_url
3109 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3110 manifest_url = re.sub(
3111 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3112 '', manifest_url).strip('?')
3117 hdcore_sign = 'hdcore=3.7.0'
3118 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3119 hds_host = hosts.get('hds')
3121 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3122 if 'hdcore=' not in f4m_url:
3123 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3124 f4m_formats = self._extract_f4m_formats(
3125 f4m_url, video_id, f4m_id='hds', fatal=False)
3126 for entry in f4m_formats:
3127 entry.update({'extra_param_to_segment_url': hdcore_sign})
3128 formats.extend(f4m_formats)
3130 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3131 hls_host = hosts.get('hls')
3133 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3134 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3135 m3u8_url, video_id, 'mp4', 'm3u8_native',
3136 m3u8_id='hls', fatal=False)
3137 formats.extend(m3u8_formats)
3138 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3140 http_host = hosts.get('http')
3141 if http_host and m3u8_formats and not signed:
3142 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3143 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3144 qualities_length = len(qualities)
3145 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3147 for f in m3u8_formats:
3148 if f['vcodec'] != 'none':
3149 for protocol in ('http', 'https'):
3151 del http_f['manifest_url']
3153 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
3155 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3157 'protocol': protocol,
3159 formats.append(http_f)
3162 return formats, subtitles
3164 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3165 query = compat_urlparse.urlparse(url).query
3166 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3168 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3169 url_base = mobj.group('url')
3170 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3173 def manifest_url(manifest):
3174 m_url = '%s/%s' % (http_base_url, manifest)
3176 m_url += '?%s' % query
3179 if 'm3u8' not in skip_protocols:
3180 formats.extend(self._extract_m3u8_formats(
3181 manifest_url('playlist.m3u8'), video_id, 'mp4',
3182 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3183 if 'f4m' not in skip_protocols:
3184 formats.extend(self._extract_f4m_formats(
3185 manifest_url('manifest.f4m'),
3186 video_id, f4m_id='hds', fatal=False))
3187 if 'dash' not in skip_protocols:
3188 formats.extend(self._extract_mpd_formats(
3189 manifest_url('manifest.mpd'),
3190 video_id, mpd_id='dash', fatal=False))
3191 if re.search(r'(?:/smil:|\.smil)', url_base):
3192 if 'smil' not in skip_protocols:
3193 rtmp_formats = self._extract_smil_formats(
3194 manifest_url('jwplayer.smil'),
3195 video_id, fatal=False)
3196 for rtmp_format in rtmp_formats:
3197 rtsp_format = rtmp_format.copy()
3198 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3199 del rtsp_format['play_path']
3200 del rtsp_format['ext']
3201 rtsp_format.update({
3202 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3203 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3206 formats.extend([rtmp_format, rtsp_format])
3208 for protocol in ('rtmp', 'rtsp'):
3209 if protocol not in skip_protocols:
3211 'url': '%s:%s' % (protocol, url_base),
3212 'format_id': protocol,
3213 'protocol': protocol,
3217 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3219 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3223 jwplayer_data = self._parse_json(mobj.group('options'),
3225 transform_source=transform_source)
3226 except ExtractorError:
3229 if isinstance(jwplayer_data, dict):
3230 return jwplayer_data
3232 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3233 jwplayer_data = self._find_jwplayer_data(
3234 webpage, video_id, transform_source=js_to_json)
3235 return self._parse_jwplayer_data(
3236 jwplayer_data, video_id, *args, **kwargs)
3238 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3239 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3240 # JWPlayer backward compatibility: flattened playlists
3241 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3242 if 'playlist' not in jwplayer_data:
3243 jwplayer_data = {'playlist': [jwplayer_data]}
3247 # JWPlayer backward compatibility: single playlist item
3248 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3249 if not isinstance(jwplayer_data['playlist'], list):
3250 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3252 for video_data in jwplayer_data['playlist']:
3253 # JWPlayer backward compatibility: flattened sources
3254 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3255 if 'sources' not in video_data:
3256 video_data['sources'] = [video_data]
3258 this_video_id = video_id or video_data['mediaid']
3260 formats = self._parse_jwplayer_formats(
3261 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3262 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3265 tracks = video_data.get('tracks')
3266 if tracks and isinstance(tracks, list):
3267 for track in tracks:
3268 if not isinstance(track, dict):
3270 track_kind = track.get('kind')
3271 if not track_kind or not isinstance(track_kind, compat_str):
3273 if track_kind.lower() not in ('captions', 'subtitles'):
3275 track_url = urljoin(base_url, track.get('file'))
3278 subtitles.setdefault(track.get('label') or 'en', []).append({
3279 'url': self._proto_relative_url(track_url)
3283 'id': this_video_id,
3284 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3285 'description': clean_html(video_data.get('description')),
3286 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3287 'timestamp': int_or_none(video_data.get('pubdate')),
3288 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3289 'subtitles': subtitles,
3291 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3292 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3294 '_type': 'url_transparent',
3295 'url': formats[0]['url'],
3298 self._sort_formats(formats)
3299 entry['formats'] = formats
3300 entries.append(entry)
3301 if len(entries) == 1:
3304 return self.playlist_result(entries)
3306 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3307 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3310 for source in jwplayer_sources_data:
3311 if not isinstance(source, dict):
3313 source_url = urljoin(
3314 base_url, self._proto_relative_url(source.get('file')))
3315 if not source_url or source_url in urls:
3317 urls.append(source_url)
3318 source_type = source.get('type') or ''
3319 ext = mimetype2ext(source_type) or determine_ext(source_url)
3320 if source_type == 'hls' or ext == 'm3u8':
3321 formats.extend(self._extract_m3u8_formats(
3322 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3323 m3u8_id=m3u8_id, fatal=False))
3324 elif source_type == 'dash' or ext == 'mpd':
3325 formats.extend(self._extract_mpd_formats(
3326 source_url, video_id, mpd_id=mpd_id, fatal=False))
3328 formats.extend(self._extract_smil_formats(
3329 source_url, video_id, fatal=False))
3330 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3331 elif source_type.startswith('audio') or ext in (
3332 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3339 height = int_or_none(source.get('height'))
3341 # Often no height is provided but there is a label in
3342 # format like "1080p", "720p SD", or 1080.
3343 height = int_or_none(self._search_regex(
3344 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3345 'height', default=None))
3348 'width': int_or_none(source.get('width')),
3350 'tbr': int_or_none(source.get('bitrate')),
3353 if source_url.startswith('rtmp'):
3354 a_format['ext'] = 'flv'
3355 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3356 # of jwplayer.flash.swf
3357 rtmp_url_parts = re.split(
3358 r'((?:mp4|mp3|flv):)', source_url, 1)
3359 if len(rtmp_url_parts) == 3:
3360 rtmp_url, prefix, play_path = rtmp_url_parts
3363 'play_path': prefix + play_path,
3366 a_format.update(rtmp_params)
3367 formats.append(a_format)
3370 def _live_title(self, name):
3371 """ Generate the title
for a live video
"""
3372 now = datetime.datetime.now()
3373 now_str = now.strftime('%Y-%m-%d %H:%M')
3374 return name + ' ' + now_str
3376 def _int(self, v, name, fatal=False, **kwargs):
3377 res = int_or_none(v, **kwargs)
3378 if 'get_attr' in kwargs:
3379 print(getattr(v, kwargs['get_attr']))
3381 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3383 raise ExtractorError(msg)
3385 self.report_warning(msg)
3388 def _float(self, v, name, fatal=False, **kwargs):
3389 res = float_or_none(v, **kwargs)
3391 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3393 raise ExtractorError(msg)
3395 self.report_warning(msg)
3398 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3399 path='/', secure=False, discard=False, rest={}, **kwargs):
3400 cookie = compat_cookiejar_Cookie(
3401 0, name, value, port, port is not None, domain, True,
3402 domain.startswith('.'), path, True, secure, expire_time,
3403 discard, None, None, rest)
3404 self._downloader.cookiejar.set_cookie(cookie)
3406 def _get_cookies(self, url):
3407 """ Return a compat_cookies_SimpleCookie
with the cookies
for the url
"""
3408 req = sanitized_Request(url)
3409 self._downloader.cookiejar.add_cookie_header(req)
3410 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
3412 def _apply_first_set_cookie_header(self, url_handle, cookie):
3414 Apply first Set
-Cookie header instead of the last
. Experimental
.
3416 Some
sites (e
.g
. [1-3]) may serve two cookies under the same name
3417 in Set
-Cookie header
and expect the
first (old
) one to be
set rather
3418 than
second (new
). However
, as of RFC6265 the newer one cookie
3419 should be
set into cookie store what actually happens
.
3420 We will workaround this issue by resetting the cookie to
3421 the first one manually
.
3422 1. https
://new
.vk
.com
/
3423 2. https
://github
.com
/ytdl
-org
/youtube
-dl
/issues
/9841#issuecomment-227871201
3424 3. https
://learning
.oreilly
.com
/
3426 for header, cookies in url_handle.headers.items():
3427 if header.lower() != 'set-cookie':
3429 if sys.version_info[0] >= 3:
3430 cookies = cookies.encode('iso-8859-1')
3431 cookies = cookies.decode('utf-8')
3432 cookie_value = re.search(
3433 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3435 value, domain = cookie_value.groups()
3436 self._set_cookie(domain, cookie, value)
3439 def get_testcases(self, include_onlymatching=False):
3440 t = getattr(self, '_TEST', None)
3442 assert not hasattr(self, '_TESTS'), \
3443 '%s has _TEST and _TESTS' % type(self).__name__
3446 tests = getattr(self, '_TESTS', [])
3448 if not include_onlymatching and t.get('only_matching', False):
3450 t['name'] = type(self).__name__[:-len('IE')]
3453 def is_suitable(self, age_limit):
3454 """ Test whether the extractor
is generally suitable
for the given
3455 age
limit (i
.e
. pornographic sites are
not, all others usually are
) """
3457 any_restricted = False
3458 for tc in self.get_testcases(include_onlymatching=False):
3459 if tc.get('playlist', []):
3460 tc = tc['playlist'][0]
3461 is_restricted = age_restricted(
3462 tc.get('info_dict', {}).get('age_limit'), age_limit)
3463 if not is_restricted:
3465 any_restricted = any_restricted or is_restricted
3466 return not any_restricted
3468 def extract_subtitles(self, *args, **kwargs):
3469 if (self.get_param('writesubtitles', False)
3470 or self.get_param('listsubtitles')):
3471 return self._get_subtitles(*args, **kwargs)
3474 def _get_subtitles(self, *args, **kwargs):
3475 raise NotImplementedError('This method must be implemented by subclasses')
3478 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3479 """ Merge subtitle items
for one language
. Items
with duplicated URLs
3480 will be dropped
. """
3481 list1_urls = set([item['url'] for item in subtitle_list1])
3482 ret = list(subtitle_list1)
3483 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3487 def _merge_subtitles(cls, *dicts, **kwargs):
3488 """ Merge subtitle dictionaries
, language by language
. """
3490 target = (lambda target=None: target)(**kwargs)
3491 # The above lambda extracts the keyword argument 'target' from kwargs
3492 # while ensuring there are no stray ones. When Python 2 support
3493 # is dropped, remove it and change the function signature to:
3495 # def _merge_subtitles(cls, *dicts, target=None):
3500 for lang, subs in d.items():
3501 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3504 def extract_automatic_captions(self, *args, **kwargs):
3505 if (self.get_param('writeautomaticsub', False)
3506 or self.get_param('listsubtitles')):
3507 return self._get_automatic_captions(*args, **kwargs)
3510 def _get_automatic_captions(self, *args, **kwargs):
3511 raise NotImplementedError('This method must be implemented by subclasses')
3513 def mark_watched(self, *args, **kwargs):
3514 if (self.get_param('mark_watched', False)
3515 and (self._get_login_info()[0] is not None
3516 or self.get_param('cookiefile') is not None)):
3517 self._mark_watched(*args, **kwargs)
3519 def _mark_watched(self, *args, **kwargs):
3520 raise NotImplementedError('This method must be implemented by subclasses')
3522 def geo_verification_headers(self):
3524 geo_verification_proxy = self.get_param('geo_verification_proxy')
3525 if geo_verification_proxy:
3526 headers['Ytdl-request-proxy'] = geo_verification_proxy
3529 def _generic_id(self, url):
3530 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3532 def _generic_title(self, url):
3533 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3536 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3537 all_known = all(map(
3538 lambda x: x is not None,
3539 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3541 'private' if is_private
3542 else 'premium_only' if needs_premium
3543 else 'subscriber_only' if needs_subscription
3544 else 'needs_auth' if needs_auth
3545 else 'unlisted' if is_unlisted
3546 else 'public' if all_known
3550 class SearchInfoExtractor(InfoExtractor):
3552 Base
class for paged search queries extractors
.
3553 They accept URLs
in the format
_SEARCH_KEY(|all|
[0-9]):{query}
3554 Instances should define _SEARCH_KEY
and _MAX_RESULTS
.
3558 def _make_valid_url(cls):
3559 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3562 def suitable(cls, url):
3563 return re.match(cls._make_valid_url(), url) is not None
3565 def _real_extract(self, query):
3566 mobj = re.match(self._make_valid_url(), query)
3568 raise ExtractorError('Invalid search query "%s"' % query)
3570 prefix = mobj.group('prefix')
3571 query = mobj.group('query')
3573 return self._get_n_results(query, 1)
3574 elif prefix == 'all':
3575 return self._get_n_results(query, self._MAX_RESULTS)
3579 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3580 elif n > self._MAX_RESULTS:
3581 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3582 n = self._MAX_RESULTS
3583 return self._get_n_results(query, n)
3585 def _get_n_results(self, query, n):
3586 """Get a specified number of results
for a query
"""
3587 raise NotImplementedError('This method must be implemented by subclasses')
3590 def SEARCH_KEY(self):
3591 return self._SEARCH_KEY