2 from __future__
import unicode_literals
17 from ..compat
import (
18 compat_cookiejar_Cookie
,
19 compat_cookies_SimpleCookie
,
21 compat_etree_fromstring
,
28 compat_urllib_parse_unquote
,
29 compat_urllib_parse_urlencode
,
30 compat_urllib_request
,
32 compat_xml_parse_error
,
34 from ..downloader
import FileDownloader
35 from ..downloader
.f4m
import (
37 remove_encrypted_media
,
68 parse_m3u8_attributes
,
93 class InfoExtractor(object):
94 """Information Extractor class.
96 Information extractors are the classes that, given a URL, extract
97 information about the video (or videos) the URL refers to. This
98 information includes the real video URL, the video title, author and
99 others. The information is stored in a dictionary which is then
100 passed to the YoutubeDL. The YoutubeDL processes this
101 information possibly downloading the video to the file system, among
102 other possible outcomes.
104 The type field determines the type of the result.
105 By far the most common value (and the default if _type is missing) is
106 "video", which indicates a single video.
108 For a video, the dictionaries must include the following fields:
110 id: Video identifier.
111 title: Video title, unescaped.
113 Additionally, it must contain either a formats entry or a url one:
115 formats: A list of dictionaries for each format available, ordered
116 from worst to best quality.
119 * url The mandatory URL representing the media:
120 for plain file media - HTTP URL of this file,
122 for HLS - URL of the M3U8 media playlist,
123 for HDS - URL of the F4M manifest,
125 - HTTP URL to plain file media (in case of
127 - URL of the MPD manifest or base URL
128 representing the media if MPD manifest
129 is parsed from a string (in case of
131 for MSS - URL of the ISM manifest.
133 The URL of the manifest file in case of
135 for HLS - URL of the M3U8 master playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH - URL of the MPD manifest,
138 for MSS - URL of the ISM manifest.
139 * ext Will be calculated from URL if missing
140 * format A human-readable description of the format
141 ("mp4 container with h264/opus").
142 Calculated from the format_id, width, height.
143 and format_note fields if missing.
144 * format_id A short description of the format
145 ("mp4_h264_opus" or "19").
146 Technically optional, but strongly recommended.
147 * format_note Additional info about the format
148 ("3D" or "DASH video")
149 * width Width of the video, if known
150 * height Height of the video, if known
151 * resolution Textual description of width and height
152 * dynamic_range The dynamic range of the video. One of:
153 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
154 * tbr Average bitrate of audio and video in KBit/s
155 * abr Average audio bitrate in KBit/s
156 * acodec Name of the audio codec in use
157 * asr Audio sampling rate in Hertz
158 * vbr Average video bitrate in KBit/s
160 * vcodec Name of the video codec in use
161 * container Name of the container format
162 * filesize The number of bytes, if known in advance
163 * filesize_approx An estimate for the number of bytes
164 * player_url SWF Player URL (used for rtmpdump).
165 * protocol The protocol that will be used for the actual
166 download, lower-case. One of "http", "https" or
167 one of the protocols defined in downloader.PROTOCOL_MAP
169 Base URL for fragments. Each fragment's path
170 value (if present) will be relative to
172 * fragments A list of fragments of a fragmented media.
173 Each fragment entry must contain either an url
174 or a path. If an url is present it should be
175 considered by a client. Otherwise both path and
176 fragment_base_url must be present. Here is
177 the list of all potential fields:
178 * "url" - fragment's URL
179 * "path" - fragment's path relative to
181 * "duration" (optional, int or float)
182 * "filesize" (optional, int)
183 * is_from_start Is a live format that can be downloaded
184 from the start. Boolean
185 * preference Order number of this format. If this field is
186 present and not None, the formats get sorted
187 by this field, regardless of all other values.
188 -1 for default (order by other properties),
189 -2 or smaller for less than default.
190 < -1000 to hide the format (if there is
191 another one which is strictly better)
192 * language Language code, e.g. "de" or "en-US".
193 * language_preference Is this in the language mentioned in
195 10 if it's what the URL is about,
196 -1 for default (don't know),
197 -10 otherwise, other values reserved for now.
198 * quality Order number of the video quality of this
199 format, irrespective of the file format.
200 -1 for default (order by other properties),
201 -2 or smaller for less than default.
202 * source_preference Order number for this video source
203 (quality takes higher priority)
204 -1 for default (order by other properties),
205 -2 or smaller for less than default.
206 * http_headers A dictionary of additional HTTP headers
207 to add to the request.
208 * stretched_ratio If given and not 1, indicates that the
209 video's pixels are not square.
210 width : height ratio as float.
211 * no_resume The server does not support resuming the
212 (HTTP or RTMP) download. Boolean.
213 * has_drm The format has DRM and cannot be downloaded. Boolean
214 * downloader_options A dictionary of downloader options as
215 described in FileDownloader
216 RTMP formats can also have the additional fields: page_url,
217 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
218 rtmp_protocol, rtmp_real_time
220 url: Final video URL.
221 ext: Video filename extension.
222 format: The video format, defaults to ext (used for --get-format)
223 player_url: SWF Player URL (used for rtmpdump).
225 The following fields are optional:
227 alt_title: A secondary title of the video.
228 display_id An alternative identifier for the video, not necessarily
229 unique, but available before title. Typically, id is
230 something like "4234987", title "Dancing naked mole rats",
231 and display_id "dancing-naked-mole-rats"
232 thumbnails: A list of dictionaries, with the following entries:
233 * "id" (optional, string) - Thumbnail format ID
235 * "preference" (optional, int) - quality of the image
236 * "width" (optional, int)
237 * "height" (optional, int)
238 * "resolution" (optional, string "{width}x{height}",
240 * "filesize" (optional, int)
241 thumbnail: Full URL to a video thumbnail image.
242 description: Full video description.
243 uploader: Full name of the video uploader.
244 license: License name the video is licensed under.
245 creator: The creator of the video.
246 timestamp: UNIX timestamp of the moment the video was uploaded
247 upload_date: Video upload date (YYYYMMDD).
248 If not explicitly set, calculated from timestamp
249 release_timestamp: UNIX timestamp of the moment the video was released.
250 If it is not clear whether to use timestamp or this, use the former
251 release_date: The date (YYYYMMDD) when the video was released.
252 If not explicitly set, calculated from release_timestamp
253 modified_timestamp: UNIX timestamp of the moment the video was last modified.
254 modified_date: The date (YYYYMMDD) when the video was last modified.
255 If not explicitly set, calculated from modified_timestamp
256 uploader_id: Nickname or id of the video uploader.
257 uploader_url: Full URL to a personal webpage of the video uploader.
258 channel: Full name of the channel the video is uploaded on.
259 Note that channel fields may or may not repeat uploader
260 fields. This depends on a particular extractor.
261 channel_id: Id of the channel.
262 channel_url: Full URL to a channel webpage.
263 location: Physical location where the video was filmed.
264 subtitles: The available subtitles as a dictionary in the format
265 {tag: subformats}. "tag" is usually a language code, and
266 "subformats" is a list sorted from lower to higher
267 preference, each element is a dictionary with the "ext"
269 * "data": The subtitles file contents
270 * "url": A URL pointing to the subtitles file
271 It can optionally also have:
272 * "name": Name or description of the subtitles
273 "ext" will be calculated from URL if missing
274 automatic_captions: Like 'subtitles'; contains automatically generated
275 captions instead of normal subtitles
276 duration: Length of the video in seconds, as an integer or float.
277 view_count: How many users have watched the video on the platform.
278 like_count: Number of positive ratings of the video
279 dislike_count: Number of negative ratings of the video
280 repost_count: Number of reposts of the video
281 average_rating: Average rating give by users, the scale used depends on the webpage
282 comment_count: Number of comments on the video
283 comments: A list of comments, each with one or more of the following
284 properties (all but one of text or html optional):
285 * "author" - human-readable name of the comment author
286 * "author_id" - user ID of the comment author
287 * "author_thumbnail" - The thumbnail of the comment author
289 * "html" - Comment as HTML
290 * "text" - Plain text of the comment
291 * "timestamp" - UNIX timestamp of comment
292 * "parent" - ID of the comment this one is replying to.
293 Set to "root" to indicate that this is a
294 comment to the original video.
295 * "like_count" - Number of positive ratings of the comment
296 * "dislike_count" - Number of negative ratings of the comment
297 * "is_favorited" - Whether the comment is marked as
298 favorite by the video uploader
299 * "author_is_uploader" - Whether the comment is made by
301 age_limit: Age restriction for the video, as an integer (years)
302 webpage_url: The URL to the video webpage, if given to yt-dlp it
303 should allow to get the same result again. (It will be set
304 by YoutubeDL if it's missing)
305 categories: A list of categories that the video falls in, for example
307 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
308 cast: A list of the video cast
309 is_live: True, False, or None (=unknown). Whether this video is a
310 live stream that goes on instead of a fixed-length video.
311 was_live: True, False, or None (=unknown). Whether this video was
312 originally a live stream.
313 live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown)
314 If absent, automatically set from is_live, was_live
315 start_time: Time in seconds where the reproduction should start, as
316 specified in the URL.
317 end_time: Time in seconds where the reproduction should end, as
318 specified in the URL.
319 chapters: A list of dictionaries, with the following entries:
320 * "start_time" - The start time of the chapter in seconds
321 * "end_time" - The end time of the chapter in seconds
322 * "title" (optional, string)
323 playable_in_embed: Whether this video is allowed to play in embedded
324 players on other sites. Can be True (=always allowed),
325 False (=never allowed), None (=unknown), or a string
326 specifying the criteria for embedability (Eg: 'whitelist')
327 availability: Under what condition the video is available. One of
328 'private', 'premium_only', 'subscriber_only', 'needs_auth',
329 'unlisted' or 'public'. Use 'InfoExtractor._availability'
331 __post_extractor: A function to be called just before the metadata is
332 written to either disk, logger or console. The function
333 must return a dict which will be added to the info_dict.
334 This is usefull for additional information that is
335 time-consuming to extract. Note that the fields thus
336 extracted will not be available to output template and
337 match_filter. So, only "comments" and "comment_count" are
338 currently allowed to be extracted via this method.
340 The following fields should only be used when the video belongs to some logical
343 chapter: Name or title of the chapter the video belongs to.
344 chapter_number: Number of the chapter the video belongs to, as an integer.
345 chapter_id: Id of the chapter the video belongs to, as a unicode string.
347 The following fields should only be used when the video is an episode of some
348 series, programme or podcast:
350 series: Title of the series or programme the video episode belongs to.
351 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
352 season: Title of the season the video episode belongs to.
353 season_number: Number of the season the video episode belongs to, as an integer.
354 season_id: Id of the season the video episode belongs to, as a unicode string.
355 episode: Title of the video episode. Unlike mandatory video title field,
356 this field should denote the exact title of the video episode
357 without any kind of decoration.
358 episode_number: Number of the video episode within a season, as an integer.
359 episode_id: Id of the video episode, as a unicode string.
361 The following fields should only be used when the media is a track or a part of
364 track: Title of the track.
365 track_number: Number of the track within an album or a disc, as an integer.
366 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
368 artist: Artist(s) of the track.
369 genre: Genre(s) of the track.
370 album: Title of the album the track belongs to.
371 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
372 album_artist: List of all artists appeared on the album (e.g.
373 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
375 disc_number: Number of the disc or other physical medium the track belongs to,
377 release_year: Year (YYYY) when the album was released.
378 composer: Composer of the piece
380 Unless mentioned otherwise, the fields should be Unicode strings.
382 Unless mentioned otherwise, None is equivalent to absence of information.
385 _type "playlist" indicates multiple videos.
386 There must be a key "entries", which is a list, an iterable, or a PagedList
387 object, each element of which is a valid dictionary by this specification.
389 Additionally, playlists can have "id", "title", and any other relevent
390 attributes with the same semantics as videos (see above).
392 It can also have the following optional fields:
394 playlist_count: The total number of videos in a playlist. If not given,
395 YoutubeDL tries to calculate it from "entries"
398 _type "multi_video" indicates that there are multiple videos that
399 form a single show, for examples multiple acts of an opera or TV episode.
400 It must have an entries key like a playlist and contain all the keys
401 required for a video at the same time.
404 _type "url" indicates that the video must be extracted from another
405 location, possibly by a different extractor. Its only required key is:
406 "url" - the next URL to extract.
407 The key "ie_key" can be set to the class name (minus the trailing "IE",
408 e.g. "Youtube") if the extractor class is known in advance.
409 Additionally, the dictionary may have any properties of the resolved entity
410 known in advance, for example "title" if the title of the referred video is
414 _type "url_transparent" entities have the same specification as "url", but
415 indicate that the given additional information is more precise than the one
416 associated with the resolved URL.
417 This is useful when a site employs a video service that hosts the video and
418 its technical metadata, but that video service does not embed a useful
419 title, description etc.
422 Subclasses of this one should re-define the _real_initialize() and
423 _real_extract() methods and define a _VALID_URL regexp.
424 Probably, they should also be added to the list of extractors.
426 Subclasses may also override suitable() if necessary, but ensure the function
427 signature is preserved and that this function imports everything it needs
428 (except other extractors), so that lazy_extractors works correctly
430 _GEO_BYPASS attribute may be set to False in order to disable
431 geo restriction bypass mechanisms for a particular extractor.
432 Though it won't disable explicit geo restriction bypass based on
433 country code provided with geo_bypass_country.
435 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
436 countries for this extractor. One of these countries will be used by
437 geo restriction bypass mechanism right away in order to bypass
438 geo restriction, of course, if the mechanism is not disabled.
440 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
441 IP blocks in CIDR notation for this extractor. One of these IP blocks
442 will be used by geo restriction bypass mechanism similarly
445 The _WORKING attribute should be set to False for broken IEs
446 in order to warn the users and skip the tests.
451 _x_forwarded_for_ip
= None
453 _GEO_COUNTRIES
= None
454 _GEO_IP_BLOCKS
= None
458 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
460 'Use --cookies-from-browser or --cookies for the authentication. '
461 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
462 'password': 'Use --username and --password, or --netrc to provide account credentials',
465 def __init__(self
, downloader
=None):
466 """Constructor. Receives an optional downloader (a YoutubeDL instance).
467 If a downloader is not passed during initialization,
468 it must be set using "set_downloader()" before "extract()" is called"""
470 self
._x
_forwarded
_for
_ip
= None
471 self
._printed
_messages
= set()
472 self
.set_downloader(downloader
)
475 def _match_valid_url(cls
, url
):
476 # This does not use has/getattr intentionally - we want to know whether
477 # we have cached the regexp for *this* class, whereas getattr would also
478 # match the superclass
479 if '_VALID_URL_RE' not in cls
.__dict
__:
480 if '_VALID_URL' not in cls
.__dict
__:
481 cls
._VALID
_URL
= cls
._make
_valid
_url
()
482 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
483 return cls
._VALID
_URL
_RE
.match(url
)
486 def suitable(cls
, url
):
487 """Receives a URL and returns True if suitable for this IE."""
488 # This function must import everything it needs (except other extractors),
489 # so that lazy_extractors works correctly
490 return cls
._match
_valid
_url
(url
) is not None
493 def _match_id(cls
, url
):
494 return cls
._match
_valid
_url
(url
).group('id')
497 def get_temp_id(cls
, url
):
499 return cls
._match
_id
(url
)
500 except (IndexError, AttributeError):
505 """Getter method for _WORKING."""
508 def initialize(self
):
509 """Initializes an instance (authentication, etc)."""
510 self
._printed
_messages
= set()
511 self
._initialize
_geo
_bypass
({
512 'countries': self
._GEO
_COUNTRIES
,
513 'ip_blocks': self
._GEO
_IP
_BLOCKS
,
516 self
._real
_initialize
()
519 def _initialize_geo_bypass(self
, geo_bypass_context
):
521 Initialize geo restriction bypass mechanism.
523 This method is used to initialize geo bypass mechanism based on faking
524 X-Forwarded-For HTTP header. A random country from provided country list
525 is selected and a random IP belonging to this country is generated. This
526 IP will be passed as X-Forwarded-For HTTP header in all subsequent
529 This method will be used for initial geo bypass mechanism initialization
530 during the instance initialization with _GEO_COUNTRIES and
533 You may also manually call it from extractor's code if geo bypass
534 information is not available beforehand (e.g. obtained during
535 extraction) or due to some other reason. In this case you should pass
536 this information in geo bypass context passed as first argument. It may
537 contain following fields:
539 countries: List of geo unrestricted countries (similar
541 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
542 (similar to _GEO_IP_BLOCKS)
545 if not self
._x
_forwarded
_for
_ip
:
547 # Geo bypass mechanism is explicitly disabled by user
548 if not self
.get_param('geo_bypass', True):
551 if not geo_bypass_context
:
552 geo_bypass_context
= {}
554 # Backward compatibility: previously _initialize_geo_bypass
555 # expected a list of countries, some 3rd party code may still use
557 if isinstance(geo_bypass_context
, (list, tuple)):
558 geo_bypass_context
= {
559 'countries': geo_bypass_context
,
562 # The whole point of geo bypass mechanism is to fake IP
563 # as X-Forwarded-For HTTP header based on some IP block or
566 # Path 1: bypassing based on IP block in CIDR notation
568 # Explicit IP block specified by user, use it right away
569 # regardless of whether extractor is geo bypassable or not
570 ip_block
= self
.get_param('geo_bypass_ip_block', None)
572 # Otherwise use random IP block from geo bypass context but only
573 # if extractor is known as geo bypassable
575 ip_blocks
= geo_bypass_context
.get('ip_blocks')
576 if self
._GEO
_BYPASS
and ip_blocks
:
577 ip_block
= random
.choice(ip_blocks
)
580 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(ip_block
)
581 self
._downloader
.write_debug(
582 '[debug] Using fake IP %s as X-Forwarded-For' % self
._x
_forwarded
_for
_ip
)
585 # Path 2: bypassing based on country code
587 # Explicit country code specified by user, use it right away
588 # regardless of whether extractor is geo bypassable or not
589 country
= self
.get_param('geo_bypass_country', None)
591 # Otherwise use random country code from geo bypass context but
592 # only if extractor is known as geo bypassable
594 countries
= geo_bypass_context
.get('countries')
595 if self
._GEO
_BYPASS
and countries
:
596 country
= random
.choice(countries
)
599 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country
)
600 self
._downloader
.write_debug(
601 'Using fake IP %s (%s) as X-Forwarded-For' % (self
._x
_forwarded
_for
_ip
, country
.upper()))
603 def extract(self
, url
):
604 """Extracts URL information and returns it in list of dicts."""
609 self
.write_debug('Extracting URL: %s' % url
)
610 ie_result
= self
._real
_extract
(url
)
611 if ie_result
is None:
613 if self
._x
_forwarded
_for
_ip
:
614 ie_result
['__x_forwarded_for_ip'] = self
._x
_forwarded
_for
_ip
615 subtitles
= ie_result
.get('subtitles')
616 if (subtitles
and 'live_chat' in subtitles
617 and 'no-live-chat' in self
.get_param('compat_opts', [])):
618 del subtitles
['live_chat']
620 except GeoRestrictedError
as e
:
621 if self
.__maybe
_fake
_ip
_and
_retry
(e
.countries
):
624 except UnsupportedError
:
626 except ExtractorError
as e
:
628 'video_id': e
.video_id
or self
.get_temp_id(url
),
630 'tb': e
.traceback
or sys
.exc_info()[2],
631 'expected': e
.expected
,
634 if hasattr(e
, 'countries'):
635 kwargs
['countries'] = e
.countries
636 raise type(e
)(e
.msg
, **kwargs
)
637 except compat_http_client
.IncompleteRead
as e
:
638 raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True, video_id
=self
.get_temp_id(url
))
639 except (KeyError, StopIteration) as e
:
640 raise ExtractorError('An extractor error has occurred.', cause
=e
, video_id
=self
.get_temp_id(url
))
642 def __maybe_fake_ip_and_retry(self
, countries
):
643 if (not self
.get_param('geo_bypass_country', None)
645 and self
.get_param('geo_bypass', True)
646 and not self
._x
_forwarded
_for
_ip
648 country_code
= random
.choice(countries
)
649 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country_code
)
650 if self
._x
_forwarded
_for
_ip
:
652 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
653 % (self
._x
_forwarded
_for
_ip
, country_code
.upper()))
657 def set_downloader(self
, downloader
):
658 """Sets the downloader for this IE."""
659 self
._downloader
= downloader
661 def _real_initialize(self
):
662 """Real initialization process. Redefine in subclasses."""
665 def _real_extract(self
, url
):
666 """Real extraction process. Redefine in subclasses."""
671 """A string for getting the InfoExtractor with get_info_extractor"""
672 return cls
.__name
__[:-2]
676 return compat_str(type(self
).__name
__[:-2])
679 def __can_accept_status_code(err
, expected_status
):
680 assert isinstance(err
, compat_urllib_error
.HTTPError
)
681 if expected_status
is None:
683 elif callable(expected_status
):
684 return expected_status(err
.code
) is True
686 return err
.code
in variadic(expected_status
)
688 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query={}
, expected_status
=None):
690 Return the response handle.
692 See _download_webpage docstring for arguments specification.
694 if not self
._downloader
._first
_webpage
_request
:
695 sleep_interval
= self
.get_param('sleep_interval_requests') or 0
696 if sleep_interval
> 0:
697 self
.to_screen('Sleeping %s seconds ...' % sleep_interval
)
698 time
.sleep(sleep_interval
)
700 self
._downloader
._first
_webpage
_request
= False
703 self
.report_download_webpage(video_id
)
704 elif note
is not False:
706 self
.to_screen('%s' % (note
,))
708 self
.to_screen('%s: %s' % (video_id
, note
))
710 # Some sites check X-Forwarded-For HTTP header in order to figure out
711 # the origin of the client behind proxy. This allows bypassing geo
712 # restriction by faking this header's value to IP that belongs to some
713 # geo unrestricted country. We will do so once we encounter any
714 # geo restriction error.
715 if self
._x
_forwarded
_for
_ip
:
716 if 'X-Forwarded-For' not in headers
:
717 headers
['X-Forwarded-For'] = self
._x
_forwarded
_for
_ip
719 if isinstance(url_or_request
, compat_urllib_request
.Request
):
720 url_or_request
= update_Request(
721 url_or_request
, data
=data
, headers
=headers
, query
=query
)
724 url_or_request
= update_url_query(url_or_request
, query
)
725 if data
is not None or headers
:
726 url_or_request
= sanitized_Request(url_or_request
, data
, headers
)
728 return self
._downloader
.urlopen(url_or_request
)
729 except network_exceptions
as err
:
730 if isinstance(err
, compat_urllib_error
.HTTPError
):
731 if self
.__can
_accept
_status
_code
(err
, expected_status
):
732 # Retain reference to error to prevent file object from
733 # being closed before it can be read. Works around the
734 # effects of <https://bugs.python.org/issue15002>
735 # introduced in Python 3.4.1.
742 errnote
= 'Unable to download webpage'
744 errmsg
= '%s: %s' % (errnote
, error_to_compat_str(err
))
746 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
)
748 self
.report_warning(errmsg
)
751 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, encoding
=None, data
=None, headers
={}, query={}
, expected_status
=None):
753 Return a tuple (page content as string, URL handle).
755 See _download_webpage docstring for arguments specification.
757 # Strip hashes from the URL (#1038)
758 if isinstance(url_or_request
, (compat_str
, str)):
759 url_or_request
= url_or_request
.partition('#')[0]
761 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
, expected_status
=expected_status
)
765 content
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
)
766 return (content
, urlh
)
769 def _guess_encoding_from_content(content_type
, webpage_bytes
):
770 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
772 encoding
= m
.group(1)
774 m
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
775 webpage_bytes[:1024])
777 encoding = m.group(1).decode('ascii')
778 elif webpage_bytes.startswith(b'\xff\xfe'):
785 def __check_blocked(self, content):
786 first_block = content[:512]
787 if ('<title>Access to this site is blocked</title>' in content
788 and 'Websense' in first_block):
789 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
790 blocked_iframe = self._html_search_regex(
791 r'<iframe src="([^
"]+)"', content,
792 'Websense information URL
', default=None)
794 msg += ' Visit
%s for more details
' % blocked_iframe
795 raise ExtractorError(msg, expected=True)
796 if '<title
>The URL you requested has been blocked
</title
>' in first_block:
798 'Access to this webpage has been blocked by Indian censorship
. '
799 'Use a VPN
or proxy
server (with --proxy
) to route around it
.')
800 block_msg = self._html_search_regex(
801 r'</h1
><p
>(.*?
)</p
>',
802 content, 'block message
', default=None)
804 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ')
805 raise ExtractorError(msg, expected=True)
806 if ('<title
>TTK
:: Доступ к ресурсу ограничен
</title
>' in content
807 and 'blocklist
.rkn
.gov
.ru
' in content):
808 raise ExtractorError(
809 'Access to this webpage has been blocked by decision of the Russian government
. '
810 'Visit http
://blocklist
.rkn
.gov
.ru
/ for a block reason
.',
813 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
814 content_type = urlh.headers.get('Content
-Type
', '')
815 webpage_bytes = urlh.read()
816 if prefix is not None:
817 webpage_bytes = prefix + webpage_bytes
819 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
820 if self.get_param('dump_intermediate_pages
', False):
821 self.to_screen('Dumping request to
' + urlh.geturl())
822 dump = base64.b64encode(webpage_bytes).decode('ascii
')
823 self._downloader.to_screen(dump)
824 if self.get_param('write_pages
', False):
825 basen = '%s_%s' % (video_id, urlh.geturl())
826 trim_length = self.get_param('trim_file_name
') or 240
827 if len(basen) > trim_length:
828 h = '___
' + hashlib.md5(basen.encode('utf
-8')).hexdigest()
829 basen = basen[:trim_length - len(h)] + h
830 raw_filename = basen + '.dump
'
831 filename = sanitize_filename(raw_filename, restricted=True)
832 self.to_screen('Saving request to
' + filename)
833 # Working around MAX_PATH limitation on Windows (see
834 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
835 if compat_os_name == 'nt
':
836 absfilepath = os.path.abspath(filename)
837 if len(absfilepath) > 259:
838 filename = '\\\\?
\\' + absfilepath
839 with open(filename, 'wb
') as outf:
840 outf.write(webpage_bytes)
843 content = webpage_bytes.decode(encoding, 'replace
')
845 content = webpage_bytes.decode('utf
-8', 'replace
')
847 self.__check_blocked(content)
851 def _download_webpage(
852 self, url_or_request, video_id, note=None, errnote=None,
853 fatal=True, tries=1, timeout=5, encoding=None, data=None,
854 headers={}, query={}, expected_status=None):
856 Return the data of the page as a string.
859 url_or_request -- plain text URL as a string or
860 a compat_urllib_request.Requestobject
861 video_id -- Video/playlist/item identifier (string)
864 note -- note printed before downloading (string)
865 errnote -- note printed in case of an error (string)
866 fatal -- flag denoting whether error should be considered fatal,
867 i.e. whether it should cause ExtractionError to be raised,
868 otherwise a warning will be reported and extraction continued
869 tries -- number of tries
870 timeout -- sleep interval between tries
871 encoding -- encoding for a page content decoding, guessed automatically
872 when not explicitly specified
873 data -- POST data (bytes)
874 headers -- HTTP headers (dict)
875 query -- URL query (dict)
876 expected_status -- allows to accept failed HTTP requests (non 2xx
877 status code) by explicitly specifying a set of accepted status
878 codes. Can be any of the following entities:
879 - an integer type specifying an exact failed status code to
881 - a list or a tuple of integer types specifying a list of
882 failed status codes to accept
883 - a callable accepting an actual failed status code and
884 returning True if it should be accepted
885 Note that this argument does not affect success status codes (2xx)
886 which are always accepted.
891 while success is False:
893 res = self._download_webpage_handle(
894 url_or_request, video_id, note, errnote, fatal,
895 encoding=encoding, data=data, headers=headers, query=query,
896 expected_status=expected_status)
898 except compat_http_client.IncompleteRead as e:
900 if try_count >= tries:
902 self._sleep(timeout, video_id)
909 def _download_xml_handle(
910 self, url_or_request, video_id, note='Downloading XML
',
911 errnote='Unable to download XML
', transform_source=None,
912 fatal=True, encoding=None, data=None, headers={}, query={},
913 expected_status=None):
915 Return a tuple (xml as an compat_etree_Element, URL handle).
917 See _download_webpage docstring for arguments specification.
919 res = self._download_webpage_handle(
920 url_or_request, video_id, note, errnote, fatal=fatal,
921 encoding=encoding, data=data, headers=headers, query=query,
922 expected_status=expected_status)
925 xml_string, urlh = res
926 return self._parse_xml(
927 xml_string, video_id, transform_source=transform_source,
931 self, url_or_request, video_id,
932 note='Downloading XML
', errnote='Unable to download XML
',
933 transform_source=None, fatal=True, encoding=None,
934 data=None, headers={}, query={}, expected_status=None):
936 Return the xml as an compat_etree_Element.
938 See _download_webpage docstring for arguments specification.
940 res = self._download_xml_handle(
941 url_or_request, video_id, note=note, errnote=errnote,
942 transform_source=transform_source, fatal=fatal, encoding=encoding,
943 data=data, headers=headers, query=query,
944 expected_status=expected_status)
945 return res if res is False else res[0]
947 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
949 xml_string = transform_source(xml_string)
951 return compat_etree_fromstring(xml_string.encode('utf
-8'))
952 except compat_xml_parse_error as ve:
953 errmsg = '%s: Failed to parse XML
' % video_id
955 raise ExtractorError(errmsg, cause=ve)
957 self.report_warning(errmsg + str(ve))
959 def _download_json_handle(
960 self, url_or_request, video_id, note='Downloading JSON metadata
',
961 errnote='Unable to download JSON metadata
', transform_source=None,
962 fatal=True, encoding=None, data=None, headers={}, query={},
963 expected_status=None):
965 Return a tuple (JSON object, URL handle).
967 See _download_webpage docstring for arguments specification.
969 res = self._download_webpage_handle(
970 url_or_request, video_id, note, errnote, fatal=fatal,
971 encoding=encoding, data=data, headers=headers, query=query,
972 expected_status=expected_status)
975 json_string, urlh = res
976 return self._parse_json(
977 json_string, video_id, transform_source=transform_source,
981 self, url_or_request, video_id, note='Downloading JSON metadata
',
982 errnote='Unable to download JSON metadata
', transform_source=None,
983 fatal=True, encoding=None, data=None, headers={}, query={},
984 expected_status=None):
986 Return the JSON object as a dict.
988 See _download_webpage docstring for arguments specification.
990 res = self._download_json_handle(
991 url_or_request, video_id, note=note, errnote=errnote,
992 transform_source=transform_source, fatal=fatal, encoding=encoding,
993 data=data, headers=headers, query=query,
994 expected_status=expected_status)
995 return res if res is False else res[0]
997 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
999 json_string = transform_source(json_string)
1001 return json.loads(json_string)
1002 except ValueError as ve:
1003 errmsg = '%s: Failed to parse JSON
' % video_id
1005 raise ExtractorError(errmsg, cause=ve)
1007 self.report_warning(errmsg + str(ve))
1009 def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True):
1010 return self._parse_json(
1011 data[data.find('{'):data.rfind('}
') + 1],
1012 video_id, transform_source, fatal)
1014 def _download_socket_json_handle(
1015 self, url_or_request, video_id, note='Polling socket
',
1016 errnote='Unable to poll socket
', transform_source=None,
1017 fatal=True, encoding=None, data=None, headers={}, query={},
1018 expected_status=None):
1020 Return a tuple (JSON object, URL handle).
1022 See _download_webpage docstring for arguments specification.
1024 res = self._download_webpage_handle(
1025 url_or_request, video_id, note, errnote, fatal=fatal,
1026 encoding=encoding, data=data, headers=headers, query=query,
1027 expected_status=expected_status)
1031 return self._parse_socket_response_as_json(
1032 webpage, video_id, transform_source=transform_source,
1035 def _download_socket_json(
1036 self, url_or_request, video_id, note='Polling socket
',
1037 errnote='Unable to poll socket
', transform_source=None,
1038 fatal=True, encoding=None, data=None, headers={}, query={},
1039 expected_status=None):
1041 Return the JSON object as a dict.
1043 See _download_webpage docstring for arguments specification.
1045 res = self._download_socket_json_handle(
1046 url_or_request, video_id, note=note, errnote=errnote,
1047 transform_source=transform_source, fatal=fatal, encoding=encoding,
1048 data=data, headers=headers, query=query,
1049 expected_status=expected_status)
1050 return res if res is False else res[0]
1052 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1053 idstr = format_field(video_id, template='%s: ')
1054 msg = f'[{self.IE_NAME}
] {idstr}{msg}
'
1056 if f'WARNING
: {msg}
' in self._printed_messages:
1058 self._printed_messages.add(f'WARNING
: {msg}
')
1059 self._downloader.report_warning(msg, *args, **kwargs)
1061 def to_screen(self, msg, *args, **kwargs):
1062 """Print msg to screen, prefixing it with '[ie_name
]'"""
1063 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1065 def write_debug(self, msg, *args, **kwargs):
1066 self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs)
1068 def get_param(self, name, default=None, *args, **kwargs):
1069 if self._downloader:
1070 return self._downloader.params.get(name, default, *args, **kwargs)
1073 def report_drm(self, video_id, partial=False):
1074 self.raise_no_formats('This video
is DRM protected
', expected=True, video_id=video_id)
1076 def report_extraction(self, id_or_name):
1077 """Report information extraction."""
1078 self.to_screen('%s: Extracting information
' % id_or_name)
1080 def report_download_webpage(self, video_id):
1081 """Report webpage download."""
1082 self.to_screen('%s: Downloading webpage
' % video_id)
1084 def report_age_confirmation(self):
1085 """Report attempt to confirm age."""
1086 self.to_screen('Confirming age
')
1088 def report_login(self):
1089 """Report attempt to log in."""
1090 self.to_screen('Logging
in')
1092 def raise_login_required(
1093 self, msg='This video
is only available
for registered users
',
1094 metadata_available=False, method='any
'):
1095 if metadata_available and (
1096 self.get_param('ignore_no_formats_error
') or self.get_param('wait_for_video
')):
1097 self.report_warning(msg)
1098 if method is not None:
1099 msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
1100 raise ExtractorError(msg, expected=True)
1102 def raise_geo_restricted(
1103 self, msg='This video
is not available
from your location due to geo restriction
',
1104 countries=None, metadata_available=False):
1105 if metadata_available and (
1106 self.get_param('ignore_no_formats_error
') or self.get_param('wait_for_video
')):
1107 self.report_warning(msg)
1109 raise GeoRestrictedError(msg, countries=countries)
1111 def raise_no_formats(self, msg, expected=False, video_id=None):
1113 self.get_param('ignore_no_formats_error
') or self.get_param('wait_for_video
')):
1114 self.report_warning(msg, video_id)
1115 elif isinstance(msg, ExtractorError):
1118 raise ExtractorError(msg, expected=expected, video_id=video_id)
1120 # Methods for following #608
1122 def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
1123 """Returns a URL that points to a page that should be processed"""
1124 # TODO: ie should be the class used for getting the info
1125 video_info = {'_type
': 'url
',
1128 video_info.update(kwargs)
1129 if video_id is not None:
1130 video_info['id'] = video_id
1131 if video_title is not None:
1132 video_info['title
'] = video_title
1135 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
1137 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1139 return self.playlist_result(
1140 urls, playlist_id=playlist_id, playlist_title=playlist_title)
1143 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
1144 """Returns a playlist"""
1145 video_info = {'_type
': 'playlist
',
1147 video_info.update(kwargs)
1149 video_info['id'] = playlist_id
1151 video_info['title
'] = playlist_title
1152 if playlist_description is not None:
1153 video_info['description
'] = playlist_description
1156 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1158 Perform a regex search on the given string, using a single or a list of
1159 patterns returning the first matching group.
1160 In case of failure return a default value or raise a WARNING or a
1161 RegexNotFoundError, depending on fatal, specifying the field name.
1163 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1164 mobj = re.search(pattern, string, flags)
1167 mobj = re.search(p, string, flags)
1171 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1175 # return the first matching group
1176 return next(g for g in mobj.groups() if g is not None)
1177 elif isinstance(group, (list, tuple)):
1178 return tuple(mobj.group(g) for g in group)
1180 return mobj.group(group)
1181 elif default is not NO_DEFAULT:
1184 raise RegexNotFoundError('Unable to extract
%s' % _name)
1186 self.report_warning('unable to extract
%s' % _name + bug_reports_message())
1189 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1191 Like _search_regex, but strips HTML tags and unescapes entities.
1193 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1195 return clean_html(res).strip()
1199 def _get_netrc_login_info(self, netrc_machine=None):
1202 netrc_machine = netrc_machine or self._NETRC_MACHINE
1204 if self.get_param('usenetrc
', False):
1206 netrc_file = compat_expanduser(self.get_param('netrc_location
') or '~
')
1207 if os.path.isdir(netrc_file):
1208 netrc_file = os.path.join(netrc_file, '.netrc
')
1209 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1210 if info is not None:
1214 raise netrc.NetrcParseError(
1215 'No authenticators
for %s' % netrc_machine)
1216 except (IOError, netrc.NetrcParseError) as err:
1217 self.report_warning(
1218 'parsing
.netrc
: %s' % error_to_compat_str(err))
1220 return username, password
1222 def _get_login_info(self, username_option='username
', password_option='password
', netrc_machine=None):
1224 Get the login info as (username, password)
1225 First look for the manually specified credentials using username_option
1226 and password_option as keys in params dictionary. If no such credentials
1227 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1229 If there's no info available
, return (None, None)
1232 # Attempt to use provided username and password or .netrc data
1233 username = self.get_param(username_option)
1234 if username is not None:
1235 password = self.get_param(password_option)
1237 username, password = self._get_netrc_login_info(netrc_machine)
1239 return username, password
1241 def _get_tfa_info(self, note='two-factor verification code'):
1243 Get the two
-factor authentication info
1244 TODO
- asking the user will be required
for sms
/phone verify
1245 currently just uses the command line option
1246 If there
's no info available, return None
1249 tfa = self.get_param('twofactor
')
1253 return compat_getpass('Type
%s and press
[Return
]: ' % note)
1255 # Helper functions for extracting OpenGraph info
1257 def _og_regexes(prop):
1258 content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))'
1259 property_re = (r'(?
:name|
property)=(?
:\'og
[:-]%(prop)s\'|
"og[:-]%(prop)s"|\s
*og
[:-]%(prop)s\b)'
1260 % {'prop': re.escape(prop)})
1261 template = r'<meta
[^
>]+?
%s[^
>]+?
%s'
1263 template % (property_re, content_re),
1264 template % (content_re, property_re),
1268 def _meta_regex(prop):
1269 return r'''(?isx)<meta
1270 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1271 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1273 def _og_search_property(self, prop, html, name=None, **kargs):
1274 prop = variadic(prop)
1276 name = 'OpenGraph
%s' % prop[0]
1279 og_regexes.extend(self._og_regexes(p))
1280 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1283 return unescapeHTML(escaped)
1285 def _og_search_thumbnail(self, html, **kargs):
1286 return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs)
1288 def _og_search_description(self, html, **kargs):
1289 return self._og_search_property('description
', html, fatal=False, **kargs)
1291 def _og_search_title(self, html, **kargs):
1292 return self._og_search_property('title
', html, **kargs)
1294 def _og_search_video_url(self, html, name='video url
', secure=True, **kargs):
1295 regexes = self._og_regexes('video
') + self._og_regexes('video
:url
')
1297 regexes = self._og_regexes('video
:secure_url
') + regexes
1298 return self._html_search_regex(regexes, html, name, **kargs)
1300 def _og_search_url(self, html, **kargs):
1301 return self._og_search_property('url
', html, **kargs)
1303 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1304 name = variadic(name)
1305 if display_name is None:
1306 display_name = name[0]
1307 return self._html_search_regex(
1308 [self._meta_regex(n) for n in name],
1309 html, display_name, fatal=fatal, group='content
', **kwargs)
1311 def _dc_search_uploader(self, html):
1312 return self._html_search_meta('dc
.creator
', html, 'uploader
')
1314 def _rta_search(self, html):
1315 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1316 if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+'
1317 r' content
="RTA-5042-1996-1400-1577-RTA"',
1322 def _media_rating_search(self, html):
1323 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1324 rating = self._html_search_meta('rating
', html)
1336 return RATING_TABLE.get(rating.lower())
1338 def _family_friendly_search(self, html):
1339 # See http://schema.org/VideoObject
1340 family_friendly = self._html_search_meta(
1341 'isFamilyFriendly
', html, default=None)
1343 if not family_friendly:
1352 return RATING_TABLE.get(family_friendly.lower())
1354 def _twitter_search_player(self, html):
1355 return self._html_search_meta('twitter
:player
', html,
1356 'twitter card player
')
1358 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1359 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1360 default = kwargs.get('default
', NO_DEFAULT)
1361 # JSON-LD may be malformed and thus `fatal` should be respected.
1362 # At the same time `default` may be passed that assumes `fatal=False`
1363 # for _search_regex. Let's simulate the same behavior here
as well
.
1364 fatal
= kwargs
.get('fatal', True) if default
is NO_DEFAULT
else False
1366 for mobj
in json_ld_list
:
1367 json_ld_item
= self
._parse
_json
(
1368 mobj
.group('json_ld'), video_id
, fatal
=fatal
)
1369 if not json_ld_item
:
1371 if isinstance(json_ld_item
, dict):
1372 json_ld
.append(json_ld_item
)
1373 elif isinstance(json_ld_item
, (list, tuple)):
1374 json_ld
.extend(json_ld_item
)
1376 json_ld
= self
._json
_ld
(json_ld
, video_id
, fatal
=fatal
, expected_type
=expected_type
)
1379 if default
is not NO_DEFAULT
:
1382 raise RegexNotFoundError('Unable to extract JSON-LD')
1384 self
.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1387 def _json_ld(self
, json_ld
, video_id
, fatal
=True, expected_type
=None):
1388 if isinstance(json_ld
, compat_str
):
1389 json_ld
= self
._parse
_json
(json_ld
, video_id
, fatal
=fatal
)
1393 if not isinstance(json_ld
, (list, tuple, dict)):
1395 if isinstance(json_ld
, dict):
1398 INTERACTION_TYPE_MAP
= {
1399 'CommentAction': 'comment',
1400 'AgreeAction': 'like',
1401 'DisagreeAction': 'dislike',
1402 'LikeAction': 'like',
1403 'DislikeAction': 'dislike',
1404 'ListenAction': 'view',
1405 'WatchAction': 'view',
1406 'ViewAction': 'view',
1409 def extract_interaction_type(e
):
1410 interaction_type
= e
.get('interactionType')
1411 if isinstance(interaction_type
, dict):
1412 interaction_type
= interaction_type
.get('@type')
1413 return str_or_none(interaction_type
)
1415 def extract_interaction_statistic(e
):
1416 interaction_statistic
= e
.get('interactionStatistic')
1417 if isinstance(interaction_statistic
, dict):
1418 interaction_statistic
= [interaction_statistic
]
1419 if not isinstance(interaction_statistic
, list):
1421 for is_e
in interaction_statistic
:
1422 if not isinstance(is_e
, dict):
1424 if is_e
.get('@type') != 'InteractionCounter':
1426 interaction_type
= extract_interaction_type(is_e
)
1427 if not interaction_type
:
1429 # For interaction count some sites provide string instead of
1430 # an integer (as per spec) with non digit characters (e.g. ",")
1431 # so extracting count with more relaxed str_to_int
1432 interaction_count
= str_to_int(is_e
.get('userInteractionCount'))
1433 if interaction_count
is None:
1435 count_kind
= INTERACTION_TYPE_MAP
.get(interaction_type
.split('/')[-1])
1438 count_key
= '%s_count' % count_kind
1439 if info
.get(count_key
) is not None:
1441 info
[count_key
] = interaction_count
1443 def extract_chapter_information(e
):
1445 'title': part
.get('name'),
1446 'start_time': part
.get('startOffset'),
1447 'end_time': part
.get('endOffset'),
1448 } for part
in e
.get('hasPart', []) if part
.get('@type') == 'Clip']
1449 for idx
, (last_c
, current_c
, next_c
) in enumerate(zip(
1450 [{'end_time': 0}
] + chapters
, chapters
, chapters
[1:])):
1451 current_c
['end_time'] = current_c
['end_time'] or next_c
['start_time']
1452 current_c
['start_time'] = current_c
['start_time'] or last_c
['end_time']
1453 if None in current_c
.values():
1454 self
.report_warning(f
'Chapter {idx} contains broken data. Not extracting chapters')
1457 chapters
[-1]['end_time'] = chapters
[-1]['end_time'] or info
['duration']
1458 info
['chapters'] = chapters
1460 def extract_video_object(e
):
1461 assert e
['@type'] == 'VideoObject'
1462 author
= e
.get('author')
1464 'url': url_or_none(e
.get('contentUrl')),
1465 'title': unescapeHTML(e
.get('name')),
1466 'description': unescapeHTML(e
.get('description')),
1467 'thumbnails': [{'url': url_or_none(url)}
1468 for url
in variadic(traverse_obj(e
, 'thumbnailUrl', 'thumbnailURL'))],
1469 'duration': parse_duration(e
.get('duration')),
1470 'timestamp': unified_timestamp(e
.get('uploadDate')),
1471 # author can be an instance of 'Organization' or 'Person' types.
1472 # both types can have 'name' property(inherited from 'Thing' type). [1]
1473 # however some websites are using 'Text' type instead.
1474 # 1. https://schema.org/VideoObject
1475 'uploader': author
.get('name') if isinstance(author
, dict) else author
if isinstance(author
, compat_str
) else None,
1476 'filesize': float_or_none(e
.get('contentSize')),
1477 'tbr': int_or_none(e
.get('bitrate')),
1478 'width': int_or_none(e
.get('width')),
1479 'height': int_or_none(e
.get('height')),
1480 'view_count': int_or_none(e
.get('interactionCount')),
1482 extract_interaction_statistic(e
)
1483 extract_chapter_information(e
)
1485 def traverse_json_ld(json_ld
, at_top_level
=True):
1487 if at_top_level
and '@context' not in e
:
1489 if at_top_level
and set(e
.keys()) == {'@context', '@graph'}
:
1490 traverse_json_ld(variadic(e
['@graph'], allowed_types
=(dict,)), at_top_level
=False)
1492 item_type
= e
.get('@type')
1493 if expected_type
is not None and expected_type
!= item_type
:
1495 rating
= traverse_obj(e
, ('aggregateRating', 'ratingValue'), expected_type
=float_or_none
)
1496 if rating
is not None:
1497 info
['average_rating'] = rating
1498 if item_type
in ('TVEpisode', 'Episode'):
1499 episode_name
= unescapeHTML(e
.get('name'))
1501 'episode': episode_name
,
1502 'episode_number': int_or_none(e
.get('episodeNumber')),
1503 'description': unescapeHTML(e
.get('description')),
1505 if not info
.get('title') and episode_name
:
1506 info
['title'] = episode_name
1507 part_of_season
= e
.get('partOfSeason')
1508 if isinstance(part_of_season
, dict) and part_of_season
.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1510 'season': unescapeHTML(part_of_season
.get('name')),
1511 'season_number': int_or_none(part_of_season
.get('seasonNumber')),
1513 part_of_series
= e
.get('partOfSeries') or e
.get('partOfTVSeries')
1514 if isinstance(part_of_series
, dict) and part_of_series
.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1515 info
['series'] = unescapeHTML(part_of_series
.get('name'))
1516 elif item_type
== 'Movie':
1518 'title': unescapeHTML(e
.get('name')),
1519 'description': unescapeHTML(e
.get('description')),
1520 'duration': parse_duration(e
.get('duration')),
1521 'timestamp': unified_timestamp(e
.get('dateCreated')),
1523 elif item_type
in ('Article', 'NewsArticle'):
1525 'timestamp': parse_iso8601(e
.get('datePublished')),
1526 'title': unescapeHTML(e
.get('headline')),
1527 'description': unescapeHTML(e
.get('articleBody') or e
.get('description')),
1529 elif item_type
== 'VideoObject':
1530 extract_video_object(e
)
1531 if expected_type
is None:
1535 video
= e
.get('video')
1536 if isinstance(video
, dict) and video
.get('@type') == 'VideoObject':
1537 extract_video_object(video
)
1538 if expected_type
is None:
1542 traverse_json_ld(json_ld
)
1544 return dict((k
, v
) for k
, v
in info
.items() if v
is not None)
1546 def _search_nextjs_data(self
, webpage
, video_id
, **kw
):
1547 return self
._parse
_json
(
1549 r
'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^
>]*>([^
<]+)</script
>',
1550 webpage, 'next
.js data
', **kw),
1553 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__
'):
1554 ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
1555 # not all website do this, but it can be changed
1556 # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source
1557 rectx = re.escape(context_name)
1558 js, arg_keys, arg_vals = self._search_regex(
1559 (r'<script
>window\
.%s=\
(function\
((?P
<arg_keys
>.*?
)\
)\{return\s(?P<js>\{.*?\}
)\
}\
((?P
<arg_vals
>.+?
)\
)\
);?
</script
>' % rectx,
1560 r'%s\
(.*?\
(function\
((?P
<arg_keys
>.*?
)\
)\{return\s(?P<js>\{.*?\}
)\
}\
((?P
<arg_vals
>.*?
)\
)' % rectx),
1561 webpage, context_name, group=['js
', 'arg_keys
', 'arg_vals
'])
1563 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1565 for key, val in args.items():
1566 if val in ('undefined
', 'void
0'):
1569 return self._parse_json(js_to_json(js, args), video_id)['data
'][0]
1572 def _hidden_inputs(html):
1573 html = re.sub(r'<!--(?
:(?
!<!--).)*-->', '', html)
1575 for input in re.findall(r'(?i
)(<input[^
>]+>)', html):
1576 attrs = extract_attributes(input)
1579 if attrs.get('type') not in ('hidden
', 'submit
'):
1581 name = attrs.get('name
') or attrs.get('id')
1582 value = attrs.get('value
')
1583 if name and value is not None:
1584 hidden_inputs[name] = value
1585 return hidden_inputs
1587 def _form_hidden_inputs(self, form_id, html):
1588 form = self._search_regex(
1589 r'(?
is)<form
[^
>]+?
id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1590 html, '%s form' % form_id, group='form')
1591 return self._hidden_inputs(form)
1594 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1596 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1597 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr',
1598 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
1599 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1600 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1601 'fps', 'fs_approx', 'source', 'id')
1604 'vcodec': {'type': 'ordered', 'regex': True,
1605 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1606 'acodec': {'type': 'ordered', 'regex': True,
1607 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1608 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1609 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1610 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1611 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1612 'vext': {'type': 'ordered', 'field': 'video_ext',
1613 'order': ('mp4', 'webm', 'flv', '', 'none'),
1614 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1615 'aext': {'type': 'ordered', 'field': 'audio_ext',
1616 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1617 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')},
1618 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1619 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1620 'field': ('vcodec', 'acodec'),
1621 'function': lambda it: int(any(v != 'none' for v in it))},
1622 'ie_pref': {'priority': True, 'type': 'extractor'},
1623 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1624 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1625 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1626 'quality': {'convert': 'float', 'default': -1},
1627 'filesize': {'convert': 'bytes'},
1628 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1629 'id': {'convert': 'string', 'field': 'format_id'},
1630 'height': {'convert': 'float_none'},
1631 'width': {'convert': 'float_none'},
1632 'fps': {'convert': 'float_none'},
1633 'tbr': {'convert': 'float_none'},
1634 'vbr': {'convert': 'float_none'},
1635 'abr': {'convert': 'float_none'},
1636 'asr': {'convert': 'float_none'},
1637 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1639 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1640 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1641 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1642 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1643 'res': {'type': 'multiple', 'field': ('height', 'width'),
1644 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1646 # For compatibility with youtube-dl
1647 'format_id': {'type': 'alias', 'field': 'id'},
1648 'preference': {'type': 'alias', 'field': 'ie_pref'},
1649 'language_preference': {'type': 'alias', 'field': 'lang'},
1652 'dimension': {'type': 'alias', 'field': 'res'},
1653 'resolution': {'type': 'alias', 'field': 'res'},
1654 'extension': {'type': 'alias', 'field': 'ext'},
1655 'bitrate': {'type': 'alias', 'field': 'br'},
1656 'total_bitrate': {'type': 'alias', 'field': 'tbr'},
1657 'video_bitrate': {'type': 'alias', 'field': 'vbr'},
1658 'audio_bitrate': {'type': 'alias', 'field': 'abr'},
1659 'framerate': {'type': 'alias', 'field': 'fps'},
1660 'protocol': {'type': 'alias', 'field': 'proto'},
1661 'source_preference': {'type': 'alias', 'field': 'source'},
1662 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1663 'filesize_estimate': {'type': 'alias', 'field': 'size'},
1664 'samplerate': {'type': 'alias', 'field': 'asr'},
1665 'video_ext': {'type': 'alias', 'field': 'vext'},
1666 'audio_ext': {'type': 'alias', 'field': 'aext'},
1667 'video_codec': {'type': 'alias', 'field': 'vcodec'},
1668 'audio_codec': {'type': 'alias', 'field': 'acodec'},
1669 'video': {'type': 'alias', 'field': 'hasvid'},
1670 'has_video': {'type': 'alias', 'field': 'hasvid'},
1671 'audio': {'type': 'alias', 'field': 'hasaud'},
1672 'has_audio': {'type': 'alias', 'field': 'hasaud'},
1673 'extractor': {'type': 'alias', 'field': 'ie_pref'},
1674 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'},
1677 def __init__(self, ie, field_preference):
1679 self.ydl = ie._downloader
1680 self.evaluate_params(self.ydl.params, field_preference)
1681 if ie.get_param('verbose'):
1682 self.print_verbose_info(self.ydl.write_debug)
1684 def _get_field_setting(self, field, key):
1685 if field not in self.settings:
1686 if key in ('forced', 'priority'):
1688 self.ydl.deprecation_warning(
1689 f'Using arbitrary fields ({field}) for format sorting is deprecated '
1690 'and may be removed in a future version')
1691 self.settings[field] = {}
1692 propObj = self.settings[field]
1693 if key not in propObj:
1694 type = propObj.get('type')
1696 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1697 elif key == 'convert':
1698 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1700 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1701 propObj[key] = default
1704 def _resolve_field_value(self, field, value, convertNone=False):
1709 value = value.lower()
1710 conversion = self._get_field_setting(field, 'convert')
1711 if conversion == 'ignore':
1713 if conversion == 'string':
1715 elif conversion == 'float_none':
1716 return float_or_none(value)
1717 elif conversion == 'bytes':
1718 return FileDownloader.parse_bytes(value)
1719 elif conversion == 'order':
1720 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1721 use_regex = self._get_field_setting(field, 'regex')
1722 list_length = len(order_list)
1723 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1724 if use_regex and value is not None:
1725 for i, regex in enumerate(order_list):
1726 if regex and re.match(regex, value):
1727 return list_length - i
1728 return list_length - empty_pos # not in list
1729 else: # not regex or value = None
1730 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1732 if value.isnumeric():
1735 self.settings[field]['convert'] = 'string'
1738 def evaluate_params(self, params, sort_extractor):
1739 self._use_free_order = params.get('prefer_free_formats', False)
1740 self._sort_user = params.get('format_sort', [])
1741 self._sort_extractor = sort_extractor
1743 def add_item(field, reverse, closest, limit_text):
1744 field = field.lower()
1745 if field in self._order:
1747 self._order.append(field)
1748 limit = self._resolve_field_value(field, limit_text)
1751 'closest': False if limit is None else closest,
1752 'limit_text': limit_text,
1754 if field in self.settings:
1755 self.settings[field].update(data)
1757 self.settings[field] = data
1760 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1761 + (tuple() if params.get('format_sort_force', False)
1762 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1763 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1765 for item in sort_list:
1766 match = re.match(self.regex, item)
1768 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1769 field = match.group('field')
1772 if self._get_field_setting(field, 'type') == 'alias':
1773 alias, field = field, self._get_field_setting(field, 'field')
1774 if alias not in ('format_id', 'preference', 'language_preference'):
1775 self.ydl.deprecation_warning(
1776 f'Format sorting alias {alias} is deprecated '
1777 f'and may be removed in a future version. Please use {field} instead')
1778 reverse = match.group('reverse') is not None
1779 closest = match.group('separator') == '~'
1780 limit_text = match.group('limit')
1782 has_limit = limit_text is not None
1783 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1784 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1786 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1787 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1788 limit_count = len(limits)
1789 for (i, f) in enumerate(fields):
1790 add_item(f, reverse, closest,
1791 limits[i] if i < limit_count
1792 else limits[0] if has_limit and not has_multiple_limits
1795 def print_verbose_info(self, write_debug):
1797 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1798 if self._sort_extractor:
1799 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1800 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1801 '+' if self._get_field_setting(field, 'reverse') else '', field,
1802 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1803 self._get_field_setting(field, 'limit_text'),
1804 self._get_field_setting(field, 'limit'))
1805 if self._get_field_setting(field, 'limit_text') is not None else '')
1806 for field in self._order if self._get_field_setting(field, 'visible')]))
1808 def _calculate_field_preference_from_value(self, format, field, type, value):
1809 reverse = self._get_field_setting(field, 'reverse')
1810 closest = self._get_field_setting(field, 'closest')
1811 limit = self._get_field_setting(field, 'limit')
1813 if type == 'extractor':
1814 maximum = self._get_field_setting(field, 'max')
1815 if value is None or (maximum is not None and value >= maximum):
1817 elif type == 'boolean':
1818 in_list = self._get_field_setting(field, 'in_list')
1819 not_in_list = self._get_field_setting(field, 'not_in_list')
1820 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1821 elif type == 'ordered':
1822 value = self._resolve_field_value(field, value, True)
1824 # try to convert to number
1825 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1826 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1830 return ((-10, 0) if value is None
1831 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1832 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1833 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1834 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1835 else (-1, value, 0))
1837 def _calculate_field_preference(self, format, field):
1838 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1839 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1840 if type == 'multiple':
1841 type = 'field' # Only 'field' is allowed in multiple for now
1842 actual_fields = self._get_field_setting(field, 'field')
1844 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1846 value = get_value(field)
1847 return self._calculate_field_preference_from_value(format, field, type, value)
1849 def calculate_preference(self, format):
1850 # Determine missing protocol
1851 if not format.get('protocol'):
1852 format['protocol'] = determine_protocol(format)
1854 # Determine missing ext
1855 if not format.get('ext') and 'url' in format:
1856 format['ext'] = determine_ext(format['url'])
1857 if format.get('vcodec') == 'none':
1858 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1859 format['video_ext'] = 'none'
1861 format['video_ext'] = format['ext']
1862 format['audio_ext'] = 'none'
1863 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1864 # format['preference'] = -1000
1866 # Determine missing bitrates
1867 if format.get('tbr') is None:
1868 if format.get('vbr') is not None and format.get('abr') is not None:
1869 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1871 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1872 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1873 if format.get('acodec') != 'none' and format.get('abr') is None:
1874 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1876 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1878 def _sort_formats(self, formats, field_preference=[]):
1881 format_sort = self.FormatSort(self, field_preference)
1882 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1884 def _check_formats(self, formats, video_id):
1886 formats[:] = filter(
1887 lambda f: self._is_valid_url(
1889 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1893 def _remove_duplicate_formats(formats):
1897 if f['url'] not in format_urls:
1898 format_urls.add(f['url'])
1899 unique_formats.append(f)
1900 formats[:] = unique_formats
1902 def _is_valid_url(self, url, video_id, item='video', headers={}):
1903 url = self._proto_relative_url(url, scheme='http:')
1904 # For now assume non HTTP(S) URLs always valid
1905 if not (url.startswith('http://') or url.startswith('https://')):
1908 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1910 except ExtractorError as e:
1912 '%s: %s URL is invalid, skipping: %s'
1913 % (video_id, item, error_to_compat_str(e.cause)))
1916 def http_scheme(self):
1917 """ Either "http
:" or "https
:", depending on the user's preferences """
1920 if self.get_param('prefer_insecure', False)
1923 def _proto_relative_url(self, url, scheme=None):
1926 if url.startswith('//'):
1928 scheme = self.http_scheme()
1933 def _sleep(self, timeout, video_id, msg_template=None):
1934 if msg_template is None:
1935 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1936 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1940 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1941 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1942 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1943 manifest = self._download_xml(
1944 manifest_url, video_id, 'Downloading f4m manifest',
1945 'Unable to download f4m manifest',
1946 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1947 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1948 transform_source=transform_source,
1949 fatal=fatal, data=data, headers=headers, query=query)
1951 if manifest is False:
1954 return self._parse_f4m_formats(
1955 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1956 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1958 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1959 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1960 fatal=True, m3u8_id=None):
1961 if not isinstance(manifest, compat_etree_Element) and not fatal:
1964 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1965 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1966 if akamai_pv is not None and ';' in akamai_pv.text:
1967 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1968 if playerVerificationChallenge.strip() != '':
1972 manifest_version = '1.0'
1973 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1975 manifest_version = '2.0'
1976 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1977 # Remove unsupported DRM protected media from final formats
1978 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1979 media_nodes = remove_encrypted_media(media_nodes)
1983 manifest_base_url = get_base_url(manifest)
1985 bootstrap_info = xpath_element(
1986 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1987 'bootstrap info', default=None)
1990 mime_type = xpath_text(
1991 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1992 'base URL', default=None)
1993 if mime_type and mime_type.startswith('audio/'):
1996 for i, media_el in enumerate(media_nodes):
1997 tbr = int_or_none(media_el.attrib.get('bitrate'))
1998 width = int_or_none(media_el.attrib.get('width'))
1999 height = int_or_none(media_el.attrib.get('height'))
2000 format_id = join_nonempty(f4m_id, tbr or i)
2001 # If <bootstrapInfo> is present, the specified f4m is a
2002 # stream-level manifest, and only set-level manifests may refer to
2003 # external resources. See section 11.4 and section 4 of F4M spec
2004 if bootstrap_info is None:
2006 # @href is introduced in 2.0, see section 11.6 of F4M spec
2007 if manifest_version == '2.0':
2008 media_url = media_el.attrib.get('href')
2009 if media_url is None:
2010 media_url = media_el.attrib.get('url')
2014 media_url if media_url.startswith('http://') or media_url.startswith('https://')
2015 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2016 # If media_url is itself a f4m manifest do the recursive extraction
2017 # since bitrates in parent manifest (this one) and media_url manifest
2018 # may differ leading to inability to resolve the format by requested
2019 # bitrate in f4m downloader
2020 ext = determine_ext(manifest_url)
2022 f4m_formats = self._extract_f4m_formats(
2023 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2024 transform_source=transform_source, fatal=fatal)
2025 # Sometimes stream-level manifest contains single media entry that
2026 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2027 # At the same time parent's media entry in set-level manifest may
2028 # contain it. We will copy it from parent in such cases.
2029 if len(f4m_formats) == 1:
2032 'tbr': f.get('tbr') or tbr,
2033 'width': f.get('width') or width,
2034 'height': f.get('height') or height,
2035 'format_id': f.get('format_id') if not tbr else format_id,
2038 formats.extend(f4m_formats)
2041 formats.extend(self._extract_m3u8_formats(
2042 manifest_url, video_id, 'mp4', preference=preference,
2043 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2046 'format_id': format_id,
2047 'url': manifest_url,
2048 'manifest_url': manifest_url,
2049 'ext': 'flv' if bootstrap_info is not None else None,
2055 'preference': preference,
2060 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2062 'format_id': join_nonempty(m3u8_id, 'meta'),
2066 'preference': preference - 100 if preference else -100,
2068 'resolution': 'multiple',
2069 'format_note': 'Quality selection URL',
2072 def _report_ignoring_subs(self, name):
2073 self.report_warning(bug_reports_message(
2074 f'Ignoring subtitle tracks found in the {name} manifest; '
2075 'if any subtitle tracks are missing,'
2078 def _extract_m3u8_formats(self, *args, **kwargs):
2079 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2081 self._report_ignoring_subs('HLS')
2084 def _extract_m3u8_formats_and_subtitles(
2085 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2086 preference=None, quality=None, m3u8_id=None, note=None,
2087 errnote=None, fatal=True, live=False, data=None, headers={},
2090 res = self._download_webpage_handle(
2092 note='Downloading m3u8 information' if note is None else note,
2093 errnote='Failed to download m3u8 information' if errnote is None else errnote,
2094 fatal=fatal, data=data, headers=headers, query=query)
2099 m3u8_doc, urlh = res
2100 m3u8_url = urlh.geturl()
2102 return self._parse_m3u8_formats_and_subtitles(
2103 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2104 preference=preference, quality=quality, m3u8_id=m3u8_id,
2105 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2106 headers=headers, query=query, video_id=video_id)
2108 def _parse_m3u8_formats_and_subtitles(
2109 self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
2110 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2111 errnote=None, fatal=True, data=None, headers={}, query={},
2113 formats, subtitles = [], {}
2115 has_drm = re.search('|'.join([
2116 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2117 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd
://', # Apple FairPlay
2120 def format_url(url):
2121 return url if re.match(r'^https?
://', url) else compat_urlparse.urljoin(m3u8_url, url)
2123 if self.get_param('hls_split_discontinuity
', False):
2124 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2126 if not manifest_url:
2128 m3u8_doc = self._download_webpage(
2129 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2130 note=False, errnote='Failed to download m3u8 playlist information
')
2131 if m3u8_doc is False:
2133 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2136 def _extract_m3u8_playlist_indices(*args
, **kwargs
):
2140 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2141 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2142 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2144 # We should try extracting formats only from master playlists [1, 4.3.4],
2145 # i.e. playlists that describe available qualities. On the other hand
2146 # media playlists [1, 4.3.3] should be returned as is since they contain
2147 # just the media without qualities renditions.
2148 # Fortunately, master playlist can be easily distinguished from media
2149 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2150 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2151 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2152 # media playlist and MUST NOT appear in master playlist thus we can
2153 # clearly detect media playlist with this criterion.
2155 if '#EXT-X-TARGETDURATION' in m3u8_doc
: # media playlist, return as is
2157 'format_id': join_nonempty(m3u8_id
, idx
),
2158 'format_index': idx
,
2161 'protocol': entry_protocol
,
2162 'preference': preference
,
2165 } for idx
in _extract_m3u8_playlist_indices(m3u8_doc
=m3u8_doc
)]
2167 return formats
, subtitles
2170 last_stream_inf
= {}
2172 def extract_media(x_media_line
):
2173 media
= parse_m3u8_attributes(x_media_line
)
2174 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2175 media_type
, group_id
, name
= media
.get('TYPE'), media
.get('GROUP-ID'), media
.get('NAME')
2176 if not (media_type
and group_id
and name
):
2178 groups
.setdefault(group_id
, []).append(media
)
2179 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2180 if media_type
== 'SUBTITLES':
2181 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2182 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2183 # However, lack of URI has been spotted in the wild.
2184 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2185 if not media
.get('URI'):
2187 url
= format_url(media
['URI'])
2190 'ext': determine_ext(url
),
2192 if sub_info
['ext'] == 'm3u8':
2193 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2194 # files may contain is WebVTT:
2195 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2196 sub_info
['ext'] = 'vtt'
2197 sub_info
['protocol'] = 'm3u8_native'
2198 lang
= media
.get('LANGUAGE') or 'und'
2199 subtitles
.setdefault(lang
, []).append(sub_info
)
2200 if media_type
not in ('VIDEO', 'AUDIO'):
2202 media_url
= media
.get('URI')
2204 manifest_url
= format_url(media_url
)
2206 'format_id': join_nonempty(m3u8_id
, group_id
, name
, idx
),
2207 'format_note': name
,
2208 'format_index': idx
,
2209 'url': manifest_url
,
2210 'manifest_url': m3u8_url
,
2211 'language': media
.get('LANGUAGE'),
2213 'protocol': entry_protocol
,
2214 'preference': preference
,
2216 'vcodec': 'none' if media_type
== 'AUDIO' else None,
2217 } for idx
in _extract_m3u8_playlist_indices(manifest_url
))
2219 def build_stream_name():
2220 # Despite specification does not mention NAME attribute for
2221 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2222 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2223 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2224 stream_name
= last_stream_inf
.get('NAME')
2227 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2228 # from corresponding rendition group
2229 stream_group_id
= last_stream_inf
.get('VIDEO')
2230 if not stream_group_id
:
2232 stream_group
= groups
.get(stream_group_id
)
2233 if not stream_group
:
2234 return stream_group_id
2235 rendition
= stream_group
[0]
2236 return rendition
.get('NAME') or stream_group_id
2238 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2239 # chance to detect video only formats when EXT-X-STREAM-INF tags
2240 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2241 for line
in m3u8_doc
.splitlines():
2242 if line
.startswith('#EXT-X-MEDIA:'):
2245 for line
in m3u8_doc
.splitlines():
2246 if line
.startswith('#EXT-X-STREAM-INF:'):
2247 last_stream_inf
= parse_m3u8_attributes(line
)
2248 elif line
.startswith('#') or not line
.strip():
2251 tbr
= float_or_none(
2252 last_stream_inf
.get('AVERAGE-BANDWIDTH')
2253 or last_stream_inf
.get('BANDWIDTH'), scale
=1000)
2254 manifest_url
= format_url(line
.strip())
2256 for idx
in _extract_m3u8_playlist_indices(manifest_url
):
2257 format_id
= [m3u8_id
, None, idx
]
2258 # Bandwidth of live streams may differ over time thus making
2259 # format_id unpredictable. So it's better to keep provided
2262 stream_name
= build_stream_name()
2263 format_id
[1] = stream_name
or '%d' % (tbr
or len(formats
))
2265 'format_id': join_nonempty(*format_id
),
2266 'format_index': idx
,
2267 'url': manifest_url
,
2268 'manifest_url': m3u8_url
,
2271 'fps': float_or_none(last_stream_inf
.get('FRAME-RATE')),
2272 'protocol': entry_protocol
,
2273 'preference': preference
,
2276 resolution
= last_stream_inf
.get('RESOLUTION')
2278 mobj
= re
.search(r
'(?P<width>\d+)[xX](?P<height>\d+)', resolution
)
2280 f
['width'] = int(mobj
.group('width'))
2281 f
['height'] = int(mobj
.group('height'))
2282 # Unified Streaming Platform
2284 r
'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f
['url'])
2286 abr
, vbr
= mobj
.groups()
2287 abr
, vbr
= float_or_none(abr
, 1000), float_or_none(vbr
, 1000)
2292 codecs
= parse_codecs(last_stream_inf
.get('CODECS'))
2294 audio_group_id
= last_stream_inf
.get('AUDIO')
2295 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2296 # references a rendition group MUST have a CODECS attribute.
2297 # However, this is not always respected, for example, [2]
2298 # contains EXT-X-STREAM-INF tag which references AUDIO
2299 # rendition group but does not have CODECS and despite
2300 # referencing an audio group it represents a complete
2301 # (with audio and video) format. So, for such cases we will
2302 # ignore references to rendition groups and treat them
2303 # as complete formats.
2304 if audio_group_id
and codecs
and f
.get('vcodec') != 'none':
2305 audio_group
= groups
.get(audio_group_id
)
2306 if audio_group
and audio_group
[0].get('URI'):
2307 # TODO: update acodec for audio only formats with
2309 f
['acodec'] = 'none'
2310 if not f
.get('ext'):
2311 f
['ext'] = 'm4a' if f
.get('vcodec') == 'none' else 'mp4'
2315 progressive_uri
= last_stream_inf
.get('PROGRESSIVE-URI')
2318 del http_f
['manifest_url']
2320 'format_id': f
['format_id'].replace('hls-', 'http-'),
2322 'url': progressive_uri
,
2324 formats
.append(http_f
)
2326 last_stream_inf
= {}
2327 return formats
, subtitles
2329 def _extract_m3u8_vod_duration(
2330 self
, m3u8_vod_url
, video_id
, note
=None, errnote
=None, data
=None, headers
={}, query={}
):
2332 m3u8_vod
= self
._download
_webpage
(
2333 m3u8_vod_url
, video_id
,
2334 note
='Downloading m3u8 VOD manifest' if note
is None else note
,
2335 errnote
='Failed to download VOD manifest' if errnote
is None else errnote
,
2336 fatal
=False, data
=data
, headers
=headers
, query
=query
)
2338 return self
._parse
_m
3u8_vod
_duration
(m3u8_vod
or '', video_id
)
2340 def _parse_m3u8_vod_duration(self
, m3u8_vod
, video_id
):
2341 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod
:
2345 float(line
[len('#EXTINF:'):].split(',')[0])
2346 for line
in m3u8_vod
.splitlines() if line
.startswith('#EXTINF:'))) or None
2349 def _xpath_ns(path
, namespace
=None):
2353 for c
in path
.split('/'):
2354 if not c
or c
== '.':
2357 out
.append('{%s}%s' % (namespace
, c
))
2358 return '/'.join(out
)
2360 def _extract_smil_formats_and_subtitles(self
, smil_url
, video_id
, fatal
=True, f4m_params
=None, transform_source
=None):
2361 smil
= self
._download
_smil
(smil_url
, video_id
, fatal
=fatal
, transform_source
=transform_source
)
2367 namespace
= self
._parse
_smil
_namespace
(smil
)
2369 fmts
= self
._parse
_smil
_formats
(
2370 smil
, smil_url
, video_id
, namespace
=namespace
, f4m_params
=f4m_params
)
2371 subs
= self
._parse
_smil
_subtitles
(
2372 smil
, namespace
=namespace
)
2376 def _extract_smil_formats(self
, *args
, **kwargs
):
2377 fmts
, subs
= self
._extract
_smil
_formats
_and
_subtitles
(*args
, **kwargs
)
2379 self
._report
_ignoring
_subs
('SMIL')
2382 def _extract_smil_info(self
, smil_url
, video_id
, fatal
=True, f4m_params
=None):
2383 smil
= self
._download
_smil
(smil_url
, video_id
, fatal
=fatal
)
2386 return self
._parse
_smil
(smil
, smil_url
, video_id
, f4m_params
=f4m_params
)
2388 def _download_smil(self
, smil_url
, video_id
, fatal
=True, transform_source
=None):
2389 return self
._download
_xml
(
2390 smil_url
, video_id
, 'Downloading SMIL file',
2391 'Unable to download SMIL file', fatal
=fatal
, transform_source
=transform_source
)
2393 def _parse_smil(self
, smil
, smil_url
, video_id
, f4m_params
=None):
2394 namespace
= self
._parse
_smil
_namespace
(smil
)
2396 formats
= self
._parse
_smil
_formats
(
2397 smil
, smil_url
, video_id
, namespace
=namespace
, f4m_params
=f4m_params
)
2398 subtitles
= self
._parse
_smil
_subtitles
(smil
, namespace
=namespace
)
2400 video_id
= os
.path
.splitext(url_basename(smil_url
))[0]
2404 for meta
in smil
.findall(self
._xpath
_ns
('./head/meta', namespace
)):
2405 name
= meta
.attrib
.get('name')
2406 content
= meta
.attrib
.get('content')
2407 if not name
or not content
:
2409 if not title
and name
== 'title':
2411 elif not description
and name
in ('description', 'abstract'):
2412 description
= content
2413 elif not upload_date
and name
== 'date':
2414 upload_date
= unified_strdate(content
)
2417 'id': image
.get('type'),
2418 'url': image
.get('src'),
2419 'width': int_or_none(image
.get('width')),
2420 'height': int_or_none(image
.get('height')),
2421 } for image
in smil
.findall(self
._xpath
_ns
('.//image', namespace
)) if image
.get('src')]
2425 'title': title
or video_id
,
2426 'description': description
,
2427 'upload_date': upload_date
,
2428 'thumbnails': thumbnails
,
2430 'subtitles': subtitles
,
2433 def _parse_smil_namespace(self
, smil
):
2434 return self
._search
_regex
(
2435 r
'(?i)^{([^}]+)?}smil$', smil
.tag
, 'namespace', default
=None)
2437 def _parse_smil_formats(self
, smil
, smil_url
, video_id
, namespace
=None, f4m_params
=None, transform_rtmp_url
=None):
2439 for meta
in smil
.findall(self
._xpath
_ns
('./head/meta', namespace
)):
2440 b
= meta
.get('base') or meta
.get('httpBase')
2452 media
= smil
.findall(self
._xpath
_ns
('.//video', namespace
)) + smil
.findall(self
._xpath
_ns
('.//audio', namespace
))
2453 for medium
in media
:
2454 src
= medium
.get('src')
2455 if not src
or src
in srcs
:
2459 bitrate
= float_or_none(medium
.get('system-bitrate') or medium
.get('systemBitrate'), 1000)
2460 filesize
= int_or_none(medium
.get('size') or medium
.get('fileSize'))
2461 width
= int_or_none(medium
.get('width'))
2462 height
= int_or_none(medium
.get('height'))
2463 proto
= medium
.get('proto')
2464 ext
= medium
.get('ext')
2465 src_ext
= determine_ext(src
)
2466 streamer
= medium
.get('streamer') or base
2468 if proto
== 'rtmp' or streamer
.startswith('rtmp'):
2474 'format_id': 'rtmp-%d' % (rtmp_count
if bitrate
is None else bitrate
),
2476 'filesize': filesize
,
2480 if transform_rtmp_url
:
2481 streamer
, src
= transform_rtmp_url(streamer
, src
)
2482 formats
[-1].update({
2488 src_url
= src
if src
.startswith('http') else compat_urlparse
.urljoin(base
, src
)
2489 src_url
= src_url
.strip()
2491 if proto
== 'm3u8' or src_ext
== 'm3u8':
2492 m3u8_formats
= self
._extract
_m
3u8_formats
(
2493 src_url
, video_id
, ext
or 'mp4', m3u8_id
='hls', fatal
=False)
2494 if len(m3u8_formats
) == 1:
2496 m3u8_formats
[0].update({
2497 'format_id': 'hls-%d' % (m3u8_count
if bitrate
is None else bitrate
),
2502 formats
.extend(m3u8_formats
)
2503 elif src_ext
== 'f4m':
2508 'plugin': 'flowplayer-3.2.0.1',
2510 f4m_url
+= '&' if '?' in f4m_url
else '?'
2511 f4m_url
+= compat_urllib_parse_urlencode(f4m_params
)
2512 formats
.extend(self
._extract
_f
4m
_formats
(f4m_url
, video_id
, f4m_id
='hds', fatal
=False))
2513 elif src_ext
== 'mpd':
2514 formats
.extend(self
._extract
_mpd
_formats
(
2515 src_url
, video_id
, mpd_id
='dash', fatal
=False))
2516 elif re
.search(r
'\.ism/[Mm]anifest', src_url
):
2517 formats
.extend(self
._extract
_ism
_formats
(
2518 src_url
, video_id
, ism_id
='mss', fatal
=False))
2519 elif src_url
.startswith('http') and self
._is
_valid
_url
(src
, video_id
):
2523 'ext': ext
or src_ext
or 'flv',
2524 'format_id': 'http-%d' % (bitrate
or http_count
),
2526 'filesize': filesize
,
2531 for medium
in smil
.findall(self
._xpath
_ns
('.//imagestream', namespace
)):
2532 src
= medium
.get('src')
2533 if not src
or src
in srcs
:
2539 'format_id': 'imagestream-%d' % (imgs_count
),
2541 'ext': mimetype2ext(medium
.get('type')),
2544 'width': int_or_none(medium
.get('width')),
2545 'height': int_or_none(medium
.get('height')),
2546 'format_note': 'SMIL storyboards',
2551 def _parse_smil_subtitles(self
, smil
, namespace
=None, subtitles_lang
='en'):
2554 for num
, textstream
in enumerate(smil
.findall(self
._xpath
_ns
('.//textstream', namespace
))):
2555 src
= textstream
.get('src')
2556 if not src
or src
in urls
:
2559 ext
= textstream
.get('ext') or mimetype2ext(textstream
.get('type')) or determine_ext(src
)
2560 lang
= textstream
.get('systemLanguage') or textstream
.get('systemLanguageName') or textstream
.get('lang') or subtitles_lang
2561 subtitles
.setdefault(lang
, []).append({
2567 def _extract_xspf_playlist(self
, xspf_url
, playlist_id
, fatal
=True):
2568 xspf
= self
._download
_xml
(
2569 xspf_url
, playlist_id
, 'Downloading xpsf playlist',
2570 'Unable to download xspf manifest', fatal
=fatal
)
2573 return self
._parse
_xspf
(
2574 xspf
, playlist_id
, xspf_url
=xspf_url
,
2575 xspf_base_url
=base_url(xspf_url
))
2577 def _parse_xspf(self
, xspf_doc
, playlist_id
, xspf_url
=None, xspf_base_url
=None):
2579 'xspf': 'http://xspf.org/ns/0/',
2580 's1': 'http://static.streamone.nl/player/ns/0',
2584 for track
in xspf_doc
.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP
)):
2586 track
, xpath_with_ns('./xspf:title', NS_MAP
), 'title', default
=playlist_id
)
2587 description
= xpath_text(
2588 track
, xpath_with_ns('./xspf:annotation', NS_MAP
), 'description')
2589 thumbnail
= xpath_text(
2590 track
, xpath_with_ns('./xspf:image', NS_MAP
), 'thumbnail')
2591 duration
= float_or_none(
2592 xpath_text(track
, xpath_with_ns('./xspf:duration', NS_MAP
), 'duration'), 1000)
2595 for location
in track
.findall(xpath_with_ns('./xspf:location', NS_MAP
)):
2596 format_url
= urljoin(xspf_base_url
, location
.text
)
2601 'manifest_url': xspf_url
,
2602 'format_id': location
.get(xpath_with_ns('s1:label', NS_MAP
)),
2603 'width': int_or_none(location
.get(xpath_with_ns('s1:width', NS_MAP
))),
2604 'height': int_or_none(location
.get(xpath_with_ns('s1:height', NS_MAP
))),
2606 self
._sort
_formats
(formats
)
2611 'description': description
,
2612 'thumbnail': thumbnail
,
2613 'duration': duration
,
2618 def _extract_mpd_formats(self
, *args
, **kwargs
):
2619 fmts
, subs
= self
._extract
_mpd
_formats
_and
_subtitles
(*args
, **kwargs
)
2621 self
._report
_ignoring
_subs
('DASH')
2624 def _extract_mpd_formats_and_subtitles(
2625 self
, mpd_url
, video_id
, mpd_id
=None, note
=None, errnote
=None,
2626 fatal
=True, data
=None, headers
={}, query={}
):
2627 res
= self
._download
_xml
_handle
(
2629 note
='Downloading MPD manifest' if note
is None else note
,
2630 errnote
='Failed to download MPD manifest' if errnote
is None else errnote
,
2631 fatal
=fatal
, data
=data
, headers
=headers
, query
=query
)
2637 mpd_base_url
= base_url(urlh
.geturl())
2639 return self
._parse
_mpd
_formats
_and
_subtitles
(
2640 mpd_doc
, mpd_id
, mpd_base_url
, mpd_url
)
2642 def _parse_mpd_formats(self
, *args
, **kwargs
):
2643 fmts
, subs
= self
._parse
_mpd
_formats
_and
_subtitles
(*args
, **kwargs
)
2645 self
._report
_ignoring
_subs
('DASH')
2648 def _parse_mpd_formats_and_subtitles(
2649 self
, mpd_doc
, mpd_id
=None, mpd_base_url
='', mpd_url
=None):
2651 Parse formats from MPD manifest.
2653 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2654 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2655 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2657 if not self
.get_param('dynamic_mpd', True):
2658 if mpd_doc
.get('type') == 'dynamic':
2661 namespace
= self
._search
_regex
(r
'(?i)^{([^}]+)?}MPD$', mpd_doc
.tag
, 'namespace', default
=None)
2664 return self
._xpath
_ns
(path
, namespace
)
2666 def is_drm_protected(element
):
2667 return element
.find(_add_ns('ContentProtection')) is not None
2669 def extract_multisegment_info(element
, ms_parent_info
):
2670 ms_info
= ms_parent_info
.copy()
2672 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2673 # common attributes and elements. We will only extract relevant
2675 def extract_common(source
):
2676 segment_timeline
= source
.find(_add_ns('SegmentTimeline'))
2677 if segment_timeline
is not None:
2678 s_e
= segment_timeline
.findall(_add_ns('S'))
2680 ms_info
['total_number'] = 0
2683 r
= int(s
.get('r', 0))
2684 ms_info
['total_number'] += 1 + r
2685 ms_info
['s'].append({
2686 't': int(s
.get('t', 0)),
2687 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2688 'd': int(s
.attrib
['d']),
2691 start_number
= source
.get('startNumber')
2693 ms_info
['start_number'] = int(start_number
)
2694 timescale
= source
.get('timescale')
2696 ms_info
['timescale'] = int(timescale
)
2697 segment_duration
= source
.get('duration')
2698 if segment_duration
:
2699 ms_info
['segment_duration'] = float(segment_duration
)
2701 def extract_Initialization(source
):
2702 initialization
= source
.find(_add_ns('Initialization'))
2703 if initialization
is not None:
2704 ms_info
['initialization_url'] = initialization
.attrib
['sourceURL']
2706 segment_list
= element
.find(_add_ns('SegmentList'))
2707 if segment_list
is not None:
2708 extract_common(segment_list
)
2709 extract_Initialization(segment_list
)
2710 segment_urls_e
= segment_list
.findall(_add_ns('SegmentURL'))
2712 ms_info
['segment_urls'] = [segment
.attrib
['media'] for segment
in segment_urls_e
]
2714 segment_template
= element
.find(_add_ns('SegmentTemplate'))
2715 if segment_template
is not None:
2716 extract_common(segment_template
)
2717 media
= segment_template
.get('media')
2719 ms_info
['media'] = media
2720 initialization
= segment_template
.get('initialization')
2722 ms_info
['initialization'] = initialization
2724 extract_Initialization(segment_template
)
2727 mpd_duration
= parse_duration(mpd_doc
.get('mediaPresentationDuration'))
2728 formats
, subtitles
= [], {}
2729 stream_numbers
= collections
.defaultdict(int)
2730 for period
in mpd_doc
.findall(_add_ns('Period')):
2731 period_duration
= parse_duration(period
.get('duration')) or mpd_duration
2732 period_ms_info
= extract_multisegment_info(period
, {
2736 for adaptation_set
in period
.findall(_add_ns('AdaptationSet')):
2737 adaption_set_ms_info
= extract_multisegment_info(adaptation_set
, period_ms_info
)
2738 for representation
in adaptation_set
.findall(_add_ns('Representation')):
2739 representation_attrib
= adaptation_set
.attrib
.copy()
2740 representation_attrib
.update(representation
.attrib
)
2741 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2742 mime_type
= representation_attrib
['mimeType']
2743 content_type
= representation_attrib
.get('contentType', mime_type
.split('/')[0])
2745 codecs
= parse_codecs(representation_attrib
.get('codecs', ''))
2746 if content_type
not in ('video', 'audio', 'text'):
2747 if mime_type
== 'image/jpeg':
2748 content_type
= mime_type
2749 elif codecs
['vcodec'] != 'none':
2750 content_type
= 'video'
2751 elif codecs
['acodec'] != 'none':
2752 content_type
= 'audio'
2753 elif codecs
.get('tcodec', 'none') != 'none':
2754 content_type
= 'text'
2755 elif mimetype2ext(mime_type
) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2756 content_type
= 'text'
2758 self
.report_warning('Unknown MIME type %s in DASH manifest' % mime_type
)
2762 for element
in (representation
, adaptation_set
, period
, mpd_doc
):
2763 base_url_e
= element
.find(_add_ns('BaseURL'))
2764 if base_url_e
is not None:
2765 base_url
= base_url_e
.text
+ base_url
2766 if re
.match(r
'^https?://', base_url
):
2768 if mpd_base_url
and base_url
.startswith('/'):
2769 base_url
= compat_urlparse
.urljoin(mpd_base_url
, base_url
)
2770 elif mpd_base_url
and not re
.match(r
'^https?://', base_url
):
2771 if not mpd_base_url
.endswith('/'):
2773 base_url
= mpd_base_url
+ base_url
2774 representation_id
= representation_attrib
.get('id')
2775 lang
= representation_attrib
.get('lang')
2776 url_el
= representation
.find(_add_ns('BaseURL'))
2777 filesize
= int_or_none(url_el
.attrib
.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el
is not None else None)
2778 bandwidth
= int_or_none(representation_attrib
.get('bandwidth'))
2779 if representation_id
is not None:
2780 format_id
= representation_id
2782 format_id
= content_type
2784 format_id
= mpd_id
+ '-' + format_id
2785 if content_type
in ('video', 'audio'):
2787 'format_id': format_id
,
2788 'manifest_url': mpd_url
,
2789 'ext': mimetype2ext(mime_type
),
2790 'width': int_or_none(representation_attrib
.get('width')),
2791 'height': int_or_none(representation_attrib
.get('height')),
2792 'tbr': float_or_none(bandwidth
, 1000),
2793 'asr': int_or_none(representation_attrib
.get('audioSamplingRate')),
2794 'fps': int_or_none(representation_attrib
.get('frameRate')),
2795 'language': lang
if lang
not in ('mul', 'und', 'zxx', 'mis') else None,
2796 'format_note': 'DASH %s' % content_type
,
2797 'filesize': filesize
,
2798 'container': mimetype2ext(mime_type
) + '_dash',
2801 elif content_type
== 'text':
2803 'ext': mimetype2ext(mime_type
),
2804 'manifest_url': mpd_url
,
2805 'filesize': filesize
,
2807 elif content_type
== 'image/jpeg':
2808 # See test case in VikiIE
2809 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2811 'format_id': format_id
,
2813 'manifest_url': mpd_url
,
2814 'format_note': 'DASH storyboards (jpeg)',
2818 if is_drm_protected(adaptation_set
) or is_drm_protected(representation
):
2820 representation_ms_info
= extract_multisegment_info(representation
, adaption_set_ms_info
)
2822 def prepare_template(template_name
, identifiers
):
2823 tmpl
= representation_ms_info
[template_name
]
2824 # First of, % characters outside $...$ templates
2825 # must be escaped by doubling for proper processing
2826 # by % operator string formatting used further (see
2827 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2833 in_template
= not in_template
2834 elif c
== '%' and not in_template
:
2836 # Next, $...$ templates are translated to their
2837 # %(...) counterparts to be used with % operator
2838 if representation_id
is not None:
2839 t
= t
.replace('$RepresentationID$', representation_id
)
2840 t
= re
.sub(r
'\$(%s)\$' % '|'.join(identifiers
), r
'%(\1)d', t
)
2841 t
= re
.sub(r
'\$(%s)%%([^$]+)\$' % '|'.join(identifiers
), r
'%(\1)\2', t
)
2842 t
.replace('$$', '$')
2845 # @initialization is a regular template like @media one
2846 # so it should be handled just the same way (see
2847 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2848 if 'initialization' in representation_ms_info
:
2849 initialization_template
= prepare_template(
2851 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2852 # $Time$ shall not be included for @initialization thus
2853 # only $Bandwidth$ remains
2855 representation_ms_info
['initialization_url'] = initialization_template
% {
2856 'Bandwidth': bandwidth
,
2859 def location_key(location
):
2860 return 'url' if re
.match(r
'^https?://', location
) else 'path'
2862 if 'segment_urls' not in representation_ms_info
and 'media' in representation_ms_info
:
2864 media_template
= prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2865 media_location_key
= location_key(media_template
)
2867 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2868 # can't be used at the same time
2869 if '%(Number' in media_template
and 's' not in representation_ms_info
:
2870 segment_duration
= None
2871 if 'total_number' not in representation_ms_info
and 'segment_duration' in representation_ms_info
:
2872 segment_duration
= float_or_none(representation_ms_info
['segment_duration'], representation_ms_info
['timescale'])
2873 representation_ms_info
['total_number'] = int(math
.ceil(float(period_duration
) / segment_duration
))
2874 representation_ms_info
['fragments'] = [{
2875 media_location_key
: media_template
% {
2876 'Number': segment_number
,
2877 'Bandwidth': bandwidth
,
2879 'duration': segment_duration
,
2880 } for segment_number
in range(
2881 representation_ms_info
['start_number'],
2882 representation_ms_info
['total_number'] + representation_ms_info
['start_number'])]
2884 # $Number*$ or $Time$ in media template with S list available
2885 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2886 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2887 representation_ms_info
['fragments'] = []
2890 segment_number
= representation_ms_info
['start_number']
2892 def add_segment_url():
2893 segment_url
= media_template
% {
2894 'Time': segment_time
,
2895 'Bandwidth': bandwidth
,
2896 'Number': segment_number
,
2898 representation_ms_info
['fragments'].append({
2899 media_location_key
: segment_url
,
2900 'duration': float_or_none(segment_d
, representation_ms_info
['timescale']),
2903 for num
, s
in enumerate(representation_ms_info
['s']):
2904 segment_time
= s
.get('t') or segment_time
2908 for r
in range(s
.get('r', 0)):
2909 segment_time
+= segment_d
2912 segment_time
+= segment_d
2913 elif 'segment_urls' in representation_ms_info
and 's' in representation_ms_info
:
2915 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2916 # or any YouTube dashsegments video
2919 timescale
= representation_ms_info
['timescale']
2920 for s
in representation_ms_info
['s']:
2921 duration
= float_or_none(s
['d'], timescale
)
2922 for r
in range(s
.get('r', 0) + 1):
2923 segment_uri
= representation_ms_info
['segment_urls'][segment_index
]
2925 location_key(segment_uri
): segment_uri
,
2926 'duration': duration
,
2929 representation_ms_info
['fragments'] = fragments
2930 elif 'segment_urls' in representation_ms_info
:
2931 # Segment URLs with no SegmentTimeline
2932 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2933 # https://github.com/ytdl-org/youtube-dl/pull/14844
2935 segment_duration
= float_or_none(
2936 representation_ms_info
['segment_duration'],
2937 representation_ms_info
['timescale']) if 'segment_duration' in representation_ms_info
else None
2938 for segment_url
in representation_ms_info
['segment_urls']:
2940 location_key(segment_url
): segment_url
,
2942 if segment_duration
:
2943 fragment
['duration'] = segment_duration
2944 fragments
.append(fragment
)
2945 representation_ms_info
['fragments'] = fragments
2946 # If there is a fragments key available then we correctly recognized fragmented media.
2947 # Otherwise we will assume unfragmented media with direct access. Technically, such
2948 # assumption is not necessarily correct since we may simply have no support for
2949 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2950 if 'fragments' in representation_ms_info
:
2952 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2953 'url': mpd_url
or base_url
,
2954 'fragment_base_url': base_url
,
2956 'protocol': 'http_dash_segments' if mime_type
!= 'image/jpeg' else 'mhtml',
2958 if 'initialization_url' in representation_ms_info
:
2959 initialization_url
= representation_ms_info
['initialization_url']
2960 if not f
.get('url'):
2961 f
['url'] = initialization_url
2962 f
['fragments'].append({location_key(initialization_url): initialization_url}
)
2963 f
['fragments'].extend(representation_ms_info
['fragments'])
2965 # Assuming direct URL to unfragmented media.
2967 if content_type
in ('video', 'audio', 'image/jpeg'):
2968 f
['manifest_stream_number'] = stream_numbers
[f
['url']]
2969 stream_numbers
[f
['url']] += 1
2971 elif content_type
== 'text':
2972 subtitles
.setdefault(lang
or 'und', []).append(f
)
2974 return formats
, subtitles
2976 def _extract_ism_formats(self
, *args
, **kwargs
):
2977 fmts
, subs
= self
._extract
_ism
_formats
_and
_subtitles
(*args
, **kwargs
)
2979 self
._report
_ignoring
_subs
('ISM')
2982 def _extract_ism_formats_and_subtitles(self
, ism_url
, video_id
, ism_id
=None, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query={}
):
2983 res
= self
._download
_xml
_handle
(
2985 note
='Downloading ISM manifest' if note
is None else note
,
2986 errnote
='Failed to download ISM manifest' if errnote
is None else errnote
,
2987 fatal
=fatal
, data
=data
, headers
=headers
, query
=query
)
2994 return self
._parse
_ism
_formats
_and
_subtitles
(ism_doc
, urlh
.geturl(), ism_id
)
2996 def _parse_ism_formats_and_subtitles(self
, ism_doc
, ism_url
, ism_id
=None):
2998 Parse formats from ISM manifest.
3000 1. [MS-SSTR]: Smooth Streaming Protocol,
3001 https://msdn.microsoft.com/en-us/library/ff469518.aspx
3003 if ism_doc
.get('IsLive') == 'TRUE':
3006 duration
= int(ism_doc
.attrib
['Duration'])
3007 timescale
= int_or_none(ism_doc
.get('TimeScale')) or 10000000
3011 for stream
in ism_doc
.findall('StreamIndex'):
3012 stream_type
= stream
.get('Type')
3013 if stream_type
not in ('video', 'audio', 'text'):
3015 url_pattern
= stream
.attrib
['Url']
3016 stream_timescale
= int_or_none(stream
.get('TimeScale')) or timescale
3017 stream_name
= stream
.get('Name')
3018 stream_language
= stream
.get('Language', 'und')
3019 for track
in stream
.findall('QualityLevel'):
3020 fourcc
= track
.get('FourCC') or ('AACL' if track
.get('AudioTag') == '255' else None)
3021 # TODO: add support for WVC1 and WMAP
3022 if fourcc
not in ('H264', 'AVC1', 'AACL', 'TTML'):
3023 self
.report_warning('%s is not a supported codec' % fourcc
)
3025 tbr
= int(track
.attrib
['Bitrate']) // 1000
3026 # [1] does not mention Width and Height attributes. However,
3027 # they're often present while MaxWidth and MaxHeight are
3028 # missing, so should be used as fallbacks
3029 width
= int_or_none(track
.get('MaxWidth') or track
.get('Width'))
3030 height
= int_or_none(track
.get('MaxHeight') or track
.get('Height'))
3031 sampling_rate
= int_or_none(track
.get('SamplingRate'))
3033 track_url_pattern
= re
.sub(r
'{[Bb]itrate}', track
.attrib
['Bitrate'], url_pattern
)
3034 track_url_pattern
= compat_urlparse
.urljoin(ism_url
, track_url_pattern
)
3040 stream_fragments
= stream
.findall('c')
3041 for stream_fragment_index
, stream_fragment
in enumerate(stream_fragments
):
3042 fragment_ctx
['time'] = int_or_none(stream_fragment
.get('t')) or fragment_ctx
['time']
3043 fragment_repeat
= int_or_none(stream_fragment
.get('r')) or 1
3044 fragment_ctx
['duration'] = int_or_none(stream_fragment
.get('d'))
3045 if not fragment_ctx
['duration']:
3047 next_fragment_time
= int(stream_fragment
[stream_fragment_index
+ 1].attrib
['t'])
3049 next_fragment_time
= duration
3050 fragment_ctx
['duration'] = (next_fragment_time
- fragment_ctx
['time']) / fragment_repeat
3051 for _
in range(fragment_repeat
):
3053 'url': re
.sub(r
'{start[ _]time}', compat_str(fragment_ctx
['time']), track_url_pattern
),
3054 'duration': fragment_ctx
['duration'] / stream_timescale
,
3056 fragment_ctx
['time'] += fragment_ctx
['duration']
3058 if stream_type
== 'text':
3059 subtitles
.setdefault(stream_language
, []).append({
3063 'manifest_url': ism_url
,
3064 'fragments': fragments
,
3065 '_download_params': {
3066 'stream_type': stream_type
,
3067 'duration': duration
,
3068 'timescale': stream_timescale
,
3070 'language': stream_language
,
3071 'codec_private_data': track
.get('CodecPrivateData'),
3074 elif stream_type
in ('video', 'audio'):
3076 'format_id': join_nonempty(ism_id
, stream_name
, tbr
),
3078 'manifest_url': ism_url
,
3079 'ext': 'ismv' if stream_type
== 'video' else 'isma',
3083 'asr': sampling_rate
,
3084 'vcodec': 'none' if stream_type
== 'audio' else fourcc
,
3085 'acodec': 'none' if stream_type
== 'video' else fourcc
,
3087 'fragments': fragments
,
3088 'has_drm': ism_doc
.find('Protection') is not None,
3089 '_download_params': {
3090 'stream_type': stream_type
,
3091 'duration': duration
,
3092 'timescale': stream_timescale
,
3093 'width': width
or 0,
3094 'height': height
or 0,
3096 'language': stream_language
,
3097 'codec_private_data': track
.get('CodecPrivateData'),
3098 'sampling_rate': sampling_rate
,
3099 'channels': int_or_none(track
.get('Channels', 2)),
3100 'bits_per_sample': int_or_none(track
.get('BitsPerSample', 16)),
3101 'nal_unit_length_field': int_or_none(track
.get('NALUnitLengthField', 4)),
3104 return formats
, subtitles
3106 def _parse_html5_media_entries(self
, base_url
, webpage
, video_id
, m3u8_id
=None, m3u8_entry_protocol
='m3u8', mpd_id
=None, preference
=None, quality
=None):
3107 def absolute_url(item_url
):
3108 return urljoin(base_url
, item_url
)
3110 def parse_content_type(content_type
):
3111 if not content_type
:
3113 ctr
= re
.search(r
'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type
)
3115 mimetype
, codecs
= ctr
.groups()
3116 f
= parse_codecs(codecs
)
3117 f
['ext'] = mimetype2ext(mimetype
)
3121 def _media_formats(src
, cur_media_type
, type_info
={}):
3122 full_url
= absolute_url(src
)
3123 ext
= type_info
.get('ext') or determine_ext(full_url
)
3125 is_plain_url
= False
3126 formats
= self
._extract
_m
3u8_formats
(
3127 full_url
, video_id
, ext
='mp4',
3128 entry_protocol
=m3u8_entry_protocol
, m3u8_id
=m3u8_id
,
3129 preference
=preference
, quality
=quality
, fatal
=False)
3131 is_plain_url
= False
3132 formats
= self
._extract
_mpd
_formats
(
3133 full_url
, video_id
, mpd_id
=mpd_id
, fatal
=False)
3138 'vcodec': 'none' if cur_media_type
== 'audio' else None,
3140 return is_plain_url
, formats
3143 # amp-video and amp-audio are very similar to their HTML5 counterparts
3144 # so we wll include them right here (see
3145 # https://www.ampproject.org/docs/reference/components/amp-video)
3146 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3147 _MEDIA_TAG_NAME_RE
= r
'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3148 media_tags
= [(media_tag
, media_tag_name
, media_type
, '')
3149 for media_tag
, media_tag_name
, media_type
3150 in re
.findall(r
'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE
, webpage
)]
3151 media_tags
.extend(re
.findall(
3152 # We only allow video|audio followed by a whitespace or '>'.
3153 # Allowing more characters may end up in significant slow down (see
3154 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
3155 # http://www.porntrex.com/maps/videositemap.xml).
3156 r
'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE
, webpage
))
3157 for media_tag
, _
, media_type
, media_content
in media_tags
:
3162 media_attributes
= extract_attributes(media_tag
)
3163 src
= strip_or_none(media_attributes
.get('src'))
3165 _
, formats
= _media_formats(src
, media_type
)
3166 media_info
['formats'].extend(formats
)
3167 media_info
['thumbnail'] = absolute_url(media_attributes
.get('poster'))
3169 for source_tag
in re
.findall(r
'<source[^>]+>', media_content
):
3170 s_attr
= extract_attributes(source_tag
)
3171 # data-video-src and data-src are non standard but seen
3172 # several times in the wild
3173 src
= strip_or_none(dict_get(s_attr
, ('src', 'data-video-src', 'data-src')))
3176 f
= parse_content_type(s_attr
.get('type'))
3177 is_plain_url
, formats
= _media_formats(src
, media_type
, f
)
3179 # width, height, res, label and title attributes are
3180 # all not standard but seen several times in the wild
3183 for lbl
in ('label', 'title')
3184 if str_or_none(s_attr
.get(lbl
))
3186 width
= int_or_none(s_attr
.get('width'))
3187 height
= (int_or_none(s_attr
.get('height'))
3188 or int_or_none(s_attr
.get('res')))
3189 if not width
or not height
:
3191 resolution
= parse_resolution(lbl
)
3194 width
= width
or resolution
.get('width')
3195 height
= height
or resolution
.get('height')
3197 tbr
= parse_bitrate(lbl
)
3206 'format_id': s_attr
.get('label') or s_attr
.get('title'),
3208 f
.update(formats
[0])
3209 media_info
['formats'].append(f
)
3211 media_info
['formats'].extend(formats
)
3212 for track_tag
in re
.findall(r
'<track[^>]+>', media_content
):
3213 track_attributes
= extract_attributes(track_tag
)
3214 kind
= track_attributes
.get('kind')
3215 if not kind
or kind
in ('subtitles', 'captions'):
3216 src
= strip_or_none(track_attributes
.get('src'))
3219 lang
= track_attributes
.get('srclang') or track_attributes
.get('lang') or track_attributes
.get('label')
3220 media_info
['subtitles'].setdefault(lang
, []).append({
3221 'url': absolute_url(src
),
3223 for f
in media_info
['formats']:
3224 f
.setdefault('http_headers', {})['Referer'] = base_url
3225 if media_info
['formats'] or media_info
['subtitles']:
3226 entries
.append(media_info
)
3229 def _extract_akamai_formats(self
, *args
, **kwargs
):
3230 fmts
, subs
= self
._extract
_akamai
_formats
_and
_subtitles
(*args
, **kwargs
)
3232 self
._report
_ignoring
_subs
('akamai')
3235 def _extract_akamai_formats_and_subtitles(self
, manifest_url
, video_id
, hosts
={}):
3236 signed
= 'hdnea=' in manifest_url
3238 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3239 manifest_url
= re
.sub(
3240 r
'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3241 '', manifest_url
).strip('?')
3246 hdcore_sign
= 'hdcore=3.7.0'
3247 f4m_url
= re
.sub(r
'(https?://[^/]+)/i/', r
'\1/z/', manifest_url
).replace('/master.m3u8', '/manifest.f4m')
3248 hds_host
= hosts
.get('hds')
3250 f4m_url
= re
.sub(r
'(https?://)[^/]+', r
'\1' + hds_host
, f4m_url
)
3251 if 'hdcore=' not in f4m_url
:
3252 f4m_url
+= ('&' if '?' in f4m_url
else '?') + hdcore_sign
3253 f4m_formats
= self
._extract
_f
4m
_formats
(
3254 f4m_url
, video_id
, f4m_id
='hds', fatal
=False)
3255 for entry
in f4m_formats
:
3256 entry
.update({'extra_param_to_segment_url': hdcore_sign}
)
3257 formats
.extend(f4m_formats
)
3259 m3u8_url
= re
.sub(r
'(https?://[^/]+)/z/', r
'\1/i/', manifest_url
).replace('/manifest.f4m', '/master.m3u8')
3260 hls_host
= hosts
.get('hls')
3262 m3u8_url
= re
.sub(r
'(https?://)[^/]+', r
'\1' + hls_host
, m3u8_url
)
3263 m3u8_formats
, m3u8_subtitles
= self
._extract
_m
3u8_formats
_and
_subtitles
(
3264 m3u8_url
, video_id
, 'mp4', 'm3u8_native',
3265 m3u8_id
='hls', fatal
=False)
3266 formats
.extend(m3u8_formats
)
3267 subtitles
= self
._merge
_subtitles
(subtitles
, m3u8_subtitles
)
3269 http_host
= hosts
.get('http')
3270 if http_host
and m3u8_formats
and not signed
:
3271 REPL_REGEX
= r
'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3272 qualities
= re
.match(REPL_REGEX
, m3u8_url
).group(2).split(',')
3273 qualities_length
= len(qualities
)
3274 if len(m3u8_formats
) in (qualities_length
, qualities_length
+ 1):
3276 for f
in m3u8_formats
:
3277 if f
['vcodec'] != 'none':
3278 for protocol
in ('http', 'https'):
3280 del http_f
['manifest_url']
3282 REPL_REGEX
, protocol
+ r
'://%s/\g<1>%s\3' % (http_host
, qualities
[i
]), f
['url'])
3284 'format_id': http_f
['format_id'].replace('hls-', protocol
+ '-'),
3286 'protocol': protocol
,
3288 formats
.append(http_f
)
3291 return formats
, subtitles
3293 def _extract_wowza_formats(self
, url
, video_id
, m3u8_entry_protocol
='m3u8_native', skip_protocols
=[]):
3294 query
= compat_urlparse
.urlparse(url
).query
3295 url
= re
.sub(r
'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url
)
3297 r
'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url
)
3298 url_base
= mobj
.group('url')
3299 http_base_url
= '%s%s:%s' % ('http', mobj
.group('s') or '', url_base
)
3302 def manifest_url(manifest
):
3303 m_url
= '%s/%s' % (http_base_url
, manifest
)
3305 m_url
+= '?%s' % query
3308 if 'm3u8' not in skip_protocols
:
3309 formats
.extend(self
._extract
_m
3u8_formats
(
3310 manifest_url('playlist.m3u8'), video_id
, 'mp4',
3311 m3u8_entry_protocol
, m3u8_id
='hls', fatal
=False))
3312 if 'f4m' not in skip_protocols
:
3313 formats
.extend(self
._extract
_f
4m
_formats
(
3314 manifest_url('manifest.f4m'),
3315 video_id
, f4m_id
='hds', fatal
=False))
3316 if 'dash' not in skip_protocols
:
3317 formats
.extend(self
._extract
_mpd
_formats
(
3318 manifest_url('manifest.mpd'),
3319 video_id
, mpd_id
='dash', fatal
=False))
3320 if re
.search(r
'(?:/smil:|\.smil)', url_base
):
3321 if 'smil' not in skip_protocols
:
3322 rtmp_formats
= self
._extract
_smil
_formats
(
3323 manifest_url('jwplayer.smil'),
3324 video_id
, fatal
=False)
3325 for rtmp_format
in rtmp_formats
:
3326 rtsp_format
= rtmp_format
.copy()
3327 rtsp_format
['url'] = '%s/%s' % (rtmp_format
['url'], rtmp_format
['play_path'])
3328 del rtsp_format
['play_path']
3329 del rtsp_format
['ext']
3330 rtsp_format
.update({
3331 'url': rtsp_format
['url'].replace('rtmp://', 'rtsp://'),
3332 'format_id': rtmp_format
['format_id'].replace('rtmp', 'rtsp'),
3335 formats
.extend([rtmp_format
, rtsp_format
])
3337 for protocol
in ('rtmp', 'rtsp'):
3338 if protocol
not in skip_protocols
:
3340 'url': '%s:%s' % (protocol
, url_base
),
3341 'format_id': protocol
,
3342 'protocol': protocol
,
3346 def _find_jwplayer_data(self
, webpage
, video_id
=None, transform_source
=js_to_json
):
3348 r
'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P
=quote
)\
)(?
!</script
>).*?\
.setup\s
*\
((?P
<options
>[^
)]+)\
)',
3352 jwplayer_data = self._parse_json(mobj.group('options
'),
3354 transform_source=transform_source)
3355 except ExtractorError:
3358 if isinstance(jwplayer_data, dict):
3359 return jwplayer_data
3361 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3362 jwplayer_data = self._find_jwplayer_data(
3363 webpage, video_id, transform_source=js_to_json)
3364 return self._parse_jwplayer_data(
3365 jwplayer_data, video_id, *args, **kwargs)
3367 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3368 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3369 # JWPlayer backward compatibility: flattened playlists
3370 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3371 if 'playlist
' not in jwplayer_data:
3372 jwplayer_data = {'playlist': [jwplayer_data]}
3376 # JWPlayer backward compatibility: single playlist item
3377 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3378 if not isinstance(jwplayer_data['playlist
'], list):
3379 jwplayer_data['playlist
'] = [jwplayer_data['playlist
']]
3381 for video_data in jwplayer_data['playlist
']:
3382 # JWPlayer backward compatibility: flattened sources
3383 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3384 if 'sources
' not in video_data:
3385 video_data['sources
'] = [video_data]
3387 this_video_id = video_id or video_data['mediaid
']
3389 formats = self._parse_jwplayer_formats(
3390 video_data['sources
'], video_id=this_video_id, m3u8_id=m3u8_id,
3391 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3394 tracks = video_data.get('tracks
')
3395 if tracks and isinstance(tracks, list):
3396 for track in tracks:
3397 if not isinstance(track, dict):
3399 track_kind = track.get('kind
')
3400 if not track_kind or not isinstance(track_kind, compat_str):
3402 if track_kind.lower() not in ('captions
', 'subtitles
'):
3404 track_url = urljoin(base_url, track.get('file'))
3407 subtitles.setdefault(track.get('label
') or 'en
', []).append({
3408 'url
': self._proto_relative_url(track_url)
3412 'id': this_video_id,
3413 'title
': unescapeHTML(video_data['title
'] if require_title else video_data.get('title
')),
3414 'description
': clean_html(video_data.get('description
')),
3415 'thumbnail
': urljoin(base_url, self._proto_relative_url(video_data.get('image
'))),
3416 'timestamp
': int_or_none(video_data.get('pubdate
')),
3417 'duration
': float_or_none(jwplayer_data.get('duration
') or video_data.get('duration
')),
3418 'subtitles
': subtitles,
3420 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3421 if len(formats) == 1 and re.search(r'^
(?
:http|
//).*(?
:youtube\
.com|youtu\
.be
)/.+', formats[0]['url
']):
3423 '_type
': 'url_transparent
',
3424 'url
': formats[0]['url
'],
3427 self._sort_formats(formats)
3428 entry['formats
'] = formats
3429 entries.append(entry)
3430 if len(entries) == 1:
3433 return self.playlist_result(entries)
3435 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3436 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3439 for source in jwplayer_sources_data:
3440 if not isinstance(source, dict):
3442 source_url = urljoin(
3443 base_url, self._proto_relative_url(source.get('file')))
3444 if not source_url or source_url in urls:
3446 urls.append(source_url)
3447 source_type = source.get('type') or ''
3448 ext = mimetype2ext(source_type) or determine_ext(source_url)
3449 if source_type == 'hls
' or ext == 'm3u8
':
3450 formats.extend(self._extract_m3u8_formats(
3451 source_url, video_id, 'mp4
', entry_protocol='m3u8_native
',
3452 m3u8_id=m3u8_id, fatal=False))
3453 elif source_type == 'dash
' or ext == 'mpd
':
3454 formats.extend(self._extract_mpd_formats(
3455 source_url, video_id, mpd_id=mpd_id, fatal=False))
3457 formats.extend(self._extract_smil_formats(
3458 source_url, video_id, fatal=False))
3459 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3460 elif source_type.startswith('audio
') or ext in (
3461 'oga
', 'aac
', 'mp3
', 'mpeg
', 'vorbis
'):
3468 height = int_or_none(source.get('height
'))
3470 # Often no height is provided but there is a label in
3471 # format like "1080p", "720p SD", or 1080.
3472 height = int_or_none(self._search_regex(
3473 r'^
(\d{3,4}
)[pP
]?
(?
:\b|$
)', compat_str(source.get('label
') or ''),
3474 'height
', default=None))
3477 'width
': int_or_none(source.get('width
')),
3479 'tbr
': int_or_none(source.get('bitrate
')),
3482 if source_url.startswith('rtmp
'):
3483 a_format['ext
'] = 'flv
'
3484 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3485 # of jwplayer.flash.swf
3486 rtmp_url_parts = re.split(
3487 r'((?
:mp4|mp3|flv
):)', source_url, 1)
3488 if len(rtmp_url_parts) == 3:
3489 rtmp_url, prefix, play_path = rtmp_url_parts
3492 'play_path
': prefix + play_path,
3495 a_format.update(rtmp_params)
3496 formats.append(a_format)
3499 def _live_title(self, name):
3500 self._downloader.deprecation_warning('yt_dlp
.InfoExtractor
._live
_title
is deprecated
and does
not work
as expected
')
3503 def _int(self, v, name, fatal=False, **kwargs):
3504 res = int_or_none(v, **kwargs)
3505 if 'get_attr
' in kwargs:
3506 print(getattr(v, kwargs['get_attr
']))
3508 msg = 'Failed to extract
%s: Could
not parse value
%r' % (name, v)
3510 raise ExtractorError(msg)
3512 self.report_warning(msg)
3515 def _float(self, v, name, fatal=False, **kwargs):
3516 res = float_or_none(v, **kwargs)
3518 msg = 'Failed to extract
%s: Could
not parse value
%r' % (name, v)
3520 raise ExtractorError(msg)
3522 self.report_warning(msg)
3525 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3526 path='/', secure=False, discard=False, rest={}, **kwargs):
3527 cookie = compat_cookiejar_Cookie(
3528 0, name, value, port, port is not None, domain, True,
3529 domain.startswith('.'), path, True, secure, expire_time,
3530 discard, None, None, rest)
3531 self._downloader.cookiejar.set_cookie(cookie)
3533 def _get_cookies(self, url):
3534 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
3535 req = sanitized_Request(url)
3536 self._downloader.cookiejar.add_cookie_header(req)
3537 return compat_cookies_SimpleCookie(req.get_header('Cookie
'))
3539 def _apply_first_set_cookie_header(self, url_handle, cookie):
3541 Apply first Set-Cookie header instead of the last. Experimental.
3543 Some sites (e.g. [1-3]) may serve two cookies under the same name
3544 in Set-Cookie header and expect the first (old) one to be set rather
3545 than second (new). However, as of RFC6265 the newer one cookie
3546 should be set into cookie store what actually happens.
3547 We will workaround this issue by resetting the cookie to
3548 the first one manually.
3549 1. https://new.vk.com/
3550 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3551 3. https://learning.oreilly.com/
3553 for header, cookies in url_handle.headers.items():
3554 if header.lower() != 'set-cookie
':
3556 if sys.version_info[0] >= 3:
3557 cookies = cookies.encode('iso
-8859-1')
3558 cookies = cookies.decode('utf
-8')
3559 cookie_value = re.search(
3560 r'%s=(.+?
);.*?
\b[Dd
]omain
=(.+?
)(?
:[,;]|$
)' % cookie, cookies)
3562 value, domain = cookie_value.groups()
3563 self._set_cookie(domain, cookie, value)
3566 def get_testcases(self, include_onlymatching=False):
3567 t = getattr(self, '_TEST
', None)
3569 assert not hasattr(self, '_TESTS
'), \
3570 '%s has _TEST
and _TESTS
' % type(self).__name__
3573 tests = getattr(self, '_TESTS
', [])
3575 if not include_onlymatching and t.get('only_matching
', False):
3577 t['name
'] = type(self).__name__[:-len('IE
')]
3580 def is_suitable(self, age_limit):
3581 """ Test whether the extractor is generally suitable for the given
3582 age limit (i.e. pornographic sites are not, all others usually are) """
3584 any_restricted = False
3585 for tc in self.get_testcases(include_onlymatching=False):
3586 if tc.get('playlist
', []):
3587 tc = tc['playlist
'][0]
3588 is_restricted = age_restricted(
3589 tc.get('info_dict
', {}).get('age_limit
'), age_limit)
3590 if not is_restricted:
3592 any_restricted = any_restricted or is_restricted
3593 return not any_restricted
3595 def extract_subtitles(self, *args, **kwargs):
3596 if (self.get_param('writesubtitles
', False)
3597 or self.get_param('listsubtitles
')):
3598 return self._get_subtitles(*args, **kwargs)
3601 def _get_subtitles(self, *args, **kwargs):
3602 raise NotImplementedError('This method must be implemented by subclasses
')
3604 def extract_comments(self, *args, **kwargs):
3605 if not self.get_param('getcomments
'):
3607 generator = self._get_comments(*args, **kwargs)
3614 comments.append(next(generator))
3615 except StopIteration:
3617 except KeyboardInterrupt:
3618 self.to_screen('Interrupted by user
')
3619 except Exception as e:
3620 if self.get_param('ignoreerrors
') is not True:
3622 self._downloader.report_error(e)
3623 comment_count = len(comments)
3624 self.to_screen(f'Extracted {comment_count} comments
')
3626 'comments
': comments,
3627 'comment_count
': None if interrupted else comment_count
3631 def _get_comments(self, *args, **kwargs):
3632 raise NotImplementedError('This method must be implemented by subclasses
')
3635 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3636 """ Merge subtitle items for one language. Items with duplicated URLs
3637 will be dropped. """
3638 list1_urls = set([item['url
'] for item in subtitle_list1])
3639 ret = list(subtitle_list1)
3640 ret.extend([item for item in subtitle_list2 if item['url
'] not in list1_urls])
3644 def _merge_subtitles(cls, *dicts, target=None):
3645 """ Merge subtitle dictionaries, language by language. """
3649 for lang, subs in d.items():
3650 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3653 def extract_automatic_captions(self, *args, **kwargs):
3654 if (self.get_param('writeautomaticsub
', False)
3655 or self.get_param('listsubtitles
')):
3656 return self._get_automatic_captions(*args, **kwargs)
3659 def _get_automatic_captions(self, *args, **kwargs):
3660 raise NotImplementedError('This method must be implemented by subclasses
')
3662 def mark_watched(self, *args, **kwargs):
3663 if not self.get_param('mark_watched
', False):
3665 if (self._get_login_info()[0] is not None
3666 or self.get_param('cookiefile
')
3667 or self.get_param('cookiesfrombrowser
')):
3668 self._mark_watched(*args, **kwargs)
3670 def _mark_watched(self, *args, **kwargs):
3671 raise NotImplementedError('This method must be implemented by subclasses
')
3673 def geo_verification_headers(self):
3675 geo_verification_proxy = self.get_param('geo_verification_proxy
')
3676 if geo_verification_proxy:
3677 headers['Ytdl
-request
-proxy
'] = geo_verification_proxy
3680 def _generic_id(self, url):
3681 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3683 def _generic_title(self, url):
3684 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3687 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3688 all_known = all(map(
3689 lambda x: x is not None,
3690 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3692 'private
' if is_private
3693 else 'premium_only
' if needs_premium
3694 else 'subscriber_only
' if needs_subscription
3695 else 'needs_auth
' if needs_auth
3696 else 'unlisted
' if is_unlisted
3697 else 'public
' if all_known
3700 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3702 @returns A list of values for the extractor argument given by "key"
3703 or "default" if no such key is present
3704 @param default The default value to return when the key is not present (default: [])
3705 @param casesense When false, the values are converted to lower case
3708 self._downloader.params, ('extractor_args
', (ie_key or self.ie_key()).lower(), key))
3710 return [] if default is NO_DEFAULT else default
3711 return list(val) if casesense else [x.lower() for x in val]
3714 class SearchInfoExtractor(InfoExtractor):
3716 Base class for paged search queries extractors.
3717 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3718 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3721 _MAX_RESULTS = float('inf
')
3724 def _make_valid_url(cls):
3725 return r'%s(?P
<prefix
>|
[1-9][0-9]*|all
):(?P
<query
>[\s\S
]+)' % cls._SEARCH_KEY
3727 def _real_extract(self, query):
3728 prefix, query = self._match_valid_url(query).group('prefix
', 'query
')
3730 return self._get_n_results(query, 1)
3731 elif prefix == 'all
':
3732 return self._get_n_results(query, self._MAX_RESULTS)
3736 raise ExtractorError('invalid download number
%s for query
"%s"' % (n, query))
3737 elif n > self._MAX_RESULTS:
3738 self.report_warning('%s returns
max %i results (you requested
%i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3739 n = self._MAX_RESULTS
3740 return self._get_n_results(query, n)
3742 def _get_n_results(self, query, n):
3743 """Get a specified number of results for a query.
3744 Either this function or _search_results must be overridden by subclasses """
3745 return self.playlist_result(
3746 itertools.islice(self._search_results(query), 0, None if n == float('inf
') else n),
3749 def _search_results(self, query):
3750 """Returns an iterator of search results"""
3751 raise NotImplementedError('This method must be implemented by subclasses
')
3754 def SEARCH_KEY(self):
3755 return self._SEARCH_KEY