21 import xml
.etree
.ElementTree
23 from ..compat
import functools
# isort: split
24 from ..compat
import compat_etree_fromstring
, compat_expanduser
, compat_os_name
25 from ..cookies
import LenientSimpleCookie
26 from ..downloader
.f4m
import get_base_url
, remove_encrypted_media
65 parse_m3u8_attributes
,
95 """Information Extractor class.
97 Information extractors are the classes that, given a URL, extract
98 information about the video (or videos) the URL refers to. This
99 information includes the real video URL, the video title, author and
100 others. The information is stored in a dictionary which is then
101 passed to the YoutubeDL. The YoutubeDL processes this
102 information possibly downloading the video to the file system, among
103 other possible outcomes.
105 The type field determines the type of the result.
106 By far the most common value (and the default if _type is missing) is
107 "video", which indicates a single video.
109 For a video, the dictionaries must include the following fields:
111 id: Video identifier.
112 title: Video title, unescaped. Set to an empty string if video has
113 no title as opposed to "None" which signifies that the
114 extractor failed to obtain a title
116 Additionally, it must contain either a formats entry or a url one:
118 formats: A list of dictionaries for each format available, ordered
119 from worst to best quality.
122 * url The mandatory URL representing the media:
123 for plain file media - HTTP URL of this file,
125 for HLS - URL of the M3U8 media playlist,
126 for HDS - URL of the F4M manifest,
128 - HTTP URL to plain file media (in case of
130 - URL of the MPD manifest or base URL
131 representing the media if MPD manifest
132 is parsed from a string (in case of
134 for MSS - URL of the ISM manifest.
135 * request_data Data to send in POST request to the URL
137 The URL of the manifest file in case of
139 for HLS - URL of the M3U8 master playlist,
140 for HDS - URL of the F4M manifest,
141 for DASH - URL of the MPD manifest,
142 for MSS - URL of the ISM manifest.
143 * manifest_stream_number (For internal use only)
144 The index of the stream in the manifest file
145 * ext Will be calculated from URL if missing
146 * format A human-readable description of the format
147 ("mp4 container with h264/opus").
148 Calculated from the format_id, width, height.
149 and format_note fields if missing.
150 * format_id A short description of the format
151 ("mp4_h264_opus" or "19").
152 Technically optional, but strongly recommended.
153 * format_note Additional info about the format
154 ("3D" or "DASH video")
155 * width Width of the video, if known
156 * height Height of the video, if known
157 * aspect_ratio Aspect ratio of the video, if known
158 Automatically calculated from width and height
159 * resolution Textual description of width and height
160 Automatically calculated from width and height
161 * dynamic_range The dynamic range of the video. One of:
162 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
163 * tbr Average bitrate of audio and video in KBit/s
164 * abr Average audio bitrate in KBit/s
165 * acodec Name of the audio codec in use
166 * asr Audio sampling rate in Hertz
167 * audio_channels Number of audio channels
168 * vbr Average video bitrate in KBit/s
170 * vcodec Name of the video codec in use
171 * container Name of the container format
172 * filesize The number of bytes, if known in advance
173 * filesize_approx An estimate for the number of bytes
174 * player_url SWF Player URL (used for rtmpdump).
175 * protocol The protocol that will be used for the actual
176 download, lower-case. One of "http", "https" or
177 one of the protocols defined in downloader.PROTOCOL_MAP
179 Base URL for fragments. Each fragment's path
180 value (if present) will be relative to
182 * fragments A list of fragments of a fragmented media.
183 Each fragment entry must contain either an url
184 or a path. If an url is present it should be
185 considered by a client. Otherwise both path and
186 fragment_base_url must be present. Here is
187 the list of all potential fields:
188 * "url" - fragment's URL
189 * "path" - fragment's path relative to
191 * "duration" (optional, int or float)
192 * "filesize" (optional, int)
193 * is_from_start Is a live format that can be downloaded
194 from the start. Boolean
195 * preference Order number of this format. If this field is
196 present and not None, the formats get sorted
197 by this field, regardless of all other values.
198 -1 for default (order by other properties),
199 -2 or smaller for less than default.
200 < -1000 to hide the format (if there is
201 another one which is strictly better)
202 * language Language code, e.g. "de" or "en-US".
203 * language_preference Is this in the language mentioned in
205 10 if it's what the URL is about,
206 -1 for default (don't know),
207 -10 otherwise, other values reserved for now.
208 * quality Order number of the video quality of this
209 format, irrespective of the file format.
210 -1 for default (order by other properties),
211 -2 or smaller for less than default.
212 * source_preference Order number for this video source
213 (quality takes higher priority)
214 -1 for default (order by other properties),
215 -2 or smaller for less than default.
216 * http_headers A dictionary of additional HTTP headers
217 to add to the request.
218 * stretched_ratio If given and not 1, indicates that the
219 video's pixels are not square.
220 width : height ratio as float.
221 * no_resume The server does not support resuming the
222 (HTTP or RTMP) download. Boolean.
223 * has_drm The format has DRM and cannot be downloaded. Boolean
224 * extra_param_to_segment_url A query string to append to each
225 fragment's URL, or to update each existing query string
226 with. Only applied by the native HLS/DASH downloaders.
227 * hls_aes A dictionary of HLS AES-128 decryption information
228 used by the native HLS downloader to override the
229 values in the media playlist when an '#EXT-X-KEY' tag
230 is present in the playlist:
231 * uri The URI from which the key will be downloaded
232 * key The key (as hex) used to decrypt fragments.
233 If `key` is given, any key URI will be ignored
234 * iv The IV (as hex) used to decrypt fragments
235 * downloader_options A dictionary of downloader options
236 (For internal use only)
237 * http_chunk_size Chunk size for HTTP downloads
238 * ffmpeg_args Extra arguments for ffmpeg downloader
239 RTMP formats can also have the additional fields: page_url,
240 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
241 rtmp_protocol, rtmp_real_time
243 url: Final video URL.
244 ext: Video filename extension.
245 format: The video format, defaults to ext (used for --get-format)
246 player_url: SWF Player URL (used for rtmpdump).
248 The following fields are optional:
250 direct: True if a direct video file was given (must only be set by GenericIE)
251 alt_title: A secondary title of the video.
252 display_id An alternative identifier for the video, not necessarily
253 unique, but available before title. Typically, id is
254 something like "4234987", title "Dancing naked mole rats",
255 and display_id "dancing-naked-mole-rats"
256 thumbnails: A list of dictionaries, with the following entries:
257 * "id" (optional, string) - Thumbnail format ID
259 * "preference" (optional, int) - quality of the image
260 * "width" (optional, int)
261 * "height" (optional, int)
262 * "resolution" (optional, string "{width}x{height}",
264 * "filesize" (optional, int)
265 * "http_headers" (dict) - HTTP headers for the request
266 thumbnail: Full URL to a video thumbnail image.
267 description: Full video description.
268 uploader: Full name of the video uploader.
269 license: License name the video is licensed under.
270 creator: The creator of the video.
271 timestamp: UNIX timestamp of the moment the video was uploaded
272 upload_date: Video upload date in UTC (YYYYMMDD).
273 If not explicitly set, calculated from timestamp
274 release_timestamp: UNIX timestamp of the moment the video was released.
275 If it is not clear whether to use timestamp or this, use the former
276 release_date: The date (YYYYMMDD) when the video was released in UTC.
277 If not explicitly set, calculated from release_timestamp
278 modified_timestamp: UNIX timestamp of the moment the video was last modified.
279 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
280 If not explicitly set, calculated from modified_timestamp
281 uploader_id: Nickname or id of the video uploader.
282 uploader_url: Full URL to a personal webpage of the video uploader.
283 channel: Full name of the channel the video is uploaded on.
284 Note that channel fields may or may not repeat uploader
285 fields. This depends on a particular extractor.
286 channel_id: Id of the channel.
287 channel_url: Full URL to a channel webpage.
288 channel_follower_count: Number of followers of the channel.
289 location: Physical location where the video was filmed.
290 subtitles: The available subtitles as a dictionary in the format
291 {tag: subformats}. "tag" is usually a language code, and
292 "subformats" is a list sorted from lower to higher
293 preference, each element is a dictionary with the "ext"
295 * "data": The subtitles file contents
296 * "url": A URL pointing to the subtitles file
297 It can optionally also have:
298 * "name": Name or description of the subtitles
299 * "http_headers": A dictionary of additional HTTP headers
300 to add to the request.
301 "ext" will be calculated from URL if missing
302 automatic_captions: Like 'subtitles'; contains automatically generated
303 captions instead of normal subtitles
304 duration: Length of the video in seconds, as an integer or float.
305 view_count: How many users have watched the video on the platform.
306 concurrent_view_count: How many users are currently watching the video on the platform.
307 like_count: Number of positive ratings of the video
308 dislike_count: Number of negative ratings of the video
309 repost_count: Number of reposts of the video
310 average_rating: Average rating give by users, the scale used depends on the webpage
311 comment_count: Number of comments on the video
312 comments: A list of comments, each with one or more of the following
313 properties (all but one of text or html optional):
314 * "author" - human-readable name of the comment author
315 * "author_id" - user ID of the comment author
316 * "author_thumbnail" - The thumbnail of the comment author
318 * "html" - Comment as HTML
319 * "text" - Plain text of the comment
320 * "timestamp" - UNIX timestamp of comment
321 * "parent" - ID of the comment this one is replying to.
322 Set to "root" to indicate that this is a
323 comment to the original video.
324 * "like_count" - Number of positive ratings of the comment
325 * "dislike_count" - Number of negative ratings of the comment
326 * "is_favorited" - Whether the comment is marked as
327 favorite by the video uploader
328 * "author_is_uploader" - Whether the comment is made by
330 age_limit: Age restriction for the video, as an integer (years)
331 webpage_url: The URL to the video webpage, if given to yt-dlp it
332 should allow to get the same result again. (It will be set
333 by YoutubeDL if it's missing)
334 categories: A list of categories that the video falls in, for example
336 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
337 cast: A list of the video cast
338 is_live: True, False, or None (=unknown). Whether this video is a
339 live stream that goes on instead of a fixed-length video.
340 was_live: True, False, or None (=unknown). Whether this video was
341 originally a live stream.
342 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
343 or 'post_live' (was live, but VOD is not yet processed)
344 If absent, automatically set from is_live, was_live
345 start_time: Time in seconds where the reproduction should start, as
346 specified in the URL.
347 end_time: Time in seconds where the reproduction should end, as
348 specified in the URL.
349 chapters: A list of dictionaries, with the following entries:
350 * "start_time" - The start time of the chapter in seconds
351 * "end_time" - The end time of the chapter in seconds
352 * "title" (optional, string)
353 heatmap: A list of dictionaries, with the following entries:
354 * "start_time" - The start time of the data point in seconds
355 * "end_time" - The end time of the data point in seconds
356 * "value" - The normalized value of the data point (float between 0 and 1)
357 playable_in_embed: Whether this video is allowed to play in embedded
358 players on other sites. Can be True (=always allowed),
359 False (=never allowed), None (=unknown), or a string
360 specifying the criteria for embedability; e.g. 'whitelist'
361 availability: Under what condition the video is available. One of
362 'private', 'premium_only', 'subscriber_only', 'needs_auth',
363 'unlisted' or 'public'. Use 'InfoExtractor._availability'
365 _old_archive_ids: A list of old archive ids needed for backward compatibility
366 _format_sort_fields: A list of fields to use for sorting formats
367 __post_extractor: A function to be called just before the metadata is
368 written to either disk, logger or console. The function
369 must return a dict which will be added to the info_dict.
370 This is usefull for additional information that is
371 time-consuming to extract. Note that the fields thus
372 extracted will not be available to output template and
373 match_filter. So, only "comments" and "comment_count" are
374 currently allowed to be extracted via this method.
376 The following fields should only be used when the video belongs to some logical
379 chapter: Name or title of the chapter the video belongs to.
380 chapter_number: Number of the chapter the video belongs to, as an integer.
381 chapter_id: Id of the chapter the video belongs to, as a unicode string.
383 The following fields should only be used when the video is an episode of some
384 series, programme or podcast:
386 series: Title of the series or programme the video episode belongs to.
387 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
388 season: Title of the season the video episode belongs to.
389 season_number: Number of the season the video episode belongs to, as an integer.
390 season_id: Id of the season the video episode belongs to, as a unicode string.
391 episode: Title of the video episode. Unlike mandatory video title field,
392 this field should denote the exact title of the video episode
393 without any kind of decoration.
394 episode_number: Number of the video episode within a season, as an integer.
395 episode_id: Id of the video episode, as a unicode string.
397 The following fields should only be used when the media is a track or a part of
400 track: Title of the track.
401 track_number: Number of the track within an album or a disc, as an integer.
402 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
404 artist: Artist(s) of the track.
405 genre: Genre(s) of the track.
406 album: Title of the album the track belongs to.
407 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
408 album_artist: List of all artists appeared on the album (e.g.
409 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
411 disc_number: Number of the disc or other physical medium the track belongs to,
413 release_year: Year (YYYY) when the album was released.
414 composer: Composer of the piece
416 The following fields should only be set for clips that should be cut from the original video:
418 section_start: Start time of the section in seconds
419 section_end: End time of the section in seconds
421 The following fields should only be set for storyboards:
422 rows: Number of rows in each storyboard fragment, as an integer
423 columns: Number of columns in each storyboard fragment, as an integer
425 Unless mentioned otherwise, the fields should be Unicode strings.
427 Unless mentioned otherwise, None is equivalent to absence of information.
430 _type "playlist" indicates multiple videos.
431 There must be a key "entries", which is a list, an iterable, or a PagedList
432 object, each element of which is a valid dictionary by this specification.
434 Additionally, playlists can have "id", "title", and any other relevant
435 attributes with the same semantics as videos (see above).
437 It can also have the following optional fields:
439 playlist_count: The total number of videos in a playlist. If not given,
440 YoutubeDL tries to calculate it from "entries"
443 _type "multi_video" indicates that there are multiple videos that
444 form a single show, for examples multiple acts of an opera or TV episode.
445 It must have an entries key like a playlist and contain all the keys
446 required for a video at the same time.
449 _type "url" indicates that the video must be extracted from another
450 location, possibly by a different extractor. Its only required key is:
451 "url" - the next URL to extract.
452 The key "ie_key" can be set to the class name (minus the trailing "IE",
453 e.g. "Youtube") if the extractor class is known in advance.
454 Additionally, the dictionary may have any properties of the resolved entity
455 known in advance, for example "title" if the title of the referred video is
459 _type "url_transparent" entities have the same specification as "url", but
460 indicate that the given additional information is more precise than the one
461 associated with the resolved URL.
462 This is useful when a site employs a video service that hosts the video and
463 its technical metadata, but that video service does not embed a useful
464 title, description etc.
467 Subclasses of this should also be added to the list of extractors and
468 should define a _VALID_URL regexp and, re-define the _real_extract() and
469 (optionally) _real_initialize() methods.
471 Subclasses may also override suitable() if necessary, but ensure the function
472 signature is preserved and that this function imports everything it needs
473 (except other extractors), so that lazy_extractors works correctly.
475 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
476 the HTML of Generic webpages. It may also override _extract_embed_urls
477 or _extract_from_webpage as necessary. While these are normally classmethods,
478 _extract_from_webpage is allowed to be an instance method.
480 _extract_from_webpage may raise self.StopExtraction() to stop further
481 processing of the webpage and obtain exclusive rights to it. This is useful
482 when the extractor cannot reliably be matched using just the URL,
483 e.g. invidious/peertube instances
485 Embed-only extractors can be defined by setting _VALID_URL = False.
487 To support username + password (or netrc) login, the extractor must define a
488 _NETRC_MACHINE and re-define _perform_login(username, password) and
489 (optionally) _initialize_pre_login() methods. The _perform_login method will
490 be called between _initialize_pre_login and _real_initialize if credentials
491 are passed by the user. In cases where it is necessary to have the login
492 process as part of the extraction rather than initialization, _perform_login
493 can be left undefined.
495 _GEO_BYPASS attribute may be set to False in order to disable
496 geo restriction bypass mechanisms for a particular extractor.
497 Though it won't disable explicit geo restriction bypass based on
498 country code provided with geo_bypass_country.
500 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
501 countries for this extractor. One of these countries will be used by
502 geo restriction bypass mechanism right away in order to bypass
503 geo restriction, of course, if the mechanism is not disabled.
505 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
506 IP blocks in CIDR notation for this extractor. One of these IP blocks
507 will be used by geo restriction bypass mechanism similarly
510 The _ENABLED attribute should be set to False for IEs that
511 are disabled by default and must be explicitly enabled.
513 The _WORKING attribute should be set to False for broken IEs
514 in order to warn the users and skip the tests.
519 _x_forwarded_for_ip
= None
521 _GEO_COUNTRIES
= None
522 _GEO_IP_BLOCKS
= None
525 _NETRC_MACHINE
= None
531 def _login_hint(self
, method
=NO_DEFAULT
, netrc
=None):
532 password_hint
= f
'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
535 'any': f
'Use --cookies, --cookies-from-browser, {password_hint}',
536 'password': f
'Use {password_hint}',
538 'Use --cookies-from-browser or --cookies for the authentication. '
539 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
540 }[method
if method
is not NO_DEFAULT
else 'any' if self
.supports_login() else 'cookies']
542 def __init__(self
, downloader
=None):
543 """Constructor. Receives an optional downloader (a YoutubeDL instance).
544 If a downloader is not passed during initialization,
545 it must be set using "set_downloader()" before "extract()" is called"""
547 self
._x
_forwarded
_for
_ip
= None
548 self
._printed
_messages
= set()
549 self
.set_downloader(downloader
)
552 def _match_valid_url(cls
, url
):
553 if cls
._VALID
_URL
is False:
555 # This does not use has/getattr intentionally - we want to know whether
556 # we have cached the regexp for *this* class, whereas getattr would also
557 # match the superclass
558 if '_VALID_URL_RE' not in cls
.__dict
__:
559 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
560 return cls
._VALID
_URL
_RE
.match(url
)
563 def suitable(cls
, url
):
564 """Receives a URL and returns True if suitable for this IE."""
565 # This function must import everything it needs (except other extractors),
566 # so that lazy_extractors works correctly
567 return cls
._match
_valid
_url
(url
) is not None
570 def _match_id(cls
, url
):
571 return cls
._match
_valid
_url
(url
).group('id')
574 def get_temp_id(cls
, url
):
576 return cls
._match
_id
(url
)
577 except (IndexError, AttributeError):
582 """Getter method for _WORKING."""
586 def supports_login(cls
):
587 return bool(cls
._NETRC
_MACHINE
)
589 def initialize(self
):
590 """Initializes an instance (authentication, etc)."""
591 self
._printed
_messages
= set()
592 self
._initialize
_geo
_bypass
({
593 'countries': self
._GEO
_COUNTRIES
,
594 'ip_blocks': self
._GEO
_IP
_BLOCKS
,
597 self
._initialize
_pre
_login
()
598 if self
.supports_login():
599 username
, password
= self
._get
_login
_info
()
601 self
._perform
_login
(username
, password
)
602 elif self
.get_param('username') and False not in (self
.IE_DESC
, self
._NETRC
_MACHINE
):
603 self
.report_warning(f
'Login with password is not supported for this website. {self._login_hint("cookies")}')
604 self
._real
_initialize
()
607 def _initialize_geo_bypass(self
, geo_bypass_context
):
609 Initialize geo restriction bypass mechanism.
611 This method is used to initialize geo bypass mechanism based on faking
612 X-Forwarded-For HTTP header. A random country from provided country list
613 is selected and a random IP belonging to this country is generated. This
614 IP will be passed as X-Forwarded-For HTTP header in all subsequent
617 This method will be used for initial geo bypass mechanism initialization
618 during the instance initialization with _GEO_COUNTRIES and
621 You may also manually call it from extractor's code if geo bypass
622 information is not available beforehand (e.g. obtained during
623 extraction) or due to some other reason. In this case you should pass
624 this information in geo bypass context passed as first argument. It may
625 contain following fields:
627 countries: List of geo unrestricted countries (similar
629 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
630 (similar to _GEO_IP_BLOCKS)
633 if not self
._x
_forwarded
_for
_ip
:
635 # Geo bypass mechanism is explicitly disabled by user
636 if not self
.get_param('geo_bypass', True):
639 if not geo_bypass_context
:
640 geo_bypass_context
= {}
642 # Backward compatibility: previously _initialize_geo_bypass
643 # expected a list of countries, some 3rd party code may still use
645 if isinstance(geo_bypass_context
, (list, tuple)):
646 geo_bypass_context
= {
647 'countries': geo_bypass_context
,
650 # The whole point of geo bypass mechanism is to fake IP
651 # as X-Forwarded-For HTTP header based on some IP block or
654 # Path 1: bypassing based on IP block in CIDR notation
656 # Explicit IP block specified by user, use it right away
657 # regardless of whether extractor is geo bypassable or not
658 ip_block
= self
.get_param('geo_bypass_ip_block', None)
660 # Otherwise use random IP block from geo bypass context but only
661 # if extractor is known as geo bypassable
663 ip_blocks
= geo_bypass_context
.get('ip_blocks')
664 if self
._GEO
_BYPASS
and ip_blocks
:
665 ip_block
= random
.choice(ip_blocks
)
668 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(ip_block
)
669 self
.write_debug(f
'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
672 # Path 2: bypassing based on country code
674 # Explicit country code specified by user, use it right away
675 # regardless of whether extractor is geo bypassable or not
676 country
= self
.get_param('geo_bypass_country', None)
678 # Otherwise use random country code from geo bypass context but
679 # only if extractor is known as geo bypassable
681 countries
= geo_bypass_context
.get('countries')
682 if self
._GEO
_BYPASS
and countries
:
683 country
= random
.choice(countries
)
686 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country
)
687 self
._downloader
.write_debug(
688 f
'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
690 def extract(self
, url
):
691 """Extracts URL information and returns it in list of dicts."""
696 self
.to_screen('Extracting URL: %s' % (
697 url
if self
.get_param('verbose') else truncate_string(url
, 100, 20)))
698 ie_result
= self
._real
_extract
(url
)
699 if ie_result
is None:
701 if self
._x
_forwarded
_for
_ip
:
702 ie_result
['__x_forwarded_for_ip'] = self
._x
_forwarded
_for
_ip
703 subtitles
= ie_result
.get('subtitles') or {}
704 if 'no-live-chat' in self
.get_param('compat_opts'):
705 for lang
in ('live_chat', 'comments', 'danmaku'):
706 subtitles
.pop(lang
, None)
708 except GeoRestrictedError
as e
:
709 if self
.__maybe
_fake
_ip
_and
_retry
(e
.countries
):
712 except UnsupportedError
:
714 except ExtractorError
as e
:
715 e
.video_id
= e
.video_id
or self
.get_temp_id(url
),
716 e
.ie
= e
.ie
or self
.IE_NAME
,
717 e
.traceback
= e
.traceback
or sys
.exc_info()[2]
719 except http
.client
.IncompleteRead
as e
:
720 raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True, video_id
=self
.get_temp_id(url
))
721 except (KeyError, StopIteration) as e
:
722 raise ExtractorError('An extractor error has occurred.', cause
=e
, video_id
=self
.get_temp_id(url
))
724 def __maybe_fake_ip_and_retry(self
, countries
):
725 if (not self
.get_param('geo_bypass_country', None)
727 and self
.get_param('geo_bypass', True)
728 and not self
._x
_forwarded
_for
_ip
730 country_code
= random
.choice(countries
)
731 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country_code
)
732 if self
._x
_forwarded
_for
_ip
:
734 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
735 % (self
._x
_forwarded
_for
_ip
, country_code
.upper()))
739 def set_downloader(self
, downloader
):
740 """Sets a YoutubeDL instance as the downloader for this IE."""
741 self
._downloader
= downloader
745 return self
._downloader
.cache
749 return self
._downloader
.cookiejar
751 def _initialize_pre_login(self
):
752 """ Initialization before login. Redefine in subclasses."""
755 def _perform_login(self
, username
, password
):
756 """ Login with username and password. Redefine in subclasses."""
759 def _real_initialize(self
):
760 """Real initialization process. Redefine in subclasses."""
763 def _real_extract(self
, url
):
764 """Real extraction process. Redefine in subclasses."""
765 raise NotImplementedError('This method must be implemented by subclasses')
769 """A string for getting the InfoExtractor with get_info_extractor"""
770 return cls
.__name
__[:-2]
774 return cls
.__name
__[:-2]
777 def __can_accept_status_code(err
, expected_status
):
778 assert isinstance(err
, urllib
.error
.HTTPError
)
779 if expected_status
is None:
781 elif callable(expected_status
):
782 return expected_status(err
.code
) is True
784 return err
.code
in variadic(expected_status
)
786 def _create_request(self
, url_or_request
, data
=None, headers
=None, query
=None):
787 if isinstance(url_or_request
, urllib
.request
.Request
):
788 return update_Request(url_or_request
, data
=data
, headers
=headers
, query
=query
)
790 url_or_request
= update_url_query(url_or_request
, query
)
791 return sanitized_Request(url_or_request
, data
, headers
or {})
793 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
=None, query
=None, expected_status
=None):
795 Return the response handle.
797 See _download_webpage docstring for arguments specification.
799 if not self
._downloader
._first
_webpage
_request
:
800 sleep_interval
= self
.get_param('sleep_interval_requests') or 0
801 if sleep_interval
> 0:
802 self
.to_screen('Sleeping %s seconds ...' % sleep_interval
)
803 time
.sleep(sleep_interval
)
805 self
._downloader
._first
_webpage
_request
= False
808 self
.report_download_webpage(video_id
)
809 elif note
is not False:
811 self
.to_screen(str(note
))
813 self
.to_screen(f
'{video_id}: {note}')
815 # Some sites check X-Forwarded-For HTTP header in order to figure out
816 # the origin of the client behind proxy. This allows bypassing geo
817 # restriction by faking this header's value to IP that belongs to some
818 # geo unrestricted country. We will do so once we encounter any
819 # geo restriction error.
820 if self
._x
_forwarded
_for
_ip
:
821 headers
= (headers
or {}).copy()
822 headers
.setdefault('X-Forwarded-For', self
._x
_forwarded
_for
_ip
)
825 return self
._downloader
.urlopen(self
._create
_request
(url_or_request
, data
, headers
, query
))
826 except network_exceptions
as err
:
827 if isinstance(err
, urllib
.error
.HTTPError
):
828 if self
.__can
_accept
_status
_code
(err
, expected_status
):
829 # Retain reference to error to prevent file object from
830 # being closed before it can be read. Works around the
831 # effects of <https://bugs.python.org/issue15002>
832 # introduced in Python 3.4.1.
839 errnote
= 'Unable to download webpage'
841 errmsg
= f
'{errnote}: {error_to_compat_str(err)}'
843 raise ExtractorError(errmsg
, cause
=err
)
845 self
.report_warning(errmsg
)
848 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True,
849 encoding
=None, data
=None, headers
={}, query={}
, expected_status
=None):
851 Return a tuple (page content as string, URL handle).
854 url_or_request -- plain text URL as a string or
855 a urllib.request.Request object
856 video_id -- Video/playlist/item identifier (string)
859 note -- note printed before downloading (string)
860 errnote -- note printed in case of an error (string)
861 fatal -- flag denoting whether error should be considered fatal,
862 i.e. whether it should cause ExtractionError to be raised,
863 otherwise a warning will be reported and extraction continued
864 encoding -- encoding for a page content decoding, guessed automatically
865 when not explicitly specified
866 data -- POST data (bytes)
867 headers -- HTTP headers (dict)
868 query -- URL query (dict)
869 expected_status -- allows to accept failed HTTP requests (non 2xx
870 status code) by explicitly specifying a set of accepted status
871 codes. Can be any of the following entities:
872 - an integer type specifying an exact failed status code to
874 - a list or a tuple of integer types specifying a list of
875 failed status codes to accept
876 - a callable accepting an actual failed status code and
877 returning True if it should be accepted
878 Note that this argument does not affect success status codes (2xx)
879 which are always accepted.
882 # Strip hashes from the URL (#1038)
883 if isinstance(url_or_request
, str):
884 url_or_request
= url_or_request
.partition('#')[0]
886 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
, expected_status
=expected_status
)
890 content
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
)
891 return (content
, urlh
)
894 def _guess_encoding_from_content(content_type
, webpage_bytes
):
895 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
897 encoding
= m
.group(1)
899 m
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
900 webpage_bytes[:1024])
902 encoding = m.group(1).decode('ascii')
903 elif webpage_bytes.startswith(b'\xff\xfe'):
910 def __check_blocked(self, content):
911 first_block = content[:512]
912 if ('<title>Access to this site is blocked</title>' in content
913 and 'Websense' in first_block):
914 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
915 blocked_iframe = self._html_search_regex(
916 r'<iframe src="([^
"]+)"', content,
917 'Websense information URL
', default=None)
919 msg += ' Visit
%s for more details
' % blocked_iframe
920 raise ExtractorError(msg, expected=True)
921 if '<title
>The URL you requested has been blocked
</title
>' in first_block:
923 'Access to this webpage has been blocked by Indian censorship
. '
924 'Use a VPN
or proxy
server (with --proxy
) to route around it
.')
925 block_msg = self._html_search_regex(
926 r'</h1
><p
>(.*?
)</p
>',
927 content, 'block message
', default=None)
929 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ')
930 raise ExtractorError(msg, expected=True)
931 if ('<title
>TTK
:: Доступ к ресурсу ограничен
</title
>' in content
932 and 'blocklist
.rkn
.gov
.ru
' in content):
933 raise ExtractorError(
934 'Access to this webpage has been blocked by decision of the Russian government
. '
935 'Visit http
://blocklist
.rkn
.gov
.ru
/ for a block reason
.',
938 def _request_dump_filename(self, url, video_id):
939 basen = f'{video_id}_{url}
'
940 trim_length = self.get_param('trim_file_name
') or 240
941 if len(basen) > trim_length:
942 h = '___
' + hashlib.md5(basen.encode('utf
-8')).hexdigest()
943 basen = basen[:trim_length - len(h)] + h
944 filename = sanitize_filename(f'{basen}
.dump
', restricted=True)
945 # Working around MAX_PATH limitation on Windows (see
946 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
947 if compat_os_name == 'nt
':
948 absfilepath = os.path.abspath(filename)
949 if len(absfilepath) > 259:
950 filename = fR'\\?\{absfilepath}
'
953 def __decode_webpage(self, webpage_bytes, encoding, headers):
955 encoding = self._guess_encoding_from_content(headers.get('Content
-Type
', ''), webpage_bytes)
957 return webpage_bytes.decode(encoding, 'replace
')
959 return webpage_bytes.decode('utf
-8', 'replace
')
961 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
962 webpage_bytes = urlh.read()
963 if prefix is not None:
964 webpage_bytes = prefix + webpage_bytes
965 if self.get_param('dump_intermediate_pages
', False):
966 self.to_screen('Dumping request to
' + urlh.geturl())
967 dump = base64.b64encode(webpage_bytes).decode('ascii
')
968 self._downloader.to_screen(dump)
969 if self.get_param('write_pages
'):
970 filename = self._request_dump_filename(urlh.geturl(), video_id)
971 self.to_screen(f'Saving request to {filename}
')
972 with open(filename, 'wb
') as outf:
973 outf.write(webpage_bytes)
975 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
976 self.__check_blocked(content)
980 def __print_error(self, errnote, fatal, video_id, err):
982 raise ExtractorError(f'{video_id}
: {errnote}
', cause=err)
984 self.report_warning(f'{video_id}
: {errnote}
: {err}
')
986 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
988 xml_string = transform_source(xml_string)
990 return compat_etree_fromstring(xml_string.encode('utf
-8'))
991 except xml.etree.ElementTree.ParseError as ve:
992 self.__print_error('Failed to parse XML
' if errnote is None else errnote, fatal, video_id, ve)
994 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
997 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
998 except ValueError as ve:
999 self.__print_error('Failed to parse JSON
' if errnote is None else errnote, fatal, video_id, ve)
1001 def _parse_socket_response_as_json(self, data, *args, **kwargs):
1002 return self._parse_json(data[data.find('{'):data.rfind('}
') + 1], *args, **kwargs)
1004 def __create_download_methods(name, parser, note, errnote, return_value):
1006 def parse(ie, content, *args, errnote=errnote, **kwargs):
1009 if errnote is False:
1010 kwargs['errnote
'] = errnote
1011 # parser is fetched by name so subclasses can override it
1012 return getattr(ie, parser)(content, *args, **kwargs)
1014 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1015 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1016 res = self._download_webpage_handle(
1017 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1018 data=data, headers=headers, query=query, expected_status=expected_status)
1022 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1024 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1025 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1026 if self.get_param('load_pages
'):
1027 url_or_request = self._create_request(url_or_request, data, headers, query)
1028 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1029 self.to_screen(f'Loading request
from {filename}
')
1031 with open(filename, 'rb
') as dumpf:
1032 webpage_bytes = dumpf.read()
1033 except OSError as e:
1034 self.report_warning(f'Unable to load request
from disk
: {e}
')
1036 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1037 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1041 'transform_source
': transform_source,
1043 'encoding
': encoding,
1047 'expected_status
': expected_status,
1050 kwargs.pop('transform_source
')
1051 # The method is fetched by name so subclasses can override _download_..._handle
1052 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1053 return res if res is False else res[0]
1055 def impersonate(func, name, return_value):
1056 func.__name__, func.__qualname__ = name, f'InfoExtractor
.{name}
'
1058 @param transform_source Apply this transformation before parsing
1059 @returns {return_value}
1061 See _download_webpage_handle docstring for other arguments specification
1064 impersonate(download_handle, f'_download_{name}_handle
', f'({return_value}
, URL handle
)')
1065 impersonate(download_content, f'_download_{name}
', f'{return_value}
')
1066 return download_handle, download_content
1068 _download_xml_handle, _download_xml = __create_download_methods(
1069 'xml
', '_parse_xml
', 'Downloading XML
', 'Unable to download XML
', 'xml
as an xml
.etree
.ElementTree
.Element
')
1070 _download_json_handle, _download_json = __create_download_methods(
1071 'json
', '_parse_json
', 'Downloading JSON metadata
', 'Unable to download JSON metadata
', 'JSON
object as a
dict')
1072 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1073 'socket_json
', '_parse_socket_response_as_json
', 'Polling socket
', 'Unable to poll socket
', 'JSON
object as a
dict')
1074 __download_webpage = __create_download_methods('webpage
', None, None, None, 'data of the page
as a string
')[1]
1076 def _download_webpage(
1077 self, url_or_request, video_id, note=None, errnote=None,
1078 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1080 Return the data of the page as a string.
1083 tries -- number of tries
1084 timeout -- sleep interval between tries
1086 See _download_webpage_handle docstring for other arguments specification.
1089 R''' # NB: These are unused; should they be deprecated?
1091 self._downloader.deprecation_warning('tries argument
is deprecated
in InfoExtractor
._download
_webpage
')
1092 if timeout is NO_DEFAULT:
1095 self._downloader.deprecation_warning('timeout argument
is deprecated
in InfoExtractor
._download
_webpage
')
1101 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1102 except http.client.IncompleteRead as e:
1104 if try_count >= tries:
1106 self._sleep(timeout, video_id)
1108 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1109 idstr = format_field(video_id, None, '%s: ')
1110 msg = f'[{self.IE_NAME}
] {idstr}{msg}
'
1112 if f'WARNING
: {msg}
' in self._printed_messages:
1114 self._printed_messages.add(f'WARNING
: {msg}
')
1115 self._downloader.report_warning(msg, *args, **kwargs)
1117 def to_screen(self, msg, *args, **kwargs):
1118 """Print msg to screen, prefixing it with '[ie_name
]'"""
1119 self._downloader.to_screen(f'[{self.IE_NAME}
] {msg}
', *args, **kwargs)
1121 def write_debug(self, msg, *args, **kwargs):
1122 self._downloader.write_debug(f'[{self.IE_NAME}
] {msg}
', *args, **kwargs)
1124 def get_param(self, name, default=None, *args, **kwargs):
1125 if self._downloader:
1126 return self._downloader.params.get(name, default, *args, **kwargs)
1129 def report_drm(self, video_id, partial=NO_DEFAULT):
1130 if partial is not NO_DEFAULT:
1131 self._downloader.deprecation_warning('InfoExtractor
.report_drm no longer accepts the argument partial
')
1132 self.raise_no_formats('This video
is DRM protected
', expected=True, video_id=video_id)
1134 def report_extraction(self, id_or_name):
1135 """Report information extraction."""
1136 self.to_screen('%s: Extracting information
' % id_or_name)
1138 def report_download_webpage(self, video_id):
1139 """Report webpage download."""
1140 self.to_screen('%s: Downloading webpage
' % video_id)
1142 def report_age_confirmation(self):
1143 """Report attempt to confirm age."""
1144 self.to_screen('Confirming age
')
1146 def report_login(self):
1147 """Report attempt to log in."""
1148 self.to_screen('Logging
in')
1150 def raise_login_required(
1151 self, msg='This video
is only available
for registered users
',
1152 metadata_available=False, method=NO_DEFAULT):
1153 if metadata_available and (
1154 self.get_param('ignore_no_formats_error
') or self.get_param('wait_for_video
')):
1155 self.report_warning(msg)
1157 msg += format_field(self._login_hint(method), None, '. %s')
1158 raise ExtractorError(msg, expected=True)
1160 def raise_geo_restricted(
1161 self, msg='This video
is not available
from your location due to geo restriction
',
1162 countries=None, metadata_available=False):
1163 if metadata_available and (
1164 self.get_param('ignore_no_formats_error
') or self.get_param('wait_for_video
')):
1165 self.report_warning(msg)
1167 raise GeoRestrictedError(msg, countries=countries)
1169 def raise_no_formats(self, msg, expected=False, video_id=None):
1171 self.get_param('ignore_no_formats_error
') or self.get_param('wait_for_video
')):
1172 self.report_warning(msg, video_id)
1173 elif isinstance(msg, ExtractorError):
1176 raise ExtractorError(msg, expected=expected, video_id=video_id)
1178 # Methods for following #608
1180 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1181 """Returns a URL that points to a page that should be processed"""
1183 kwargs['ie_key
'] = ie if isinstance(ie, str) else ie.ie_key()
1184 if video_id is not None:
1185 kwargs['id'] = video_id
1186 if video_title is not None:
1187 kwargs['title
'] = video_title
1190 '_type
': 'url_transparent
' if url_transparent else 'url
',
1195 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1196 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1197 return cls.playlist_result(
1198 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1199 playlist_id, playlist_title, **kwargs)
1202 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1203 """Returns a playlist"""
1205 kwargs['id'] = playlist_id
1207 kwargs['title
'] = playlist_title
1208 if playlist_description is not None:
1209 kwargs['description
'] = playlist_description
1212 '_type
': 'multi_video
' if multi_video else 'playlist
',
1216 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1218 Perform a regex search on the given string, using a single or a list of
1219 patterns returning the first matching group.
1220 In case of failure return a default value or raise a WARNING or a
1221 RegexNotFoundError, depending on fatal, specifying the field name.
1225 elif isinstance(pattern, (str, re.Pattern)):
1226 mobj = re.search(pattern, string, flags)
1229 mobj = re.search(p, string, flags)
1233 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1237 # return the first matching group
1238 return next(g for g in mobj.groups() if g is not None)
1239 elif isinstance(group, (list, tuple)):
1240 return tuple(mobj.group(g) for g in group)
1242 return mobj.group(group)
1243 elif default is not NO_DEFAULT:
1246 raise RegexNotFoundError('Unable to extract
%s' % _name)
1248 self.report_warning('unable to extract
%s' % _name + bug_reports_message())
1251 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1252 contains_pattern=r'{(?s:.+)}
', fatal=True, default=NO_DEFAULT, **kwargs):
1253 """Searches string for the JSON object specified by start_pattern"""
1254 # NB: end_pattern is only used to reduce the size of the initial match
1255 if default is NO_DEFAULT:
1256 default, has_default = {}, False
1258 fatal, has_default = False, True
1260 json_string = self._search_regex(
1261 rf'(?
:{start_pattern}
)\s
*(?P
<json
>{contains_pattern}
)\s
*(?
:{end_pattern}
)',
1262 string, name, group='json
', fatal=fatal, default=None if has_default else NO_DEFAULT)
1266 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1268 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1269 except ExtractorError as e:
1271 raise ExtractorError(
1272 f'Unable to extract {_name}
- Failed to parse JSON
', cause=e.cause, video_id=video_id)
1273 elif not has_default:
1274 self.report_warning(
1275 f'Unable to extract {_name}
- Failed to parse JSON
: {e}
', video_id=video_id)
1278 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1280 Like _search_regex, but strips HTML tags and unescapes entities.
1282 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1283 if isinstance(res, tuple):
1284 return tuple(map(clean_html, res))
1285 return clean_html(res)
1287 def _get_netrc_login_info(self, netrc_machine=None):
1290 netrc_machine = netrc_machine or self._NETRC_MACHINE
1292 if self.get_param('usenetrc
', False):
1294 netrc_file = compat_expanduser(self.get_param('netrc_location
') or '~
')
1295 if os.path.isdir(netrc_file):
1296 netrc_file = os.path.join(netrc_file, '.netrc
')
1297 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1298 if info is not None:
1302 raise netrc.NetrcParseError(
1303 'No authenticators
for %s' % netrc_machine)
1304 except (OSError, netrc.NetrcParseError) as err:
1305 self.report_warning(
1306 'parsing
.netrc
: %s' % error_to_compat_str(err))
1308 return username, password
1310 def _get_login_info(self, username_option='username
', password_option='password
', netrc_machine=None):
1312 Get the login info as (username, password)
1313 First look for the manually specified credentials using username_option
1314 and password_option as keys in params dictionary. If no such credentials
1315 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1317 If there's no info available
, return (None, None)
1320 # Attempt to use provided username and password or .netrc data
1321 username = self.get_param(username_option)
1322 if username is not None:
1323 password = self.get_param(password_option)
1325 username, password = self._get_netrc_login_info(netrc_machine)
1327 return username, password
1329 def _get_tfa_info(self, note='two-factor verification code'):
1331 Get the two
-factor authentication info
1332 TODO
- asking the user will be required
for sms
/phone verify
1333 currently just uses the command line option
1334 If there
's no info available, return None
1337 tfa = self.get_param('twofactor
')
1341 return getpass.getpass('Type
%s and press
[Return
]: ' % note)
1343 # Helper functions for extracting OpenGraph info
1345 def _og_regexes(prop):
1346 content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
)(?
=\s|
/?
>))'
1347 property_re = (r'(?
:name|
property)=(?
:\'og
%(sep)s%(prop)s\'|
"og%(sep)s%(prop)s"|\s
*og
%(sep)s%(prop)s\b)'
1348 % {'prop': re.escape(prop), 'sep': '(?::|[:-])'})
1349 template = r'<meta
[^
>]+?
%s[^
>]+?
%s'
1351 template % (property_re, content_re),
1352 template % (content_re, property_re),
1356 def _meta_regex(prop):
1357 return r'''(?isx)<meta
1358 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1359 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1361 def _og_search_property(self, prop, html, name=None, **kargs):
1362 prop = variadic(prop)
1364 name = 'OpenGraph
%s' % prop[0]
1367 og_regexes.extend(self._og_regexes(p))
1368 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1371 return unescapeHTML(escaped)
1373 def _og_search_thumbnail(self, html, **kargs):
1374 return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs)
1376 def _og_search_description(self, html, **kargs):
1377 return self._og_search_property('description
', html, fatal=False, **kargs)
1379 def _og_search_title(self, html, *, fatal=False, **kargs):
1380 return self._og_search_property('title
', html, fatal=fatal, **kargs)
1382 def _og_search_video_url(self, html, name='video url
', secure=True, **kargs):
1383 regexes = self._og_regexes('video
') + self._og_regexes('video
:url
')
1385 regexes = self._og_regexes('video
:secure_url
') + regexes
1386 return self._html_search_regex(regexes, html, name, **kargs)
1388 def _og_search_url(self, html, **kargs):
1389 return self._og_search_property('url
', html, **kargs)
1391 def _html_extract_title(self, html, name='title
', *, fatal=False, **kwargs):
1392 return self._html_search_regex(r'(?s
)<title
\b[^
>]*>([^
<]+)</title
>', html, name, fatal=fatal, **kwargs)
1394 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1395 name = variadic(name)
1396 if display_name is None:
1397 display_name = name[0]
1398 return self._html_search_regex(
1399 [self._meta_regex(n) for n in name],
1400 html, display_name, fatal=fatal, group='content
', **kwargs)
1402 def _dc_search_uploader(self, html):
1403 return self._html_search_meta('dc
.creator
', html, 'uploader
')
1406 def _rta_search(html):
1407 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1408 if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+'
1409 r' content
="RTA-5042-1996-1400-1577-RTA"',
1413 # And then there are the jokers who advertise that they use RTA, but actually don't
.
1414 AGE_LIMIT_MARKERS
= [
1415 r
'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1416 r
'>[^<]*you acknowledge you are at least (\d+) years old',
1417 r
'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1421 for marker
in AGE_LIMIT_MARKERS
:
1422 mobj
= re
.search(marker
, html
)
1424 age_limit
= max(age_limit
, int(traverse_obj(mobj
, 1, default
=18)))
1427 def _media_rating_search(self
, html
):
1428 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1429 rating
= self
._html
_search
_meta
('rating', html
)
1441 return RATING_TABLE
.get(rating
.lower())
1443 def _family_friendly_search(self
, html
):
1444 # See http://schema.org/VideoObject
1445 family_friendly
= self
._html
_search
_meta
(
1446 'isFamilyFriendly', html
, default
=None)
1448 if not family_friendly
:
1457 return RATING_TABLE
.get(family_friendly
.lower())
1459 def _twitter_search_player(self
, html
):
1460 return self
._html
_search
_meta
('twitter:player', html
,
1461 'twitter card player')
1463 def _yield_json_ld(self
, html
, video_id
, *, fatal
=True, default
=NO_DEFAULT
):
1464 """Yield all json ld objects in the html"""
1465 if default
is not NO_DEFAULT
:
1467 for mobj
in re
.finditer(JSON_LD_RE
, html
):
1468 json_ld_item
= self
._parse
_json
(mobj
.group('json_ld'), video_id
, fatal
=fatal
)
1469 for json_ld
in variadic(json_ld_item
):
1470 if isinstance(json_ld
, dict):
1473 def _search_json_ld(self
, html
, video_id
, expected_type
=None, *, fatal
=True, default
=NO_DEFAULT
):
1474 """Search for a video in any json ld in the html"""
1475 if default
is not NO_DEFAULT
:
1477 info
= self
._json
_ld
(
1478 list(self
._yield
_json
_ld
(html
, video_id
, fatal
=fatal
, default
=default
)),
1479 video_id
, fatal
=fatal
, expected_type
=expected_type
)
1482 if default
is not NO_DEFAULT
:
1485 raise RegexNotFoundError('Unable to extract JSON-LD')
1487 self
.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1490 def _json_ld(self
, json_ld
, video_id
, fatal
=True, expected_type
=None):
1491 if isinstance(json_ld
, str):
1492 json_ld
= self
._parse
_json
(json_ld
, video_id
, fatal
=fatal
)
1497 INTERACTION_TYPE_MAP
= {
1498 'CommentAction': 'comment',
1499 'AgreeAction': 'like',
1500 'DisagreeAction': 'dislike',
1501 'LikeAction': 'like',
1502 'DislikeAction': 'dislike',
1503 'ListenAction': 'view',
1504 'WatchAction': 'view',
1505 'ViewAction': 'view',
1508 def is_type(e
, *expected_types
):
1509 type = variadic(traverse_obj(e
, '@type'))
1510 return any(x
in type for x
in expected_types
)
1512 def extract_interaction_type(e
):
1513 interaction_type
= e
.get('interactionType')
1514 if isinstance(interaction_type
, dict):
1515 interaction_type
= interaction_type
.get('@type')
1516 return str_or_none(interaction_type
)
1518 def extract_interaction_statistic(e
):
1519 interaction_statistic
= e
.get('interactionStatistic')
1520 if isinstance(interaction_statistic
, dict):
1521 interaction_statistic
= [interaction_statistic
]
1522 if not isinstance(interaction_statistic
, list):
1524 for is_e
in interaction_statistic
:
1525 if not is_type(is_e
, 'InteractionCounter'):
1527 interaction_type
= extract_interaction_type(is_e
)
1528 if not interaction_type
:
1530 # For interaction count some sites provide string instead of
1531 # an integer (as per spec) with non digit characters (e.g. ",")
1532 # so extracting count with more relaxed str_to_int
1533 interaction_count
= str_to_int(is_e
.get('userInteractionCount'))
1534 if interaction_count
is None:
1536 count_kind
= INTERACTION_TYPE_MAP
.get(interaction_type
.split('/')[-1])
1539 count_key
= '%s_count' % count_kind
1540 if info
.get(count_key
) is not None:
1542 info
[count_key
] = interaction_count
1544 def extract_chapter_information(e
):
1546 'title': part
.get('name'),
1547 'start_time': part
.get('startOffset'),
1548 'end_time': part
.get('endOffset'),
1549 } for part
in variadic(e
.get('hasPart') or []) if part
.get('@type') == 'Clip']
1550 for idx
, (last_c
, current_c
, next_c
) in enumerate(zip(
1551 [{'end_time': 0}
] + chapters
, chapters
, chapters
[1:])):
1552 current_c
['end_time'] = current_c
['end_time'] or next_c
['start_time']
1553 current_c
['start_time'] = current_c
['start_time'] or last_c
['end_time']
1554 if None in current_c
.values():
1555 self
.report_warning(f
'Chapter {idx} contains broken data. Not extracting chapters')
1558 chapters
[-1]['end_time'] = chapters
[-1]['end_time'] or info
['duration']
1559 info
['chapters'] = chapters
1561 def extract_video_object(e
):
1562 author
= e
.get('author')
1564 'url': url_or_none(e
.get('contentUrl')),
1565 'ext': mimetype2ext(e
.get('encodingFormat')),
1566 'title': unescapeHTML(e
.get('name')),
1567 'description': unescapeHTML(e
.get('description')),
1568 'thumbnails': [{'url': unescapeHTML(url)}
1569 for url
in variadic(traverse_obj(e
, 'thumbnailUrl', 'thumbnailURL'))
1570 if url_or_none(url
)],
1571 'duration': parse_duration(e
.get('duration')),
1572 'timestamp': unified_timestamp(e
.get('uploadDate')),
1573 # author can be an instance of 'Organization' or 'Person' types.
1574 # both types can have 'name' property(inherited from 'Thing' type). [1]
1575 # however some websites are using 'Text' type instead.
1576 # 1. https://schema.org/VideoObject
1577 'uploader': author
.get('name') if isinstance(author
, dict) else author
if isinstance(author
, str) else None,
1578 'artist': traverse_obj(e
, ('byArtist', 'name'), expected_type
=str),
1579 'filesize': int_or_none(float_or_none(e
.get('contentSize'))),
1580 'tbr': int_or_none(e
.get('bitrate')),
1581 'width': int_or_none(e
.get('width')),
1582 'height': int_or_none(e
.get('height')),
1583 'view_count': int_or_none(e
.get('interactionCount')),
1584 'tags': try_call(lambda: e
.get('keywords').split(',')),
1586 if is_type(e
, 'AudioObject'):
1589 'abr': int_or_none(e
.get('bitrate')),
1591 extract_interaction_statistic(e
)
1592 extract_chapter_information(e
)
1594 def traverse_json_ld(json_ld
, at_top_level
=True):
1595 for e
in variadic(json_ld
):
1596 if not isinstance(e
, dict):
1598 if at_top_level
and '@context' not in e
:
1600 if at_top_level
and set(e
.keys()) == {'@context', '@graph'}
:
1601 traverse_json_ld(e
['@graph'], at_top_level
=False)
1603 if expected_type
is not None and not is_type(e
, expected_type
):
1605 rating
= traverse_obj(e
, ('aggregateRating', 'ratingValue'), expected_type
=float_or_none
)
1606 if rating
is not None:
1607 info
['average_rating'] = rating
1608 if is_type(e
, 'TVEpisode', 'Episode'):
1609 episode_name
= unescapeHTML(e
.get('name'))
1611 'episode': episode_name
,
1612 'episode_number': int_or_none(e
.get('episodeNumber')),
1613 'description': unescapeHTML(e
.get('description')),
1615 if not info
.get('title') and episode_name
:
1616 info
['title'] = episode_name
1617 part_of_season
= e
.get('partOfSeason')
1618 if is_type(part_of_season
, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1620 'season': unescapeHTML(part_of_season
.get('name')),
1621 'season_number': int_or_none(part_of_season
.get('seasonNumber')),
1623 part_of_series
= e
.get('partOfSeries') or e
.get('partOfTVSeries')
1624 if is_type(part_of_series
, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1625 info
['series'] = unescapeHTML(part_of_series
.get('name'))
1626 elif is_type(e
, 'Movie'):
1628 'title': unescapeHTML(e
.get('name')),
1629 'description': unescapeHTML(e
.get('description')),
1630 'duration': parse_duration(e
.get('duration')),
1631 'timestamp': unified_timestamp(e
.get('dateCreated')),
1633 elif is_type(e
, 'Article', 'NewsArticle'):
1635 'timestamp': parse_iso8601(e
.get('datePublished')),
1636 'title': unescapeHTML(e
.get('headline')),
1637 'description': unescapeHTML(e
.get('articleBody') or e
.get('description')),
1639 if is_type(traverse_obj(e
, ('video', 0)), 'VideoObject'):
1640 extract_video_object(e
['video'][0])
1641 elif is_type(traverse_obj(e
, ('subjectOf', 0)), 'VideoObject'):
1642 extract_video_object(e
['subjectOf'][0])
1643 elif is_type(e
, 'VideoObject', 'AudioObject'):
1644 extract_video_object(e
)
1645 if expected_type
is None:
1649 video
= e
.get('video')
1650 if is_type(video
, 'VideoObject'):
1651 extract_video_object(video
)
1652 if expected_type
is None:
1657 traverse_json_ld(json_ld
)
1658 return filter_dict(info
)
1660 def _search_nextjs_data(self
, webpage
, video_id
, *, transform_source
=None, fatal
=True, **kw
):
1661 return self
._parse
_json
(
1663 r
'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^
>]*>([^
<]+)</script
>',
1664 webpage, 'next
.js data
', fatal=fatal, **kw),
1665 video_id, transform_source=transform_source, fatal=fatal)
1667 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__
', *, fatal=True, traverse=('data
', 0)):
1668 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1669 rectx = re.escape(context_name)
1670 FUNCTION_RE = r'\
(function\
((?P
<arg_keys
>.*?
)\
){return\s+(?P<js>{.*?}
)\s
*;?\s
*}\
((?P
<arg_vals
>.*?
)\
)'
1671 js, arg_keys, arg_vals = self._search_regex(
1672 (rf'<script
>\s
*window\
.{rectx}
={FUNCTION_RE}\s
*\
)\s
*;?\s
*</script
>', rf'{rectx}\
(.*?{FUNCTION_RE}
'),
1673 webpage, context_name, group=('js
', 'arg_keys
', 'arg_vals
'),
1674 default=NO_DEFAULT if fatal else (None, None, None))
1678 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1679 f'[{arg_vals}
]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1681 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1682 return traverse_obj(ret, traverse) or {}
1685 def _hidden_inputs(html):
1686 html = re.sub(r'<!--(?
:(?
!<!--).)*-->', '', html)
1688 for input in re.findall(r'(?i
)(<input[^
>]+>)', html):
1689 attrs = extract_attributes(input)
1692 if attrs.get('type') not in ('hidden
', 'submit
'):
1694 name = attrs.get('name
') or attrs.get('id')
1695 value = attrs.get('value
')
1696 if name and value is not None:
1697 hidden_inputs[name] = value
1698 return hidden_inputs
1700 def _form_hidden_inputs(self, form_id, html):
1701 form = self._search_regex(
1702 r'(?
is)<form
[^
>]+?
id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1703 html, '%s form' % form_id, group='form')
1704 return self._hidden_inputs(form)
1706 @classproperty(cache=True)
1707 def FormatSort(cls):
1708 class FormatSort(FormatSorter):
1709 def __init__(ie, *args, **kwargs):
1710 super().__init__(ie._downloader, *args, **kwargs)
1712 deprecation_warning(
1713 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1714 'Use yt_dlp.utils.FormatSorter instead')
1717 def _sort_formats(self, formats, field_preference=[]):
1718 if not field_preference:
1719 self._downloader.deprecation_warning(
1720 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1722 self._downloader.deprecation_warning(
1723 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1724 'Return _format_sort_fields in the info_dict instead')
1726 formats[0]['__sort_fields'] = field_preference
1728 def _check_formats(self, formats, video_id):
1730 formats[:] = filter(
1731 lambda f: self._is_valid_url(
1733 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1737 def _remove_duplicate_formats(formats):
1741 if f['url'] not in format_urls:
1742 format_urls.add(f['url'])
1743 unique_formats.append(f)
1744 formats[:] = unique_formats
1746 def _is_valid_url(self, url, video_id, item='video', headers={}):
1747 url = self._proto_relative_url(url, scheme='http:')
1748 # For now assume non HTTP(S) URLs always valid
1749 if not (url.startswith('http://') or url.startswith('https://')):
1752 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1754 except ExtractorError as e:
1756 '%s: %s URL is invalid, skipping: %s'
1757 % (video_id, item, error_to_compat_str(e.cause)))
1760 def http_scheme(self):
1761 """ Either "http
:" or "https
:", depending on the user's preferences """
1764 if self.get_param('prefer_insecure', False)
1767 def _proto_relative_url(self, url, scheme=None):
1768 scheme = scheme or self.http_scheme()
1769 assert scheme.endswith(':')
1770 return sanitize_url(url, scheme=scheme[:-1])
1772 def _sleep(self, timeout, video_id, msg_template=None):
1773 if msg_template is None:
1774 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1775 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1779 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1780 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1781 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1782 if self.get_param('ignore_no_formats_error'):
1785 res = self._download_xml_handle(
1786 manifest_url, video_id, 'Downloading f4m manifest',
1787 'Unable to download f4m manifest',
1788 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1789 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1790 transform_source=transform_source,
1791 fatal=fatal, data=data, headers=headers, query=query)
1795 manifest, urlh = res
1796 manifest_url = urlh.geturl()
1798 return self._parse_f4m_formats(
1799 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1800 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1802 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1803 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1804 fatal=True, m3u8_id=None):
1805 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1808 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1809 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1810 if akamai_pv is not None and ';' in akamai_pv.text:
1811 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1812 if playerVerificationChallenge.strip() != '':
1816 manifest_version = '1.0'
1817 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1819 manifest_version = '2.0'
1820 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1821 # Remove unsupported DRM protected media from final formats
1822 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1823 media_nodes = remove_encrypted_media(media_nodes)
1827 manifest_base_url = get_base_url(manifest)
1829 bootstrap_info = xpath_element(
1830 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1831 'bootstrap info', default=None)
1834 mime_type = xpath_text(
1835 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1836 'base URL', default=None)
1837 if mime_type and mime_type.startswith('audio/'):
1840 for i, media_el in enumerate(media_nodes):
1841 tbr = int_or_none(media_el.attrib.get('bitrate'))
1842 width = int_or_none(media_el.attrib.get('width'))
1843 height = int_or_none(media_el.attrib.get('height'))
1844 format_id = join_nonempty(f4m_id, tbr or i)
1845 # If <bootstrapInfo> is present, the specified f4m is a
1846 # stream-level manifest, and only set-level manifests may refer to
1847 # external resources. See section 11.4 and section 4 of F4M spec
1848 if bootstrap_info is None:
1850 # @href is introduced in 2.0, see section 11.6 of F4M spec
1851 if manifest_version == '2.0':
1852 media_url = media_el.attrib.get('href')
1853 if media_url is None:
1854 media_url = media_el.attrib.get('url')
1858 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1859 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1860 # If media_url is itself a f4m manifest do the recursive extraction
1861 # since bitrates in parent manifest (this one) and media_url manifest
1862 # may differ leading to inability to resolve the format by requested
1863 # bitrate in f4m downloader
1864 ext = determine_ext(manifest_url)
1866 f4m_formats = self._extract_f4m_formats(
1867 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1868 transform_source=transform_source, fatal=fatal)
1869 # Sometimes stream-level manifest contains single media entry that
1870 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1871 # At the same time parent's media entry in set-level manifest may
1872 # contain it. We will copy it from parent in such cases.
1873 if len(f4m_formats) == 1:
1876 'tbr': f.get('tbr') or tbr,
1877 'width': f.get('width') or width,
1878 'height': f.get('height') or height,
1879 'format_id': f.get('format_id') if not tbr else format_id,
1882 formats.extend(f4m_formats)
1885 formats.extend(self._extract_m3u8_formats(
1886 manifest_url, video_id, 'mp4', preference=preference,
1887 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1890 'format_id': format_id,
1891 'url': manifest_url,
1892 'manifest_url': manifest_url,
1893 'ext': 'flv' if bootstrap_info is not None else None,
1899 'preference': preference,
1904 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1906 'format_id': join_nonempty(m3u8_id, 'meta'),
1910 'preference': preference - 100 if preference else -100,
1912 'resolution': 'multiple',
1913 'format_note': 'Quality selection URL',
1916 def _report_ignoring_subs(self, name):
1917 self.report_warning(bug_reports_message(
1918 f'Ignoring subtitle tracks found in the {name} manifest; '
1919 'if any subtitle tracks are missing,'
1922 def _extract_m3u8_formats(self, *args, **kwargs):
1923 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1925 self._report_ignoring_subs('HLS')
1928 def _extract_m3u8_formats_and_subtitles(
1929 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1930 preference=None, quality=None, m3u8_id=None, note=None,
1931 errnote=None, fatal=True, live=False, data=None, headers={},
1934 if self.get_param('ignore_no_formats_error'):
1938 if errnote is not False:
1939 errnote = errnote or 'Failed to obtain m3u8 URL'
1941 raise ExtractorError(errnote, video_id=video_id)
1942 self.report_warning(f'{errnote}{bug_reports_message()}')
1945 res = self._download_webpage_handle(
1947 note='Downloading m3u8 information' if note is None else note,
1948 errnote='Failed to download m3u8 information' if errnote is None else errnote,
1949 fatal=fatal, data=data, headers=headers, query=query)
1954 m3u8_doc, urlh = res
1955 m3u8_url = urlh.geturl()
1957 return self._parse_m3u8_formats_and_subtitles(
1958 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1959 preference=preference, quality=quality, m3u8_id=m3u8_id,
1960 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1961 headers=headers, query=query, video_id=video_id)
1963 def _parse_m3u8_formats_and_subtitles(
1964 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1965 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1966 errnote=None, fatal=True, data=None, headers={}, query={},
1968 formats, subtitles = [], {}
1970 has_drm = re.search('|'.join([
1971 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
1972 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd
://', # Apple FairPlay
1975 def format_url(url):
1976 return url if re.match(r'^https?
://', url) else urllib.parse.urljoin(m3u8_url, url)
1978 if self.get_param('hls_split_discontinuity
', False):
1979 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1981 if not manifest_url:
1983 m3u8_doc = self._download_webpage(
1984 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1985 note=False, errnote='Failed to download m3u8 playlist information
')
1986 if m3u8_doc is False:
1988 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
1991 def _extract_m3u8_playlist_indices(*args
, **kwargs
):
1995 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1996 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1997 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1999 # We should try extracting formats only from master playlists [1, 4.3.4],
2000 # i.e. playlists that describe available qualities. On the other hand
2001 # media playlists [1, 4.3.3] should be returned as is since they contain
2002 # just the media without qualities renditions.
2003 # Fortunately, master playlist can be easily distinguished from media
2004 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2005 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2006 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2007 # media playlist and MUST NOT appear in master playlist thus we can
2008 # clearly detect media playlist with this criterion.
2010 if '#EXT-X-TARGETDURATION' in m3u8_doc
: # media playlist, return as is
2012 'format_id': join_nonempty(m3u8_id
, idx
),
2013 'format_index': idx
,
2014 'url': m3u8_url
or encode_data_uri(m3u8_doc
.encode('utf-8'), 'application/x-mpegurl'),
2016 'protocol': entry_protocol
,
2017 'preference': preference
,
2020 } for idx
in _extract_m3u8_playlist_indices(m3u8_doc
=m3u8_doc
)]
2022 return formats
, subtitles
2025 last_stream_inf
= {}
2027 def extract_media(x_media_line
):
2028 media
= parse_m3u8_attributes(x_media_line
)
2029 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2030 media_type
, group_id
, name
= media
.get('TYPE'), media
.get('GROUP-ID'), media
.get('NAME')
2031 if not (media_type
and group_id
and name
):
2033 groups
.setdefault(group_id
, []).append(media
)
2034 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2035 if media_type
== 'SUBTITLES':
2036 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2037 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2038 # However, lack of URI has been spotted in the wild.
2039 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2040 if not media
.get('URI'):
2042 url
= format_url(media
['URI'])
2045 'ext': determine_ext(url
),
2047 if sub_info
['ext'] == 'm3u8':
2048 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2049 # files may contain is WebVTT:
2050 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2051 sub_info
['ext'] = 'vtt'
2052 sub_info
['protocol'] = 'm3u8_native'
2053 lang
= media
.get('LANGUAGE') or 'und'
2054 subtitles
.setdefault(lang
, []).append(sub_info
)
2055 if media_type
not in ('VIDEO', 'AUDIO'):
2057 media_url
= media
.get('URI')
2059 manifest_url
= format_url(media_url
)
2061 'format_id': join_nonempty(m3u8_id
, group_id
, name
, idx
),
2062 'format_note': name
,
2063 'format_index': idx
,
2064 'url': manifest_url
,
2065 'manifest_url': m3u8_url
,
2066 'language': media
.get('LANGUAGE'),
2068 'protocol': entry_protocol
,
2069 'preference': preference
,
2072 'vcodec': 'none' if media_type
== 'AUDIO' else None,
2073 } for idx
in _extract_m3u8_playlist_indices(manifest_url
))
2075 def build_stream_name():
2076 # Despite specification does not mention NAME attribute for
2077 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2078 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2079 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2080 stream_name
= last_stream_inf
.get('NAME')
2083 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2084 # from corresponding rendition group
2085 stream_group_id
= last_stream_inf
.get('VIDEO')
2086 if not stream_group_id
:
2088 stream_group
= groups
.get(stream_group_id
)
2089 if not stream_group
:
2090 return stream_group_id
2091 rendition
= stream_group
[0]
2092 return rendition
.get('NAME') or stream_group_id
2094 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2095 # chance to detect video only formats when EXT-X-STREAM-INF tags
2096 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2097 for line
in m3u8_doc
.splitlines():
2098 if line
.startswith('#EXT-X-MEDIA:'):
2101 for line
in m3u8_doc
.splitlines():
2102 if line
.startswith('#EXT-X-STREAM-INF:'):
2103 last_stream_inf
= parse_m3u8_attributes(line
)
2104 elif line
.startswith('#') or not line
.strip():
2107 tbr
= float_or_none(
2108 last_stream_inf
.get('AVERAGE-BANDWIDTH')
2109 or last_stream_inf
.get('BANDWIDTH'), scale
=1000)
2110 manifest_url
= format_url(line
.strip())
2112 for idx
in _extract_m3u8_playlist_indices(manifest_url
):
2113 format_id
= [m3u8_id
, None, idx
]
2114 # Bandwidth of live streams may differ over time thus making
2115 # format_id unpredictable. So it's better to keep provided
2118 stream_name
= build_stream_name()
2119 format_id
[1] = stream_name
or '%d' % (tbr
or len(formats
))
2121 'format_id': join_nonempty(*format_id
),
2122 'format_index': idx
,
2123 'url': manifest_url
,
2124 'manifest_url': m3u8_url
,
2127 'fps': float_or_none(last_stream_inf
.get('FRAME-RATE')),
2128 'protocol': entry_protocol
,
2129 'preference': preference
,
2133 resolution
= last_stream_inf
.get('RESOLUTION')
2135 mobj
= re
.search(r
'(?P<width>\d+)[xX](?P<height>\d+)', resolution
)
2137 f
['width'] = int(mobj
.group('width'))
2138 f
['height'] = int(mobj
.group('height'))
2139 # Unified Streaming Platform
2141 r
'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f
['url'])
2143 abr
, vbr
= mobj
.groups()
2144 abr
, vbr
= float_or_none(abr
, 1000), float_or_none(vbr
, 1000)
2149 codecs
= parse_codecs(last_stream_inf
.get('CODECS'))
2151 audio_group_id
= last_stream_inf
.get('AUDIO')
2152 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2153 # references a rendition group MUST have a CODECS attribute.
2154 # However, this is not always respected. E.g. [2]
2155 # contains EXT-X-STREAM-INF tag which references AUDIO
2156 # rendition group but does not have CODECS and despite
2157 # referencing an audio group it represents a complete
2158 # (with audio and video) format. So, for such cases we will
2159 # ignore references to rendition groups and treat them
2160 # as complete formats.
2161 if audio_group_id
and codecs
and f
.get('vcodec') != 'none':
2162 audio_group
= groups
.get(audio_group_id
)
2163 if audio_group
and audio_group
[0].get('URI'):
2164 # TODO: update acodec for audio only formats with
2166 f
['acodec'] = 'none'
2167 if not f
.get('ext'):
2168 f
['ext'] = 'm4a' if f
.get('vcodec') == 'none' else 'mp4'
2172 progressive_uri
= last_stream_inf
.get('PROGRESSIVE-URI')
2175 del http_f
['manifest_url']
2177 'format_id': f
['format_id'].replace('hls-', 'http-'),
2179 'url': progressive_uri
,
2181 formats
.append(http_f
)
2183 last_stream_inf
= {}
2184 return formats
, subtitles
2186 def _extract_m3u8_vod_duration(
2187 self
, m3u8_vod_url
, video_id
, note
=None, errnote
=None, data
=None, headers
={}, query={}
):
2189 m3u8_vod
= self
._download
_webpage
(
2190 m3u8_vod_url
, video_id
,
2191 note
='Downloading m3u8 VOD manifest' if note
is None else note
,
2192 errnote
='Failed to download VOD manifest' if errnote
is None else errnote
,
2193 fatal
=False, data
=data
, headers
=headers
, query
=query
)
2195 return self
._parse
_m
3u8_vod
_duration
(m3u8_vod
or '', video_id
)
2197 def _parse_m3u8_vod_duration(self
, m3u8_vod
, video_id
):
2198 if '#EXT-X-ENDLIST' not in m3u8_vod
:
2202 float(line
[len('#EXTINF:'):].split(',')[0])
2203 for line
in m3u8_vod
.splitlines() if line
.startswith('#EXTINF:'))) or None
2205 def _extract_mpd_vod_duration(
2206 self
, mpd_url
, video_id
, note
=None, errnote
=None, data
=None, headers
={}, query={}
):
2208 mpd_doc
= self
._download
_xml
(
2210 note
='Downloading MPD VOD manifest' if note
is None else note
,
2211 errnote
='Failed to download VOD manifest' if errnote
is None else errnote
,
2212 fatal
=False, data
=data
, headers
=headers
, query
=query
) or {}
2213 return int_or_none(parse_duration(mpd_doc
.get('mediaPresentationDuration')))
2216 def _xpath_ns(path
, namespace
=None):
2220 for c
in path
.split('/'):
2221 if not c
or c
== '.':
2224 out
.append('{%s}%s' % (namespace
, c
))
2225 return '/'.join(out
)
2227 def _extract_smil_formats_and_subtitles(self
, smil_url
, video_id
, fatal
=True, f4m_params
=None, transform_source
=None):
2228 if self
.get_param('ignore_no_formats_error'):
2231 res
= self
._download
_smil
(smil_url
, video_id
, fatal
=fatal
, transform_source
=transform_source
)
2237 smil_url
= urlh
.geturl()
2239 namespace
= self
._parse
_smil
_namespace
(smil
)
2241 fmts
= self
._parse
_smil
_formats
(
2242 smil
, smil_url
, video_id
, namespace
=namespace
, f4m_params
=f4m_params
)
2243 subs
= self
._parse
_smil
_subtitles
(
2244 smil
, namespace
=namespace
)
2248 def _extract_smil_formats(self
, *args
, **kwargs
):
2249 fmts
, subs
= self
._extract
_smil
_formats
_and
_subtitles
(*args
, **kwargs
)
2251 self
._report
_ignoring
_subs
('SMIL')
2254 def _extract_smil_info(self
, smil_url
, video_id
, fatal
=True, f4m_params
=None):
2255 res
= self
._download
_smil
(smil_url
, video_id
, fatal
=fatal
)
2260 smil_url
= urlh
.geturl()
2262 return self
._parse
_smil
(smil
, smil_url
, video_id
, f4m_params
=f4m_params
)
2264 def _download_smil(self
, smil_url
, video_id
, fatal
=True, transform_source
=None):
2265 return self
._download
_xml
_handle
(
2266 smil_url
, video_id
, 'Downloading SMIL file',
2267 'Unable to download SMIL file', fatal
=fatal
, transform_source
=transform_source
)
2269 def _parse_smil(self
, smil
, smil_url
, video_id
, f4m_params
=None):
2270 namespace
= self
._parse
_smil
_namespace
(smil
)
2272 formats
= self
._parse
_smil
_formats
(
2273 smil
, smil_url
, video_id
, namespace
=namespace
, f4m_params
=f4m_params
)
2274 subtitles
= self
._parse
_smil
_subtitles
(smil
, namespace
=namespace
)
2276 video_id
= os
.path
.splitext(url_basename(smil_url
))[0]
2280 for meta
in smil
.findall(self
._xpath
_ns
('./head/meta', namespace
)):
2281 name
= meta
.attrib
.get('name')
2282 content
= meta
.attrib
.get('content')
2283 if not name
or not content
:
2285 if not title
and name
== 'title':
2287 elif not description
and name
in ('description', 'abstract'):
2288 description
= content
2289 elif not upload_date
and name
== 'date':
2290 upload_date
= unified_strdate(content
)
2293 'id': image
.get('type'),
2294 'url': image
.get('src'),
2295 'width': int_or_none(image
.get('width')),
2296 'height': int_or_none(image
.get('height')),
2297 } for image
in smil
.findall(self
._xpath
_ns
('.//image', namespace
)) if image
.get('src')]
2301 'title': title
or video_id
,
2302 'description': description
,
2303 'upload_date': upload_date
,
2304 'thumbnails': thumbnails
,
2306 'subtitles': subtitles
,
2309 def _parse_smil_namespace(self
, smil
):
2310 return self
._search
_regex
(
2311 r
'(?i)^{([^}]+)?}smil$', smil
.tag
, 'namespace', default
=None)
2313 def _parse_smil_formats(self
, smil
, smil_url
, video_id
, namespace
=None, f4m_params
=None, transform_rtmp_url
=None):
2315 for meta
in smil
.findall(self
._xpath
_ns
('./head/meta', namespace
)):
2316 b
= meta
.get('base') or meta
.get('httpBase')
2328 media
= smil
.findall(self
._xpath
_ns
('.//video', namespace
)) + smil
.findall(self
._xpath
_ns
('.//audio', namespace
))
2329 for medium
in media
:
2330 src
= medium
.get('src')
2331 if not src
or src
in srcs
:
2335 bitrate
= float_or_none(medium
.get('system-bitrate') or medium
.get('systemBitrate'), 1000)
2336 filesize
= int_or_none(medium
.get('size') or medium
.get('fileSize'))
2337 width
= int_or_none(medium
.get('width'))
2338 height
= int_or_none(medium
.get('height'))
2339 proto
= medium
.get('proto')
2340 ext
= medium
.get('ext')
2341 src_ext
= determine_ext(src
, default_ext
=None) or ext
or urlhandle_detect_ext(
2342 self
._request
_webpage
(HEADRequest(src
), video_id
, note
='Requesting extension info', fatal
=False))
2343 streamer
= medium
.get('streamer') or base
2345 if proto
== 'rtmp' or streamer
.startswith('rtmp'):
2351 'format_id': 'rtmp-%d' % (rtmp_count
if bitrate
is None else bitrate
),
2353 'filesize': filesize
,
2357 if transform_rtmp_url
:
2358 streamer
, src
= transform_rtmp_url(streamer
, src
)
2359 formats
[-1].update({
2365 src_url
= src
if src
.startswith('http') else urllib
.parse
.urljoin(base
, src
)
2366 src_url
= src_url
.strip()
2368 if proto
== 'm3u8' or src_ext
== 'm3u8':
2369 m3u8_formats
= self
._extract
_m
3u8_formats
(
2370 src_url
, video_id
, ext
or 'mp4', m3u8_id
='hls', fatal
=False)
2371 if len(m3u8_formats
) == 1:
2373 m3u8_formats
[0].update({
2374 'format_id': 'hls-%d' % (m3u8_count
if bitrate
is None else bitrate
),
2379 formats
.extend(m3u8_formats
)
2380 elif src_ext
== 'f4m':
2385 'plugin': 'flowplayer-3.2.0.1',
2387 f4m_url
+= '&' if '?' in f4m_url
else '?'
2388 f4m_url
+= urllib
.parse
.urlencode(f4m_params
)
2389 formats
.extend(self
._extract
_f
4m
_formats
(f4m_url
, video_id
, f4m_id
='hds', fatal
=False))
2390 elif src_ext
== 'mpd':
2391 formats
.extend(self
._extract
_mpd
_formats
(
2392 src_url
, video_id
, mpd_id
='dash', fatal
=False))
2393 elif re
.search(r
'\.ism/[Mm]anifest', src_url
):
2394 formats
.extend(self
._extract
_ism
_formats
(
2395 src_url
, video_id
, ism_id
='mss', fatal
=False))
2396 elif src_url
.startswith('http') and self
._is
_valid
_url
(src
, video_id
):
2400 'ext': ext
or src_ext
or 'flv',
2401 'format_id': 'http-%d' % (bitrate
or http_count
),
2403 'filesize': filesize
,
2408 for medium
in smil
.findall(self
._xpath
_ns
('.//imagestream', namespace
)):
2409 src
= medium
.get('src')
2410 if not src
or src
in srcs
:
2416 'format_id': 'imagestream-%d' % (imgs_count
),
2418 'ext': mimetype2ext(medium
.get('type')),
2421 'width': int_or_none(medium
.get('width')),
2422 'height': int_or_none(medium
.get('height')),
2423 'format_note': 'SMIL storyboards',
2428 def _parse_smil_subtitles(self
, smil
, namespace
=None, subtitles_lang
='en'):
2431 for num
, textstream
in enumerate(smil
.findall(self
._xpath
_ns
('.//textstream', namespace
))):
2432 src
= textstream
.get('src')
2433 if not src
or src
in urls
:
2436 ext
= textstream
.get('ext') or mimetype2ext(textstream
.get('type')) or determine_ext(src
)
2437 lang
= textstream
.get('systemLanguage') or textstream
.get('systemLanguageName') or textstream
.get('lang') or subtitles_lang
2438 subtitles
.setdefault(lang
, []).append({
2444 def _extract_xspf_playlist(self
, xspf_url
, playlist_id
, fatal
=True):
2445 res
= self
._download
_xml
_handle
(
2446 xspf_url
, playlist_id
, 'Downloading xpsf playlist',
2447 'Unable to download xspf manifest', fatal
=fatal
)
2452 xspf_url
= urlh
.geturl()
2454 return self
._parse
_xspf
(
2455 xspf
, playlist_id
, xspf_url
=xspf_url
,
2456 xspf_base_url
=base_url(xspf_url
))
2458 def _parse_xspf(self
, xspf_doc
, playlist_id
, xspf_url
=None, xspf_base_url
=None):
2460 'xspf': 'http://xspf.org/ns/0/',
2461 's1': 'http://static.streamone.nl/player/ns/0',
2465 for track
in xspf_doc
.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP
)):
2467 track
, xpath_with_ns('./xspf:title', NS_MAP
), 'title', default
=playlist_id
)
2468 description
= xpath_text(
2469 track
, xpath_with_ns('./xspf:annotation', NS_MAP
), 'description')
2470 thumbnail
= xpath_text(
2471 track
, xpath_with_ns('./xspf:image', NS_MAP
), 'thumbnail')
2472 duration
= float_or_none(
2473 xpath_text(track
, xpath_with_ns('./xspf:duration', NS_MAP
), 'duration'), 1000)
2476 for location
in track
.findall(xpath_with_ns('./xspf:location', NS_MAP
)):
2477 format_url
= urljoin(xspf_base_url
, location
.text
)
2482 'manifest_url': xspf_url
,
2483 'format_id': location
.get(xpath_with_ns('s1:label', NS_MAP
)),
2484 'width': int_or_none(location
.get(xpath_with_ns('s1:width', NS_MAP
))),
2485 'height': int_or_none(location
.get(xpath_with_ns('s1:height', NS_MAP
))),
2491 'description': description
,
2492 'thumbnail': thumbnail
,
2493 'duration': duration
,
2498 def _extract_mpd_formats(self
, *args
, **kwargs
):
2499 fmts
, subs
= self
._extract
_mpd
_formats
_and
_subtitles
(*args
, **kwargs
)
2501 self
._report
_ignoring
_subs
('DASH')
2504 def _extract_mpd_formats_and_subtitles(
2505 self
, mpd_url
, video_id
, mpd_id
=None, note
=None, errnote
=None,
2506 fatal
=True, data
=None, headers
={}, query={}
):
2508 if self
.get_param('ignore_no_formats_error'):
2511 res
= self
._download
_xml
_handle
(
2513 note
='Downloading MPD manifest' if note
is None else note
,
2514 errnote
='Failed to download MPD manifest' if errnote
is None else errnote
,
2515 fatal
=fatal
, data
=data
, headers
=headers
, query
=query
)
2522 # We could have been redirected to a new url when we retrieved our mpd file.
2523 mpd_url
= urlh
.geturl()
2524 mpd_base_url
= base_url(mpd_url
)
2526 return self
._parse
_mpd
_formats
_and
_subtitles
(
2527 mpd_doc
, mpd_id
, mpd_base_url
, mpd_url
)
2529 def _parse_mpd_formats(self
, *args
, **kwargs
):
2530 fmts
, subs
= self
._parse
_mpd
_formats
_and
_subtitles
(*args
, **kwargs
)
2532 self
._report
_ignoring
_subs
('DASH')
2535 def _parse_mpd_formats_and_subtitles(
2536 self
, mpd_doc
, mpd_id
=None, mpd_base_url
='', mpd_url
=None):
2538 Parse formats from MPD manifest.
2540 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2541 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2542 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2544 if not self
.get_param('dynamic_mpd', True):
2545 if mpd_doc
.get('type') == 'dynamic':
2548 namespace
= self
._search
_regex
(r
'(?i)^{([^}]+)?}MPD$', mpd_doc
.tag
, 'namespace', default
=None)
2551 return self
._xpath
_ns
(path
, namespace
)
2553 def is_drm_protected(element
):
2554 return element
.find(_add_ns('ContentProtection')) is not None
2556 def extract_multisegment_info(element
, ms_parent_info
):
2557 ms_info
= ms_parent_info
.copy()
2559 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2560 # common attributes and elements. We will only extract relevant
2562 def extract_common(source
):
2563 segment_timeline
= source
.find(_add_ns('SegmentTimeline'))
2564 if segment_timeline
is not None:
2565 s_e
= segment_timeline
.findall(_add_ns('S'))
2567 ms_info
['total_number'] = 0
2570 r
= int(s
.get('r', 0))
2571 ms_info
['total_number'] += 1 + r
2572 ms_info
['s'].append({
2573 't': int(s
.get('t', 0)),
2574 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2575 'd': int(s
.attrib
['d']),
2578 start_number
= source
.get('startNumber')
2580 ms_info
['start_number'] = int(start_number
)
2581 timescale
= source
.get('timescale')
2583 ms_info
['timescale'] = int(timescale
)
2584 segment_duration
= source
.get('duration')
2585 if segment_duration
:
2586 ms_info
['segment_duration'] = float(segment_duration
)
2588 def extract_Initialization(source
):
2589 initialization
= source
.find(_add_ns('Initialization'))
2590 if initialization
is not None:
2591 ms_info
['initialization_url'] = initialization
.attrib
['sourceURL']
2593 segment_list
= element
.find(_add_ns('SegmentList'))
2594 if segment_list
is not None:
2595 extract_common(segment_list
)
2596 extract_Initialization(segment_list
)
2597 segment_urls_e
= segment_list
.findall(_add_ns('SegmentURL'))
2599 ms_info
['segment_urls'] = [segment
.attrib
['media'] for segment
in segment_urls_e
]
2601 segment_template
= element
.find(_add_ns('SegmentTemplate'))
2602 if segment_template
is not None:
2603 extract_common(segment_template
)
2604 media
= segment_template
.get('media')
2606 ms_info
['media'] = media
2607 initialization
= segment_template
.get('initialization')
2609 ms_info
['initialization'] = initialization
2611 extract_Initialization(segment_template
)
2614 mpd_duration
= parse_duration(mpd_doc
.get('mediaPresentationDuration'))
2615 formats
, subtitles
= [], {}
2616 stream_numbers
= collections
.defaultdict(int)
2617 for period
in mpd_doc
.findall(_add_ns('Period')):
2618 period_duration
= parse_duration(period
.get('duration')) or mpd_duration
2619 period_ms_info
= extract_multisegment_info(period
, {
2623 for adaptation_set
in period
.findall(_add_ns('AdaptationSet')):
2624 adaption_set_ms_info
= extract_multisegment_info(adaptation_set
, period_ms_info
)
2625 for representation
in adaptation_set
.findall(_add_ns('Representation')):
2626 representation_attrib
= adaptation_set
.attrib
.copy()
2627 representation_attrib
.update(representation
.attrib
)
2628 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2629 mime_type
= representation_attrib
['mimeType']
2630 content_type
= representation_attrib
.get('contentType', mime_type
.split('/')[0])
2632 codec_str
= representation_attrib
.get('codecs', '')
2633 # Some kind of binary subtitle found in some youtube livestreams
2634 if mime_type
== 'application/x-rawcc':
2635 codecs
= {'scodec': codec_str}
2637 codecs
= parse_codecs(codec_str
)
2638 if content_type
not in ('video', 'audio', 'text'):
2639 if mime_type
== 'image/jpeg':
2640 content_type
= mime_type
2641 elif codecs
.get('vcodec', 'none') != 'none':
2642 content_type
= 'video'
2643 elif codecs
.get('acodec', 'none') != 'none':
2644 content_type
= 'audio'
2645 elif codecs
.get('scodec', 'none') != 'none':
2646 content_type
= 'text'
2647 elif mimetype2ext(mime_type
) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2648 content_type
= 'text'
2650 self
.report_warning('Unknown MIME type %s in DASH manifest' % mime_type
)
2654 for element
in (representation
, adaptation_set
, period
, mpd_doc
):
2655 base_url_e
= element
.find(_add_ns('BaseURL'))
2656 if try_call(lambda: base_url_e
.text
) is not None:
2657 base_url
= base_url_e
.text
+ base_url
2658 if re
.match(r
'^https?://', base_url
):
2660 if mpd_base_url
and base_url
.startswith('/'):
2661 base_url
= urllib
.parse
.urljoin(mpd_base_url
, base_url
)
2662 elif mpd_base_url
and not re
.match(r
'^https?://', base_url
):
2663 if not mpd_base_url
.endswith('/'):
2665 base_url
= mpd_base_url
+ base_url
2666 representation_id
= representation_attrib
.get('id')
2667 lang
= representation_attrib
.get('lang')
2668 url_el
= representation
.find(_add_ns('BaseURL'))
2669 filesize
= int_or_none(url_el
.attrib
.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el
is not None else None)
2670 bandwidth
= int_or_none(representation_attrib
.get('bandwidth'))
2671 if representation_id
is not None:
2672 format_id
= representation_id
2674 format_id
= content_type
2676 format_id
= mpd_id
+ '-' + format_id
2677 if content_type
in ('video', 'audio'):
2679 'format_id': format_id
,
2680 'manifest_url': mpd_url
,
2681 'ext': mimetype2ext(mime_type
),
2682 'width': int_or_none(representation_attrib
.get('width')),
2683 'height': int_or_none(representation_attrib
.get('height')),
2684 'tbr': float_or_none(bandwidth
, 1000),
2685 'asr': int_or_none(representation_attrib
.get('audioSamplingRate')),
2686 'fps': int_or_none(representation_attrib
.get('frameRate')),
2687 'language': lang
if lang
not in ('mul', 'und', 'zxx', 'mis') else None,
2688 'format_note': 'DASH %s' % content_type
,
2689 'filesize': filesize
,
2690 'container': mimetype2ext(mime_type
) + '_dash',
2693 elif content_type
== 'text':
2695 'ext': mimetype2ext(mime_type
),
2696 'manifest_url': mpd_url
,
2697 'filesize': filesize
,
2699 elif content_type
== 'image/jpeg':
2700 # See test case in VikiIE
2701 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2703 'format_id': format_id
,
2705 'manifest_url': mpd_url
,
2706 'format_note': 'DASH storyboards (jpeg)',
2710 if is_drm_protected(adaptation_set
) or is_drm_protected(representation
):
2712 representation_ms_info
= extract_multisegment_info(representation
, adaption_set_ms_info
)
2714 def prepare_template(template_name
, identifiers
):
2715 tmpl
= representation_ms_info
[template_name
]
2716 if representation_id
is not None:
2717 tmpl
= tmpl
.replace('$RepresentationID$', representation_id
)
2718 # First of, % characters outside $...$ templates
2719 # must be escaped by doubling for proper processing
2720 # by % operator string formatting used further (see
2721 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2727 in_template
= not in_template
2728 elif c
== '%' and not in_template
:
2730 # Next, $...$ templates are translated to their
2731 # %(...) counterparts to be used with % operator
2732 t
= re
.sub(r
'\$(%s)\$' % '|'.join(identifiers
), r
'%(\1)d', t
)
2733 t
= re
.sub(r
'\$(%s)%%([^$]+)\$' % '|'.join(identifiers
), r
'%(\1)\2', t
)
2734 t
.replace('$$', '$')
2737 # @initialization is a regular template like @media one
2738 # so it should be handled just the same way (see
2739 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2740 if 'initialization' in representation_ms_info
:
2741 initialization_template
= prepare_template(
2743 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2744 # $Time$ shall not be included for @initialization thus
2745 # only $Bandwidth$ remains
2747 representation_ms_info
['initialization_url'] = initialization_template
% {
2748 'Bandwidth': bandwidth
,
2751 def location_key(location
):
2752 return 'url' if re
.match(r
'^https?://', location
) else 'path'
2754 if 'segment_urls' not in representation_ms_info
and 'media' in representation_ms_info
:
2756 media_template
= prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2757 media_location_key
= location_key(media_template
)
2759 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2760 # can't be used at the same time
2761 if '%(Number' in media_template
and 's' not in representation_ms_info
:
2762 segment_duration
= None
2763 if 'total_number' not in representation_ms_info
and 'segment_duration' in representation_ms_info
:
2764 segment_duration
= float_or_none(representation_ms_info
['segment_duration'], representation_ms_info
['timescale'])
2765 representation_ms_info
['total_number'] = int(math
.ceil(
2766 float_or_none(period_duration
, segment_duration
, default
=0)))
2767 representation_ms_info
['fragments'] = [{
2768 media_location_key
: media_template
% {
2769 'Number': segment_number
,
2770 'Bandwidth': bandwidth
,
2772 'duration': segment_duration
,
2773 } for segment_number
in range(
2774 representation_ms_info
['start_number'],
2775 representation_ms_info
['total_number'] + representation_ms_info
['start_number'])]
2777 # $Number*$ or $Time$ in media template with S list available
2778 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2779 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2780 representation_ms_info
['fragments'] = []
2783 segment_number
= representation_ms_info
['start_number']
2785 def add_segment_url():
2786 segment_url
= media_template
% {
2787 'Time': segment_time
,
2788 'Bandwidth': bandwidth
,
2789 'Number': segment_number
,
2791 representation_ms_info
['fragments'].append({
2792 media_location_key
: segment_url
,
2793 'duration': float_or_none(segment_d
, representation_ms_info
['timescale']),
2796 for num
, s
in enumerate(representation_ms_info
['s']):
2797 segment_time
= s
.get('t') or segment_time
2801 for r
in range(s
.get('r', 0)):
2802 segment_time
+= segment_d
2805 segment_time
+= segment_d
2806 elif 'segment_urls' in representation_ms_info
and 's' in representation_ms_info
:
2807 # No media template,
2808 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2809 # or any YouTube dashsegments video
2812 timescale
= representation_ms_info
['timescale']
2813 for s
in representation_ms_info
['s']:
2814 duration
= float_or_none(s
['d'], timescale
)
2815 for r
in range(s
.get('r', 0) + 1):
2816 segment_uri
= representation_ms_info
['segment_urls'][segment_index
]
2818 location_key(segment_uri
): segment_uri
,
2819 'duration': duration
,
2822 representation_ms_info
['fragments'] = fragments
2823 elif 'segment_urls' in representation_ms_info
:
2824 # Segment URLs with no SegmentTimeline
2825 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2826 # https://github.com/ytdl-org/youtube-dl/pull/14844
2828 segment_duration
= float_or_none(
2829 representation_ms_info
['segment_duration'],
2830 representation_ms_info
['timescale']) if 'segment_duration' in representation_ms_info
else None
2831 for segment_url
in representation_ms_info
['segment_urls']:
2833 location_key(segment_url
): segment_url
,
2835 if segment_duration
:
2836 fragment
['duration'] = segment_duration
2837 fragments
.append(fragment
)
2838 representation_ms_info
['fragments'] = fragments
2839 # If there is a fragments key available then we correctly recognized fragmented media.
2840 # Otherwise we will assume unfragmented media with direct access. Technically, such
2841 # assumption is not necessarily correct since we may simply have no support for
2842 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2843 if 'fragments' in representation_ms_info
:
2845 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2846 'url': mpd_url
or base_url
,
2847 'fragment_base_url': base_url
,
2849 'protocol': 'http_dash_segments' if mime_type
!= 'image/jpeg' else 'mhtml',
2851 if 'initialization_url' in representation_ms_info
:
2852 initialization_url
= representation_ms_info
['initialization_url']
2853 if not f
.get('url'):
2854 f
['url'] = initialization_url
2855 f
['fragments'].append({location_key(initialization_url): initialization_url}
)
2856 f
['fragments'].extend(representation_ms_info
['fragments'])
2857 if not period_duration
:
2858 period_duration
= try_get(
2859 representation_ms_info
,
2860 lambda r
: sum(frag
['duration'] for frag
in r
['fragments']), float)
2862 # Assuming direct URL to unfragmented media.
2864 if content_type
in ('video', 'audio', 'image/jpeg'):
2865 f
['manifest_stream_number'] = stream_numbers
[f
['url']]
2866 stream_numbers
[f
['url']] += 1
2868 elif content_type
== 'text':
2869 subtitles
.setdefault(lang
or 'und', []).append(f
)
2871 return formats
, subtitles
2873 def _extract_ism_formats(self
, *args
, **kwargs
):
2874 fmts
, subs
= self
._extract
_ism
_formats
_and
_subtitles
(*args
, **kwargs
)
2876 self
._report
_ignoring
_subs
('ISM')
2879 def _extract_ism_formats_and_subtitles(self
, ism_url
, video_id
, ism_id
=None, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query={}
):
2880 if self
.get_param('ignore_no_formats_error'):
2883 res
= self
._download
_xml
_handle
(
2885 note
='Downloading ISM manifest' if note
is None else note
,
2886 errnote
='Failed to download ISM manifest' if errnote
is None else errnote
,
2887 fatal
=fatal
, data
=data
, headers
=headers
, query
=query
)
2894 return self
._parse
_ism
_formats
_and
_subtitles
(ism_doc
, urlh
.geturl(), ism_id
)
2896 def _parse_ism_formats_and_subtitles(self
, ism_doc
, ism_url
, ism_id
=None):
2898 Parse formats from ISM manifest.
2900 1. [MS-SSTR]: Smooth Streaming Protocol,
2901 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2903 if ism_doc
.get('IsLive') == 'TRUE':
2906 duration
= int(ism_doc
.attrib
['Duration'])
2907 timescale
= int_or_none(ism_doc
.get('TimeScale')) or 10000000
2911 for stream
in ism_doc
.findall('StreamIndex'):
2912 stream_type
= stream
.get('Type')
2913 if stream_type
not in ('video', 'audio', 'text'):
2915 url_pattern
= stream
.attrib
['Url']
2916 stream_timescale
= int_or_none(stream
.get('TimeScale')) or timescale
2917 stream_name
= stream
.get('Name')
2918 stream_language
= stream
.get('Language', 'und')
2919 for track
in stream
.findall('QualityLevel'):
2920 KNOWN_TAGS
= {'255': 'AACL', '65534': 'EC-3'}
2921 fourcc
= track
.get('FourCC') or KNOWN_TAGS
.get(track
.get('AudioTag'))
2922 # TODO: add support for WVC1 and WMAP
2923 if fourcc
not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2924 self
.report_warning('%s is not a supported codec' % fourcc
)
2926 tbr
= int(track
.attrib
['Bitrate']) // 1000
2927 # [1] does not mention Width and Height attributes. However,
2928 # they're often present while MaxWidth and MaxHeight are
2929 # missing, so should be used as fallbacks
2930 width
= int_or_none(track
.get('MaxWidth') or track
.get('Width'))
2931 height
= int_or_none(track
.get('MaxHeight') or track
.get('Height'))
2932 sampling_rate
= int_or_none(track
.get('SamplingRate'))
2934 track_url_pattern
= re
.sub(r
'{[Bb]itrate}', track
.attrib
['Bitrate'], url_pattern
)
2935 track_url_pattern
= urllib
.parse
.urljoin(ism_url
, track_url_pattern
)
2941 stream_fragments
= stream
.findall('c')
2942 for stream_fragment_index
, stream_fragment
in enumerate(stream_fragments
):
2943 fragment_ctx
['time'] = int_or_none(stream_fragment
.get('t')) or fragment_ctx
['time']
2944 fragment_repeat
= int_or_none(stream_fragment
.get('r')) or 1
2945 fragment_ctx
['duration'] = int_or_none(stream_fragment
.get('d'))
2946 if not fragment_ctx
['duration']:
2948 next_fragment_time
= int(stream_fragment
[stream_fragment_index
+ 1].attrib
['t'])
2950 next_fragment_time
= duration
2951 fragment_ctx
['duration'] = (next_fragment_time
- fragment_ctx
['time']) / fragment_repeat
2952 for _
in range(fragment_repeat
):
2954 'url': re
.sub(r
'{start[ _]time}', str(fragment_ctx
['time']), track_url_pattern
),
2955 'duration': fragment_ctx
['duration'] / stream_timescale
,
2957 fragment_ctx
['time'] += fragment_ctx
['duration']
2959 if stream_type
== 'text':
2960 subtitles
.setdefault(stream_language
, []).append({
2964 'manifest_url': ism_url
,
2965 'fragments': fragments
,
2966 '_download_params': {
2967 'stream_type': stream_type
,
2968 'duration': duration
,
2969 'timescale': stream_timescale
,
2971 'language': stream_language
,
2972 'codec_private_data': track
.get('CodecPrivateData'),
2975 elif stream_type
in ('video', 'audio'):
2977 'format_id': join_nonempty(ism_id
, stream_name
, tbr
),
2979 'manifest_url': ism_url
,
2980 'ext': 'ismv' if stream_type
== 'video' else 'isma',
2984 'asr': sampling_rate
,
2985 'vcodec': 'none' if stream_type
== 'audio' else fourcc
,
2986 'acodec': 'none' if stream_type
== 'video' else fourcc
,
2988 'fragments': fragments
,
2989 'has_drm': ism_doc
.find('Protection') is not None,
2990 'language': stream_language
,
2991 'audio_channels': int_or_none(track
.get('Channels')),
2992 '_download_params': {
2993 'stream_type': stream_type
,
2994 'duration': duration
,
2995 'timescale': stream_timescale
,
2996 'width': width
or 0,
2997 'height': height
or 0,
2999 'language': stream_language
,
3000 'codec_private_data': track
.get('CodecPrivateData'),
3001 'sampling_rate': sampling_rate
,
3002 'channels': int_or_none(track
.get('Channels', 2)),
3003 'bits_per_sample': int_or_none(track
.get('BitsPerSample', 16)),
3004 'nal_unit_length_field': int_or_none(track
.get('NALUnitLengthField', 4)),
3007 return formats
, subtitles
3009 def _parse_html5_media_entries(self
, base_url
, webpage
, video_id
, m3u8_id
=None, m3u8_entry_protocol
='m3u8_native', mpd_id
=None, preference
=None, quality
=None):
3010 def absolute_url(item_url
):
3011 return urljoin(base_url
, item_url
)
3013 def parse_content_type(content_type
):
3014 if not content_type
:
3016 ctr
= re
.search(r
'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type
)
3018 mimetype
, codecs
= ctr
.groups()
3019 f
= parse_codecs(codecs
)
3020 f
['ext'] = mimetype2ext(mimetype
)
3024 def _media_formats(src
, cur_media_type
, type_info
=None):
3025 type_info
= type_info
or {}
3026 full_url
= absolute_url(src
)
3027 ext
= type_info
.get('ext') or determine_ext(full_url
)
3029 is_plain_url
= False
3030 formats
= self
._extract
_m
3u8_formats
(
3031 full_url
, video_id
, ext
='mp4',
3032 entry_protocol
=m3u8_entry_protocol
, m3u8_id
=m3u8_id
,
3033 preference
=preference
, quality
=quality
, fatal
=False)
3035 is_plain_url
= False
3036 formats
= self
._extract
_mpd
_formats
(
3037 full_url
, video_id
, mpd_id
=mpd_id
, fatal
=False)
3042 'vcodec': 'none' if cur_media_type
== 'audio' else None,
3045 return is_plain_url
, formats
3048 # amp-video and amp-audio are very similar to their HTML5 counterparts
3049 # so we will include them right here (see
3050 # https://www.ampproject.org/docs/reference/components/amp-video)
3051 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3052 _MEDIA_TAG_NAME_RE
= r
'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3053 media_tags
= [(media_tag
, media_tag_name
, media_type
, '')
3054 for media_tag
, media_tag_name
, media_type
3055 in re
.findall(r
'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE
, webpage
)]
3056 media_tags
.extend(re
.findall(
3057 # We only allow video|audio followed by a whitespace or '>'.
3058 # Allowing more characters may end up in significant slow down (see
3059 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3060 # e.g. http://www.porntrex.com/maps/videositemap.xml).
3061 r
'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE
, webpage
))
3062 for media_tag
, _
, media_type
, media_content
in media_tags
:
3067 media_attributes
= extract_attributes(media_tag
)
3068 src
= strip_or_none(dict_get(media_attributes
, ('src', 'data-video-src', 'data-src', 'data-source')))
3070 f
= parse_content_type(media_attributes
.get('type'))
3071 _
, formats
= _media_formats(src
, media_type
, f
)
3072 media_info
['formats'].extend(formats
)
3073 media_info
['thumbnail'] = absolute_url(media_attributes
.get('poster'))
3075 for source_tag
in re
.findall(r
'<source[^>]+>', media_content
):
3076 s_attr
= extract_attributes(source_tag
)
3077 # data-video-src and data-src are non standard but seen
3078 # several times in the wild
3079 src
= strip_or_none(dict_get(s_attr
, ('src', 'data-video-src', 'data-src', 'data-source')))
3082 f
= parse_content_type(s_attr
.get('type'))
3083 is_plain_url
, formats
= _media_formats(src
, media_type
, f
)
3085 # width, height, res, label and title attributes are
3086 # all not standard but seen several times in the wild
3089 for lbl
in ('label', 'title')
3090 if str_or_none(s_attr
.get(lbl
))
3092 width
= int_or_none(s_attr
.get('width'))
3093 height
= (int_or_none(s_attr
.get('height'))
3094 or int_or_none(s_attr
.get('res')))
3095 if not width
or not height
:
3097 resolution
= parse_resolution(lbl
)
3100 width
= width
or resolution
.get('width')
3101 height
= height
or resolution
.get('height')
3103 tbr
= parse_bitrate(lbl
)
3112 'format_id': s_attr
.get('label') or s_attr
.get('title'),
3114 f
.update(formats
[0])
3115 media_info
['formats'].append(f
)
3117 media_info
['formats'].extend(formats
)
3118 for track_tag
in re
.findall(r
'<track[^>]+>', media_content
):
3119 track_attributes
= extract_attributes(track_tag
)
3120 kind
= track_attributes
.get('kind')
3121 if not kind
or kind
in ('subtitles', 'captions'):
3122 src
= strip_or_none(track_attributes
.get('src'))
3125 lang
= track_attributes
.get('srclang') or track_attributes
.get('lang') or track_attributes
.get('label')
3126 media_info
['subtitles'].setdefault(lang
, []).append({
3127 'url': absolute_url(src
),
3129 for f
in media_info
['formats']:
3130 f
.setdefault('http_headers', {})['Referer'] = base_url
3131 if media_info
['formats'] or media_info
['subtitles']:
3132 entries
.append(media_info
)
3135 def _extract_akamai_formats(self
, *args
, **kwargs
):
3136 fmts
, subs
= self
._extract
_akamai
_formats
_and
_subtitles
(*args
, **kwargs
)
3138 self
._report
_ignoring
_subs
('akamai')
3141 def _extract_akamai_formats_and_subtitles(self
, manifest_url
, video_id
, hosts
={}):
3142 signed
= 'hdnea=' in manifest_url
3144 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3145 manifest_url
= re
.sub(
3146 r
'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3147 '', manifest_url
).strip('?')
3152 hdcore_sign
= 'hdcore=3.7.0'
3153 f4m_url
= re
.sub(r
'(https?://[^/]+)/i/', r
'\1/z/', manifest_url
).replace('/master.m3u8', '/manifest.f4m')
3154 hds_host
= hosts
.get('hds')
3156 f4m_url
= re
.sub(r
'(https?://)[^/]+', r
'\1' + hds_host
, f4m_url
)
3157 if 'hdcore=' not in f4m_url
:
3158 f4m_url
+= ('&' if '?' in f4m_url
else '?') + hdcore_sign
3159 f4m_formats
= self
._extract
_f
4m
_formats
(
3160 f4m_url
, video_id
, f4m_id
='hds', fatal
=False)
3161 for entry
in f4m_formats
:
3162 entry
.update({'extra_param_to_segment_url': hdcore_sign}
)
3163 formats
.extend(f4m_formats
)
3165 m3u8_url
= re
.sub(r
'(https?://[^/]+)/z/', r
'\1/i/', manifest_url
).replace('/manifest.f4m', '/master.m3u8')
3166 hls_host
= hosts
.get('hls')
3168 m3u8_url
= re
.sub(r
'(https?://)[^/]+', r
'\1' + hls_host
, m3u8_url
)
3169 m3u8_formats
, m3u8_subtitles
= self
._extract
_m
3u8_formats
_and
_subtitles
(
3170 m3u8_url
, video_id
, 'mp4', 'm3u8_native',
3171 m3u8_id
='hls', fatal
=False)
3172 formats
.extend(m3u8_formats
)
3173 subtitles
= self
._merge
_subtitles
(subtitles
, m3u8_subtitles
)
3175 http_host
= hosts
.get('http')
3176 if http_host
and m3u8_formats
and not signed
:
3177 REPL_REGEX
= r
'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3178 qualities
= re
.match(REPL_REGEX
, m3u8_url
).group(2).split(',')
3179 qualities_length
= len(qualities
)
3180 if len(m3u8_formats
) in (qualities_length
, qualities_length
+ 1):
3182 for f
in m3u8_formats
:
3183 if f
['vcodec'] != 'none':
3184 for protocol
in ('http', 'https'):
3186 del http_f
['manifest_url']
3188 REPL_REGEX
, protocol
+ fr
'://{http_host}/\g<1>{qualities[i]}\3', f
['url'])
3190 'format_id': http_f
['format_id'].replace('hls-', protocol
+ '-'),
3192 'protocol': protocol
,
3194 formats
.append(http_f
)
3197 return formats
, subtitles
3199 def _extract_wowza_formats(self
, url
, video_id
, m3u8_entry_protocol
='m3u8_native', skip_protocols
=[]):
3200 query
= urllib
.parse
.urlparse(url
).query
3201 url
= re
.sub(r
'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url
)
3203 r
'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url
)
3204 url_base
= mobj
.group('url')
3205 http_base_url
= '%s%s:%s' % ('http', mobj
.group('s') or '', url_base
)
3208 def manifest_url(manifest
):
3209 m_url
= f
'{http_base_url}/{manifest}'
3211 m_url
+= '?%s' % query
3214 if 'm3u8' not in skip_protocols
:
3215 formats
.extend(self
._extract
_m
3u8_formats
(
3216 manifest_url('playlist.m3u8'), video_id
, 'mp4',
3217 m3u8_entry_protocol
, m3u8_id
='hls', fatal
=False))
3218 if 'f4m' not in skip_protocols
:
3219 formats
.extend(self
._extract
_f
4m
_formats
(
3220 manifest_url('manifest.f4m'),
3221 video_id
, f4m_id
='hds', fatal
=False))
3222 if 'dash' not in skip_protocols
:
3223 formats
.extend(self
._extract
_mpd
_formats
(
3224 manifest_url('manifest.mpd'),
3225 video_id
, mpd_id
='dash', fatal
=False))
3226 if re
.search(r
'(?:/smil:|\.smil)', url_base
):
3227 if 'smil' not in skip_protocols
:
3228 rtmp_formats
= self
._extract
_smil
_formats
(
3229 manifest_url('jwplayer.smil'),
3230 video_id
, fatal
=False)
3231 for rtmp_format
in rtmp_formats
:
3232 rtsp_format
= rtmp_format
.copy()
3233 rtsp_format
['url'] = '%s/%s' % (rtmp_format
['url'], rtmp_format
['play_path'])
3234 del rtsp_format
['play_path']
3235 del rtsp_format
['ext']
3236 rtsp_format
.update({
3237 'url': rtsp_format
['url'].replace('rtmp://', 'rtsp://'),
3238 'format_id': rtmp_format
['format_id'].replace('rtmp', 'rtsp'),
3241 formats
.extend([rtmp_format
, rtsp_format
])
3243 for protocol
in ('rtmp', 'rtsp'):
3244 if protocol
not in skip_protocols
:
3246 'url': f
'{protocol}:{url_base}',
3247 'format_id': protocol
,
3248 'protocol': protocol
,
3252 def _find_jwplayer_data(self
, webpage
, video_id
=None, transform_source
=js_to_json
):
3254 r
'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3258 jwplayer_data
= self
._parse
_json
(mobj
.group('options'),
3260 transform_source
=transform_source
)
3261 except ExtractorError
:
3264 if isinstance(jwplayer_data
, dict):
3265 return jwplayer_data
3267 def _extract_jwplayer_data(self
, webpage
, video_id
, *args
, **kwargs
):
3268 jwplayer_data
= self
._find
_jwplayer
_data
(
3269 webpage
, video_id
, transform_source
=js_to_json
)
3270 return self
._parse
_jwplayer
_data
(
3271 jwplayer_data
, video_id
, *args
, **kwargs
)
3273 def _parse_jwplayer_data(self
, jwplayer_data
, video_id
=None, require_title
=True,
3274 m3u8_id
=None, mpd_id
=None, rtmp_params
=None, base_url
=None):
3276 if not isinstance(jwplayer_data
, dict):
3279 playlist_items
= jwplayer_data
.get('playlist')
3280 # JWPlayer backward compatibility: single playlist item/flattened playlists
3281 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3282 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3283 if not isinstance(playlist_items
, list):
3284 playlist_items
= (playlist_items
or jwplayer_data
, )
3286 for video_data
in playlist_items
:
3287 if not isinstance(video_data
, dict):
3289 # JWPlayer backward compatibility: flattened sources
3290 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3291 if 'sources' not in video_data
:
3292 video_data
['sources'] = [video_data
]
3294 this_video_id
= video_id
or video_data
['mediaid']
3296 formats
= self
._parse
_jwplayer
_formats
(
3297 video_data
['sources'], video_id
=this_video_id
, m3u8_id
=m3u8_id
,
3298 mpd_id
=mpd_id
, rtmp_params
=rtmp_params
, base_url
=base_url
)
3301 tracks
= video_data
.get('tracks')
3302 if tracks
and isinstance(tracks
, list):
3303 for track
in tracks
:
3304 if not isinstance(track
, dict):
3306 track_kind
= track
.get('kind')
3307 if not track_kind
or not isinstance(track_kind
, str):
3309 if track_kind
.lower() not in ('captions', 'subtitles'):
3311 track_url
= urljoin(base_url
, track
.get('file'))
3314 subtitles
.setdefault(track
.get('label') or 'en', []).append({
3315 'url': self
._proto
_relative
_url
(track_url
)
3319 'id': this_video_id
,
3320 'title': unescapeHTML(video_data
['title'] if require_title
else video_data
.get('title')),
3321 'description': clean_html(video_data
.get('description')),
3322 'thumbnail': urljoin(base_url
, self
._proto
_relative
_url
(video_data
.get('image'))),
3323 'timestamp': int_or_none(video_data
.get('pubdate')),
3324 'duration': float_or_none(jwplayer_data
.get('duration') or video_data
.get('duration')),
3325 'subtitles': subtitles
,
3326 'alt_title': clean_html(video_data
.get('subtitle')), # attributes used e.g. by Tele5 ...
3327 'genre': clean_html(video_data
.get('genre')),
3328 'channel': clean_html(dict_get(video_data
, ('category', 'channel'))),
3329 'season_number': int_or_none(video_data
.get('season')),
3330 'episode_number': int_or_none(video_data
.get('episode')),
3331 'release_year': int_or_none(video_data
.get('releasedate')),
3332 'age_limit': int_or_none(video_data
.get('age_restriction')),
3334 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3335 if len(formats
) == 1 and re
.search(r
'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats
[0]['url']):
3337 '_type': 'url_transparent',
3338 'url': formats
[0]['url'],
3341 entry
['formats'] = formats
3342 entries
.append(entry
)
3343 if len(entries
) == 1:
3346 return self
.playlist_result(entries
)
3348 def _parse_jwplayer_formats(self
, jwplayer_sources_data
, video_id
=None,
3349 m3u8_id
=None, mpd_id
=None, rtmp_params
=None, base_url
=None):
3352 for source
in jwplayer_sources_data
:
3353 if not isinstance(source
, dict):
3355 source_url
= urljoin(
3356 base_url
, self
._proto
_relative
_url
(source
.get('file')))
3357 if not source_url
or source_url
in urls
:
3359 urls
.add(source_url
)
3360 source_type
= source
.get('type') or ''
3361 ext
= mimetype2ext(source_type
) or determine_ext(source_url
)
3362 if source_type
== 'hls' or ext
== 'm3u8' or 'format=m3u8-aapl' in source_url
:
3363 formats
.extend(self
._extract
_m
3u8_formats
(
3364 source_url
, video_id
, 'mp4', entry_protocol
='m3u8_native',
3365 m3u8_id
=m3u8_id
, fatal
=False))
3366 elif source_type
== 'dash' or ext
== 'mpd' or 'format=mpd-time-csf' in source_url
:
3367 formats
.extend(self
._extract
_mpd
_formats
(
3368 source_url
, video_id
, mpd_id
=mpd_id
, fatal
=False))
3370 formats
.extend(self
._extract
_smil
_formats
(
3371 source_url
, video_id
, fatal
=False))
3372 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3373 elif source_type
.startswith('audio') or ext
in (
3374 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3381 format_id
= str_or_none(source
.get('label'))
3382 height
= int_or_none(source
.get('height'))
3383 if height
is None and format_id
:
3384 # Often no height is provided but there is a label in
3385 # format like "1080p", "720p SD", or 1080.
3386 height
= parse_resolution(format_id
).get('height')
3389 'width': int_or_none(source
.get('width')),
3391 'tbr': int_or_none(source
.get('bitrate'), scale
=1000),
3392 'filesize': int_or_none(source
.get('filesize')),
3394 'format_id': format_id
3396 if source_url
.startswith('rtmp'):
3397 a_format
['ext'] = 'flv'
3398 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3399 # of jwplayer.flash.swf
3400 rtmp_url_parts
= re
.split(
3401 r
'((?:mp4|mp3|flv):)', source_url
, 1)
3402 if len(rtmp_url_parts
) == 3:
3403 rtmp_url
, prefix
, play_path
= rtmp_url_parts
3406 'play_path': prefix
+ play_path
,
3409 a_format
.update(rtmp_params
)
3410 formats
.append(a_format
)
3413 def _live_title(self
, name
):
3414 self
._downloader
.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3417 def _int(self
, v
, name
, fatal
=False, **kwargs
):
3418 res
= int_or_none(v
, **kwargs
)
3420 msg
= f
'Failed to extract {name}: Could not parse value {v!r}'
3422 raise ExtractorError(msg
)
3424 self
.report_warning(msg
)
3427 def _float(self
, v
, name
, fatal
=False, **kwargs
):
3428 res
= float_or_none(v
, **kwargs
)
3430 msg
= f
'Failed to extract {name}: Could not parse value {v!r}'
3432 raise ExtractorError(msg
)
3434 self
.report_warning(msg
)
3437 def _set_cookie(self
, domain
, name
, value
, expire_time
=None, port
=None,
3438 path
='/', secure
=False, discard
=False, rest
={}, **kwargs
):
3439 cookie
= http
.cookiejar
.Cookie(
3440 0, name
, value
, port
, port
is not None, domain
, True,
3441 domain
.startswith('.'), path
, True, secure
, expire_time
,
3442 discard
, None, None, rest
)
3443 self
.cookiejar
.set_cookie(cookie
)
3445 def _get_cookies(self
, url
):
3446 """ Return a http.cookies.SimpleCookie with the cookies for the url """
3447 return LenientSimpleCookie(self
._downloader
.cookiejar
.get_cookie_header(url
))
3449 def _apply_first_set_cookie_header(self
, url_handle
, cookie
):
3451 Apply first Set-Cookie header instead of the last. Experimental.
3453 Some sites (e.g. [1-3]) may serve two cookies under the same name
3454 in Set-Cookie header and expect the first (old) one to be set rather
3455 than second (new). However, as of RFC6265 the newer one cookie
3456 should be set into cookie store what actually happens.
3457 We will workaround this issue by resetting the cookie to
3458 the first one manually.
3459 1. https://new.vk.com/
3460 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3461 3. https://learning.oreilly.com/
3463 for header
, cookies
in url_handle
.headers
.items():
3464 if header
.lower() != 'set-cookie':
3466 cookies
= cookies
.encode('iso-8859-1').decode('utf-8')
3467 cookie_value
= re
.search(
3468 r
'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie
, cookies
)
3470 value
, domain
= cookie_value
.groups()
3471 self
._set
_cookie
(domain
, cookie
, value
)
3475 def get_testcases(cls
, include_onlymatching
=False):
3476 # Do not look in super classes
3477 t
= vars(cls
).get('_TEST')
3479 assert not hasattr(cls
, '_TESTS'), f
'{cls.ie_key()}IE has _TEST and _TESTS'
3482 tests
= vars(cls
).get('_TESTS', [])
3484 if not include_onlymatching
and t
.get('only_matching', False):
3486 t
['name'] = cls
.ie_key()
3488 if getattr(cls
, '__wrapped__', None):
3489 yield from cls
.__wrapped
__.get_testcases(include_onlymatching
)
3492 def get_webpage_testcases(cls
):
3493 tests
= vars(cls
).get('_WEBPAGE_TESTS', [])
3495 t
['name'] = cls
.ie_key()
3497 if getattr(cls
, '__wrapped__', None):
3498 yield from cls
.__wrapped
__.get_webpage_testcases()
3500 @classproperty(cache
=True)
3502 """Get age limit from the testcases"""
3503 return max(traverse_obj(
3504 (*cls
.get_testcases(include_onlymatching
=False), *cls
.get_webpage_testcases()),
3505 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3507 @classproperty(cache
=True)
3508 def _RETURN_TYPE(cls
):
3509 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3510 tests
= tuple(cls
.get_testcases(include_onlymatching
=False))
3513 elif not any(k
.startswith('playlist') for test
in tests
for k
in test
):
3515 elif all(any(k
.startswith('playlist') for k
in test
) for test
in tests
):
3520 def is_single_video(cls
, url
):
3521 """Returns whether the URL is of a single video, None if unknown"""
3522 if cls
.suitable(url
):
3523 return {'video': True, 'playlist': False}
.get(cls
._RETURN
_TYPE
)
3526 def is_suitable(cls
, age_limit
):
3527 """Test whether the extractor is generally suitable for the given age limit"""
3528 return not age_restricted(cls
.age_limit
, age_limit
)
3531 def description(cls
, *, markdown
=True, search_examples
=None):
3532 """Description of the extractor"""
3534 if cls
._NETRC
_MACHINE
:
3536 desc
+= f
' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3538 desc
+= f
' [{cls._NETRC_MACHINE}]'
3539 if cls
.IE_DESC
is False:
3542 desc
+= f
' {cls.IE_DESC}'
3544 desc
+= f
'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3546 _COUNTS
= ('', '5', '10', 'all')
3547 desc
+= f
' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3548 if not cls
.working():
3549 desc
+= ' (**Currently broken**)' if markdown
else ' (Currently broken)'
3551 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3552 name
= (' - **%s**' % re
.sub(r
':(\w+:)', ':\u200B\\g<1>', cls
.IE_NAME
)) if markdown
else cls
.IE_NAME
3553 return f
'{name}:{desc}' if desc
else name
3555 def extract_subtitles(self
, *args
, **kwargs
):
3556 if (self
.get_param('writesubtitles', False)
3557 or self
.get_param('listsubtitles')):
3558 return self
._get
_subtitles
(*args
, **kwargs
)
3561 def _get_subtitles(self
, *args
, **kwargs
):
3562 raise NotImplementedError('This method must be implemented by subclasses')
3564 class CommentsDisabled(Exception):
3565 """Raise in _get_comments if comments are disabled for the video"""
3567 def extract_comments(self
, *args
, **kwargs
):
3568 if not self
.get_param('getcomments'):
3570 generator
= self
._get
_comments
(*args
, **kwargs
)
3577 comments
.append(next(generator
))
3578 except StopIteration:
3580 except KeyboardInterrupt:
3581 self
.to_screen('Interrupted by user')
3582 except self
.CommentsDisabled
:
3583 return {'comments': None, 'comment_count': None}
3584 except Exception as e
:
3585 if self
.get_param('ignoreerrors') is not True:
3587 self
._downloader
.report_error(e
)
3588 comment_count
= len(comments
)
3589 self
.to_screen(f
'Extracted {comment_count} comments')
3591 'comments': comments
,
3592 'comment_count': None if interrupted
else comment_count
3596 def _get_comments(self
, *args
, **kwargs
):
3597 raise NotImplementedError('This method must be implemented by subclasses')
3600 def _merge_subtitle_items(subtitle_list1
, subtitle_list2
):
3601 """ Merge subtitle items for one language. Items with duplicated URLs/data
3602 will be dropped. """
3603 list1_data
= {(item.get('url'), item.get('data')) for item in subtitle_list1}
3604 ret
= list(subtitle_list1
)
3605 ret
.extend(item
for item
in subtitle_list2
if (item
.get('url'), item
.get('data')) not in list1_data
)
3609 def _merge_subtitles(cls
, *dicts
, target
=None):
3610 """ Merge subtitle dictionaries, language by language. """
3614 for lang
, subs
in d
.items():
3615 target
[lang
] = cls
._merge
_subtitle
_items
(target
.get(lang
, []), subs
)
3618 def extract_automatic_captions(self
, *args
, **kwargs
):
3619 if (self
.get_param('writeautomaticsub', False)
3620 or self
.get_param('listsubtitles')):
3621 return self
._get
_automatic
_captions
(*args
, **kwargs
)
3624 def _get_automatic_captions(self
, *args
, **kwargs
):
3625 raise NotImplementedError('This method must be implemented by subclasses')
3627 @functools.cached_property
3628 def _cookies_passed(self
):
3629 """Whether cookies have been passed to YoutubeDL"""
3630 return self
.get_param('cookiefile') is not None or self
.get_param('cookiesfrombrowser') is not None
3632 def mark_watched(self
, *args
, **kwargs
):
3633 if not self
.get_param('mark_watched', False):
3635 if self
.supports_login() and self
._get
_login
_info
()[0] is not None or self
._cookies
_passed
:
3636 self
._mark
_watched
(*args
, **kwargs
)
3638 def _mark_watched(self
, *args
, **kwargs
):
3639 raise NotImplementedError('This method must be implemented by subclasses')
3641 def geo_verification_headers(self
):
3643 geo_verification_proxy
= self
.get_param('geo_verification_proxy')
3644 if geo_verification_proxy
:
3645 headers
['Ytdl-request-proxy'] = geo_verification_proxy
3649 def _generic_id(url
):
3650 return urllib
.parse
.unquote(os
.path
.splitext(url
.rstrip('/').split('/')[-1])[0])
3652 def _generic_title(self
, url
='', webpage
='', *, default
=None):
3653 return (self
._og
_search
_title
(webpage
, default
=None)
3654 or self
._html
_extract
_title
(webpage
, default
=None)
3655 or urllib
.parse
.unquote(os
.path
.splitext(url_basename(url
))[0])
3658 def _extract_chapters_helper(self
, chapter_list
, start_function
, title_function
, duration
, strict
=True):
3662 'start_time': start_function(chapter
),
3663 'title': title_function(chapter
),
3664 } for chapter
in chapter_list
or []]
3666 warn
= self
.report_warning
3668 warn
= self
.write_debug
3669 chapter_list
.sort(key
=lambda c
: c
['start_time'] or 0)
3671 chapters
= [{'start_time': 0}
]
3672 for idx
, chapter
in enumerate(chapter_list
):
3673 if chapter
['start_time'] is None:
3674 warn(f
'Incomplete chapter {idx}')
3675 elif chapters
[-1]['start_time'] <= chapter
['start_time'] <= duration
:
3676 chapters
.append(chapter
)
3677 elif chapter
not in chapters
:
3678 issue
= (f
'{chapter["start_time"]} > {duration}' if chapter
['start_time'] > duration
3679 else f
'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3680 warn(f
'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3683 def _extract_chapters_from_description(self
, description
, duration
):
3684 duration_re
= r
'(?:\d+:)?\d{1,2}:\d{2}'
3685 sep_re
= r
'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3686 return self
._extract
_chapters
_helper
(
3687 re
.findall(sep_re
% (duration_re
, r
'.+?'), description
or ''),
3688 start_function
=lambda x
: parse_duration(x
[0]), title_function
=lambda x
: x
[1],
3689 duration
=duration
, strict
=False) or self
._extract
_chapters
_helper
(
3690 re
.findall(sep_re
% (r
'.+?', duration_re
), description
or ''),
3691 start_function
=lambda x
: parse_duration(x
[1]), title_function
=lambda x
: x
[0],
3692 duration
=duration
, strict
=False)
3695 def _availability(is_private
=None, needs_premium
=None, needs_subscription
=None, needs_auth
=None, is_unlisted
=None):
3696 all_known
= all(map(
3697 lambda x
: x
is not None,
3698 (is_private
, needs_premium
, needs_subscription
, needs_auth
, is_unlisted
)))
3700 'private' if is_private
3701 else 'premium_only' if needs_premium
3702 else 'subscriber_only' if needs_subscription
3703 else 'needs_auth' if needs_auth
3704 else 'unlisted' if is_unlisted
3705 else 'public' if all_known
3708 def _configuration_arg(self
, key
, default
=NO_DEFAULT
, *, ie_key
=None, casesense
=False):
3710 @returns A list of values for the extractor argument given by "key"
3711 or "default" if no such key is present
3712 @param default The default value to return when the key is not present (default: [])
3713 @param casesense When false, the values are converted to lower case
3715 ie_key
= ie_key
if isinstance(ie_key
, str) else (ie_key
or self
).ie_key()
3716 val
= traverse_obj(self
._downloader
.params
, ('extractor_args', ie_key
.lower(), key
))
3718 return [] if default
is NO_DEFAULT
else default
3719 return list(val
) if casesense
else [x
.lower() for x
in val
]
3721 def _yes_playlist(self
, playlist_id
, video_id
, smuggled_data
=None, *, playlist_label
='playlist', video_label
='video'):
3722 if not playlist_id
or not video_id
:
3725 no_playlist
= (smuggled_data
or {}).get('force_noplaylist')
3726 if no_playlist
is not None:
3727 return not no_playlist
3729 video_id
= '' if video_id
is True else f
' {video_id}'
3730 playlist_id
= '' if playlist_id
is True else f
' {playlist_id}'
3731 if self
.get_param('noplaylist'):
3732 self
.to_screen(f
'Downloading just the {video_label}{video_id} because of --no-playlist')
3734 self
.to_screen(f
'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3737 def _error_or_warning(self
, err
, _count
=None, _retries
=0, *, fatal
=True):
3738 RetryManager
.report_retry(
3739 err
, _count
or int(fatal
), _retries
,
3740 info
=self
.to_screen
, warn
=self
.report_warning
, error
=None if fatal
else self
.report_warning
,
3741 sleep_func
=self
.get_param('retry_sleep_functions', {}).get('extractor'))
3743 def RetryManager(self
, **kwargs
):
3744 return RetryManager(self
.get_param('extractor_retries', 3), self
._error
_or
_warning
, **kwargs
)
3746 def _extract_generic_embeds(self
, url
, *args
, info_dict
={}, note
='Extracting generic embeds', **kwargs
):
3747 display_id
= traverse_obj(info_dict
, 'display_id', 'id')
3748 self
.to_screen(f
'{format_field(display_id, None, "%s: ")}{note}')
3749 return self
._downloader
.get_info_extractor('Generic')._extract
_embeds
(
3750 smuggle_url(url
, {'block_ies': [self.ie_key()]}
), *args
, **kwargs
)
3753 def extract_from_webpage(cls
, ydl
, url
, webpage
):
3754 ie
= (cls
if isinstance(cls
._extract
_from
_webpage
, types
.MethodType
)
3755 else ydl
.get_info_extractor(cls
.ie_key()))
3756 for info
in ie
._extract
_from
_webpage
(url
, webpage
) or []:
3757 # url = None since we do not want to set (webpage/original)_url
3758 ydl
.add_default_extra_info(info
, ie
, None)
3762 def _extract_from_webpage(cls
, url
, webpage
):
3763 for embed_url
in orderedSet(
3764 cls
._extract
_embed
_urls
(url
, webpage
) or [], lazy
=True):
3765 yield cls
.url_result(embed_url
, None if cls
._VALID
_URL
is False else cls
)
3768 def _extract_embed_urls(cls
, url
, webpage
):
3769 """@returns all the embed urls on the webpage"""
3770 if '_EMBED_URL_RE' not in cls
.__dict
__:
3771 assert isinstance(cls
._EMBED
_REGEX
, (list, tuple))
3772 for idx
, regex
in enumerate(cls
._EMBED
_REGEX
):
3773 assert regex
.count('(?P<url>') == 1, \
3774 f
'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3775 cls
._EMBED
_URL
_RE
= tuple(map(re
.compile, cls
._EMBED
_REGEX
))
3777 for regex
in cls
._EMBED
_URL
_RE
:
3778 for mobj
in regex
.finditer(webpage
):
3779 embed_url
= urllib
.parse
.urljoin(url
, unescapeHTML(mobj
.group('url')))
3780 if cls
._VALID
_URL
is False or cls
.suitable(embed_url
):
3783 class StopExtraction(Exception):
3787 def _extract_url(cls
, webpage
): # TODO: Remove
3788 """Only for compatibility with some older extractors"""
3789 return next(iter(cls
._extract
_embed
_urls
(None, webpage
) or []), None)
3792 def __init_subclass__(cls
, *, plugin_name
=None, **kwargs
):
3794 mro
= inspect
.getmro(cls
)
3795 super_class
= cls
.__wrapped
__ = mro
[mro
.index(cls
) + 1]
3796 cls
.PLUGIN_NAME
, cls
.ie_key
= plugin_name
, super_class
.ie_key
3797 cls
.IE_NAME
= f
'{super_class.IE_NAME}+{plugin_name}'
3798 while getattr(super_class
, '__wrapped__', None):
3799 super_class
= super_class
.__wrapped
__
3800 setattr(sys
.modules
[super_class
.__module
__], super_class
.__name
__, cls
)
3801 _PLUGIN_OVERRIDES
[super_class
].append(cls
)
3803 return super().__init
_subclass
__(**kwargs
)
3806 class SearchInfoExtractor(InfoExtractor
):
3808 Base class for paged search queries extractors.
3809 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3810 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3813 _MAX_RESULTS
= float('inf')
3814 _RETURN_TYPE
= 'playlist'
3817 def _VALID_URL(cls
):
3818 return r
'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls
._SEARCH
_KEY
3820 def _real_extract(self
, query
):
3821 prefix
, query
= self
._match
_valid
_url
(query
).group('prefix', 'query')
3823 return self
._get
_n
_results
(query
, 1)
3824 elif prefix
== 'all':
3825 return self
._get
_n
_results
(query
, self
._MAX
_RESULTS
)
3829 raise ExtractorError(f
'invalid download number {n} for query "{query}"')
3830 elif n
> self
._MAX
_RESULTS
:
3831 self
.report_warning('%s returns max %i results (you requested %i)' % (self
._SEARCH
_KEY
, self
._MAX
_RESULTS
, n
))
3832 n
= self
._MAX
_RESULTS
3833 return self
._get
_n
_results
(query
, n
)
3835 def _get_n_results(self
, query
, n
):
3836 """Get a specified number of results for a query.
3837 Either this function or _search_results must be overridden by subclasses """
3838 return self
.playlist_result(
3839 itertools
.islice(self
._search
_results
(query
), 0, None if n
== float('inf') else n
),
3842 def _search_results(self
, query
):
3843 """Returns an iterator of search results"""
3844 raise NotImplementedError('This method must be implemented by subclasses')
3847 def SEARCH_KEY(cls
):
3848 return cls
._SEARCH
_KEY
3851 class UnsupportedURLIE(InfoExtractor
):
3856 def _real_extract(self
, url
):
3857 raise UnsupportedError(url
)
3860 _PLUGIN_OVERRIDES
= collections
.defaultdict(list)