21 import xml
.etree
.ElementTree
23 from ..compat
import functools
# isort: split
24 from ..compat
import compat_etree_fromstring
, compat_expanduser
, compat_os_name
25 from ..cookies
import LenientSimpleCookie
26 from ..downloader
import FileDownloader
27 from ..downloader
.f4m
import get_base_url
, remove_encrypted_media
64 parse_m3u8_attributes
,
91 """Information Extractor class.
93 Information extractors are the classes that, given a URL, extract
94 information about the video (or videos) the URL refers to. This
95 information includes the real video URL, the video title, author and
96 others. The information is stored in a dictionary which is then
97 passed to the YoutubeDL. The YoutubeDL processes this
98 information possibly downloading the video to the file system, among
99 other possible outcomes.
101 The type field determines the type of the result.
102 By far the most common value (and the default if _type is missing) is
103 "video", which indicates a single video.
105 For a video, the dictionaries must include the following fields:
107 id: Video identifier.
108 title: Video title, unescaped. Set to an empty string if video has
109 no title as opposed to "None" which signifies that the
110 extractor failed to obtain a title
112 Additionally, it must contain either a formats entry or a url one:
114 formats: A list of dictionaries for each format available, ordered
115 from worst to best quality.
118 * url The mandatory URL representing the media:
119 for plain file media - HTTP URL of this file,
121 for HLS - URL of the M3U8 media playlist,
122 for HDS - URL of the F4M manifest,
124 - HTTP URL to plain file media (in case of
126 - URL of the MPD manifest or base URL
127 representing the media if MPD manifest
128 is parsed from a string (in case of
130 for MSS - URL of the ISM manifest.
132 The URL of the manifest file in case of
134 for HLS - URL of the M3U8 master playlist,
135 for HDS - URL of the F4M manifest,
136 for DASH - URL of the MPD manifest,
137 for MSS - URL of the ISM manifest.
138 * manifest_stream_number (For internal use only)
139 The index of the stream in the manifest file
140 * ext Will be calculated from URL if missing
141 * format A human-readable description of the format
142 ("mp4 container with h264/opus").
143 Calculated from the format_id, width, height.
144 and format_note fields if missing.
145 * format_id A short description of the format
146 ("mp4_h264_opus" or "19").
147 Technically optional, but strongly recommended.
148 * format_note Additional info about the format
149 ("3D" or "DASH video")
150 * width Width of the video, if known
151 * height Height of the video, if known
152 * resolution Textual description of width and height
153 * dynamic_range The dynamic range of the video. One of:
154 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
155 * tbr Average bitrate of audio and video in KBit/s
156 * abr Average audio bitrate in KBit/s
157 * acodec Name of the audio codec in use
158 * asr Audio sampling rate in Hertz
159 * audio_channels Number of audio channels
160 * vbr Average video bitrate in KBit/s
162 * vcodec Name of the video codec in use
163 * container Name of the container format
164 * filesize The number of bytes, if known in advance
165 * filesize_approx An estimate for the number of bytes
166 * player_url SWF Player URL (used for rtmpdump).
167 * protocol The protocol that will be used for the actual
168 download, lower-case. One of "http", "https" or
169 one of the protocols defined in downloader.PROTOCOL_MAP
171 Base URL for fragments. Each fragment's path
172 value (if present) will be relative to
174 * fragments A list of fragments of a fragmented media.
175 Each fragment entry must contain either an url
176 or a path. If an url is present it should be
177 considered by a client. Otherwise both path and
178 fragment_base_url must be present. Here is
179 the list of all potential fields:
180 * "url" - fragment's URL
181 * "path" - fragment's path relative to
183 * "duration" (optional, int or float)
184 * "filesize" (optional, int)
185 * is_from_start Is a live format that can be downloaded
186 from the start. Boolean
187 * preference Order number of this format. If this field is
188 present and not None, the formats get sorted
189 by this field, regardless of all other values.
190 -1 for default (order by other properties),
191 -2 or smaller for less than default.
192 < -1000 to hide the format (if there is
193 another one which is strictly better)
194 * language Language code, e.g. "de" or "en-US".
195 * language_preference Is this in the language mentioned in
197 10 if it's what the URL is about,
198 -1 for default (don't know),
199 -10 otherwise, other values reserved for now.
200 * quality Order number of the video quality of this
201 format, irrespective of the file format.
202 -1 for default (order by other properties),
203 -2 or smaller for less than default.
204 * source_preference Order number for this video source
205 (quality takes higher priority)
206 -1 for default (order by other properties),
207 -2 or smaller for less than default.
208 * http_headers A dictionary of additional HTTP headers
209 to add to the request.
210 * stretched_ratio If given and not 1, indicates that the
211 video's pixels are not square.
212 width : height ratio as float.
213 * no_resume The server does not support resuming the
214 (HTTP or RTMP) download. Boolean.
215 * has_drm The format has DRM and cannot be downloaded. Boolean
216 * downloader_options A dictionary of downloader options
217 (For internal use only)
218 * http_chunk_size Chunk size for HTTP downloads
219 * ffmpeg_args Extra arguments for ffmpeg downloader
220 RTMP formats can also have the additional fields: page_url,
221 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
222 rtmp_protocol, rtmp_real_time
224 url: Final video URL.
225 ext: Video filename extension.
226 format: The video format, defaults to ext (used for --get-format)
227 player_url: SWF Player URL (used for rtmpdump).
229 The following fields are optional:
231 direct: True if a direct video file was given (must only be set by GenericIE)
232 alt_title: A secondary title of the video.
233 display_id An alternative identifier for the video, not necessarily
234 unique, but available before title. Typically, id is
235 something like "4234987", title "Dancing naked mole rats",
236 and display_id "dancing-naked-mole-rats"
237 thumbnails: A list of dictionaries, with the following entries:
238 * "id" (optional, string) - Thumbnail format ID
240 * "preference" (optional, int) - quality of the image
241 * "width" (optional, int)
242 * "height" (optional, int)
243 * "resolution" (optional, string "{width}x{height}",
245 * "filesize" (optional, int)
246 * "http_headers" (dict) - HTTP headers for the request
247 thumbnail: Full URL to a video thumbnail image.
248 description: Full video description.
249 uploader: Full name of the video uploader.
250 license: License name the video is licensed under.
251 creator: The creator of the video.
252 timestamp: UNIX timestamp of the moment the video was uploaded
253 upload_date: Video upload date in UTC (YYYYMMDD).
254 If not explicitly set, calculated from timestamp
255 release_timestamp: UNIX timestamp of the moment the video was released.
256 If it is not clear whether to use timestamp or this, use the former
257 release_date: The date (YYYYMMDD) when the video was released in UTC.
258 If not explicitly set, calculated from release_timestamp
259 modified_timestamp: UNIX timestamp of the moment the video was last modified.
260 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
261 If not explicitly set, calculated from modified_timestamp
262 uploader_id: Nickname or id of the video uploader.
263 uploader_url: Full URL to a personal webpage of the video uploader.
264 channel: Full name of the channel the video is uploaded on.
265 Note that channel fields may or may not repeat uploader
266 fields. This depends on a particular extractor.
267 channel_id: Id of the channel.
268 channel_url: Full URL to a channel webpage.
269 channel_follower_count: Number of followers of the channel.
270 location: Physical location where the video was filmed.
271 subtitles: The available subtitles as a dictionary in the format
272 {tag: subformats}. "tag" is usually a language code, and
273 "subformats" is a list sorted from lower to higher
274 preference, each element is a dictionary with the "ext"
276 * "data": The subtitles file contents
277 * "url": A URL pointing to the subtitles file
278 It can optionally also have:
279 * "name": Name or description of the subtitles
280 * "http_headers": A dictionary of additional HTTP headers
281 to add to the request.
282 "ext" will be calculated from URL if missing
283 automatic_captions: Like 'subtitles'; contains automatically generated
284 captions instead of normal subtitles
285 duration: Length of the video in seconds, as an integer or float.
286 view_count: How many users have watched the video on the platform.
287 like_count: Number of positive ratings of the video
288 dislike_count: Number of negative ratings of the video
289 repost_count: Number of reposts of the video
290 average_rating: Average rating give by users, the scale used depends on the webpage
291 comment_count: Number of comments on the video
292 comments: A list of comments, each with one or more of the following
293 properties (all but one of text or html optional):
294 * "author" - human-readable name of the comment author
295 * "author_id" - user ID of the comment author
296 * "author_thumbnail" - The thumbnail of the comment author
298 * "html" - Comment as HTML
299 * "text" - Plain text of the comment
300 * "timestamp" - UNIX timestamp of comment
301 * "parent" - ID of the comment this one is replying to.
302 Set to "root" to indicate that this is a
303 comment to the original video.
304 * "like_count" - Number of positive ratings of the comment
305 * "dislike_count" - Number of negative ratings of the comment
306 * "is_favorited" - Whether the comment is marked as
307 favorite by the video uploader
308 * "author_is_uploader" - Whether the comment is made by
310 age_limit: Age restriction for the video, as an integer (years)
311 webpage_url: The URL to the video webpage, if given to yt-dlp it
312 should allow to get the same result again. (It will be set
313 by YoutubeDL if it's missing)
314 categories: A list of categories that the video falls in, for example
316 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
317 cast: A list of the video cast
318 is_live: True, False, or None (=unknown). Whether this video is a
319 live stream that goes on instead of a fixed-length video.
320 was_live: True, False, or None (=unknown). Whether this video was
321 originally a live stream.
322 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
323 or 'post_live' (was live, but VOD is not yet processed)
324 If absent, automatically set from is_live, was_live
325 start_time: Time in seconds where the reproduction should start, as
326 specified in the URL.
327 end_time: Time in seconds where the reproduction should end, as
328 specified in the URL.
329 chapters: A list of dictionaries, with the following entries:
330 * "start_time" - The start time of the chapter in seconds
331 * "end_time" - The end time of the chapter in seconds
332 * "title" (optional, string)
333 playable_in_embed: Whether this video is allowed to play in embedded
334 players on other sites. Can be True (=always allowed),
335 False (=never allowed), None (=unknown), or a string
336 specifying the criteria for embedability; e.g. 'whitelist'
337 availability: Under what condition the video is available. One of
338 'private', 'premium_only', 'subscriber_only', 'needs_auth',
339 'unlisted' or 'public'. Use 'InfoExtractor._availability'
341 _old_archive_ids: A list of old archive ids needed for backward compatibility
342 __post_extractor: A function to be called just before the metadata is
343 written to either disk, logger or console. The function
344 must return a dict which will be added to the info_dict.
345 This is usefull for additional information that is
346 time-consuming to extract. Note that the fields thus
347 extracted will not be available to output template and
348 match_filter. So, only "comments" and "comment_count" are
349 currently allowed to be extracted via this method.
351 The following fields should only be used when the video belongs to some logical
354 chapter: Name or title of the chapter the video belongs to.
355 chapter_number: Number of the chapter the video belongs to, as an integer.
356 chapter_id: Id of the chapter the video belongs to, as a unicode string.
358 The following fields should only be used when the video is an episode of some
359 series, programme or podcast:
361 series: Title of the series or programme the video episode belongs to.
362 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
363 season: Title of the season the video episode belongs to.
364 season_number: Number of the season the video episode belongs to, as an integer.
365 season_id: Id of the season the video episode belongs to, as a unicode string.
366 episode: Title of the video episode. Unlike mandatory video title field,
367 this field should denote the exact title of the video episode
368 without any kind of decoration.
369 episode_number: Number of the video episode within a season, as an integer.
370 episode_id: Id of the video episode, as a unicode string.
372 The following fields should only be used when the media is a track or a part of
375 track: Title of the track.
376 track_number: Number of the track within an album or a disc, as an integer.
377 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
379 artist: Artist(s) of the track.
380 genre: Genre(s) of the track.
381 album: Title of the album the track belongs to.
382 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
383 album_artist: List of all artists appeared on the album (e.g.
384 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
386 disc_number: Number of the disc or other physical medium the track belongs to,
388 release_year: Year (YYYY) when the album was released.
389 composer: Composer of the piece
391 The following fields should only be set for clips that should be cut from the original video:
393 section_start: Start time of the section in seconds
394 section_end: End time of the section in seconds
396 The following fields should only be set for storyboards:
397 rows: Number of rows in each storyboard fragment, as an integer
398 columns: Number of columns in each storyboard fragment, as an integer
400 Unless mentioned otherwise, the fields should be Unicode strings.
402 Unless mentioned otherwise, None is equivalent to absence of information.
405 _type "playlist" indicates multiple videos.
406 There must be a key "entries", which is a list, an iterable, or a PagedList
407 object, each element of which is a valid dictionary by this specification.
409 Additionally, playlists can have "id", "title", and any other relevant
410 attributes with the same semantics as videos (see above).
412 It can also have the following optional fields:
414 playlist_count: The total number of videos in a playlist. If not given,
415 YoutubeDL tries to calculate it from "entries"
418 _type "multi_video" indicates that there are multiple videos that
419 form a single show, for examples multiple acts of an opera or TV episode.
420 It must have an entries key like a playlist and contain all the keys
421 required for a video at the same time.
424 _type "url" indicates that the video must be extracted from another
425 location, possibly by a different extractor. Its only required key is:
426 "url" - the next URL to extract.
427 The key "ie_key" can be set to the class name (minus the trailing "IE",
428 e.g. "Youtube") if the extractor class is known in advance.
429 Additionally, the dictionary may have any properties of the resolved entity
430 known in advance, for example "title" if the title of the referred video is
434 _type "url_transparent" entities have the same specification as "url", but
435 indicate that the given additional information is more precise than the one
436 associated with the resolved URL.
437 This is useful when a site employs a video service that hosts the video and
438 its technical metadata, but that video service does not embed a useful
439 title, description etc.
442 Subclasses of this should also be added to the list of extractors and
443 should define a _VALID_URL regexp and, re-define the _real_extract() and
444 (optionally) _real_initialize() methods.
446 Subclasses may also override suitable() if necessary, but ensure the function
447 signature is preserved and that this function imports everything it needs
448 (except other extractors), so that lazy_extractors works correctly.
450 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
451 the HTML of Generic webpages. It may also override _extract_embed_urls
452 or _extract_from_webpage as necessary. While these are normally classmethods,
453 _extract_from_webpage is allowed to be an instance method.
455 _extract_from_webpage may raise self.StopExtraction() to stop further
456 processing of the webpage and obtain exclusive rights to it. This is useful
457 when the extractor cannot reliably be matched using just the URL,
458 e.g. invidious/peertube instances
460 Embed-only extractors can be defined by setting _VALID_URL = False.
462 To support username + password (or netrc) login, the extractor must define a
463 _NETRC_MACHINE and re-define _perform_login(username, password) and
464 (optionally) _initialize_pre_login() methods. The _perform_login method will
465 be called between _initialize_pre_login and _real_initialize if credentials
466 are passed by the user. In cases where it is necessary to have the login
467 process as part of the extraction rather than initialization, _perform_login
468 can be left undefined.
470 _GEO_BYPASS attribute may be set to False in order to disable
471 geo restriction bypass mechanisms for a particular extractor.
472 Though it won't disable explicit geo restriction bypass based on
473 country code provided with geo_bypass_country.
475 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
476 countries for this extractor. One of these countries will be used by
477 geo restriction bypass mechanism right away in order to bypass
478 geo restriction, of course, if the mechanism is not disabled.
480 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
481 IP blocks in CIDR notation for this extractor. One of these IP blocks
482 will be used by geo restriction bypass mechanism similarly
485 The _ENABLED attribute should be set to False for IEs that
486 are disabled by default and must be explicitly enabled.
488 The _WORKING attribute should be set to False for broken IEs
489 in order to warn the users and skip the tests.
494 _x_forwarded_for_ip
= None
496 _GEO_COUNTRIES
= None
497 _GEO_IP_BLOCKS
= None
500 _NETRC_MACHINE
= None
506 def _login_hint(self
, method
=NO_DEFAULT
, netrc
=None):
507 password_hint
= f
'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
510 'any': f
'Use --cookies, --cookies-from-browser, {password_hint}',
511 'password': f
'Use {password_hint}',
513 'Use --cookies-from-browser or --cookies for the authentication. '
514 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
515 }[method
if method
is not NO_DEFAULT
else 'any' if self
.supports_login() else 'cookies']
517 def __init__(self
, downloader
=None):
518 """Constructor. Receives an optional downloader (a YoutubeDL instance).
519 If a downloader is not passed during initialization,
520 it must be set using "set_downloader()" before "extract()" is called"""
522 self
._x
_forwarded
_for
_ip
= None
523 self
._printed
_messages
= set()
524 self
.set_downloader(downloader
)
527 def _match_valid_url(cls
, url
):
528 if cls
._VALID
_URL
is False:
530 # This does not use has/getattr intentionally - we want to know whether
531 # we have cached the regexp for *this* class, whereas getattr would also
532 # match the superclass
533 if '_VALID_URL_RE' not in cls
.__dict
__:
534 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
535 return cls
._VALID
_URL
_RE
.match(url
)
538 def suitable(cls
, url
):
539 """Receives a URL and returns True if suitable for this IE."""
540 # This function must import everything it needs (except other extractors),
541 # so that lazy_extractors works correctly
542 return cls
._match
_valid
_url
(url
) is not None
545 def _match_id(cls
, url
):
546 return cls
._match
_valid
_url
(url
).group('id')
549 def get_temp_id(cls
, url
):
551 return cls
._match
_id
(url
)
552 except (IndexError, AttributeError):
557 """Getter method for _WORKING."""
561 def supports_login(cls
):
562 return bool(cls
._NETRC
_MACHINE
)
564 def initialize(self
):
565 """Initializes an instance (authentication, etc)."""
566 self
._printed
_messages
= set()
567 self
._initialize
_geo
_bypass
({
568 'countries': self
._GEO
_COUNTRIES
,
569 'ip_blocks': self
._GEO
_IP
_BLOCKS
,
572 self
._initialize
_pre
_login
()
573 if self
.supports_login():
574 username
, password
= self
._get
_login
_info
()
576 self
._perform
_login
(username
, password
)
577 elif self
.get_param('username') and False not in (self
.IE_DESC
, self
._NETRC
_MACHINE
):
578 self
.report_warning(f
'Login with password is not supported for this website. {self._login_hint("cookies")}')
579 self
._real
_initialize
()
582 def _initialize_geo_bypass(self
, geo_bypass_context
):
584 Initialize geo restriction bypass mechanism.
586 This method is used to initialize geo bypass mechanism based on faking
587 X-Forwarded-For HTTP header. A random country from provided country list
588 is selected and a random IP belonging to this country is generated. This
589 IP will be passed as X-Forwarded-For HTTP header in all subsequent
592 This method will be used for initial geo bypass mechanism initialization
593 during the instance initialization with _GEO_COUNTRIES and
596 You may also manually call it from extractor's code if geo bypass
597 information is not available beforehand (e.g. obtained during
598 extraction) or due to some other reason. In this case you should pass
599 this information in geo bypass context passed as first argument. It may
600 contain following fields:
602 countries: List of geo unrestricted countries (similar
604 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
605 (similar to _GEO_IP_BLOCKS)
608 if not self
._x
_forwarded
_for
_ip
:
610 # Geo bypass mechanism is explicitly disabled by user
611 if not self
.get_param('geo_bypass', True):
614 if not geo_bypass_context
:
615 geo_bypass_context
= {}
617 # Backward compatibility: previously _initialize_geo_bypass
618 # expected a list of countries, some 3rd party code may still use
620 if isinstance(geo_bypass_context
, (list, tuple)):
621 geo_bypass_context
= {
622 'countries': geo_bypass_context
,
625 # The whole point of geo bypass mechanism is to fake IP
626 # as X-Forwarded-For HTTP header based on some IP block or
629 # Path 1: bypassing based on IP block in CIDR notation
631 # Explicit IP block specified by user, use it right away
632 # regardless of whether extractor is geo bypassable or not
633 ip_block
= self
.get_param('geo_bypass_ip_block', None)
635 # Otherwise use random IP block from geo bypass context but only
636 # if extractor is known as geo bypassable
638 ip_blocks
= geo_bypass_context
.get('ip_blocks')
639 if self
._GEO
_BYPASS
and ip_blocks
:
640 ip_block
= random
.choice(ip_blocks
)
643 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(ip_block
)
644 self
.write_debug(f
'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
647 # Path 2: bypassing based on country code
649 # Explicit country code specified by user, use it right away
650 # regardless of whether extractor is geo bypassable or not
651 country
= self
.get_param('geo_bypass_country', None)
653 # Otherwise use random country code from geo bypass context but
654 # only if extractor is known as geo bypassable
656 countries
= geo_bypass_context
.get('countries')
657 if self
._GEO
_BYPASS
and countries
:
658 country
= random
.choice(countries
)
661 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country
)
662 self
._downloader
.write_debug(
663 f
'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
665 def extract(self
, url
):
666 """Extracts URL information and returns it in list of dicts."""
671 self
.write_debug('Extracting URL: %s' % url
)
672 ie_result
= self
._real
_extract
(url
)
673 if ie_result
is None:
675 if self
._x
_forwarded
_for
_ip
:
676 ie_result
['__x_forwarded_for_ip'] = self
._x
_forwarded
_for
_ip
677 subtitles
= ie_result
.get('subtitles') or {}
678 if 'no-live-chat' in self
.get_param('compat_opts'):
679 for lang
in ('live_chat', 'comments', 'danmaku'):
680 subtitles
.pop(lang
, None)
682 except GeoRestrictedError
as e
:
683 if self
.__maybe
_fake
_ip
_and
_retry
(e
.countries
):
686 except UnsupportedError
:
688 except ExtractorError
as e
:
690 'video_id': e
.video_id
or self
.get_temp_id(url
),
692 'tb': e
.traceback
or sys
.exc_info()[2],
693 'expected': e
.expected
,
696 if hasattr(e
, 'countries'):
697 kwargs
['countries'] = e
.countries
698 raise type(e
)(e
.orig_msg
, **kwargs
)
699 except http
.client
.IncompleteRead
as e
:
700 raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True, video_id
=self
.get_temp_id(url
))
701 except (KeyError, StopIteration) as e
:
702 raise ExtractorError('An extractor error has occurred.', cause
=e
, video_id
=self
.get_temp_id(url
))
704 def __maybe_fake_ip_and_retry(self
, countries
):
705 if (not self
.get_param('geo_bypass_country', None)
707 and self
.get_param('geo_bypass', True)
708 and not self
._x
_forwarded
_for
_ip
710 country_code
= random
.choice(countries
)
711 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country_code
)
712 if self
._x
_forwarded
_for
_ip
:
714 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
715 % (self
._x
_forwarded
_for
_ip
, country_code
.upper()))
719 def set_downloader(self
, downloader
):
720 """Sets a YoutubeDL instance as the downloader for this IE."""
721 self
._downloader
= downloader
725 return self
._downloader
.cache
729 return self
._downloader
.cookiejar
731 def _initialize_pre_login(self
):
732 """ Initialization before login. Redefine in subclasses."""
735 def _perform_login(self
, username
, password
):
736 """ Login with username and password. Redefine in subclasses."""
739 def _real_initialize(self
):
740 """Real initialization process. Redefine in subclasses."""
743 def _real_extract(self
, url
):
744 """Real extraction process. Redefine in subclasses."""
745 raise NotImplementedError('This method must be implemented by subclasses')
749 """A string for getting the InfoExtractor with get_info_extractor"""
750 return cls
.__name
__[:-2]
754 return cls
.__name
__[:-2]
757 def __can_accept_status_code(err
, expected_status
):
758 assert isinstance(err
, urllib
.error
.HTTPError
)
759 if expected_status
is None:
761 elif callable(expected_status
):
762 return expected_status(err
.code
) is True
764 return err
.code
in variadic(expected_status
)
766 def _create_request(self
, url_or_request
, data
=None, headers
=None, query
=None):
767 if isinstance(url_or_request
, urllib
.request
.Request
):
768 return update_Request(url_or_request
, data
=data
, headers
=headers
, query
=query
)
770 url_or_request
= update_url_query(url_or_request
, query
)
771 return sanitized_Request(url_or_request
, data
, headers
or {})
773 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
=None, query
=None, expected_status
=None):
775 Return the response handle.
777 See _download_webpage docstring for arguments specification.
779 if not self
._downloader
._first
_webpage
_request
:
780 sleep_interval
= self
.get_param('sleep_interval_requests') or 0
781 if sleep_interval
> 0:
782 self
.to_screen('Sleeping %s seconds ...' % sleep_interval
)
783 time
.sleep(sleep_interval
)
785 self
._downloader
._first
_webpage
_request
= False
788 self
.report_download_webpage(video_id
)
789 elif note
is not False:
791 self
.to_screen(str(note
))
793 self
.to_screen(f
'{video_id}: {note}')
795 # Some sites check X-Forwarded-For HTTP header in order to figure out
796 # the origin of the client behind proxy. This allows bypassing geo
797 # restriction by faking this header's value to IP that belongs to some
798 # geo unrestricted country. We will do so once we encounter any
799 # geo restriction error.
800 if self
._x
_forwarded
_for
_ip
:
801 headers
= (headers
or {}).copy()
802 headers
.setdefault('X-Forwarded-For', self
._x
_forwarded
_for
_ip
)
805 return self
._downloader
.urlopen(self
._create
_request
(url_or_request
, data
, headers
, query
))
806 except network_exceptions
as err
:
807 if isinstance(err
, urllib
.error
.HTTPError
):
808 if self
.__can
_accept
_status
_code
(err
, expected_status
):
809 # Retain reference to error to prevent file object from
810 # being closed before it can be read. Works around the
811 # effects of <https://bugs.python.org/issue15002>
812 # introduced in Python 3.4.1.
819 errnote
= 'Unable to download webpage'
821 errmsg
= f
'{errnote}: {error_to_compat_str(err)}'
823 raise ExtractorError(errmsg
, cause
=err
)
825 self
.report_warning(errmsg
)
828 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True,
829 encoding
=None, data
=None, headers
={}, query={}
, expected_status
=None):
831 Return a tuple (page content as string, URL handle).
834 url_or_request -- plain text URL as a string or
835 a urllib.request.Request object
836 video_id -- Video/playlist/item identifier (string)
839 note -- note printed before downloading (string)
840 errnote -- note printed in case of an error (string)
841 fatal -- flag denoting whether error should be considered fatal,
842 i.e. whether it should cause ExtractionError to be raised,
843 otherwise a warning will be reported and extraction continued
844 encoding -- encoding for a page content decoding, guessed automatically
845 when not explicitly specified
846 data -- POST data (bytes)
847 headers -- HTTP headers (dict)
848 query -- URL query (dict)
849 expected_status -- allows to accept failed HTTP requests (non 2xx
850 status code) by explicitly specifying a set of accepted status
851 codes. Can be any of the following entities:
852 - an integer type specifying an exact failed status code to
854 - a list or a tuple of integer types specifying a list of
855 failed status codes to accept
856 - a callable accepting an actual failed status code and
857 returning True if it should be accepted
858 Note that this argument does not affect success status codes (2xx)
859 which are always accepted.
862 # Strip hashes from the URL (#1038)
863 if isinstance(url_or_request
, str):
864 url_or_request
= url_or_request
.partition('#')[0]
866 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
, expected_status
=expected_status
)
870 content
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
)
871 return (content
, urlh
)
874 def _guess_encoding_from_content(content_type
, webpage_bytes
):
875 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
877 encoding
= m
.group(1)
879 m
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
880 webpage_bytes[:1024])
882 encoding = m.group(1).decode('ascii')
883 elif webpage_bytes.startswith(b'\xff\xfe'):
890 def __check_blocked(self, content):
891 first_block = content[:512]
892 if ('<title>Access to this site is blocked</title>' in content
893 and 'Websense' in first_block):
894 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
895 blocked_iframe = self._html_search_regex(
896 r'<iframe src="([^
"]+)"', content,
897 'Websense information URL
', default=None)
899 msg += ' Visit
%s for more details
' % blocked_iframe
900 raise ExtractorError(msg, expected=True)
901 if '<title
>The URL you requested has been blocked
</title
>' in first_block:
903 'Access to this webpage has been blocked by Indian censorship
. '
904 'Use a VPN
or proxy
server (with --proxy
) to route around it
.')
905 block_msg = self._html_search_regex(
906 r'</h1
><p
>(.*?
)</p
>',
907 content, 'block message
', default=None)
909 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ')
910 raise ExtractorError(msg, expected=True)
911 if ('<title
>TTK
:: Доступ к ресурсу ограничен
</title
>' in content
912 and 'blocklist
.rkn
.gov
.ru
' in content):
913 raise ExtractorError(
914 'Access to this webpage has been blocked by decision of the Russian government
. '
915 'Visit http
://blocklist
.rkn
.gov
.ru
/ for a block reason
.',
918 def _request_dump_filename(self, url, video_id):
919 basen = f'{video_id}_{url}
'
920 trim_length = self.get_param('trim_file_name
') or 240
921 if len(basen) > trim_length:
922 h = '___
' + hashlib.md5(basen.encode('utf
-8')).hexdigest()
923 basen = basen[:trim_length - len(h)] + h
924 filename = sanitize_filename(f'{basen}
.dump
', restricted=True)
925 # Working around MAX_PATH limitation on Windows (see
926 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
927 if compat_os_name == 'nt
':
928 absfilepath = os.path.abspath(filename)
929 if len(absfilepath) > 259:
930 filename = fR'\\?\{absfilepath}
'
933 def __decode_webpage(self, webpage_bytes, encoding, headers):
935 encoding = self._guess_encoding_from_content(headers.get('Content
-Type
', ''), webpage_bytes)
937 return webpage_bytes.decode(encoding, 'replace
')
939 return webpage_bytes.decode('utf
-8', 'replace
')
941 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
942 webpage_bytes = urlh.read()
943 if prefix is not None:
944 webpage_bytes = prefix + webpage_bytes
945 if self.get_param('dump_intermediate_pages
', False):
946 self.to_screen('Dumping request to
' + urlh.geturl())
947 dump = base64.b64encode(webpage_bytes).decode('ascii
')
948 self._downloader.to_screen(dump)
949 if self.get_param('write_pages
'):
950 filename = self._request_dump_filename(urlh.geturl(), video_id)
951 self.to_screen(f'Saving request to {filename}
')
952 with open(filename, 'wb
') as outf:
953 outf.write(webpage_bytes)
955 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
956 self.__check_blocked(content)
960 def __print_error(self, errnote, fatal, video_id, err):
962 raise ExtractorError(f'{video_id}
: {errnote}
', cause=err)
964 self.report_warning(f'{video_id}
: {errnote}
: {err}
')
966 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
968 xml_string = transform_source(xml_string)
970 return compat_etree_fromstring(xml_string.encode('utf
-8'))
971 except xml.etree.ElementTree.ParseError as ve:
972 self.__print_error('Failed to parse XML
' if errnote is None else errnote, fatal, video_id, ve)
974 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
977 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
978 except ValueError as ve:
979 self.__print_error('Failed to parse JSON
' if errnote is None else errnote, fatal, video_id, ve)
981 def _parse_socket_response_as_json(self, data, *args, **kwargs):
982 return self._parse_json(data[data.find('{'):data.rfind('}
') + 1], *args, **kwargs)
984 def __create_download_methods(name, parser, note, errnote, return_value):
986 def parse(ie, content, *args, errnote=errnote, **kwargs):
990 kwargs['errnote
'] = errnote
991 # parser is fetched by name so subclasses can override it
992 return getattr(ie, parser)(content, *args, **kwargs)
994 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
995 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
996 res = self._download_webpage_handle(
997 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
998 data=data, headers=headers, query=query, expected_status=expected_status)
1002 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1004 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1005 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1006 if self.get_param('load_pages
'):
1007 url_or_request = self._create_request(url_or_request, data, headers, query)
1008 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1009 self.to_screen(f'Loading request
from {filename}
')
1011 with open(filename, 'rb
') as dumpf:
1012 webpage_bytes = dumpf.read()
1013 except OSError as e:
1014 self.report_warning(f'Unable to load request
from disk
: {e}
')
1016 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1017 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1021 'transform_source
': transform_source,
1023 'encoding
': encoding,
1027 'expected_status
': expected_status,
1030 kwargs.pop('transform_source
')
1031 # The method is fetched by name so subclasses can override _download_..._handle
1032 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1033 return res if res is False else res[0]
1035 def impersonate(func, name, return_value):
1036 func.__name__, func.__qualname__ = name, f'InfoExtractor
.{name}
'
1038 @param transform_source Apply this transformation before parsing
1039 @returns {return_value}
1041 See _download_webpage_handle docstring for other arguments specification
1044 impersonate(download_handle, f'_download_{name}_handle
', f'({return_value}
, URL handle
)')
1045 impersonate(download_content, f'_download_{name}
', f'{return_value}
')
1046 return download_handle, download_content
1048 _download_xml_handle, _download_xml = __create_download_methods(
1049 'xml
', '_parse_xml
', 'Downloading XML
', 'Unable to download XML
', 'xml
as an xml
.etree
.ElementTree
.Element
')
1050 _download_json_handle, _download_json = __create_download_methods(
1051 'json
', '_parse_json
', 'Downloading JSON metadata
', 'Unable to download JSON metadata
', 'JSON
object as a
dict')
1052 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1053 'socket_json
', '_parse_socket_response_as_json
', 'Polling socket
', 'Unable to poll socket
', 'JSON
object as a
dict')
1054 __download_webpage = __create_download_methods('webpage
', None, None, None, 'data of the page
as a string
')[1]
1056 def _download_webpage(
1057 self, url_or_request, video_id, note=None, errnote=None,
1058 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1060 Return the data of the page as a string.
1063 tries -- number of tries
1064 timeout -- sleep interval between tries
1066 See _download_webpage_handle docstring for other arguments specification.
1069 R''' # NB: These are unused; should they be deprecated?
1071 self._downloader.deprecation_warning('tries argument
is deprecated
in InfoExtractor
._download
_webpage
')
1072 if timeout is NO_DEFAULT:
1075 self._downloader.deprecation_warning('timeout argument
is deprecated
in InfoExtractor
._download
_webpage
')
1081 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1082 except http.client.IncompleteRead as e:
1084 if try_count >= tries:
1086 self._sleep(timeout, video_id)
1088 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1089 idstr = format_field(video_id, None, '%s: ')
1090 msg = f'[{self.IE_NAME}
] {idstr}{msg}
'
1092 if f'WARNING
: {msg}
' in self._printed_messages:
1094 self._printed_messages.add(f'WARNING
: {msg}
')
1095 self._downloader.report_warning(msg, *args, **kwargs)
1097 def to_screen(self, msg, *args, **kwargs):
1098 """Print msg to screen, prefixing it with '[ie_name
]'"""
1099 self._downloader.to_screen(f'[{self.IE_NAME}
] {msg}
', *args, **kwargs)
1101 def write_debug(self, msg, *args, **kwargs):
1102 self._downloader.write_debug(f'[{self.IE_NAME}
] {msg}
', *args, **kwargs)
1104 def get_param(self, name, default=None, *args, **kwargs):
1105 if self._downloader:
1106 return self._downloader.params.get(name, default, *args, **kwargs)
1109 def report_drm(self, video_id, partial=False):
1110 self.raise_no_formats('This video
is DRM protected
', expected=True, video_id=video_id)
1112 def report_extraction(self, id_or_name):
1113 """Report information extraction."""
1114 self.to_screen('%s: Extracting information
' % id_or_name)
1116 def report_download_webpage(self, video_id):
1117 """Report webpage download."""
1118 self.to_screen('%s: Downloading webpage
' % video_id)
1120 def report_age_confirmation(self):
1121 """Report attempt to confirm age."""
1122 self.to_screen('Confirming age
')
1124 def report_login(self):
1125 """Report attempt to log in."""
1126 self.to_screen('Logging
in')
1128 def raise_login_required(
1129 self, msg='This video
is only available
for registered users
',
1130 metadata_available=False, method=NO_DEFAULT):
1131 if metadata_available and (
1132 self.get_param('ignore_no_formats_error
') or self.get_param('wait_for_video
')):
1133 self.report_warning(msg)
1135 msg += format_field(self._login_hint(method), None, '. %s')
1136 raise ExtractorError(msg, expected=True)
1138 def raise_geo_restricted(
1139 self, msg='This video
is not available
from your location due to geo restriction
',
1140 countries=None, metadata_available=False):
1141 if metadata_available and (
1142 self.get_param('ignore_no_formats_error
') or self.get_param('wait_for_video
')):
1143 self.report_warning(msg)
1145 raise GeoRestrictedError(msg, countries=countries)
1147 def raise_no_formats(self, msg, expected=False, video_id=None):
1149 self.get_param('ignore_no_formats_error
') or self.get_param('wait_for_video
')):
1150 self.report_warning(msg, video_id)
1151 elif isinstance(msg, ExtractorError):
1154 raise ExtractorError(msg, expected=expected, video_id=video_id)
1156 # Methods for following #608
1158 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1159 """Returns a URL that points to a page that should be processed"""
1161 kwargs['ie_key
'] = ie if isinstance(ie, str) else ie.ie_key()
1162 if video_id is not None:
1163 kwargs['id'] = video_id
1164 if video_title is not None:
1165 kwargs['title
'] = video_title
1168 '_type
': 'url_transparent
' if url_transparent else 'url
',
1173 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1174 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1175 return cls.playlist_result(
1176 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1177 playlist_id, playlist_title, **kwargs)
1180 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1181 """Returns a playlist"""
1183 kwargs['id'] = playlist_id
1185 kwargs['title
'] = playlist_title
1186 if playlist_description is not None:
1187 kwargs['description
'] = playlist_description
1190 '_type
': 'multi_video
' if multi_video else 'playlist
',
1194 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1196 Perform a regex search on the given string, using a single or a list of
1197 patterns returning the first matching group.
1198 In case of failure return a default value or raise a WARNING or a
1199 RegexNotFoundError, depending on fatal, specifying the field name.
1203 elif isinstance(pattern, (str, re.Pattern)):
1204 mobj = re.search(pattern, string, flags)
1207 mobj = re.search(p, string, flags)
1211 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1215 # return the first matching group
1216 return next(g for g in mobj.groups() if g is not None)
1217 elif isinstance(group, (list, tuple)):
1218 return tuple(mobj.group(g) for g in group)
1220 return mobj.group(group)
1221 elif default is not NO_DEFAULT:
1224 raise RegexNotFoundError('Unable to extract
%s' % _name)
1226 self.report_warning('unable to extract
%s' % _name + bug_reports_message())
1229 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1230 contains_pattern='(?s
:.+)', fatal=True, default=NO_DEFAULT, **kwargs):
1231 """Searches string for the JSON object specified by start_pattern"""
1232 # NB: end_pattern is only used to reduce the size of the initial match
1233 if default is NO_DEFAULT:
1234 default, has_default = {}, False
1236 fatal, has_default = False, True
1238 json_string = self._search_regex(
1239 rf'(?
:{start_pattern}
)\s
*(?P
<json
>{{\s*(?:{contains_pattern}
)\s
*}})\s
*(?
:{end_pattern}
)',
1240 string, name, group='json
', fatal=fatal, default=None if has_default else NO_DEFAULT)
1244 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1246 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1247 except ExtractorError as e:
1249 raise ExtractorError(
1250 f'Unable to extract {_name}
- Failed to parse JSON
', cause=e.cause, video_id=video_id)
1251 elif not has_default:
1252 self.report_warning(
1253 f'Unable to extract {_name}
- Failed to parse JSON
: {e}
', video_id=video_id)
1256 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1258 Like _search_regex, but strips HTML tags and unescapes entities.
1260 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1262 return clean_html(res).strip()
1266 def _get_netrc_login_info(self, netrc_machine=None):
1269 netrc_machine = netrc_machine or self._NETRC_MACHINE
1271 if self.get_param('usenetrc
', False):
1273 netrc_file = compat_expanduser(self.get_param('netrc_location
') or '~
')
1274 if os.path.isdir(netrc_file):
1275 netrc_file = os.path.join(netrc_file, '.netrc
')
1276 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1277 if info is not None:
1281 raise netrc.NetrcParseError(
1282 'No authenticators
for %s' % netrc_machine)
1283 except (OSError, netrc.NetrcParseError) as err:
1284 self.report_warning(
1285 'parsing
.netrc
: %s' % error_to_compat_str(err))
1287 return username, password
1289 def _get_login_info(self, username_option='username
', password_option='password
', netrc_machine=None):
1291 Get the login info as (username, password)
1292 First look for the manually specified credentials using username_option
1293 and password_option as keys in params dictionary. If no such credentials
1294 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1296 If there's no info available
, return (None, None)
1299 # Attempt to use provided username and password or .netrc data
1300 username = self.get_param(username_option)
1301 if username is not None:
1302 password = self.get_param(password_option)
1304 username, password = self._get_netrc_login_info(netrc_machine)
1306 return username, password
1308 def _get_tfa_info(self, note='two-factor verification code'):
1310 Get the two
-factor authentication info
1311 TODO
- asking the user will be required
for sms
/phone verify
1312 currently just uses the command line option
1313 If there
's no info available, return None
1316 tfa = self.get_param('twofactor
')
1320 return getpass.getpass('Type
%s and press
[Return
]: ' % note)
1322 # Helper functions for extracting OpenGraph info
1324 def _og_regexes(prop):
1325 content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))'
1326 property_re = (r'(?
:name|
property)=(?
:\'og
%(sep)s%(prop)s\'|
"og%(sep)s%(prop)s"|\s
*og
%(sep)s%(prop)s\b)'
1327 % {'prop': re.escape(prop), 'sep': '(?::|[:-])'})
1328 template = r'<meta
[^
>]+?
%s[^
>]+?
%s'
1330 template % (property_re, content_re),
1331 template % (content_re, property_re),
1335 def _meta_regex(prop):
1336 return r'''(?isx)<meta
1337 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1338 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1340 def _og_search_property(self, prop, html, name=None, **kargs):
1341 prop = variadic(prop)
1343 name = 'OpenGraph
%s' % prop[0]
1346 og_regexes.extend(self._og_regexes(p))
1347 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1350 return unescapeHTML(escaped)
1352 def _og_search_thumbnail(self, html, **kargs):
1353 return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs)
1355 def _og_search_description(self, html, **kargs):
1356 return self._og_search_property('description
', html, fatal=False, **kargs)
1358 def _og_search_title(self, html, *, fatal=False, **kargs):
1359 return self._og_search_property('title
', html, fatal=fatal, **kargs)
1361 def _og_search_video_url(self, html, name='video url
', secure=True, **kargs):
1362 regexes = self._og_regexes('video
') + self._og_regexes('video
:url
')
1364 regexes = self._og_regexes('video
:secure_url
') + regexes
1365 return self._html_search_regex(regexes, html, name, **kargs)
1367 def _og_search_url(self, html, **kargs):
1368 return self._og_search_property('url
', html, **kargs)
1370 def _html_extract_title(self, html, name='title
', *, fatal=False, **kwargs):
1371 return self._html_search_regex(r'(?s
)<title
\b[^
>]*>([^
<]+)</title
>', html, name, fatal=fatal, **kwargs)
1373 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1374 name = variadic(name)
1375 if display_name is None:
1376 display_name = name[0]
1377 return self._html_search_regex(
1378 [self._meta_regex(n) for n in name],
1379 html, display_name, fatal=fatal, group='content
', **kwargs)
1381 def _dc_search_uploader(self, html):
1382 return self._html_search_meta('dc
.creator
', html, 'uploader
')
1385 def _rta_search(html):
1386 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1387 if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+'
1388 r' content
="RTA-5042-1996-1400-1577-RTA"',
1392 # And then there are the jokers who advertise that they use RTA, but actually don't
.
1393 AGE_LIMIT_MARKERS
= [
1394 r
'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1396 if any(re
.search(marker
, html
) for marker
in AGE_LIMIT_MARKERS
):
1400 def _media_rating_search(self
, html
):
1401 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1402 rating
= self
._html
_search
_meta
('rating', html
)
1414 return RATING_TABLE
.get(rating
.lower())
1416 def _family_friendly_search(self
, html
):
1417 # See http://schema.org/VideoObject
1418 family_friendly
= self
._html
_search
_meta
(
1419 'isFamilyFriendly', html
, default
=None)
1421 if not family_friendly
:
1430 return RATING_TABLE
.get(family_friendly
.lower())
1432 def _twitter_search_player(self
, html
):
1433 return self
._html
_search
_meta
('twitter:player', html
,
1434 'twitter card player')
1436 def _yield_json_ld(self
, html
, video_id
, *, fatal
=True, default
=NO_DEFAULT
):
1437 """Yield all json ld objects in the html"""
1438 if default
is not NO_DEFAULT
:
1440 for mobj
in re
.finditer(JSON_LD_RE
, html
):
1441 json_ld_item
= self
._parse
_json
(mobj
.group('json_ld'), video_id
, fatal
=fatal
)
1442 for json_ld
in variadic(json_ld_item
):
1443 if isinstance(json_ld
, dict):
1446 def _search_json_ld(self
, html
, video_id
, expected_type
=None, *, fatal
=True, default
=NO_DEFAULT
):
1447 """Search for a video in any json ld in the html"""
1448 if default
is not NO_DEFAULT
:
1450 info
= self
._json
_ld
(
1451 list(self
._yield
_json
_ld
(html
, video_id
, fatal
=fatal
, default
=default
)),
1452 video_id
, fatal
=fatal
, expected_type
=expected_type
)
1455 if default
is not NO_DEFAULT
:
1458 raise RegexNotFoundError('Unable to extract JSON-LD')
1460 self
.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1463 def _json_ld(self
, json_ld
, video_id
, fatal
=True, expected_type
=None):
1464 if isinstance(json_ld
, str):
1465 json_ld
= self
._parse
_json
(json_ld
, video_id
, fatal
=fatal
)
1469 if not isinstance(json_ld
, (list, tuple, dict)):
1471 if isinstance(json_ld
, dict):
1474 INTERACTION_TYPE_MAP
= {
1475 'CommentAction': 'comment',
1476 'AgreeAction': 'like',
1477 'DisagreeAction': 'dislike',
1478 'LikeAction': 'like',
1479 'DislikeAction': 'dislike',
1480 'ListenAction': 'view',
1481 'WatchAction': 'view',
1482 'ViewAction': 'view',
1485 def is_type(e
, *expected_types
):
1486 type = variadic(traverse_obj(e
, '@type'))
1487 return any(x
in type for x
in expected_types
)
1489 def extract_interaction_type(e
):
1490 interaction_type
= e
.get('interactionType')
1491 if isinstance(interaction_type
, dict):
1492 interaction_type
= interaction_type
.get('@type')
1493 return str_or_none(interaction_type
)
1495 def extract_interaction_statistic(e
):
1496 interaction_statistic
= e
.get('interactionStatistic')
1497 if isinstance(interaction_statistic
, dict):
1498 interaction_statistic
= [interaction_statistic
]
1499 if not isinstance(interaction_statistic
, list):
1501 for is_e
in interaction_statistic
:
1502 if not is_type(is_e
, 'InteractionCounter'):
1504 interaction_type
= extract_interaction_type(is_e
)
1505 if not interaction_type
:
1507 # For interaction count some sites provide string instead of
1508 # an integer (as per spec) with non digit characters (e.g. ",")
1509 # so extracting count with more relaxed str_to_int
1510 interaction_count
= str_to_int(is_e
.get('userInteractionCount'))
1511 if interaction_count
is None:
1513 count_kind
= INTERACTION_TYPE_MAP
.get(interaction_type
.split('/')[-1])
1516 count_key
= '%s_count' % count_kind
1517 if info
.get(count_key
) is not None:
1519 info
[count_key
] = interaction_count
1521 def extract_chapter_information(e
):
1523 'title': part
.get('name'),
1524 'start_time': part
.get('startOffset'),
1525 'end_time': part
.get('endOffset'),
1526 } for part
in variadic(e
.get('hasPart') or []) if part
.get('@type') == 'Clip']
1527 for idx
, (last_c
, current_c
, next_c
) in enumerate(zip(
1528 [{'end_time': 0}
] + chapters
, chapters
, chapters
[1:])):
1529 current_c
['end_time'] = current_c
['end_time'] or next_c
['start_time']
1530 current_c
['start_time'] = current_c
['start_time'] or last_c
['end_time']
1531 if None in current_c
.values():
1532 self
.report_warning(f
'Chapter {idx} contains broken data. Not extracting chapters')
1535 chapters
[-1]['end_time'] = chapters
[-1]['end_time'] or info
['duration']
1536 info
['chapters'] = chapters
1538 def extract_video_object(e
):
1539 assert is_type(e
, 'VideoObject')
1540 author
= e
.get('author')
1542 'url': url_or_none(e
.get('contentUrl')),
1543 'title': unescapeHTML(e
.get('name')),
1544 'description': unescapeHTML(e
.get('description')),
1545 'thumbnails': [{'url': unescapeHTML(url)}
1546 for url
in variadic(traverse_obj(e
, 'thumbnailUrl', 'thumbnailURL'))
1547 if url_or_none(url
)],
1548 'duration': parse_duration(e
.get('duration')),
1549 'timestamp': unified_timestamp(e
.get('uploadDate')),
1550 # author can be an instance of 'Organization' or 'Person' types.
1551 # both types can have 'name' property(inherited from 'Thing' type). [1]
1552 # however some websites are using 'Text' type instead.
1553 # 1. https://schema.org/VideoObject
1554 'uploader': author
.get('name') if isinstance(author
, dict) else author
if isinstance(author
, str) else None,
1555 'filesize': int_or_none(float_or_none(e
.get('contentSize'))),
1556 'tbr': int_or_none(e
.get('bitrate')),
1557 'width': int_or_none(e
.get('width')),
1558 'height': int_or_none(e
.get('height')),
1559 'view_count': int_or_none(e
.get('interactionCount')),
1561 extract_interaction_statistic(e
)
1562 extract_chapter_information(e
)
1564 def traverse_json_ld(json_ld
, at_top_level
=True):
1566 if at_top_level
and '@context' not in e
:
1568 if at_top_level
and set(e
.keys()) == {'@context', '@graph'}
:
1569 traverse_json_ld(variadic(e
['@graph'], allowed_types
=(dict,)), at_top_level
=False)
1571 if expected_type
is not None and not is_type(e
, expected_type
):
1573 rating
= traverse_obj(e
, ('aggregateRating', 'ratingValue'), expected_type
=float_or_none
)
1574 if rating
is not None:
1575 info
['average_rating'] = rating
1576 if is_type(e
, 'TVEpisode', 'Episode'):
1577 episode_name
= unescapeHTML(e
.get('name'))
1579 'episode': episode_name
,
1580 'episode_number': int_or_none(e
.get('episodeNumber')),
1581 'description': unescapeHTML(e
.get('description')),
1583 if not info
.get('title') and episode_name
:
1584 info
['title'] = episode_name
1585 part_of_season
= e
.get('partOfSeason')
1586 if is_type(part_of_season
, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1588 'season': unescapeHTML(part_of_season
.get('name')),
1589 'season_number': int_or_none(part_of_season
.get('seasonNumber')),
1591 part_of_series
= e
.get('partOfSeries') or e
.get('partOfTVSeries')
1592 if is_type(part_of_series
, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1593 info
['series'] = unescapeHTML(part_of_series
.get('name'))
1594 elif is_type(e
, 'Movie'):
1596 'title': unescapeHTML(e
.get('name')),
1597 'description': unescapeHTML(e
.get('description')),
1598 'duration': parse_duration(e
.get('duration')),
1599 'timestamp': unified_timestamp(e
.get('dateCreated')),
1601 elif is_type(e
, 'Article', 'NewsArticle'):
1603 'timestamp': parse_iso8601(e
.get('datePublished')),
1604 'title': unescapeHTML(e
.get('headline')),
1605 'description': unescapeHTML(e
.get('articleBody') or e
.get('description')),
1607 if is_type(traverse_obj(e
, ('video', 0)), 'VideoObject'):
1608 extract_video_object(e
['video'][0])
1609 elif is_type(traverse_obj(e
, ('subjectOf', 0)), 'VideoObject'):
1610 extract_video_object(e
['subjectOf'][0])
1611 elif is_type(e
, 'VideoObject'):
1612 extract_video_object(e
)
1613 if expected_type
is None:
1617 video
= e
.get('video')
1618 if is_type(video
, 'VideoObject'):
1619 extract_video_object(video
)
1620 if expected_type
is None:
1624 traverse_json_ld(json_ld
)
1626 return filter_dict(info
)
1628 def _search_nextjs_data(self
, webpage
, video_id
, *, transform_source
=None, fatal
=True, **kw
):
1629 return self
._parse
_json
(
1631 r
'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^
>]*>([^
<]+)</script
>',
1632 webpage, 'next
.js data
', fatal=fatal, **kw),
1633 video_id, transform_source=transform_source, fatal=fatal)
1635 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__
', *, fatal=True, traverse=('data
', 0)):
1636 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1637 rectx = re.escape(context_name)
1638 FUNCTION_RE = r'\
(function\
((?P
<arg_keys
>.*?
)\
){return\s+(?P<js>{.*?}
)\s
*;?\s
*}\
((?P
<arg_vals
>.*?
)\
)'
1639 js, arg_keys, arg_vals = self._search_regex(
1640 (rf'<script
>\s
*window\
.{rectx}
={FUNCTION_RE}\s
*\
)\s
*;?\s
*</script
>', rf'{rectx}\
(.*?{FUNCTION_RE}
'),
1641 webpage, context_name, group=('js
', 'arg_keys
', 'arg_vals
'), fatal=fatal)
1643 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1645 for key, val in args.items():
1646 if val in ('undefined
', 'void
0'):
1649 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1650 return traverse_obj(ret, traverse) or {}
1653 def _hidden_inputs(html):
1654 html = re.sub(r'<!--(?
:(?
!<!--).)*-->', '', html)
1656 for input in re.findall(r'(?i
)(<input[^
>]+>)', html):
1657 attrs = extract_attributes(input)
1660 if attrs.get('type') not in ('hidden
', 'submit
'):
1662 name = attrs.get('name
') or attrs.get('id')
1663 value = attrs.get('value
')
1664 if name and value is not None:
1665 hidden_inputs[name] = value
1666 return hidden_inputs
1668 def _form_hidden_inputs(self, form_id, html):
1669 form = self._search_regex(
1670 r'(?
is)<form
[^
>]+?
id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1671 html, '%s form' % form_id, group='form')
1672 return self._hidden_inputs(form)
1675 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1677 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1678 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1679 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
1680 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1681 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1682 'fps', 'fs_approx', 'source', 'id')
1685 'vcodec': {'type': 'ordered', 'regex': True,
1686 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1687 'acodec': {'type': 'ordered', 'regex': True,
1688 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1689 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1690 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1691 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1692 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1693 'vext': {'type': 'ordered', 'field': 'video_ext',
1694 'order': ('mp4', 'webm', 'flv', '', 'none'),
1695 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1696 'aext': {'type': 'ordered', 'field': 'audio_ext',
1697 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1698 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
1699 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1700 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1701 'field': ('vcodec', 'acodec'),
1702 'function': lambda it: int(any(v != 'none' for v in it))},
1703 'ie_pref': {'priority': True, 'type': 'extractor'},
1704 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1705 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1706 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1707 'quality': {'convert': 'float', 'default': -1},
1708 'filesize': {'convert': 'bytes'},
1709 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1710 'id': {'convert': 'string', 'field': 'format_id'},
1711 'height': {'convert': 'float_none'},
1712 'width': {'convert': 'float_none'},
1713 'fps': {'convert': 'float_none'},
1714 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1715 'tbr': {'convert': 'float_none'},
1716 'vbr': {'convert': 'float_none'},
1717 'abr': {'convert': 'float_none'},
1718 'asr': {'convert': 'float_none'},
1719 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1721 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1722 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1723 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1724 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1725 'res': {'type': 'multiple', 'field': ('height', 'width'),
1726 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1728 # Actual field names
1729 'format_id': {'type': 'alias', 'field': 'id'},
1730 'preference': {'type': 'alias', 'field': 'ie_pref'},
1731 'language_preference': {'type': 'alias', 'field': 'lang'},
1732 'source_preference': {'type': 'alias', 'field': 'source'},
1733 'protocol': {'type': 'alias', 'field': 'proto'},
1734 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1735 'audio_channels': {'type': 'alias', 'field': 'channels'},
1738 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1739 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1740 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1741 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1742 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1743 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1744 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1745 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1746 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1747 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1748 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1749 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1750 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1751 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1752 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1753 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1754 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1755 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1756 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1757 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1760 def __init__(self, ie, field_preference):
1762 self.ydl = ie._downloader
1763 self.evaluate_params(self.ydl.params, field_preference)
1764 if ie.get_param('verbose'):
1765 self.print_verbose_info(self.ydl.write_debug)
1767 def _get_field_setting(self, field, key):
1768 if field not in self.settings:
1769 if key in ('forced', 'priority'):
1771 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1772 'deprecated and may be removed in a future version')
1773 self.settings[field] = {}
1774 propObj = self.settings[field]
1775 if key not in propObj:
1776 type = propObj.get('type')
1778 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1779 elif key == 'convert':
1780 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1782 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1783 propObj[key] = default
1786 def _resolve_field_value(self, field, value, convertNone=False):
1791 value = value.lower()
1792 conversion = self._get_field_setting(field, 'convert')
1793 if conversion == 'ignore':
1795 if conversion == 'string':
1797 elif conversion == 'float_none':
1798 return float_or_none(value)
1799 elif conversion == 'bytes':
1800 return FileDownloader.parse_bytes(value)
1801 elif conversion == 'order':
1802 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1803 use_regex = self._get_field_setting(field, 'regex')
1804 list_length = len(order_list)
1805 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1806 if use_regex and value is not None:
1807 for i, regex in enumerate(order_list):
1808 if regex and re.match(regex, value):
1809 return list_length - i
1810 return list_length - empty_pos # not in list
1811 else: # not regex or value = None
1812 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1814 if value.isnumeric():
1817 self.settings[field]['convert'] = 'string'
1820 def evaluate_params(self, params, sort_extractor):
1821 self._use_free_order = params.get('prefer_free_formats', False)
1822 self._sort_user = params.get('format_sort', [])
1823 self._sort_extractor = sort_extractor
1825 def add_item(field, reverse, closest, limit_text):
1826 field = field.lower()
1827 if field in self._order:
1829 self._order.append(field)
1830 limit = self._resolve_field_value(field, limit_text)
1833 'closest': False if limit is None else closest,
1834 'limit_text': limit_text,
1836 if field in self.settings:
1837 self.settings[field].update(data)
1839 self.settings[field] = data
1842 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1843 + (tuple() if params.get('format_sort_force', False)
1844 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1845 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1847 for item in sort_list:
1848 match = re.match(self.regex, item)
1850 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1851 field = match.group('field')
1854 if self._get_field_setting(field, 'type') == 'alias':
1855 alias, field = field, self._get_field_setting(field, 'field')
1856 if self._get_field_setting(alias, 'deprecated'):
1857 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
1858 'be removed in a future version. Please use {field} instead')
1859 reverse = match.group('reverse') is not None
1860 closest = match.group('separator') == '~'
1861 limit_text = match.group('limit')
1863 has_limit = limit_text is not None
1864 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1865 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1867 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1868 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1869 limit_count = len(limits)
1870 for (i, f) in enumerate(fields):
1871 add_item(f, reverse, closest,
1872 limits[i] if i < limit_count
1873 else limits[0] if has_limit and not has_multiple_limits
1876 def print_verbose_info(self, write_debug):
1878 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1879 if self._sort_extractor:
1880 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1881 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1882 '+' if self._get_field_setting(field, 'reverse') else '', field,
1883 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1884 self._get_field_setting(field, 'limit_text'),
1885 self._get_field_setting(field, 'limit'))
1886 if self._get_field_setting(field, 'limit_text') is not None else '')
1887 for field in self._order if self._get_field_setting(field, 'visible')]))
1889 def _calculate_field_preference_from_value(self, format, field, type, value):
1890 reverse = self._get_field_setting(field, 'reverse')
1891 closest = self._get_field_setting(field, 'closest')
1892 limit = self._get_field_setting(field, 'limit')
1894 if type == 'extractor':
1895 maximum = self._get_field_setting(field, 'max')
1896 if value is None or (maximum is not None and value >= maximum):
1898 elif type == 'boolean':
1899 in_list = self._get_field_setting(field, 'in_list')
1900 not_in_list = self._get_field_setting(field, 'not_in_list')
1901 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1902 elif type == 'ordered':
1903 value = self._resolve_field_value(field, value, True)
1905 # try to convert to number
1906 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1907 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1911 return ((-10, 0) if value is None
1912 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1913 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1914 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1915 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1916 else (-1, value, 0))
1918 def _calculate_field_preference(self, format, field):
1919 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1920 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1921 if type == 'multiple':
1922 type = 'field' # Only 'field' is allowed in multiple for now
1923 actual_fields = self._get_field_setting(field, 'field')
1925 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1927 value = get_value(field)
1928 return self._calculate_field_preference_from_value(format, field, type, value)
1930 def calculate_preference(self, format):
1931 # Determine missing protocol
1932 if not format.get('protocol'):
1933 format['protocol'] = determine_protocol(format)
1935 # Determine missing ext
1936 if not format.get('ext') and 'url' in format:
1937 format['ext'] = determine_ext(format['url'])
1938 if format.get('vcodec') == 'none':
1939 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1940 format['video_ext'] = 'none'
1942 format['video_ext'] = format['ext']
1943 format['audio_ext'] = 'none'
1944 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1945 # format['preference'] = -1000
1947 # Determine missing bitrates
1948 if format.get('tbr') is None:
1949 if format.get('vbr') is not None and format.get('abr') is not None:
1950 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1952 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1953 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1954 if format.get('acodec') != 'none' and format.get('abr') is None:
1955 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1957 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1959 def _sort_formats(self, formats, field_preference=[]):
1962 formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1964 def _check_formats(self, formats, video_id):
1966 formats[:] = filter(
1967 lambda f: self._is_valid_url(
1969 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1973 def _remove_duplicate_formats(formats):
1977 if f['url'] not in format_urls:
1978 format_urls.add(f['url'])
1979 unique_formats.append(f)
1980 formats[:] = unique_formats
1982 def _is_valid_url(self, url, video_id, item='video', headers={}):
1983 url = self._proto_relative_url(url, scheme='http:')
1984 # For now assume non HTTP(S) URLs always valid
1985 if not (url.startswith('http://') or url.startswith('https://')):
1988 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1990 except ExtractorError as e:
1992 '%s: %s URL is invalid, skipping: %s'
1993 % (video_id, item, error_to_compat_str(e.cause)))
1996 def http_scheme(self):
1997 """ Either "http
:" or "https
:", depending on the user's preferences """
2000 if self.get_param('prefer_insecure', False)
2003 def _proto_relative_url(self, url, scheme=None):
2004 scheme = scheme or self.http_scheme()
2005 assert scheme.endswith(':')
2006 return sanitize_url(url, scheme=scheme[:-1])
2008 def _sleep(self, timeout, video_id, msg_template=None):
2009 if msg_template is None:
2010 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2011 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2015 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2016 transform_source=lambda s: fix_xml_ampersands(s).strip(),
2017 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2018 res = self._download_xml_handle(
2019 manifest_url, video_id, 'Downloading f4m manifest',
2020 'Unable to download f4m manifest',
2021 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2022 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2023 transform_source=transform_source,
2024 fatal=fatal, data=data, headers=headers, query=query)
2028 manifest, urlh = res
2029 manifest_url = urlh.geturl()
2031 return self._parse_f4m_formats(
2032 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2033 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2035 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2036 transform_source=lambda s: fix_xml_ampersands(s).strip(),
2037 fatal=True, m3u8_id=None):
2038 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2041 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2042 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2043 if akamai_pv is not None and ';' in akamai_pv.text:
2044 playerVerificationChallenge = akamai_pv.text.split(';')[0]
2045 if playerVerificationChallenge.strip() != '':
2049 manifest_version = '1.0'
2050 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2052 manifest_version = '2.0'
2053 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2054 # Remove unsupported DRM protected media from final formats
2055 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2056 media_nodes = remove_encrypted_media(media_nodes)
2060 manifest_base_url = get_base_url(manifest)
2062 bootstrap_info = xpath_element(
2063 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2064 'bootstrap info', default=None)
2067 mime_type = xpath_text(
2068 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2069 'base URL', default=None)
2070 if mime_type and mime_type.startswith('audio/'):
2073 for i, media_el in enumerate(media_nodes):
2074 tbr = int_or_none(media_el.attrib.get('bitrate'))
2075 width = int_or_none(media_el.attrib.get('width'))
2076 height = int_or_none(media_el.attrib.get('height'))
2077 format_id = join_nonempty(f4m_id, tbr or i)
2078 # If <bootstrapInfo> is present, the specified f4m is a
2079 # stream-level manifest, and only set-level manifests may refer to
2080 # external resources. See section 11.4 and section 4 of F4M spec
2081 if bootstrap_info is None:
2083 # @href is introduced in 2.0, see section 11.6 of F4M spec
2084 if manifest_version == '2.0':
2085 media_url = media_el.attrib.get('href')
2086 if media_url is None:
2087 media_url = media_el.attrib.get('url')
2091 media_url if media_url.startswith('http://') or media_url.startswith('https://')
2092 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2093 # If media_url is itself a f4m manifest do the recursive extraction
2094 # since bitrates in parent manifest (this one) and media_url manifest
2095 # may differ leading to inability to resolve the format by requested
2096 # bitrate in f4m downloader
2097 ext = determine_ext(manifest_url)
2099 f4m_formats = self._extract_f4m_formats(
2100 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2101 transform_source=transform_source, fatal=fatal)
2102 # Sometimes stream-level manifest contains single media entry that
2103 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2104 # At the same time parent's media entry in set-level manifest may
2105 # contain it. We will copy it from parent in such cases.
2106 if len(f4m_formats) == 1:
2109 'tbr': f.get('tbr') or tbr,
2110 'width': f.get('width') or width,
2111 'height': f.get('height') or height,
2112 'format_id': f.get('format_id') if not tbr else format_id,
2115 formats.extend(f4m_formats)
2118 formats.extend(self._extract_m3u8_formats(
2119 manifest_url, video_id, 'mp4', preference=preference,
2120 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2123 'format_id': format_id,
2124 'url': manifest_url,
2125 'manifest_url': manifest_url,
2126 'ext': 'flv' if bootstrap_info is not None else None,
2132 'preference': preference,
2137 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2139 'format_id': join_nonempty(m3u8_id, 'meta'),
2143 'preference': preference - 100 if preference else -100,
2145 'resolution': 'multiple',
2146 'format_note': 'Quality selection URL',
2149 def _report_ignoring_subs(self, name):
2150 self.report_warning(bug_reports_message(
2151 f'Ignoring subtitle tracks found in the {name} manifest; '
2152 'if any subtitle tracks are missing,'
2155 def _extract_m3u8_formats(self, *args, **kwargs):
2156 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2158 self._report_ignoring_subs('HLS')
2161 def _extract_m3u8_formats_and_subtitles(
2162 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2163 preference=None, quality=None, m3u8_id=None, note=None,
2164 errnote=None, fatal=True, live=False, data=None, headers={},
2167 res = self._download_webpage_handle(
2169 note='Downloading m3u8 information' if note is None else note,
2170 errnote='Failed to download m3u8 information' if errnote is None else errnote,
2171 fatal=fatal, data=data, headers=headers, query=query)
2176 m3u8_doc, urlh = res
2177 m3u8_url = urlh.geturl()
2179 return self._parse_m3u8_formats_and_subtitles(
2180 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2181 preference=preference, quality=quality, m3u8_id=m3u8_id,
2182 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2183 headers=headers, query=query, video_id=video_id)
2185 def _parse_m3u8_formats_and_subtitles(
2186 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2187 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2188 errnote=None, fatal=True, data=None, headers={}, query={},
2190 formats, subtitles = [], {}
2192 has_drm = re.search('|'.join([
2193 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2194 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd
://', # Apple FairPlay
2197 def format_url(url):
2198 return url if re.match(r'^https?
://', url) else urllib.parse.urljoin(m3u8_url, url)
2200 if self.get_param('hls_split_discontinuity
', False):
2201 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2203 if not manifest_url:
2205 m3u8_doc = self._download_webpage(
2206 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2207 note=False, errnote='Failed to download m3u8 playlist information
')
2208 if m3u8_doc is False:
2210 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2213 def _extract_m3u8_playlist_indices(*args
, **kwargs
):
2217 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2218 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2219 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2221 # We should try extracting formats only from master playlists [1, 4.3.4],
2222 # i.e. playlists that describe available qualities. On the other hand
2223 # media playlists [1, 4.3.3] should be returned as is since they contain
2224 # just the media without qualities renditions.
2225 # Fortunately, master playlist can be easily distinguished from media
2226 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2227 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2228 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2229 # media playlist and MUST NOT appear in master playlist thus we can
2230 # clearly detect media playlist with this criterion.
2232 if '#EXT-X-TARGETDURATION' in m3u8_doc
: # media playlist, return as is
2234 'format_id': join_nonempty(m3u8_id
, idx
),
2235 'format_index': idx
,
2236 'url': m3u8_url
or encode_data_uri(m3u8_doc
.encode('utf-8'), 'application/x-mpegurl'),
2238 'protocol': entry_protocol
,
2239 'preference': preference
,
2242 } for idx
in _extract_m3u8_playlist_indices(m3u8_doc
=m3u8_doc
)]
2244 return formats
, subtitles
2247 last_stream_inf
= {}
2249 def extract_media(x_media_line
):
2250 media
= parse_m3u8_attributes(x_media_line
)
2251 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2252 media_type
, group_id
, name
= media
.get('TYPE'), media
.get('GROUP-ID'), media
.get('NAME')
2253 if not (media_type
and group_id
and name
):
2255 groups
.setdefault(group_id
, []).append(media
)
2256 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2257 if media_type
== 'SUBTITLES':
2258 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2259 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2260 # However, lack of URI has been spotted in the wild.
2261 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2262 if not media
.get('URI'):
2264 url
= format_url(media
['URI'])
2267 'ext': determine_ext(url
),
2269 if sub_info
['ext'] == 'm3u8':
2270 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2271 # files may contain is WebVTT:
2272 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2273 sub_info
['ext'] = 'vtt'
2274 sub_info
['protocol'] = 'm3u8_native'
2275 lang
= media
.get('LANGUAGE') or 'und'
2276 subtitles
.setdefault(lang
, []).append(sub_info
)
2277 if media_type
not in ('VIDEO', 'AUDIO'):
2279 media_url
= media
.get('URI')
2281 manifest_url
= format_url(media_url
)
2283 'format_id': join_nonempty(m3u8_id
, group_id
, name
, idx
),
2284 'format_note': name
,
2285 'format_index': idx
,
2286 'url': manifest_url
,
2287 'manifest_url': m3u8_url
,
2288 'language': media
.get('LANGUAGE'),
2290 'protocol': entry_protocol
,
2291 'preference': preference
,
2293 'vcodec': 'none' if media_type
== 'AUDIO' else None,
2294 } for idx
in _extract_m3u8_playlist_indices(manifest_url
))
2296 def build_stream_name():
2297 # Despite specification does not mention NAME attribute for
2298 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2299 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2300 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2301 stream_name
= last_stream_inf
.get('NAME')
2304 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2305 # from corresponding rendition group
2306 stream_group_id
= last_stream_inf
.get('VIDEO')
2307 if not stream_group_id
:
2309 stream_group
= groups
.get(stream_group_id
)
2310 if not stream_group
:
2311 return stream_group_id
2312 rendition
= stream_group
[0]
2313 return rendition
.get('NAME') or stream_group_id
2315 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2316 # chance to detect video only formats when EXT-X-STREAM-INF tags
2317 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2318 for line
in m3u8_doc
.splitlines():
2319 if line
.startswith('#EXT-X-MEDIA:'):
2322 for line
in m3u8_doc
.splitlines():
2323 if line
.startswith('#EXT-X-STREAM-INF:'):
2324 last_stream_inf
= parse_m3u8_attributes(line
)
2325 elif line
.startswith('#') or not line
.strip():
2328 tbr
= float_or_none(
2329 last_stream_inf
.get('AVERAGE-BANDWIDTH')
2330 or last_stream_inf
.get('BANDWIDTH'), scale
=1000)
2331 manifest_url
= format_url(line
.strip())
2333 for idx
in _extract_m3u8_playlist_indices(manifest_url
):
2334 format_id
= [m3u8_id
, None, idx
]
2335 # Bandwidth of live streams may differ over time thus making
2336 # format_id unpredictable. So it's better to keep provided
2339 stream_name
= build_stream_name()
2340 format_id
[1] = stream_name
or '%d' % (tbr
or len(formats
))
2342 'format_id': join_nonempty(*format_id
),
2343 'format_index': idx
,
2344 'url': manifest_url
,
2345 'manifest_url': m3u8_url
,
2348 'fps': float_or_none(last_stream_inf
.get('FRAME-RATE')),
2349 'protocol': entry_protocol
,
2350 'preference': preference
,
2353 resolution
= last_stream_inf
.get('RESOLUTION')
2355 mobj
= re
.search(r
'(?P<width>\d+)[xX](?P<height>\d+)', resolution
)
2357 f
['width'] = int(mobj
.group('width'))
2358 f
['height'] = int(mobj
.group('height'))
2359 # Unified Streaming Platform
2361 r
'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f
['url'])
2363 abr
, vbr
= mobj
.groups()
2364 abr
, vbr
= float_or_none(abr
, 1000), float_or_none(vbr
, 1000)
2369 codecs
= parse_codecs(last_stream_inf
.get('CODECS'))
2371 audio_group_id
= last_stream_inf
.get('AUDIO')
2372 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2373 # references a rendition group MUST have a CODECS attribute.
2374 # However, this is not always respected. E.g. [2]
2375 # contains EXT-X-STREAM-INF tag which references AUDIO
2376 # rendition group but does not have CODECS and despite
2377 # referencing an audio group it represents a complete
2378 # (with audio and video) format. So, for such cases we will
2379 # ignore references to rendition groups and treat them
2380 # as complete formats.
2381 if audio_group_id
and codecs
and f
.get('vcodec') != 'none':
2382 audio_group
= groups
.get(audio_group_id
)
2383 if audio_group
and audio_group
[0].get('URI'):
2384 # TODO: update acodec for audio only formats with
2386 f
['acodec'] = 'none'
2387 if not f
.get('ext'):
2388 f
['ext'] = 'm4a' if f
.get('vcodec') == 'none' else 'mp4'
2392 progressive_uri
= last_stream_inf
.get('PROGRESSIVE-URI')
2395 del http_f
['manifest_url']
2397 'format_id': f
['format_id'].replace('hls-', 'http-'),
2399 'url': progressive_uri
,
2401 formats
.append(http_f
)
2403 last_stream_inf
= {}
2404 return formats
, subtitles
2406 def _extract_m3u8_vod_duration(
2407 self
, m3u8_vod_url
, video_id
, note
=None, errnote
=None, data
=None, headers
={}, query={}
):
2409 m3u8_vod
= self
._download
_webpage
(
2410 m3u8_vod_url
, video_id
,
2411 note
='Downloading m3u8 VOD manifest' if note
is None else note
,
2412 errnote
='Failed to download VOD manifest' if errnote
is None else errnote
,
2413 fatal
=False, data
=data
, headers
=headers
, query
=query
)
2415 return self
._parse
_m
3u8_vod
_duration
(m3u8_vod
or '', video_id
)
2417 def _parse_m3u8_vod_duration(self
, m3u8_vod
, video_id
):
2418 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod
:
2422 float(line
[len('#EXTINF:'):].split(',')[0])
2423 for line
in m3u8_vod
.splitlines() if line
.startswith('#EXTINF:'))) or None
2426 def _xpath_ns(path
, namespace
=None):
2430 for c
in path
.split('/'):
2431 if not c
or c
== '.':
2434 out
.append('{%s}%s' % (namespace
, c
))
2435 return '/'.join(out
)
2437 def _extract_smil_formats_and_subtitles(self
, smil_url
, video_id
, fatal
=True, f4m_params
=None, transform_source
=None):
2438 res
= self
._download
_smil
(smil_url
, video_id
, fatal
=fatal
, transform_source
=transform_source
)
2444 smil_url
= urlh
.geturl()
2446 namespace
= self
._parse
_smil
_namespace
(smil
)
2448 fmts
= self
._parse
_smil
_formats
(
2449 smil
, smil_url
, video_id
, namespace
=namespace
, f4m_params
=f4m_params
)
2450 subs
= self
._parse
_smil
_subtitles
(
2451 smil
, namespace
=namespace
)
2455 def _extract_smil_formats(self
, *args
, **kwargs
):
2456 fmts
, subs
= self
._extract
_smil
_formats
_and
_subtitles
(*args
, **kwargs
)
2458 self
._report
_ignoring
_subs
('SMIL')
2461 def _extract_smil_info(self
, smil_url
, video_id
, fatal
=True, f4m_params
=None):
2462 res
= self
._download
_smil
(smil_url
, video_id
, fatal
=fatal
)
2467 smil_url
= urlh
.geturl()
2469 return self
._parse
_smil
(smil
, smil_url
, video_id
, f4m_params
=f4m_params
)
2471 def _download_smil(self
, smil_url
, video_id
, fatal
=True, transform_source
=None):
2472 return self
._download
_xml
_handle
(
2473 smil_url
, video_id
, 'Downloading SMIL file',
2474 'Unable to download SMIL file', fatal
=fatal
, transform_source
=transform_source
)
2476 def _parse_smil(self
, smil
, smil_url
, video_id
, f4m_params
=None):
2477 namespace
= self
._parse
_smil
_namespace
(smil
)
2479 formats
= self
._parse
_smil
_formats
(
2480 smil
, smil_url
, video_id
, namespace
=namespace
, f4m_params
=f4m_params
)
2481 subtitles
= self
._parse
_smil
_subtitles
(smil
, namespace
=namespace
)
2483 video_id
= os
.path
.splitext(url_basename(smil_url
))[0]
2487 for meta
in smil
.findall(self
._xpath
_ns
('./head/meta', namespace
)):
2488 name
= meta
.attrib
.get('name')
2489 content
= meta
.attrib
.get('content')
2490 if not name
or not content
:
2492 if not title
and name
== 'title':
2494 elif not description
and name
in ('description', 'abstract'):
2495 description
= content
2496 elif not upload_date
and name
== 'date':
2497 upload_date
= unified_strdate(content
)
2500 'id': image
.get('type'),
2501 'url': image
.get('src'),
2502 'width': int_or_none(image
.get('width')),
2503 'height': int_or_none(image
.get('height')),
2504 } for image
in smil
.findall(self
._xpath
_ns
('.//image', namespace
)) if image
.get('src')]
2508 'title': title
or video_id
,
2509 'description': description
,
2510 'upload_date': upload_date
,
2511 'thumbnails': thumbnails
,
2513 'subtitles': subtitles
,
2516 def _parse_smil_namespace(self
, smil
):
2517 return self
._search
_regex
(
2518 r
'(?i)^{([^}]+)?}smil$', smil
.tag
, 'namespace', default
=None)
2520 def _parse_smil_formats(self
, smil
, smil_url
, video_id
, namespace
=None, f4m_params
=None, transform_rtmp_url
=None):
2522 for meta
in smil
.findall(self
._xpath
_ns
('./head/meta', namespace
)):
2523 b
= meta
.get('base') or meta
.get('httpBase')
2535 media
= smil
.findall(self
._xpath
_ns
('.//video', namespace
)) + smil
.findall(self
._xpath
_ns
('.//audio', namespace
))
2536 for medium
in media
:
2537 src
= medium
.get('src')
2538 if not src
or src
in srcs
:
2542 bitrate
= float_or_none(medium
.get('system-bitrate') or medium
.get('systemBitrate'), 1000)
2543 filesize
= int_or_none(medium
.get('size') or medium
.get('fileSize'))
2544 width
= int_or_none(medium
.get('width'))
2545 height
= int_or_none(medium
.get('height'))
2546 proto
= medium
.get('proto')
2547 ext
= medium
.get('ext')
2548 src_ext
= determine_ext(src
)
2549 streamer
= medium
.get('streamer') or base
2551 if proto
== 'rtmp' or streamer
.startswith('rtmp'):
2557 'format_id': 'rtmp-%d' % (rtmp_count
if bitrate
is None else bitrate
),
2559 'filesize': filesize
,
2563 if transform_rtmp_url
:
2564 streamer
, src
= transform_rtmp_url(streamer
, src
)
2565 formats
[-1].update({
2571 src_url
= src
if src
.startswith('http') else urllib
.parse
.urljoin(base
, src
)
2572 src_url
= src_url
.strip()
2574 if proto
== 'm3u8' or src_ext
== 'm3u8':
2575 m3u8_formats
= self
._extract
_m
3u8_formats
(
2576 src_url
, video_id
, ext
or 'mp4', m3u8_id
='hls', fatal
=False)
2577 if len(m3u8_formats
) == 1:
2579 m3u8_formats
[0].update({
2580 'format_id': 'hls-%d' % (m3u8_count
if bitrate
is None else bitrate
),
2585 formats
.extend(m3u8_formats
)
2586 elif src_ext
== 'f4m':
2591 'plugin': 'flowplayer-3.2.0.1',
2593 f4m_url
+= '&' if '?' in f4m_url
else '?'
2594 f4m_url
+= urllib
.parse
.urlencode(f4m_params
)
2595 formats
.extend(self
._extract
_f
4m
_formats
(f4m_url
, video_id
, f4m_id
='hds', fatal
=False))
2596 elif src_ext
== 'mpd':
2597 formats
.extend(self
._extract
_mpd
_formats
(
2598 src_url
, video_id
, mpd_id
='dash', fatal
=False))
2599 elif re
.search(r
'\.ism/[Mm]anifest', src_url
):
2600 formats
.extend(self
._extract
_ism
_formats
(
2601 src_url
, video_id
, ism_id
='mss', fatal
=False))
2602 elif src_url
.startswith('http') and self
._is
_valid
_url
(src
, video_id
):
2606 'ext': ext
or src_ext
or 'flv',
2607 'format_id': 'http-%d' % (bitrate
or http_count
),
2609 'filesize': filesize
,
2614 for medium
in smil
.findall(self
._xpath
_ns
('.//imagestream', namespace
)):
2615 src
= medium
.get('src')
2616 if not src
or src
in srcs
:
2622 'format_id': 'imagestream-%d' % (imgs_count
),
2624 'ext': mimetype2ext(medium
.get('type')),
2627 'width': int_or_none(medium
.get('width')),
2628 'height': int_or_none(medium
.get('height')),
2629 'format_note': 'SMIL storyboards',
2634 def _parse_smil_subtitles(self
, smil
, namespace
=None, subtitles_lang
='en'):
2637 for num
, textstream
in enumerate(smil
.findall(self
._xpath
_ns
('.//textstream', namespace
))):
2638 src
= textstream
.get('src')
2639 if not src
or src
in urls
:
2642 ext
= textstream
.get('ext') or mimetype2ext(textstream
.get('type')) or determine_ext(src
)
2643 lang
= textstream
.get('systemLanguage') or textstream
.get('systemLanguageName') or textstream
.get('lang') or subtitles_lang
2644 subtitles
.setdefault(lang
, []).append({
2650 def _extract_xspf_playlist(self
, xspf_url
, playlist_id
, fatal
=True):
2651 res
= self
._download
_xml
_handle
(
2652 xspf_url
, playlist_id
, 'Downloading xpsf playlist',
2653 'Unable to download xspf manifest', fatal
=fatal
)
2658 xspf_url
= urlh
.geturl()
2660 return self
._parse
_xspf
(
2661 xspf
, playlist_id
, xspf_url
=xspf_url
,
2662 xspf_base_url
=base_url(xspf_url
))
2664 def _parse_xspf(self
, xspf_doc
, playlist_id
, xspf_url
=None, xspf_base_url
=None):
2666 'xspf': 'http://xspf.org/ns/0/',
2667 's1': 'http://static.streamone.nl/player/ns/0',
2671 for track
in xspf_doc
.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP
)):
2673 track
, xpath_with_ns('./xspf:title', NS_MAP
), 'title', default
=playlist_id
)
2674 description
= xpath_text(
2675 track
, xpath_with_ns('./xspf:annotation', NS_MAP
), 'description')
2676 thumbnail
= xpath_text(
2677 track
, xpath_with_ns('./xspf:image', NS_MAP
), 'thumbnail')
2678 duration
= float_or_none(
2679 xpath_text(track
, xpath_with_ns('./xspf:duration', NS_MAP
), 'duration'), 1000)
2682 for location
in track
.findall(xpath_with_ns('./xspf:location', NS_MAP
)):
2683 format_url
= urljoin(xspf_base_url
, location
.text
)
2688 'manifest_url': xspf_url
,
2689 'format_id': location
.get(xpath_with_ns('s1:label', NS_MAP
)),
2690 'width': int_or_none(location
.get(xpath_with_ns('s1:width', NS_MAP
))),
2691 'height': int_or_none(location
.get(xpath_with_ns('s1:height', NS_MAP
))),
2693 self
._sort
_formats
(formats
)
2698 'description': description
,
2699 'thumbnail': thumbnail
,
2700 'duration': duration
,
2705 def _extract_mpd_formats(self
, *args
, **kwargs
):
2706 fmts
, subs
= self
._extract
_mpd
_formats
_and
_subtitles
(*args
, **kwargs
)
2708 self
._report
_ignoring
_subs
('DASH')
2711 def _extract_mpd_formats_and_subtitles(
2712 self
, mpd_url
, video_id
, mpd_id
=None, note
=None, errnote
=None,
2713 fatal
=True, data
=None, headers
={}, query={}
):
2714 res
= self
._download
_xml
_handle
(
2716 note
='Downloading MPD manifest' if note
is None else note
,
2717 errnote
='Failed to download MPD manifest' if errnote
is None else errnote
,
2718 fatal
=fatal
, data
=data
, headers
=headers
, query
=query
)
2725 # We could have been redirected to a new url when we retrieved our mpd file.
2726 mpd_url
= urlh
.geturl()
2727 mpd_base_url
= base_url(mpd_url
)
2729 return self
._parse
_mpd
_formats
_and
_subtitles
(
2730 mpd_doc
, mpd_id
, mpd_base_url
, mpd_url
)
2732 def _parse_mpd_formats(self
, *args
, **kwargs
):
2733 fmts
, subs
= self
._parse
_mpd
_formats
_and
_subtitles
(*args
, **kwargs
)
2735 self
._report
_ignoring
_subs
('DASH')
2738 def _parse_mpd_formats_and_subtitles(
2739 self
, mpd_doc
, mpd_id
=None, mpd_base_url
='', mpd_url
=None):
2741 Parse formats from MPD manifest.
2743 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2744 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2745 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2747 if not self
.get_param('dynamic_mpd', True):
2748 if mpd_doc
.get('type') == 'dynamic':
2751 namespace
= self
._search
_regex
(r
'(?i)^{([^}]+)?}MPD$', mpd_doc
.tag
, 'namespace', default
=None)
2754 return self
._xpath
_ns
(path
, namespace
)
2756 def is_drm_protected(element
):
2757 return element
.find(_add_ns('ContentProtection')) is not None
2759 def extract_multisegment_info(element
, ms_parent_info
):
2760 ms_info
= ms_parent_info
.copy()
2762 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2763 # common attributes and elements. We will only extract relevant
2765 def extract_common(source
):
2766 segment_timeline
= source
.find(_add_ns('SegmentTimeline'))
2767 if segment_timeline
is not None:
2768 s_e
= segment_timeline
.findall(_add_ns('S'))
2770 ms_info
['total_number'] = 0
2773 r
= int(s
.get('r', 0))
2774 ms_info
['total_number'] += 1 + r
2775 ms_info
['s'].append({
2776 't': int(s
.get('t', 0)),
2777 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2778 'd': int(s
.attrib
['d']),
2781 start_number
= source
.get('startNumber')
2783 ms_info
['start_number'] = int(start_number
)
2784 timescale
= source
.get('timescale')
2786 ms_info
['timescale'] = int(timescale
)
2787 segment_duration
= source
.get('duration')
2788 if segment_duration
:
2789 ms_info
['segment_duration'] = float(segment_duration
)
2791 def extract_Initialization(source
):
2792 initialization
= source
.find(_add_ns('Initialization'))
2793 if initialization
is not None:
2794 ms_info
['initialization_url'] = initialization
.attrib
['sourceURL']
2796 segment_list
= element
.find(_add_ns('SegmentList'))
2797 if segment_list
is not None:
2798 extract_common(segment_list
)
2799 extract_Initialization(segment_list
)
2800 segment_urls_e
= segment_list
.findall(_add_ns('SegmentURL'))
2802 ms_info
['segment_urls'] = [segment
.attrib
['media'] for segment
in segment_urls_e
]
2804 segment_template
= element
.find(_add_ns('SegmentTemplate'))
2805 if segment_template
is not None:
2806 extract_common(segment_template
)
2807 media
= segment_template
.get('media')
2809 ms_info
['media'] = media
2810 initialization
= segment_template
.get('initialization')
2812 ms_info
['initialization'] = initialization
2814 extract_Initialization(segment_template
)
2817 mpd_duration
= parse_duration(mpd_doc
.get('mediaPresentationDuration'))
2818 formats
, subtitles
= [], {}
2819 stream_numbers
= collections
.defaultdict(int)
2820 for period
in mpd_doc
.findall(_add_ns('Period')):
2821 period_duration
= parse_duration(period
.get('duration')) or mpd_duration
2822 period_ms_info
= extract_multisegment_info(period
, {
2826 for adaptation_set
in period
.findall(_add_ns('AdaptationSet')):
2827 adaption_set_ms_info
= extract_multisegment_info(adaptation_set
, period_ms_info
)
2828 for representation
in adaptation_set
.findall(_add_ns('Representation')):
2829 representation_attrib
= adaptation_set
.attrib
.copy()
2830 representation_attrib
.update(representation
.attrib
)
2831 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2832 mime_type
= representation_attrib
['mimeType']
2833 content_type
= representation_attrib
.get('contentType', mime_type
.split('/')[0])
2835 codec_str
= representation_attrib
.get('codecs', '')
2836 # Some kind of binary subtitle found in some youtube livestreams
2837 if mime_type
== 'application/x-rawcc':
2838 codecs
= {'scodec': codec_str}
2840 codecs
= parse_codecs(codec_str
)
2841 if content_type
not in ('video', 'audio', 'text'):
2842 if mime_type
== 'image/jpeg':
2843 content_type
= mime_type
2844 elif codecs
.get('vcodec', 'none') != 'none':
2845 content_type
= 'video'
2846 elif codecs
.get('acodec', 'none') != 'none':
2847 content_type
= 'audio'
2848 elif codecs
.get('scodec', 'none') != 'none':
2849 content_type
= 'text'
2850 elif mimetype2ext(mime_type
) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2851 content_type
= 'text'
2853 self
.report_warning('Unknown MIME type %s in DASH manifest' % mime_type
)
2857 for element
in (representation
, adaptation_set
, period
, mpd_doc
):
2858 base_url_e
= element
.find(_add_ns('BaseURL'))
2859 if try_call(lambda: base_url_e
.text
) is not None:
2860 base_url
= base_url_e
.text
+ base_url
2861 if re
.match(r
'^https?://', base_url
):
2863 if mpd_base_url
and base_url
.startswith('/'):
2864 base_url
= urllib
.parse
.urljoin(mpd_base_url
, base_url
)
2865 elif mpd_base_url
and not re
.match(r
'^https?://', base_url
):
2866 if not mpd_base_url
.endswith('/'):
2868 base_url
= mpd_base_url
+ base_url
2869 representation_id
= representation_attrib
.get('id')
2870 lang
= representation_attrib
.get('lang')
2871 url_el
= representation
.find(_add_ns('BaseURL'))
2872 filesize
= int_or_none(url_el
.attrib
.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el
is not None else None)
2873 bandwidth
= int_or_none(representation_attrib
.get('bandwidth'))
2874 if representation_id
is not None:
2875 format_id
= representation_id
2877 format_id
= content_type
2879 format_id
= mpd_id
+ '-' + format_id
2880 if content_type
in ('video', 'audio'):
2882 'format_id': format_id
,
2883 'manifest_url': mpd_url
,
2884 'ext': mimetype2ext(mime_type
),
2885 'width': int_or_none(representation_attrib
.get('width')),
2886 'height': int_or_none(representation_attrib
.get('height')),
2887 'tbr': float_or_none(bandwidth
, 1000),
2888 'asr': int_or_none(representation_attrib
.get('audioSamplingRate')),
2889 'fps': int_or_none(representation_attrib
.get('frameRate')),
2890 'language': lang
if lang
not in ('mul', 'und', 'zxx', 'mis') else None,
2891 'format_note': 'DASH %s' % content_type
,
2892 'filesize': filesize
,
2893 'container': mimetype2ext(mime_type
) + '_dash',
2896 elif content_type
== 'text':
2898 'ext': mimetype2ext(mime_type
),
2899 'manifest_url': mpd_url
,
2900 'filesize': filesize
,
2902 elif content_type
== 'image/jpeg':
2903 # See test case in VikiIE
2904 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2906 'format_id': format_id
,
2908 'manifest_url': mpd_url
,
2909 'format_note': 'DASH storyboards (jpeg)',
2913 if is_drm_protected(adaptation_set
) or is_drm_protected(representation
):
2915 representation_ms_info
= extract_multisegment_info(representation
, adaption_set_ms_info
)
2917 def prepare_template(template_name
, identifiers
):
2918 tmpl
= representation_ms_info
[template_name
]
2919 if representation_id
is not None:
2920 tmpl
= tmpl
.replace('$RepresentationID$', representation_id
)
2921 # First of, % characters outside $...$ templates
2922 # must be escaped by doubling for proper processing
2923 # by % operator string formatting used further (see
2924 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2930 in_template
= not in_template
2931 elif c
== '%' and not in_template
:
2933 # Next, $...$ templates are translated to their
2934 # %(...) counterparts to be used with % operator
2935 t
= re
.sub(r
'\$(%s)\$' % '|'.join(identifiers
), r
'%(\1)d', t
)
2936 t
= re
.sub(r
'\$(%s)%%([^$]+)\$' % '|'.join(identifiers
), r
'%(\1)\2', t
)
2937 t
.replace('$$', '$')
2940 # @initialization is a regular template like @media one
2941 # so it should be handled just the same way (see
2942 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2943 if 'initialization' in representation_ms_info
:
2944 initialization_template
= prepare_template(
2946 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2947 # $Time$ shall not be included for @initialization thus
2948 # only $Bandwidth$ remains
2950 representation_ms_info
['initialization_url'] = initialization_template
% {
2951 'Bandwidth': bandwidth
,
2954 def location_key(location
):
2955 return 'url' if re
.match(r
'^https?://', location
) else 'path'
2957 if 'segment_urls' not in representation_ms_info
and 'media' in representation_ms_info
:
2959 media_template
= prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2960 media_location_key
= location_key(media_template
)
2962 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2963 # can't be used at the same time
2964 if '%(Number' in media_template
and 's' not in representation_ms_info
:
2965 segment_duration
= None
2966 if 'total_number' not in representation_ms_info
and 'segment_duration' in representation_ms_info
:
2967 segment_duration
= float_or_none(representation_ms_info
['segment_duration'], representation_ms_info
['timescale'])
2968 representation_ms_info
['total_number'] = int(math
.ceil(
2969 float_or_none(period_duration
, segment_duration
, default
=0)))
2970 representation_ms_info
['fragments'] = [{
2971 media_location_key
: media_template
% {
2972 'Number': segment_number
,
2973 'Bandwidth': bandwidth
,
2975 'duration': segment_duration
,
2976 } for segment_number
in range(
2977 representation_ms_info
['start_number'],
2978 representation_ms_info
['total_number'] + representation_ms_info
['start_number'])]
2980 # $Number*$ or $Time$ in media template with S list available
2981 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2982 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2983 representation_ms_info
['fragments'] = []
2986 segment_number
= representation_ms_info
['start_number']
2988 def add_segment_url():
2989 segment_url
= media_template
% {
2990 'Time': segment_time
,
2991 'Bandwidth': bandwidth
,
2992 'Number': segment_number
,
2994 representation_ms_info
['fragments'].append({
2995 media_location_key
: segment_url
,
2996 'duration': float_or_none(segment_d
, representation_ms_info
['timescale']),
2999 for num
, s
in enumerate(representation_ms_info
['s']):
3000 segment_time
= s
.get('t') or segment_time
3004 for r
in range(s
.get('r', 0)):
3005 segment_time
+= segment_d
3008 segment_time
+= segment_d
3009 elif 'segment_urls' in representation_ms_info
and 's' in representation_ms_info
:
3010 # No media template,
3011 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3012 # or any YouTube dashsegments video
3015 timescale
= representation_ms_info
['timescale']
3016 for s
in representation_ms_info
['s']:
3017 duration
= float_or_none(s
['d'], timescale
)
3018 for r
in range(s
.get('r', 0) + 1):
3019 segment_uri
= representation_ms_info
['segment_urls'][segment_index
]
3021 location_key(segment_uri
): segment_uri
,
3022 'duration': duration
,
3025 representation_ms_info
['fragments'] = fragments
3026 elif 'segment_urls' in representation_ms_info
:
3027 # Segment URLs with no SegmentTimeline
3028 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3029 # https://github.com/ytdl-org/youtube-dl/pull/14844
3031 segment_duration
= float_or_none(
3032 representation_ms_info
['segment_duration'],
3033 representation_ms_info
['timescale']) if 'segment_duration' in representation_ms_info
else None
3034 for segment_url
in representation_ms_info
['segment_urls']:
3036 location_key(segment_url
): segment_url
,
3038 if segment_duration
:
3039 fragment
['duration'] = segment_duration
3040 fragments
.append(fragment
)
3041 representation_ms_info
['fragments'] = fragments
3042 # If there is a fragments key available then we correctly recognized fragmented media.
3043 # Otherwise we will assume unfragmented media with direct access. Technically, such
3044 # assumption is not necessarily correct since we may simply have no support for
3045 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3046 if 'fragments' in representation_ms_info
:
3048 # NB: mpd_url may be empty when MPD manifest is parsed from a string
3049 'url': mpd_url
or base_url
,
3050 'fragment_base_url': base_url
,
3052 'protocol': 'http_dash_segments' if mime_type
!= 'image/jpeg' else 'mhtml',
3054 if 'initialization_url' in representation_ms_info
:
3055 initialization_url
= representation_ms_info
['initialization_url']
3056 if not f
.get('url'):
3057 f
['url'] = initialization_url
3058 f
['fragments'].append({location_key(initialization_url): initialization_url}
)
3059 f
['fragments'].extend(representation_ms_info
['fragments'])
3060 if not period_duration
:
3061 period_duration
= try_get(
3062 representation_ms_info
,
3063 lambda r
: sum(frag
['duration'] for frag
in r
['fragments']), float)
3065 # Assuming direct URL to unfragmented media.
3067 if content_type
in ('video', 'audio', 'image/jpeg'):
3068 f
['manifest_stream_number'] = stream_numbers
[f
['url']]
3069 stream_numbers
[f
['url']] += 1
3071 elif content_type
== 'text':
3072 subtitles
.setdefault(lang
or 'und', []).append(f
)
3074 return formats
, subtitles
3076 def _extract_ism_formats(self
, *args
, **kwargs
):
3077 fmts
, subs
= self
._extract
_ism
_formats
_and
_subtitles
(*args
, **kwargs
)
3079 self
._report
_ignoring
_subs
('ISM')
3082 def _extract_ism_formats_and_subtitles(self
, ism_url
, video_id
, ism_id
=None, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query={}
):
3083 res
= self
._download
_xml
_handle
(
3085 note
='Downloading ISM manifest' if note
is None else note
,
3086 errnote
='Failed to download ISM manifest' if errnote
is None else errnote
,
3087 fatal
=fatal
, data
=data
, headers
=headers
, query
=query
)
3094 return self
._parse
_ism
_formats
_and
_subtitles
(ism_doc
, urlh
.geturl(), ism_id
)
3096 def _parse_ism_formats_and_subtitles(self
, ism_doc
, ism_url
, ism_id
=None):
3098 Parse formats from ISM manifest.
3100 1. [MS-SSTR]: Smooth Streaming Protocol,
3101 https://msdn.microsoft.com/en-us/library/ff469518.aspx
3103 if ism_doc
.get('IsLive') == 'TRUE':
3106 duration
= int(ism_doc
.attrib
['Duration'])
3107 timescale
= int_or_none(ism_doc
.get('TimeScale')) or 10000000
3111 for stream
in ism_doc
.findall('StreamIndex'):
3112 stream_type
= stream
.get('Type')
3113 if stream_type
not in ('video', 'audio', 'text'):
3115 url_pattern
= stream
.attrib
['Url']
3116 stream_timescale
= int_or_none(stream
.get('TimeScale')) or timescale
3117 stream_name
= stream
.get('Name')
3118 stream_language
= stream
.get('Language', 'und')
3119 for track
in stream
.findall('QualityLevel'):
3120 fourcc
= track
.get('FourCC') or ('AACL' if track
.get('AudioTag') == '255' else None)
3121 # TODO: add support for WVC1 and WMAP
3122 if fourcc
not in ('H264', 'AVC1', 'AACL', 'TTML'):
3123 self
.report_warning('%s is not a supported codec' % fourcc
)
3125 tbr
= int(track
.attrib
['Bitrate']) // 1000
3126 # [1] does not mention Width and Height attributes. However,
3127 # they're often present while MaxWidth and MaxHeight are
3128 # missing, so should be used as fallbacks
3129 width
= int_or_none(track
.get('MaxWidth') or track
.get('Width'))
3130 height
= int_or_none(track
.get('MaxHeight') or track
.get('Height'))
3131 sampling_rate
= int_or_none(track
.get('SamplingRate'))
3133 track_url_pattern
= re
.sub(r
'{[Bb]itrate}', track
.attrib
['Bitrate'], url_pattern
)
3134 track_url_pattern
= urllib
.parse
.urljoin(ism_url
, track_url_pattern
)
3140 stream_fragments
= stream
.findall('c')
3141 for stream_fragment_index
, stream_fragment
in enumerate(stream_fragments
):
3142 fragment_ctx
['time'] = int_or_none(stream_fragment
.get('t')) or fragment_ctx
['time']
3143 fragment_repeat
= int_or_none(stream_fragment
.get('r')) or 1
3144 fragment_ctx
['duration'] = int_or_none(stream_fragment
.get('d'))
3145 if not fragment_ctx
['duration']:
3147 next_fragment_time
= int(stream_fragment
[stream_fragment_index
+ 1].attrib
['t'])
3149 next_fragment_time
= duration
3150 fragment_ctx
['duration'] = (next_fragment_time
- fragment_ctx
['time']) / fragment_repeat
3151 for _
in range(fragment_repeat
):
3153 'url': re
.sub(r
'{start[ _]time}', str(fragment_ctx
['time']), track_url_pattern
),
3154 'duration': fragment_ctx
['duration'] / stream_timescale
,
3156 fragment_ctx
['time'] += fragment_ctx
['duration']
3158 if stream_type
== 'text':
3159 subtitles
.setdefault(stream_language
, []).append({
3163 'manifest_url': ism_url
,
3164 'fragments': fragments
,
3165 '_download_params': {
3166 'stream_type': stream_type
,
3167 'duration': duration
,
3168 'timescale': stream_timescale
,
3170 'language': stream_language
,
3171 'codec_private_data': track
.get('CodecPrivateData'),
3174 elif stream_type
in ('video', 'audio'):
3176 'format_id': join_nonempty(ism_id
, stream_name
, tbr
),
3178 'manifest_url': ism_url
,
3179 'ext': 'ismv' if stream_type
== 'video' else 'isma',
3183 'asr': sampling_rate
,
3184 'vcodec': 'none' if stream_type
== 'audio' else fourcc
,
3185 'acodec': 'none' if stream_type
== 'video' else fourcc
,
3187 'fragments': fragments
,
3188 'has_drm': ism_doc
.find('Protection') is not None,
3189 '_download_params': {
3190 'stream_type': stream_type
,
3191 'duration': duration
,
3192 'timescale': stream_timescale
,
3193 'width': width
or 0,
3194 'height': height
or 0,
3196 'language': stream_language
,
3197 'codec_private_data': track
.get('CodecPrivateData'),
3198 'sampling_rate': sampling_rate
,
3199 'channels': int_or_none(track
.get('Channels', 2)),
3200 'bits_per_sample': int_or_none(track
.get('BitsPerSample', 16)),
3201 'nal_unit_length_field': int_or_none(track
.get('NALUnitLengthField', 4)),
3204 return formats
, subtitles
3206 def _parse_html5_media_entries(self
, base_url
, webpage
, video_id
, m3u8_id
=None, m3u8_entry_protocol
='m3u8_native', mpd_id
=None, preference
=None, quality
=None):
3207 def absolute_url(item_url
):
3208 return urljoin(base_url
, item_url
)
3210 def parse_content_type(content_type
):
3211 if not content_type
:
3213 ctr
= re
.search(r
'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type
)
3215 mimetype
, codecs
= ctr
.groups()
3216 f
= parse_codecs(codecs
)
3217 f
['ext'] = mimetype2ext(mimetype
)
3221 def _media_formats(src
, cur_media_type
, type_info
=None):
3222 type_info
= type_info
or {}
3223 full_url
= absolute_url(src
)
3224 ext
= type_info
.get('ext') or determine_ext(full_url
)
3226 is_plain_url
= False
3227 formats
= self
._extract
_m
3u8_formats
(
3228 full_url
, video_id
, ext
='mp4',
3229 entry_protocol
=m3u8_entry_protocol
, m3u8_id
=m3u8_id
,
3230 preference
=preference
, quality
=quality
, fatal
=False)
3232 is_plain_url
= False
3233 formats
= self
._extract
_mpd
_formats
(
3234 full_url
, video_id
, mpd_id
=mpd_id
, fatal
=False)
3239 'vcodec': 'none' if cur_media_type
== 'audio' else None,
3242 return is_plain_url
, formats
3245 # amp-video and amp-audio are very similar to their HTML5 counterparts
3246 # so we will include them right here (see
3247 # https://www.ampproject.org/docs/reference/components/amp-video)
3248 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3249 _MEDIA_TAG_NAME_RE
= r
'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3250 media_tags
= [(media_tag
, media_tag_name
, media_type
, '')
3251 for media_tag
, media_tag_name
, media_type
3252 in re
.findall(r
'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE
, webpage
)]
3253 media_tags
.extend(re
.findall(
3254 # We only allow video|audio followed by a whitespace or '>'.
3255 # Allowing more characters may end up in significant slow down (see
3256 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3257 # e.g. http://www.porntrex.com/maps/videositemap.xml).
3258 r
'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE
, webpage
))
3259 for media_tag
, _
, media_type
, media_content
in media_tags
:
3264 media_attributes
= extract_attributes(media_tag
)
3265 src
= strip_or_none(dict_get(media_attributes
, ('src', 'data-video-src', 'data-src', 'data-source')))
3267 f
= parse_content_type(media_attributes
.get('type'))
3268 _
, formats
= _media_formats(src
, media_type
, f
)
3269 media_info
['formats'].extend(formats
)
3270 media_info
['thumbnail'] = absolute_url(media_attributes
.get('poster'))
3272 for source_tag
in re
.findall(r
'<source[^>]+>', media_content
):
3273 s_attr
= extract_attributes(source_tag
)
3274 # data-video-src and data-src are non standard but seen
3275 # several times in the wild
3276 src
= strip_or_none(dict_get(s_attr
, ('src', 'data-video-src', 'data-src', 'data-source')))
3279 f
= parse_content_type(s_attr
.get('type'))
3280 is_plain_url
, formats
= _media_formats(src
, media_type
, f
)
3282 # width, height, res, label and title attributes are
3283 # all not standard but seen several times in the wild
3286 for lbl
in ('label', 'title')
3287 if str_or_none(s_attr
.get(lbl
))
3289 width
= int_or_none(s_attr
.get('width'))
3290 height
= (int_or_none(s_attr
.get('height'))
3291 or int_or_none(s_attr
.get('res')))
3292 if not width
or not height
:
3294 resolution
= parse_resolution(lbl
)
3297 width
= width
or resolution
.get('width')
3298 height
= height
or resolution
.get('height')
3300 tbr
= parse_bitrate(lbl
)
3309 'format_id': s_attr
.get('label') or s_attr
.get('title'),
3311 f
.update(formats
[0])
3312 media_info
['formats'].append(f
)
3314 media_info
['formats'].extend(formats
)
3315 for track_tag
in re
.findall(r
'<track[^>]+>', media_content
):
3316 track_attributes
= extract_attributes(track_tag
)
3317 kind
= track_attributes
.get('kind')
3318 if not kind
or kind
in ('subtitles', 'captions'):
3319 src
= strip_or_none(track_attributes
.get('src'))
3322 lang
= track_attributes
.get('srclang') or track_attributes
.get('lang') or track_attributes
.get('label')
3323 media_info
['subtitles'].setdefault(lang
, []).append({
3324 'url': absolute_url(src
),
3326 for f
in media_info
['formats']:
3327 f
.setdefault('http_headers', {})['Referer'] = base_url
3328 if media_info
['formats'] or media_info
['subtitles']:
3329 entries
.append(media_info
)
3332 def _extract_akamai_formats(self
, *args
, **kwargs
):
3333 fmts
, subs
= self
._extract
_akamai
_formats
_and
_subtitles
(*args
, **kwargs
)
3335 self
._report
_ignoring
_subs
('akamai')
3338 def _extract_akamai_formats_and_subtitles(self
, manifest_url
, video_id
, hosts
={}):
3339 signed
= 'hdnea=' in manifest_url
3341 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3342 manifest_url
= re
.sub(
3343 r
'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3344 '', manifest_url
).strip('?')
3349 hdcore_sign
= 'hdcore=3.7.0'
3350 f4m_url
= re
.sub(r
'(https?://[^/]+)/i/', r
'\1/z/', manifest_url
).replace('/master.m3u8', '/manifest.f4m')
3351 hds_host
= hosts
.get('hds')
3353 f4m_url
= re
.sub(r
'(https?://)[^/]+', r
'\1' + hds_host
, f4m_url
)
3354 if 'hdcore=' not in f4m_url
:
3355 f4m_url
+= ('&' if '?' in f4m_url
else '?') + hdcore_sign
3356 f4m_formats
= self
._extract
_f
4m
_formats
(
3357 f4m_url
, video_id
, f4m_id
='hds', fatal
=False)
3358 for entry
in f4m_formats
:
3359 entry
.update({'extra_param_to_segment_url': hdcore_sign}
)
3360 formats
.extend(f4m_formats
)
3362 m3u8_url
= re
.sub(r
'(https?://[^/]+)/z/', r
'\1/i/', manifest_url
).replace('/manifest.f4m', '/master.m3u8')
3363 hls_host
= hosts
.get('hls')
3365 m3u8_url
= re
.sub(r
'(https?://)[^/]+', r
'\1' + hls_host
, m3u8_url
)
3366 m3u8_formats
, m3u8_subtitles
= self
._extract
_m
3u8_formats
_and
_subtitles
(
3367 m3u8_url
, video_id
, 'mp4', 'm3u8_native',
3368 m3u8_id
='hls', fatal
=False)
3369 formats
.extend(m3u8_formats
)
3370 subtitles
= self
._merge
_subtitles
(subtitles
, m3u8_subtitles
)
3372 http_host
= hosts
.get('http')
3373 if http_host
and m3u8_formats
and not signed
:
3374 REPL_REGEX
= r
'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3375 qualities
= re
.match(REPL_REGEX
, m3u8_url
).group(2).split(',')
3376 qualities_length
= len(qualities
)
3377 if len(m3u8_formats
) in (qualities_length
, qualities_length
+ 1):
3379 for f
in m3u8_formats
:
3380 if f
['vcodec'] != 'none':
3381 for protocol
in ('http', 'https'):
3383 del http_f
['manifest_url']
3385 REPL_REGEX
, protocol
+ fr
'://{http_host}/\g<1>{qualities[i]}\3', f
['url'])
3387 'format_id': http_f
['format_id'].replace('hls-', protocol
+ '-'),
3389 'protocol': protocol
,
3391 formats
.append(http_f
)
3394 return formats
, subtitles
3396 def _extract_wowza_formats(self
, url
, video_id
, m3u8_entry_protocol
='m3u8_native', skip_protocols
=[]):
3397 query
= urllib
.parse
.urlparse(url
).query
3398 url
= re
.sub(r
'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url
)
3400 r
'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url
)
3401 url_base
= mobj
.group('url')
3402 http_base_url
= '%s%s:%s' % ('http', mobj
.group('s') or '', url_base
)
3405 def manifest_url(manifest
):
3406 m_url
= f
'{http_base_url}/{manifest}'
3408 m_url
+= '?%s' % query
3411 if 'm3u8' not in skip_protocols
:
3412 formats
.extend(self
._extract
_m
3u8_formats
(
3413 manifest_url('playlist.m3u8'), video_id
, 'mp4',
3414 m3u8_entry_protocol
, m3u8_id
='hls', fatal
=False))
3415 if 'f4m' not in skip_protocols
:
3416 formats
.extend(self
._extract
_f
4m
_formats
(
3417 manifest_url('manifest.f4m'),
3418 video_id
, f4m_id
='hds', fatal
=False))
3419 if 'dash' not in skip_protocols
:
3420 formats
.extend(self
._extract
_mpd
_formats
(
3421 manifest_url('manifest.mpd'),
3422 video_id
, mpd_id
='dash', fatal
=False))
3423 if re
.search(r
'(?:/smil:|\.smil)', url_base
):
3424 if 'smil' not in skip_protocols
:
3425 rtmp_formats
= self
._extract
_smil
_formats
(
3426 manifest_url('jwplayer.smil'),
3427 video_id
, fatal
=False)
3428 for rtmp_format
in rtmp_formats
:
3429 rtsp_format
= rtmp_format
.copy()
3430 rtsp_format
['url'] = '%s/%s' % (rtmp_format
['url'], rtmp_format
['play_path'])
3431 del rtsp_format
['play_path']
3432 del rtsp_format
['ext']
3433 rtsp_format
.update({
3434 'url': rtsp_format
['url'].replace('rtmp://', 'rtsp://'),
3435 'format_id': rtmp_format
['format_id'].replace('rtmp', 'rtsp'),
3438 formats
.extend([rtmp_format
, rtsp_format
])
3440 for protocol
in ('rtmp', 'rtsp'):
3441 if protocol
not in skip_protocols
:
3443 'url': f
'{protocol}:{url_base}',
3444 'format_id': protocol
,
3445 'protocol': protocol
,
3449 def _find_jwplayer_data(self
, webpage
, video_id
=None, transform_source
=js_to_json
):
3451 r
'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P
=quote
)\
)(?
!</script
>).*?\
.setup\s
*\
((?P
<options
>[^
)]+)\
)',
3455 jwplayer_data = self._parse_json(mobj.group('options
'),
3457 transform_source=transform_source)
3458 except ExtractorError:
3461 if isinstance(jwplayer_data, dict):
3462 return jwplayer_data
3464 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3465 jwplayer_data = self._find_jwplayer_data(
3466 webpage, video_id, transform_source=js_to_json)
3467 return self._parse_jwplayer_data(
3468 jwplayer_data, video_id, *args, **kwargs)
3470 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3471 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3472 # JWPlayer backward compatibility: flattened playlists
3473 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3474 if 'playlist
' not in jwplayer_data:
3475 jwplayer_data = {'playlist': [jwplayer_data]}
3479 # JWPlayer backward compatibility: single playlist item
3480 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3481 if not isinstance(jwplayer_data['playlist
'], list):
3482 jwplayer_data['playlist
'] = [jwplayer_data['playlist
']]
3484 for video_data in jwplayer_data['playlist
']:
3485 # JWPlayer backward compatibility: flattened sources
3486 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3487 if 'sources
' not in video_data:
3488 video_data['sources
'] = [video_data]
3490 this_video_id = video_id or video_data['mediaid
']
3492 formats = self._parse_jwplayer_formats(
3493 video_data['sources
'], video_id=this_video_id, m3u8_id=m3u8_id,
3494 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3497 tracks = video_data.get('tracks
')
3498 if tracks and isinstance(tracks, list):
3499 for track in tracks:
3500 if not isinstance(track, dict):
3502 track_kind = track.get('kind
')
3503 if not track_kind or not isinstance(track_kind, str):
3505 if track_kind.lower() not in ('captions
', 'subtitles
'):
3507 track_url = urljoin(base_url, track.get('file'))
3510 subtitles.setdefault(track.get('label
') or 'en
', []).append({
3511 'url
': self._proto_relative_url(track_url)
3515 'id': this_video_id,
3516 'title
': unescapeHTML(video_data['title
'] if require_title else video_data.get('title
')),
3517 'description
': clean_html(video_data.get('description
')),
3518 'thumbnail
': urljoin(base_url, self._proto_relative_url(video_data.get('image
'))),
3519 'timestamp
': int_or_none(video_data.get('pubdate
')),
3520 'duration
': float_or_none(jwplayer_data.get('duration
') or video_data.get('duration
')),
3521 'subtitles
': subtitles,
3523 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3524 if len(formats) == 1 and re.search(r'^
(?
:http|
//).*(?
:youtube\
.com|youtu\
.be
)/.+', formats[0]['url
']):
3526 '_type
': 'url_transparent
',
3527 'url
': formats[0]['url
'],
3530 self._sort_formats(formats)
3531 entry['formats
'] = formats
3532 entries.append(entry)
3533 if len(entries) == 1:
3536 return self.playlist_result(entries)
3538 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3539 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3542 for source in jwplayer_sources_data:
3543 if not isinstance(source, dict):
3545 source_url = urljoin(
3546 base_url, self._proto_relative_url(source.get('file')))
3547 if not source_url or source_url in urls:
3549 urls.append(source_url)
3550 source_type = source.get('type') or ''
3551 ext = mimetype2ext(source_type) or determine_ext(source_url)
3552 if source_type == 'hls
' or ext == 'm3u8
':
3553 formats.extend(self._extract_m3u8_formats(
3554 source_url, video_id, 'mp4
', entry_protocol='m3u8_native
',
3555 m3u8_id=m3u8_id, fatal=False))
3556 elif source_type == 'dash
' or ext == 'mpd
':
3557 formats.extend(self._extract_mpd_formats(
3558 source_url, video_id, mpd_id=mpd_id, fatal=False))
3560 formats.extend(self._extract_smil_formats(
3561 source_url, video_id, fatal=False))
3562 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3563 elif source_type.startswith('audio
') or ext in (
3564 'oga
', 'aac
', 'mp3
', 'mpeg
', 'vorbis
'):
3571 height = int_or_none(source.get('height
'))
3573 # Often no height is provided but there is a label in
3574 # format like "1080p", "720p SD", or 1080.
3575 height = int_or_none(self._search_regex(
3576 r'^
(\d{3,4}
)[pP
]?
(?
:\b|$
)', str(source.get('label
') or ''),
3577 'height
', default=None))
3580 'width
': int_or_none(source.get('width
')),
3582 'tbr
': int_or_none(source.get('bitrate
')),
3585 if source_url.startswith('rtmp
'):
3586 a_format['ext
'] = 'flv
'
3587 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3588 # of jwplayer.flash.swf
3589 rtmp_url_parts = re.split(
3590 r'((?
:mp4|mp3|flv
):)', source_url, 1)
3591 if len(rtmp_url_parts) == 3:
3592 rtmp_url, prefix, play_path = rtmp_url_parts
3595 'play_path
': prefix + play_path,
3598 a_format.update(rtmp_params)
3599 formats.append(a_format)
3602 def _live_title(self, name):
3603 self._downloader.deprecation_warning('yt_dlp
.InfoExtractor
._live
_title
is deprecated
and does
not work
as expected
')
3606 def _int(self, v, name, fatal=False, **kwargs):
3607 res = int_or_none(v, **kwargs)
3609 msg = f'Failed to extract {name}
: Could
not parse value {v!r}
'
3611 raise ExtractorError(msg)
3613 self.report_warning(msg)
3616 def _float(self, v, name, fatal=False, **kwargs):
3617 res = float_or_none(v, **kwargs)
3619 msg = f'Failed to extract {name}
: Could
not parse value {v!r}
'
3621 raise ExtractorError(msg)
3623 self.report_warning(msg)
3626 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3627 path='/', secure=False, discard=False, rest={}, **kwargs):
3628 cookie = http.cookiejar.Cookie(
3629 0, name, value, port, port is not None, domain, True,
3630 domain.startswith('.'), path, True, secure, expire_time,
3631 discard, None, None, rest)
3632 self.cookiejar.set_cookie(cookie)
3634 def _get_cookies(self, url):
3635 """ Return a http.cookies.SimpleCookie with the cookies for the url """
3636 return LenientSimpleCookie(self._downloader._calc_cookies(url))
3638 def _apply_first_set_cookie_header(self, url_handle, cookie):
3640 Apply first Set-Cookie header instead of the last. Experimental.
3642 Some sites (e.g. [1-3]) may serve two cookies under the same name
3643 in Set-Cookie header and expect the first (old) one to be set rather
3644 than second (new). However, as of RFC6265 the newer one cookie
3645 should be set into cookie store what actually happens.
3646 We will workaround this issue by resetting the cookie to
3647 the first one manually.
3648 1. https://new.vk.com/
3649 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3650 3. https://learning.oreilly.com/
3652 for header, cookies in url_handle.headers.items():
3653 if header.lower() != 'set-cookie
':
3655 cookies = cookies.encode('iso
-8859-1').decode('utf
-8')
3656 cookie_value = re.search(
3657 r'%s=(.+?
);.*?
\b[Dd
]omain
=(.+?
)(?
:[,;]|$
)' % cookie, cookies)
3659 value, domain = cookie_value.groups()
3660 self._set_cookie(domain, cookie, value)
3664 def get_testcases(cls, include_onlymatching=False):
3665 t = getattr(cls, '_TEST
', None)
3667 assert not hasattr(cls, '_TESTS
'), f'{cls.ie_key()}IE has _TEST
and _TESTS
'
3670 tests = getattr(cls, '_TESTS
', [])
3672 if not include_onlymatching and t.get('only_matching
', False):
3674 t['name
'] = cls.ie_key()
3678 def get_webpage_testcases(cls):
3679 tests = getattr(cls, '_WEBPAGE_TESTS
', [])
3681 t['name
'] = cls.ie_key()
3686 """Get age limit from the testcases"""
3687 return max(traverse_obj(
3688 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3689 (..., (('playlist
', 0), None), 'info_dict
', 'age_limit
')) or [0])
3692 def is_suitable(cls, age_limit):
3693 """Test whether the extractor is generally suitable for the given age limit"""
3694 return not age_restricted(cls.age_limit, age_limit)
3697 def description(cls, *, markdown=True, search_examples=None):
3698 """Description of the extractor"""
3700 if cls._NETRC_MACHINE:
3702 desc += f' [<abbr title
="netrc machine"><em
>{cls._NETRC_MACHINE}
</em
></abbr
>]'
3704 desc += f' [{cls._NETRC_MACHINE}
]'
3705 if cls.IE_DESC is False:
3708 desc += f' {cls.IE_DESC}
'
3710 desc += f'; "{cls.SEARCH_KEY}:" prefix
'
3712 _COUNTS = ('', '5', '10', 'all
')
3713 desc += f' (e
.g
. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3714 if not cls.working():
3715 desc += ' (**Currently broken
**)' if markdown else ' (Currently broken
)'
3717 name = f' - **{cls.IE_NAME}
**' if markdown else cls.IE_NAME
3718 return f'{name}
:{desc}
' if desc else name
3720 def extract_subtitles(self, *args, **kwargs):
3721 if (self.get_param('writesubtitles
', False)
3722 or self.get_param('listsubtitles
')):
3723 return self._get_subtitles(*args, **kwargs)
3726 def _get_subtitles(self, *args, **kwargs):
3727 raise NotImplementedError('This method must be implemented by subclasses
')
3729 def extract_comments(self, *args, **kwargs):
3730 if not self.get_param('getcomments
'):
3732 generator = self._get_comments(*args, **kwargs)
3739 comments.append(next(generator))
3740 except StopIteration:
3742 except KeyboardInterrupt:
3743 self.to_screen('Interrupted by user
')
3744 except Exception as e:
3745 if self.get_param('ignoreerrors
') is not True:
3747 self._downloader.report_error(e)
3748 comment_count = len(comments)
3749 self.to_screen(f'Extracted {comment_count} comments
')
3751 'comments
': comments,
3752 'comment_count
': None if interrupted else comment_count
3756 def _get_comments(self, *args, **kwargs):
3757 raise NotImplementedError('This method must be implemented by subclasses
')
3760 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3761 """ Merge subtitle items for one language. Items with duplicated URLs/data
3762 will be dropped. """
3763 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3764 ret = list(subtitle_list1)
3765 ret.extend(item for item in subtitle_list2 if (item.get('url
'), item.get('data
')) not in list1_data)
3769 def _merge_subtitles(cls, *dicts, target=None):
3770 """ Merge subtitle dictionaries, language by language. """
3774 for lang, subs in d.items():
3775 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3778 def extract_automatic_captions(self, *args, **kwargs):
3779 if (self.get_param('writeautomaticsub
', False)
3780 or self.get_param('listsubtitles
')):
3781 return self._get_automatic_captions(*args, **kwargs)
3784 def _get_automatic_captions(self, *args, **kwargs):
3785 raise NotImplementedError('This method must be implemented by subclasses
')
3787 @functools.cached_property
3788 def _cookies_passed(self):
3789 """Whether cookies have been passed to YoutubeDL"""
3790 return self.get_param('cookiefile
') is not None or self.get_param('cookiesfrombrowser
') is not None
3792 def mark_watched(self, *args, **kwargs):
3793 if not self.get_param('mark_watched
', False):
3795 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3796 self._mark_watched(*args, **kwargs)
3798 def _mark_watched(self, *args, **kwargs):
3799 raise NotImplementedError('This method must be implemented by subclasses
')
3801 def geo_verification_headers(self):
3803 geo_verification_proxy = self.get_param('geo_verification_proxy
')
3804 if geo_verification_proxy:
3805 headers['Ytdl
-request
-proxy
'] = geo_verification_proxy
3809 def _generic_id(url):
3810 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3813 def _generic_title(url):
3814 return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3817 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3818 all_known = all(map(
3819 lambda x: x is not None,
3820 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3822 'private
' if is_private
3823 else 'premium_only
' if needs_premium
3824 else 'subscriber_only
' if needs_subscription
3825 else 'needs_auth
' if needs_auth
3826 else 'unlisted
' if is_unlisted
3827 else 'public
' if all_known
3830 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3832 @returns A list of values for the extractor argument given by "key"
3833 or "default" if no such key is present
3834 @param default The default value to return when the key is not present (default: [])
3835 @param casesense When false, the values are converted to lower case
3838 self._downloader.params, ('extractor_args
', (ie_key or self.ie_key()).lower(), key))
3840 return [] if default is NO_DEFAULT else default
3841 return list(val) if casesense else [x.lower() for x in val]
3843 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist
', video_label='video
'):
3844 if not playlist_id or not video_id:
3847 no_playlist = (smuggled_data or {}).get('force_noplaylist
')
3848 if no_playlist is not None:
3849 return not no_playlist
3851 video_id = '' if video_id is True else f' {video_id}
'
3852 playlist_id = '' if playlist_id is True else f' {playlist_id}
'
3853 if self.get_param('noplaylist
'):
3854 self.to_screen(f'Downloading just the {video_label}{video_id} because of
--no
-playlist
')
3856 self.to_screen(f'Downloading {playlist_label}{playlist_id}
- add
--no
-playlist to download just the {video_label}{video_id}
')
3859 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3860 RetryManager.report_retry(
3861 err, _count or int(fatal), _retries,
3862 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3863 sleep_func=self.get_param('retry_sleep_functions
', {}).get('extractor
'))
3865 def RetryManager(self, **kwargs):
3866 return RetryManager(self.get_param('extractor_retries
', 3), self._error_or_warning, **kwargs)
3869 def extract_from_webpage(cls, ydl, url, webpage):
3870 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3871 else ydl.get_info_extractor(cls.ie_key()))
3872 for info in ie._extract_from_webpage(url, webpage) or []:
3873 # url = None since we do not want to set (webpage/original)_url
3874 ydl.add_default_extra_info(info, ie, None)
3878 def _extract_from_webpage(cls, url, webpage):
3879 for embed_url in orderedSet(
3880 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3881 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3884 def _extract_embed_urls(cls, url, webpage):
3885 """@returns all the embed urls on the webpage"""
3886 if '_EMBED_URL_RE
' not in cls.__dict__:
3887 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3888 for idx, regex in enumerate(cls._EMBED_REGEX):
3889 assert regex.count('(?P
<url
>') == 1, \
3890 f'{cls.__name__}
._EMBED
_REGEX
[{idx}
] must have exactly
1 url group
\n\t{regex}
'
3891 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3893 for regex in cls._EMBED_URL_RE:
3894 for mobj in regex.finditer(webpage):
3895 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url
')))
3896 if cls._VALID_URL is False or cls.suitable(embed_url):
3899 class StopExtraction(Exception):
3903 def _extract_url(cls, webpage): # TODO: Remove
3904 """Only for compatibility with some older extractors"""
3905 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3908 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3910 mro = inspect.getmro(cls)
3911 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3912 cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}
+{plugin_name}
', super_class.ie_key
3913 while getattr(super_class, '__wrapped__
', None):
3914 super_class = super_class.__wrapped__
3915 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3917 return super().__init_subclass__(**kwargs)
3920 class SearchInfoExtractor(InfoExtractor):
3922 Base class for paged search queries extractors.
3923 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3924 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3927 _MAX_RESULTS = float('inf
')
3930 def _VALID_URL(cls):
3931 return r'%s(?P
<prefix
>|
[1-9][0-9]*|all
):(?P
<query
>[\s\S
]+)' % cls._SEARCH_KEY
3933 def _real_extract(self, query):
3934 prefix, query = self._match_valid_url(query).group('prefix
', 'query
')
3936 return self._get_n_results(query, 1)
3937 elif prefix == 'all
':
3938 return self._get_n_results(query, self._MAX_RESULTS)
3942 raise ExtractorError(f'invalid download number {n}
for query
"{query}"')
3943 elif n > self._MAX_RESULTS:
3944 self.report_warning('%s returns
max %i results (you requested
%i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3945 n = self._MAX_RESULTS
3946 return self._get_n_results(query, n)
3948 def _get_n_results(self, query, n):
3949 """Get a specified number of results for a query.
3950 Either this function or _search_results must be overridden by subclasses """
3951 return self.playlist_result(
3952 itertools.islice(self._search_results(query), 0, None if n == float('inf
') else n),
3955 def _search_results(self, query):
3956 """Returns an iterator of search results"""
3957 raise NotImplementedError('This method must be implemented by subclasses
')
3960 def SEARCH_KEY(cls):
3961 return cls._SEARCH_KEY
3964 class UnsupportedURLIE(InfoExtractor):
3969 def _real_extract(self, url):
3970 raise UnsupportedError(url)