]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
Add new field `aspect_ratio`
[yt-dlp.git] / yt_dlp / extractor / common.py
1 import base64
2 import collections
3 import getpass
4 import hashlib
5 import http.client
6 import http.cookiejar
7 import http.cookies
8 import inspect
9 import itertools
10 import json
11 import math
12 import netrc
13 import os
14 import random
15 import re
16 import sys
17 import time
18 import types
19 import urllib.parse
20 import urllib.request
21 import xml.etree.ElementTree
22
23 from ..compat import functools # isort: split
24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
25 from ..cookies import LenientSimpleCookie
26 from ..downloader import FileDownloader
27 from ..downloader.f4m import get_base_url, remove_encrypted_media
28 from ..utils import (
29 IDENTITY,
30 JSON_LD_RE,
31 NO_DEFAULT,
32 ExtractorError,
33 GeoRestrictedError,
34 GeoUtils,
35 LenientJSONDecoder,
36 RegexNotFoundError,
37 RetryManager,
38 UnsupportedError,
39 age_restricted,
40 base_url,
41 bug_reports_message,
42 classproperty,
43 clean_html,
44 determine_ext,
45 determine_protocol,
46 dict_get,
47 encode_data_uri,
48 error_to_compat_str,
49 extract_attributes,
50 filter_dict,
51 fix_xml_ampersands,
52 float_or_none,
53 format_field,
54 int_or_none,
55 join_nonempty,
56 js_to_json,
57 mimetype2ext,
58 network_exceptions,
59 orderedSet,
60 parse_bitrate,
61 parse_codecs,
62 parse_duration,
63 parse_iso8601,
64 parse_m3u8_attributes,
65 parse_resolution,
66 sanitize_filename,
67 sanitize_url,
68 sanitized_Request,
69 smuggle_url,
70 str_or_none,
71 str_to_int,
72 strip_or_none,
73 traverse_obj,
74 try_call,
75 try_get,
76 unescapeHTML,
77 unified_strdate,
78 unified_timestamp,
79 update_Request,
80 update_url_query,
81 url_basename,
82 url_or_none,
83 urljoin,
84 variadic,
85 xpath_element,
86 xpath_text,
87 xpath_with_ns,
88 )
89
90
91 class InfoExtractor:
92 """Information Extractor class.
93
94 Information extractors are the classes that, given a URL, extract
95 information about the video (or videos) the URL refers to. This
96 information includes the real video URL, the video title, author and
97 others. The information is stored in a dictionary which is then
98 passed to the YoutubeDL. The YoutubeDL processes this
99 information possibly downloading the video to the file system, among
100 other possible outcomes.
101
102 The type field determines the type of the result.
103 By far the most common value (and the default if _type is missing) is
104 "video", which indicates a single video.
105
106 For a video, the dictionaries must include the following fields:
107
108 id: Video identifier.
109 title: Video title, unescaped. Set to an empty string if video has
110 no title as opposed to "None" which signifies that the
111 extractor failed to obtain a title
112
113 Additionally, it must contain either a formats entry or a url one:
114
115 formats: A list of dictionaries for each format available, ordered
116 from worst to best quality.
117
118 Potential fields:
119 * url The mandatory URL representing the media:
120 for plain file media - HTTP URL of this file,
121 for RTMP - RTMP URL,
122 for HLS - URL of the M3U8 media playlist,
123 for HDS - URL of the F4M manifest,
124 for DASH
125 - HTTP URL to plain file media (in case of
126 unfragmented media)
127 - URL of the MPD manifest or base URL
128 representing the media if MPD manifest
129 is parsed from a string (in case of
130 fragmented media)
131 for MSS - URL of the ISM manifest.
132 * manifest_url
133 The URL of the manifest file in case of
134 fragmented media:
135 for HLS - URL of the M3U8 master playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH - URL of the MPD manifest,
138 for MSS - URL of the ISM manifest.
139 * manifest_stream_number (For internal use only)
140 The index of the stream in the manifest file
141 * ext Will be calculated from URL if missing
142 * format A human-readable description of the format
143 ("mp4 container with h264/opus").
144 Calculated from the format_id, width, height.
145 and format_note fields if missing.
146 * format_id A short description of the format
147 ("mp4_h264_opus" or "19").
148 Technically optional, but strongly recommended.
149 * format_note Additional info about the format
150 ("3D" or "DASH video")
151 * width Width of the video, if known
152 * height Height of the video, if known
153 * aspect_ratio Aspect ratio of the video, if known
154 Automatically calculated from width and height
155 * resolution Textual description of width and height
156 Automatically calculated from width and height
157 * dynamic_range The dynamic range of the video. One of:
158 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
159 * tbr Average bitrate of audio and video in KBit/s
160 * abr Average audio bitrate in KBit/s
161 * acodec Name of the audio codec in use
162 * asr Audio sampling rate in Hertz
163 * audio_channels Number of audio channels
164 * vbr Average video bitrate in KBit/s
165 * fps Frame rate
166 * vcodec Name of the video codec in use
167 * container Name of the container format
168 * filesize The number of bytes, if known in advance
169 * filesize_approx An estimate for the number of bytes
170 * player_url SWF Player URL (used for rtmpdump).
171 * protocol The protocol that will be used for the actual
172 download, lower-case. One of "http", "https" or
173 one of the protocols defined in downloader.PROTOCOL_MAP
174 * fragment_base_url
175 Base URL for fragments. Each fragment's path
176 value (if present) will be relative to
177 this URL.
178 * fragments A list of fragments of a fragmented media.
179 Each fragment entry must contain either an url
180 or a path. If an url is present it should be
181 considered by a client. Otherwise both path and
182 fragment_base_url must be present. Here is
183 the list of all potential fields:
184 * "url" - fragment's URL
185 * "path" - fragment's path relative to
186 fragment_base_url
187 * "duration" (optional, int or float)
188 * "filesize" (optional, int)
189 * is_from_start Is a live format that can be downloaded
190 from the start. Boolean
191 * preference Order number of this format. If this field is
192 present and not None, the formats get sorted
193 by this field, regardless of all other values.
194 -1 for default (order by other properties),
195 -2 or smaller for less than default.
196 < -1000 to hide the format (if there is
197 another one which is strictly better)
198 * language Language code, e.g. "de" or "en-US".
199 * language_preference Is this in the language mentioned in
200 the URL?
201 10 if it's what the URL is about,
202 -1 for default (don't know),
203 -10 otherwise, other values reserved for now.
204 * quality Order number of the video quality of this
205 format, irrespective of the file format.
206 -1 for default (order by other properties),
207 -2 or smaller for less than default.
208 * source_preference Order number for this video source
209 (quality takes higher priority)
210 -1 for default (order by other properties),
211 -2 or smaller for less than default.
212 * http_headers A dictionary of additional HTTP headers
213 to add to the request.
214 * stretched_ratio If given and not 1, indicates that the
215 video's pixels are not square.
216 width : height ratio as float.
217 * no_resume The server does not support resuming the
218 (HTTP or RTMP) download. Boolean.
219 * has_drm The format has DRM and cannot be downloaded. Boolean
220 * downloader_options A dictionary of downloader options
221 (For internal use only)
222 * http_chunk_size Chunk size for HTTP downloads
223 * ffmpeg_args Extra arguments for ffmpeg downloader
224 RTMP formats can also have the additional fields: page_url,
225 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
226 rtmp_protocol, rtmp_real_time
227
228 url: Final video URL.
229 ext: Video filename extension.
230 format: The video format, defaults to ext (used for --get-format)
231 player_url: SWF Player URL (used for rtmpdump).
232
233 The following fields are optional:
234
235 direct: True if a direct video file was given (must only be set by GenericIE)
236 alt_title: A secondary title of the video.
237 display_id An alternative identifier for the video, not necessarily
238 unique, but available before title. Typically, id is
239 something like "4234987", title "Dancing naked mole rats",
240 and display_id "dancing-naked-mole-rats"
241 thumbnails: A list of dictionaries, with the following entries:
242 * "id" (optional, string) - Thumbnail format ID
243 * "url"
244 * "preference" (optional, int) - quality of the image
245 * "width" (optional, int)
246 * "height" (optional, int)
247 * "resolution" (optional, string "{width}x{height}",
248 deprecated)
249 * "filesize" (optional, int)
250 * "http_headers" (dict) - HTTP headers for the request
251 thumbnail: Full URL to a video thumbnail image.
252 description: Full video description.
253 uploader: Full name of the video uploader.
254 license: License name the video is licensed under.
255 creator: The creator of the video.
256 timestamp: UNIX timestamp of the moment the video was uploaded
257 upload_date: Video upload date in UTC (YYYYMMDD).
258 If not explicitly set, calculated from timestamp
259 release_timestamp: UNIX timestamp of the moment the video was released.
260 If it is not clear whether to use timestamp or this, use the former
261 release_date: The date (YYYYMMDD) when the video was released in UTC.
262 If not explicitly set, calculated from release_timestamp
263 modified_timestamp: UNIX timestamp of the moment the video was last modified.
264 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
265 If not explicitly set, calculated from modified_timestamp
266 uploader_id: Nickname or id of the video uploader.
267 uploader_url: Full URL to a personal webpage of the video uploader.
268 channel: Full name of the channel the video is uploaded on.
269 Note that channel fields may or may not repeat uploader
270 fields. This depends on a particular extractor.
271 channel_id: Id of the channel.
272 channel_url: Full URL to a channel webpage.
273 channel_follower_count: Number of followers of the channel.
274 location: Physical location where the video was filmed.
275 subtitles: The available subtitles as a dictionary in the format
276 {tag: subformats}. "tag" is usually a language code, and
277 "subformats" is a list sorted from lower to higher
278 preference, each element is a dictionary with the "ext"
279 entry and one of:
280 * "data": The subtitles file contents
281 * "url": A URL pointing to the subtitles file
282 It can optionally also have:
283 * "name": Name or description of the subtitles
284 * "http_headers": A dictionary of additional HTTP headers
285 to add to the request.
286 "ext" will be calculated from URL if missing
287 automatic_captions: Like 'subtitles'; contains automatically generated
288 captions instead of normal subtitles
289 duration: Length of the video in seconds, as an integer or float.
290 view_count: How many users have watched the video on the platform.
291 concurrent_view_count: How many users are currently watching the video on the platform.
292 like_count: Number of positive ratings of the video
293 dislike_count: Number of negative ratings of the video
294 repost_count: Number of reposts of the video
295 average_rating: Average rating give by users, the scale used depends on the webpage
296 comment_count: Number of comments on the video
297 comments: A list of comments, each with one or more of the following
298 properties (all but one of text or html optional):
299 * "author" - human-readable name of the comment author
300 * "author_id" - user ID of the comment author
301 * "author_thumbnail" - The thumbnail of the comment author
302 * "id" - Comment ID
303 * "html" - Comment as HTML
304 * "text" - Plain text of the comment
305 * "timestamp" - UNIX timestamp of comment
306 * "parent" - ID of the comment this one is replying to.
307 Set to "root" to indicate that this is a
308 comment to the original video.
309 * "like_count" - Number of positive ratings of the comment
310 * "dislike_count" - Number of negative ratings of the comment
311 * "is_favorited" - Whether the comment is marked as
312 favorite by the video uploader
313 * "author_is_uploader" - Whether the comment is made by
314 the video uploader
315 age_limit: Age restriction for the video, as an integer (years)
316 webpage_url: The URL to the video webpage, if given to yt-dlp it
317 should allow to get the same result again. (It will be set
318 by YoutubeDL if it's missing)
319 categories: A list of categories that the video falls in, for example
320 ["Sports", "Berlin"]
321 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
322 cast: A list of the video cast
323 is_live: True, False, or None (=unknown). Whether this video is a
324 live stream that goes on instead of a fixed-length video.
325 was_live: True, False, or None (=unknown). Whether this video was
326 originally a live stream.
327 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
328 or 'post_live' (was live, but VOD is not yet processed)
329 If absent, automatically set from is_live, was_live
330 start_time: Time in seconds where the reproduction should start, as
331 specified in the URL.
332 end_time: Time in seconds where the reproduction should end, as
333 specified in the URL.
334 chapters: A list of dictionaries, with the following entries:
335 * "start_time" - The start time of the chapter in seconds
336 * "end_time" - The end time of the chapter in seconds
337 * "title" (optional, string)
338 playable_in_embed: Whether this video is allowed to play in embedded
339 players on other sites. Can be True (=always allowed),
340 False (=never allowed), None (=unknown), or a string
341 specifying the criteria for embedability; e.g. 'whitelist'
342 availability: Under what condition the video is available. One of
343 'private', 'premium_only', 'subscriber_only', 'needs_auth',
344 'unlisted' or 'public'. Use 'InfoExtractor._availability'
345 to set it
346 _old_archive_ids: A list of old archive ids needed for backward compatibility
347 __post_extractor: A function to be called just before the metadata is
348 written to either disk, logger or console. The function
349 must return a dict which will be added to the info_dict.
350 This is usefull for additional information that is
351 time-consuming to extract. Note that the fields thus
352 extracted will not be available to output template and
353 match_filter. So, only "comments" and "comment_count" are
354 currently allowed to be extracted via this method.
355
356 The following fields should only be used when the video belongs to some logical
357 chapter or section:
358
359 chapter: Name or title of the chapter the video belongs to.
360 chapter_number: Number of the chapter the video belongs to, as an integer.
361 chapter_id: Id of the chapter the video belongs to, as a unicode string.
362
363 The following fields should only be used when the video is an episode of some
364 series, programme or podcast:
365
366 series: Title of the series or programme the video episode belongs to.
367 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
368 season: Title of the season the video episode belongs to.
369 season_number: Number of the season the video episode belongs to, as an integer.
370 season_id: Id of the season the video episode belongs to, as a unicode string.
371 episode: Title of the video episode. Unlike mandatory video title field,
372 this field should denote the exact title of the video episode
373 without any kind of decoration.
374 episode_number: Number of the video episode within a season, as an integer.
375 episode_id: Id of the video episode, as a unicode string.
376
377 The following fields should only be used when the media is a track or a part of
378 a music album:
379
380 track: Title of the track.
381 track_number: Number of the track within an album or a disc, as an integer.
382 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
383 as a unicode string.
384 artist: Artist(s) of the track.
385 genre: Genre(s) of the track.
386 album: Title of the album the track belongs to.
387 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
388 album_artist: List of all artists appeared on the album (e.g.
389 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
390 and compilations).
391 disc_number: Number of the disc or other physical medium the track belongs to,
392 as an integer.
393 release_year: Year (YYYY) when the album was released.
394 composer: Composer of the piece
395
396 The following fields should only be set for clips that should be cut from the original video:
397
398 section_start: Start time of the section in seconds
399 section_end: End time of the section in seconds
400
401 The following fields should only be set for storyboards:
402 rows: Number of rows in each storyboard fragment, as an integer
403 columns: Number of columns in each storyboard fragment, as an integer
404
405 Unless mentioned otherwise, the fields should be Unicode strings.
406
407 Unless mentioned otherwise, None is equivalent to absence of information.
408
409
410 _type "playlist" indicates multiple videos.
411 There must be a key "entries", which is a list, an iterable, or a PagedList
412 object, each element of which is a valid dictionary by this specification.
413
414 Additionally, playlists can have "id", "title", and any other relevant
415 attributes with the same semantics as videos (see above).
416
417 It can also have the following optional fields:
418
419 playlist_count: The total number of videos in a playlist. If not given,
420 YoutubeDL tries to calculate it from "entries"
421
422
423 _type "multi_video" indicates that there are multiple videos that
424 form a single show, for examples multiple acts of an opera or TV episode.
425 It must have an entries key like a playlist and contain all the keys
426 required for a video at the same time.
427
428
429 _type "url" indicates that the video must be extracted from another
430 location, possibly by a different extractor. Its only required key is:
431 "url" - the next URL to extract.
432 The key "ie_key" can be set to the class name (minus the trailing "IE",
433 e.g. "Youtube") if the extractor class is known in advance.
434 Additionally, the dictionary may have any properties of the resolved entity
435 known in advance, for example "title" if the title of the referred video is
436 known ahead of time.
437
438
439 _type "url_transparent" entities have the same specification as "url", but
440 indicate that the given additional information is more precise than the one
441 associated with the resolved URL.
442 This is useful when a site employs a video service that hosts the video and
443 its technical metadata, but that video service does not embed a useful
444 title, description etc.
445
446
447 Subclasses of this should also be added to the list of extractors and
448 should define a _VALID_URL regexp and, re-define the _real_extract() and
449 (optionally) _real_initialize() methods.
450
451 Subclasses may also override suitable() if necessary, but ensure the function
452 signature is preserved and that this function imports everything it needs
453 (except other extractors), so that lazy_extractors works correctly.
454
455 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
456 the HTML of Generic webpages. It may also override _extract_embed_urls
457 or _extract_from_webpage as necessary. While these are normally classmethods,
458 _extract_from_webpage is allowed to be an instance method.
459
460 _extract_from_webpage may raise self.StopExtraction() to stop further
461 processing of the webpage and obtain exclusive rights to it. This is useful
462 when the extractor cannot reliably be matched using just the URL,
463 e.g. invidious/peertube instances
464
465 Embed-only extractors can be defined by setting _VALID_URL = False.
466
467 To support username + password (or netrc) login, the extractor must define a
468 _NETRC_MACHINE and re-define _perform_login(username, password) and
469 (optionally) _initialize_pre_login() methods. The _perform_login method will
470 be called between _initialize_pre_login and _real_initialize if credentials
471 are passed by the user. In cases where it is necessary to have the login
472 process as part of the extraction rather than initialization, _perform_login
473 can be left undefined.
474
475 _GEO_BYPASS attribute may be set to False in order to disable
476 geo restriction bypass mechanisms for a particular extractor.
477 Though it won't disable explicit geo restriction bypass based on
478 country code provided with geo_bypass_country.
479
480 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
481 countries for this extractor. One of these countries will be used by
482 geo restriction bypass mechanism right away in order to bypass
483 geo restriction, of course, if the mechanism is not disabled.
484
485 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
486 IP blocks in CIDR notation for this extractor. One of these IP blocks
487 will be used by geo restriction bypass mechanism similarly
488 to _GEO_COUNTRIES.
489
490 The _ENABLED attribute should be set to False for IEs that
491 are disabled by default and must be explicitly enabled.
492
493 The _WORKING attribute should be set to False for broken IEs
494 in order to warn the users and skip the tests.
495 """
496
497 _ready = False
498 _downloader = None
499 _x_forwarded_for_ip = None
500 _GEO_BYPASS = True
501 _GEO_COUNTRIES = None
502 _GEO_IP_BLOCKS = None
503 _WORKING = True
504 _ENABLED = True
505 _NETRC_MACHINE = None
506 IE_DESC = None
507 SEARCH_KEY = None
508 _VALID_URL = None
509 _EMBED_REGEX = []
510
511 def _login_hint(self, method=NO_DEFAULT, netrc=None):
512 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
513 return {
514 None: '',
515 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
516 'password': f'Use {password_hint}',
517 'cookies': (
518 'Use --cookies-from-browser or --cookies for the authentication. '
519 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
520 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
521
522 def __init__(self, downloader=None):
523 """Constructor. Receives an optional downloader (a YoutubeDL instance).
524 If a downloader is not passed during initialization,
525 it must be set using "set_downloader()" before "extract()" is called"""
526 self._ready = False
527 self._x_forwarded_for_ip = None
528 self._printed_messages = set()
529 self.set_downloader(downloader)
530
531 @classmethod
532 def _match_valid_url(cls, url):
533 if cls._VALID_URL is False:
534 return None
535 # This does not use has/getattr intentionally - we want to know whether
536 # we have cached the regexp for *this* class, whereas getattr would also
537 # match the superclass
538 if '_VALID_URL_RE' not in cls.__dict__:
539 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
540 return cls._VALID_URL_RE.match(url)
541
542 @classmethod
543 def suitable(cls, url):
544 """Receives a URL and returns True if suitable for this IE."""
545 # This function must import everything it needs (except other extractors),
546 # so that lazy_extractors works correctly
547 return cls._match_valid_url(url) is not None
548
549 @classmethod
550 def _match_id(cls, url):
551 return cls._match_valid_url(url).group('id')
552
553 @classmethod
554 def get_temp_id(cls, url):
555 try:
556 return cls._match_id(url)
557 except (IndexError, AttributeError):
558 return None
559
560 @classmethod
561 def working(cls):
562 """Getter method for _WORKING."""
563 return cls._WORKING
564
565 @classmethod
566 def supports_login(cls):
567 return bool(cls._NETRC_MACHINE)
568
569 def initialize(self):
570 """Initializes an instance (authentication, etc)."""
571 self._printed_messages = set()
572 self._initialize_geo_bypass({
573 'countries': self._GEO_COUNTRIES,
574 'ip_blocks': self._GEO_IP_BLOCKS,
575 })
576 if not self._ready:
577 self._initialize_pre_login()
578 if self.supports_login():
579 username, password = self._get_login_info()
580 if username:
581 self._perform_login(username, password)
582 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
583 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
584 self._real_initialize()
585 self._ready = True
586
587 def _initialize_geo_bypass(self, geo_bypass_context):
588 """
589 Initialize geo restriction bypass mechanism.
590
591 This method is used to initialize geo bypass mechanism based on faking
592 X-Forwarded-For HTTP header. A random country from provided country list
593 is selected and a random IP belonging to this country is generated. This
594 IP will be passed as X-Forwarded-For HTTP header in all subsequent
595 HTTP requests.
596
597 This method will be used for initial geo bypass mechanism initialization
598 during the instance initialization with _GEO_COUNTRIES and
599 _GEO_IP_BLOCKS.
600
601 You may also manually call it from extractor's code if geo bypass
602 information is not available beforehand (e.g. obtained during
603 extraction) or due to some other reason. In this case you should pass
604 this information in geo bypass context passed as first argument. It may
605 contain following fields:
606
607 countries: List of geo unrestricted countries (similar
608 to _GEO_COUNTRIES)
609 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
610 (similar to _GEO_IP_BLOCKS)
611
612 """
613 if not self._x_forwarded_for_ip:
614
615 # Geo bypass mechanism is explicitly disabled by user
616 if not self.get_param('geo_bypass', True):
617 return
618
619 if not geo_bypass_context:
620 geo_bypass_context = {}
621
622 # Backward compatibility: previously _initialize_geo_bypass
623 # expected a list of countries, some 3rd party code may still use
624 # it this way
625 if isinstance(geo_bypass_context, (list, tuple)):
626 geo_bypass_context = {
627 'countries': geo_bypass_context,
628 }
629
630 # The whole point of geo bypass mechanism is to fake IP
631 # as X-Forwarded-For HTTP header based on some IP block or
632 # country code.
633
634 # Path 1: bypassing based on IP block in CIDR notation
635
636 # Explicit IP block specified by user, use it right away
637 # regardless of whether extractor is geo bypassable or not
638 ip_block = self.get_param('geo_bypass_ip_block', None)
639
640 # Otherwise use random IP block from geo bypass context but only
641 # if extractor is known as geo bypassable
642 if not ip_block:
643 ip_blocks = geo_bypass_context.get('ip_blocks')
644 if self._GEO_BYPASS and ip_blocks:
645 ip_block = random.choice(ip_blocks)
646
647 if ip_block:
648 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
649 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
650 return
651
652 # Path 2: bypassing based on country code
653
654 # Explicit country code specified by user, use it right away
655 # regardless of whether extractor is geo bypassable or not
656 country = self.get_param('geo_bypass_country', None)
657
658 # Otherwise use random country code from geo bypass context but
659 # only if extractor is known as geo bypassable
660 if not country:
661 countries = geo_bypass_context.get('countries')
662 if self._GEO_BYPASS and countries:
663 country = random.choice(countries)
664
665 if country:
666 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
667 self._downloader.write_debug(
668 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
669
670 def extract(self, url):
671 """Extracts URL information and returns it in list of dicts."""
672 try:
673 for _ in range(2):
674 try:
675 self.initialize()
676 self.write_debug('Extracting URL: %s' % url)
677 ie_result = self._real_extract(url)
678 if ie_result is None:
679 return None
680 if self._x_forwarded_for_ip:
681 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
682 subtitles = ie_result.get('subtitles') or {}
683 if 'no-live-chat' in self.get_param('compat_opts'):
684 for lang in ('live_chat', 'comments', 'danmaku'):
685 subtitles.pop(lang, None)
686 return ie_result
687 except GeoRestrictedError as e:
688 if self.__maybe_fake_ip_and_retry(e.countries):
689 continue
690 raise
691 except UnsupportedError:
692 raise
693 except ExtractorError as e:
694 kwargs = {
695 'video_id': e.video_id or self.get_temp_id(url),
696 'ie': self.IE_NAME,
697 'tb': e.traceback or sys.exc_info()[2],
698 'expected': e.expected,
699 'cause': e.cause
700 }
701 if hasattr(e, 'countries'):
702 kwargs['countries'] = e.countries
703 raise type(e)(e.orig_msg, **kwargs)
704 except http.client.IncompleteRead as e:
705 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
706 except (KeyError, StopIteration) as e:
707 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
708
709 def __maybe_fake_ip_and_retry(self, countries):
710 if (not self.get_param('geo_bypass_country', None)
711 and self._GEO_BYPASS
712 and self.get_param('geo_bypass', True)
713 and not self._x_forwarded_for_ip
714 and countries):
715 country_code = random.choice(countries)
716 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
717 if self._x_forwarded_for_ip:
718 self.report_warning(
719 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
720 % (self._x_forwarded_for_ip, country_code.upper()))
721 return True
722 return False
723
724 def set_downloader(self, downloader):
725 """Sets a YoutubeDL instance as the downloader for this IE."""
726 self._downloader = downloader
727
728 @property
729 def cache(self):
730 return self._downloader.cache
731
732 @property
733 def cookiejar(self):
734 return self._downloader.cookiejar
735
736 def _initialize_pre_login(self):
737 """ Initialization before login. Redefine in subclasses."""
738 pass
739
740 def _perform_login(self, username, password):
741 """ Login with username and password. Redefine in subclasses."""
742 pass
743
744 def _real_initialize(self):
745 """Real initialization process. Redefine in subclasses."""
746 pass
747
748 def _real_extract(self, url):
749 """Real extraction process. Redefine in subclasses."""
750 raise NotImplementedError('This method must be implemented by subclasses')
751
752 @classmethod
753 def ie_key(cls):
754 """A string for getting the InfoExtractor with get_info_extractor"""
755 return cls.__name__[:-2]
756
757 @classproperty
758 def IE_NAME(cls):
759 return cls.__name__[:-2]
760
761 @staticmethod
762 def __can_accept_status_code(err, expected_status):
763 assert isinstance(err, urllib.error.HTTPError)
764 if expected_status is None:
765 return False
766 elif callable(expected_status):
767 return expected_status(err.code) is True
768 else:
769 return err.code in variadic(expected_status)
770
771 def _create_request(self, url_or_request, data=None, headers=None, query=None):
772 if isinstance(url_or_request, urllib.request.Request):
773 return update_Request(url_or_request, data=data, headers=headers, query=query)
774 if query:
775 url_or_request = update_url_query(url_or_request, query)
776 return sanitized_Request(url_or_request, data, headers or {})
777
778 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
779 """
780 Return the response handle.
781
782 See _download_webpage docstring for arguments specification.
783 """
784 if not self._downloader._first_webpage_request:
785 sleep_interval = self.get_param('sleep_interval_requests') or 0
786 if sleep_interval > 0:
787 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
788 time.sleep(sleep_interval)
789 else:
790 self._downloader._first_webpage_request = False
791
792 if note is None:
793 self.report_download_webpage(video_id)
794 elif note is not False:
795 if video_id is None:
796 self.to_screen(str(note))
797 else:
798 self.to_screen(f'{video_id}: {note}')
799
800 # Some sites check X-Forwarded-For HTTP header in order to figure out
801 # the origin of the client behind proxy. This allows bypassing geo
802 # restriction by faking this header's value to IP that belongs to some
803 # geo unrestricted country. We will do so once we encounter any
804 # geo restriction error.
805 if self._x_forwarded_for_ip:
806 headers = (headers or {}).copy()
807 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
808
809 try:
810 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
811 except network_exceptions as err:
812 if isinstance(err, urllib.error.HTTPError):
813 if self.__can_accept_status_code(err, expected_status):
814 # Retain reference to error to prevent file object from
815 # being closed before it can be read. Works around the
816 # effects of <https://bugs.python.org/issue15002>
817 # introduced in Python 3.4.1.
818 err.fp._error = err
819 return err.fp
820
821 if errnote is False:
822 return False
823 if errnote is None:
824 errnote = 'Unable to download webpage'
825
826 errmsg = f'{errnote}: {error_to_compat_str(err)}'
827 if fatal:
828 raise ExtractorError(errmsg, cause=err)
829 else:
830 self.report_warning(errmsg)
831 return False
832
833 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
834 encoding=None, data=None, headers={}, query={}, expected_status=None):
835 """
836 Return a tuple (page content as string, URL handle).
837
838 Arguments:
839 url_or_request -- plain text URL as a string or
840 a urllib.request.Request object
841 video_id -- Video/playlist/item identifier (string)
842
843 Keyword arguments:
844 note -- note printed before downloading (string)
845 errnote -- note printed in case of an error (string)
846 fatal -- flag denoting whether error should be considered fatal,
847 i.e. whether it should cause ExtractionError to be raised,
848 otherwise a warning will be reported and extraction continued
849 encoding -- encoding for a page content decoding, guessed automatically
850 when not explicitly specified
851 data -- POST data (bytes)
852 headers -- HTTP headers (dict)
853 query -- URL query (dict)
854 expected_status -- allows to accept failed HTTP requests (non 2xx
855 status code) by explicitly specifying a set of accepted status
856 codes. Can be any of the following entities:
857 - an integer type specifying an exact failed status code to
858 accept
859 - a list or a tuple of integer types specifying a list of
860 failed status codes to accept
861 - a callable accepting an actual failed status code and
862 returning True if it should be accepted
863 Note that this argument does not affect success status codes (2xx)
864 which are always accepted.
865 """
866
867 # Strip hashes from the URL (#1038)
868 if isinstance(url_or_request, str):
869 url_or_request = url_or_request.partition('#')[0]
870
871 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
872 if urlh is False:
873 assert not fatal
874 return False
875 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
876 return (content, urlh)
877
878 @staticmethod
879 def _guess_encoding_from_content(content_type, webpage_bytes):
880 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
881 if m:
882 encoding = m.group(1)
883 else:
884 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
885 webpage_bytes[:1024])
886 if m:
887 encoding = m.group(1).decode('ascii')
888 elif webpage_bytes.startswith(b'\xff\xfe'):
889 encoding = 'utf-16'
890 else:
891 encoding = 'utf-8'
892
893 return encoding
894
895 def __check_blocked(self, content):
896 first_block = content[:512]
897 if ('<title>Access to this site is blocked</title>' in content
898 and 'Websense' in first_block):
899 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
900 blocked_iframe = self._html_search_regex(
901 r'<iframe src="([^"]+)"', content,
902 'Websense information URL', default=None)
903 if blocked_iframe:
904 msg += ' Visit %s for more details' % blocked_iframe
905 raise ExtractorError(msg, expected=True)
906 if '<title>The URL you requested has been blocked</title>' in first_block:
907 msg = (
908 'Access to this webpage has been blocked by Indian censorship. '
909 'Use a VPN or proxy server (with --proxy) to route around it.')
910 block_msg = self._html_search_regex(
911 r'</h1><p>(.*?)</p>',
912 content, 'block message', default=None)
913 if block_msg:
914 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
915 raise ExtractorError(msg, expected=True)
916 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
917 and 'blocklist.rkn.gov.ru' in content):
918 raise ExtractorError(
919 'Access to this webpage has been blocked by decision of the Russian government. '
920 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
921 expected=True)
922
923 def _request_dump_filename(self, url, video_id):
924 basen = f'{video_id}_{url}'
925 trim_length = self.get_param('trim_file_name') or 240
926 if len(basen) > trim_length:
927 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
928 basen = basen[:trim_length - len(h)] + h
929 filename = sanitize_filename(f'{basen}.dump', restricted=True)
930 # Working around MAX_PATH limitation on Windows (see
931 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
932 if compat_os_name == 'nt':
933 absfilepath = os.path.abspath(filename)
934 if len(absfilepath) > 259:
935 filename = fR'\\?\{absfilepath}'
936 return filename
937
938 def __decode_webpage(self, webpage_bytes, encoding, headers):
939 if not encoding:
940 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
941 try:
942 return webpage_bytes.decode(encoding, 'replace')
943 except LookupError:
944 return webpage_bytes.decode('utf-8', 'replace')
945
946 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
947 webpage_bytes = urlh.read()
948 if prefix is not None:
949 webpage_bytes = prefix + webpage_bytes
950 if self.get_param('dump_intermediate_pages', False):
951 self.to_screen('Dumping request to ' + urlh.geturl())
952 dump = base64.b64encode(webpage_bytes).decode('ascii')
953 self._downloader.to_screen(dump)
954 if self.get_param('write_pages'):
955 filename = self._request_dump_filename(urlh.geturl(), video_id)
956 self.to_screen(f'Saving request to {filename}')
957 with open(filename, 'wb') as outf:
958 outf.write(webpage_bytes)
959
960 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
961 self.__check_blocked(content)
962
963 return content
964
965 def __print_error(self, errnote, fatal, video_id, err):
966 if fatal:
967 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
968 elif errnote:
969 self.report_warning(f'{video_id}: {errnote}: {err}')
970
971 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
972 if transform_source:
973 xml_string = transform_source(xml_string)
974 try:
975 return compat_etree_fromstring(xml_string.encode('utf-8'))
976 except xml.etree.ElementTree.ParseError as ve:
977 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
978
979 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
980 try:
981 return json.loads(
982 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
983 except ValueError as ve:
984 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
985
986 def _parse_socket_response_as_json(self, data, *args, **kwargs):
987 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
988
989 def __create_download_methods(name, parser, note, errnote, return_value):
990
991 def parse(ie, content, *args, errnote=errnote, **kwargs):
992 if parser is None:
993 return content
994 if errnote is False:
995 kwargs['errnote'] = errnote
996 # parser is fetched by name so subclasses can override it
997 return getattr(ie, parser)(content, *args, **kwargs)
998
999 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1000 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1001 res = self._download_webpage_handle(
1002 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1003 data=data, headers=headers, query=query, expected_status=expected_status)
1004 if res is False:
1005 return res
1006 content, urlh = res
1007 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1008
1009 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1010 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1011 if self.get_param('load_pages'):
1012 url_or_request = self._create_request(url_or_request, data, headers, query)
1013 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1014 self.to_screen(f'Loading request from {filename}')
1015 try:
1016 with open(filename, 'rb') as dumpf:
1017 webpage_bytes = dumpf.read()
1018 except OSError as e:
1019 self.report_warning(f'Unable to load request from disk: {e}')
1020 else:
1021 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1022 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1023 kwargs = {
1024 'note': note,
1025 'errnote': errnote,
1026 'transform_source': transform_source,
1027 'fatal': fatal,
1028 'encoding': encoding,
1029 'data': data,
1030 'headers': headers,
1031 'query': query,
1032 'expected_status': expected_status,
1033 }
1034 if parser is None:
1035 kwargs.pop('transform_source')
1036 # The method is fetched by name so subclasses can override _download_..._handle
1037 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1038 return res if res is False else res[0]
1039
1040 def impersonate(func, name, return_value):
1041 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1042 func.__doc__ = f'''
1043 @param transform_source Apply this transformation before parsing
1044 @returns {return_value}
1045
1046 See _download_webpage_handle docstring for other arguments specification
1047 '''
1048
1049 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1050 impersonate(download_content, f'_download_{name}', f'{return_value}')
1051 return download_handle, download_content
1052
1053 _download_xml_handle, _download_xml = __create_download_methods(
1054 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1055 _download_json_handle, _download_json = __create_download_methods(
1056 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1057 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1058 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1059 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1060
1061 def _download_webpage(
1062 self, url_or_request, video_id, note=None, errnote=None,
1063 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1064 """
1065 Return the data of the page as a string.
1066
1067 Keyword arguments:
1068 tries -- number of tries
1069 timeout -- sleep interval between tries
1070
1071 See _download_webpage_handle docstring for other arguments specification.
1072 """
1073
1074 R''' # NB: These are unused; should they be deprecated?
1075 if tries != 1:
1076 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1077 if timeout is NO_DEFAULT:
1078 timeout = 5
1079 else:
1080 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1081 '''
1082
1083 try_count = 0
1084 while True:
1085 try:
1086 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1087 except http.client.IncompleteRead as e:
1088 try_count += 1
1089 if try_count >= tries:
1090 raise e
1091 self._sleep(timeout, video_id)
1092
1093 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1094 idstr = format_field(video_id, None, '%s: ')
1095 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1096 if only_once:
1097 if f'WARNING: {msg}' in self._printed_messages:
1098 return
1099 self._printed_messages.add(f'WARNING: {msg}')
1100 self._downloader.report_warning(msg, *args, **kwargs)
1101
1102 def to_screen(self, msg, *args, **kwargs):
1103 """Print msg to screen, prefixing it with '[ie_name]'"""
1104 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1105
1106 def write_debug(self, msg, *args, **kwargs):
1107 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1108
1109 def get_param(self, name, default=None, *args, **kwargs):
1110 if self._downloader:
1111 return self._downloader.params.get(name, default, *args, **kwargs)
1112 return default
1113
1114 def report_drm(self, video_id, partial=NO_DEFAULT):
1115 if partial is not NO_DEFAULT:
1116 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1117 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1118
1119 def report_extraction(self, id_or_name):
1120 """Report information extraction."""
1121 self.to_screen('%s: Extracting information' % id_or_name)
1122
1123 def report_download_webpage(self, video_id):
1124 """Report webpage download."""
1125 self.to_screen('%s: Downloading webpage' % video_id)
1126
1127 def report_age_confirmation(self):
1128 """Report attempt to confirm age."""
1129 self.to_screen('Confirming age')
1130
1131 def report_login(self):
1132 """Report attempt to log in."""
1133 self.to_screen('Logging in')
1134
1135 def raise_login_required(
1136 self, msg='This video is only available for registered users',
1137 metadata_available=False, method=NO_DEFAULT):
1138 if metadata_available and (
1139 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1140 self.report_warning(msg)
1141 return
1142 msg += format_field(self._login_hint(method), None, '. %s')
1143 raise ExtractorError(msg, expected=True)
1144
1145 def raise_geo_restricted(
1146 self, msg='This video is not available from your location due to geo restriction',
1147 countries=None, metadata_available=False):
1148 if metadata_available and (
1149 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1150 self.report_warning(msg)
1151 else:
1152 raise GeoRestrictedError(msg, countries=countries)
1153
1154 def raise_no_formats(self, msg, expected=False, video_id=None):
1155 if expected and (
1156 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1157 self.report_warning(msg, video_id)
1158 elif isinstance(msg, ExtractorError):
1159 raise msg
1160 else:
1161 raise ExtractorError(msg, expected=expected, video_id=video_id)
1162
1163 # Methods for following #608
1164 @staticmethod
1165 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1166 """Returns a URL that points to a page that should be processed"""
1167 if ie is not None:
1168 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1169 if video_id is not None:
1170 kwargs['id'] = video_id
1171 if video_title is not None:
1172 kwargs['title'] = video_title
1173 return {
1174 **kwargs,
1175 '_type': 'url_transparent' if url_transparent else 'url',
1176 'url': url,
1177 }
1178
1179 @classmethod
1180 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1181 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1182 return cls.playlist_result(
1183 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1184 playlist_id, playlist_title, **kwargs)
1185
1186 @staticmethod
1187 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1188 """Returns a playlist"""
1189 if playlist_id:
1190 kwargs['id'] = playlist_id
1191 if playlist_title:
1192 kwargs['title'] = playlist_title
1193 if playlist_description is not None:
1194 kwargs['description'] = playlist_description
1195 return {
1196 **kwargs,
1197 '_type': 'multi_video' if multi_video else 'playlist',
1198 'entries': entries,
1199 }
1200
1201 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1202 """
1203 Perform a regex search on the given string, using a single or a list of
1204 patterns returning the first matching group.
1205 In case of failure return a default value or raise a WARNING or a
1206 RegexNotFoundError, depending on fatal, specifying the field name.
1207 """
1208 if string is None:
1209 mobj = None
1210 elif isinstance(pattern, (str, re.Pattern)):
1211 mobj = re.search(pattern, string, flags)
1212 else:
1213 for p in pattern:
1214 mobj = re.search(p, string, flags)
1215 if mobj:
1216 break
1217
1218 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1219
1220 if mobj:
1221 if group is None:
1222 # return the first matching group
1223 return next(g for g in mobj.groups() if g is not None)
1224 elif isinstance(group, (list, tuple)):
1225 return tuple(mobj.group(g) for g in group)
1226 else:
1227 return mobj.group(group)
1228 elif default is not NO_DEFAULT:
1229 return default
1230 elif fatal:
1231 raise RegexNotFoundError('Unable to extract %s' % _name)
1232 else:
1233 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1234 return None
1235
1236 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1237 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1238 """Searches string for the JSON object specified by start_pattern"""
1239 # NB: end_pattern is only used to reduce the size of the initial match
1240 if default is NO_DEFAULT:
1241 default, has_default = {}, False
1242 else:
1243 fatal, has_default = False, True
1244
1245 json_string = self._search_regex(
1246 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1247 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1248 if not json_string:
1249 return default
1250
1251 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1252 try:
1253 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1254 except ExtractorError as e:
1255 if fatal:
1256 raise ExtractorError(
1257 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1258 elif not has_default:
1259 self.report_warning(
1260 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1261 return default
1262
1263 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1264 """
1265 Like _search_regex, but strips HTML tags and unescapes entities.
1266 """
1267 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1268 if res:
1269 return clean_html(res).strip()
1270 else:
1271 return res
1272
1273 def _get_netrc_login_info(self, netrc_machine=None):
1274 username = None
1275 password = None
1276 netrc_machine = netrc_machine or self._NETRC_MACHINE
1277
1278 if self.get_param('usenetrc', False):
1279 try:
1280 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1281 if os.path.isdir(netrc_file):
1282 netrc_file = os.path.join(netrc_file, '.netrc')
1283 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1284 if info is not None:
1285 username = info[0]
1286 password = info[2]
1287 else:
1288 raise netrc.NetrcParseError(
1289 'No authenticators for %s' % netrc_machine)
1290 except (OSError, netrc.NetrcParseError) as err:
1291 self.report_warning(
1292 'parsing .netrc: %s' % error_to_compat_str(err))
1293
1294 return username, password
1295
1296 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1297 """
1298 Get the login info as (username, password)
1299 First look for the manually specified credentials using username_option
1300 and password_option as keys in params dictionary. If no such credentials
1301 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1302 value.
1303 If there's no info available, return (None, None)
1304 """
1305
1306 # Attempt to use provided username and password or .netrc data
1307 username = self.get_param(username_option)
1308 if username is not None:
1309 password = self.get_param(password_option)
1310 else:
1311 username, password = self._get_netrc_login_info(netrc_machine)
1312
1313 return username, password
1314
1315 def _get_tfa_info(self, note='two-factor verification code'):
1316 """
1317 Get the two-factor authentication info
1318 TODO - asking the user will be required for sms/phone verify
1319 currently just uses the command line option
1320 If there's no info available, return None
1321 """
1322
1323 tfa = self.get_param('twofactor')
1324 if tfa is not None:
1325 return tfa
1326
1327 return getpass.getpass('Type %s and press [Return]: ' % note)
1328
1329 # Helper functions for extracting OpenGraph info
1330 @staticmethod
1331 def _og_regexes(prop):
1332 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1333 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1334 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1335 template = r'<meta[^>]+?%s[^>]+?%s'
1336 return [
1337 template % (property_re, content_re),
1338 template % (content_re, property_re),
1339 ]
1340
1341 @staticmethod
1342 def _meta_regex(prop):
1343 return r'''(?isx)<meta
1344 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1345 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1346
1347 def _og_search_property(self, prop, html, name=None, **kargs):
1348 prop = variadic(prop)
1349 if name is None:
1350 name = 'OpenGraph %s' % prop[0]
1351 og_regexes = []
1352 for p in prop:
1353 og_regexes.extend(self._og_regexes(p))
1354 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1355 if escaped is None:
1356 return None
1357 return unescapeHTML(escaped)
1358
1359 def _og_search_thumbnail(self, html, **kargs):
1360 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1361
1362 def _og_search_description(self, html, **kargs):
1363 return self._og_search_property('description', html, fatal=False, **kargs)
1364
1365 def _og_search_title(self, html, *, fatal=False, **kargs):
1366 return self._og_search_property('title', html, fatal=fatal, **kargs)
1367
1368 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1369 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1370 if secure:
1371 regexes = self._og_regexes('video:secure_url') + regexes
1372 return self._html_search_regex(regexes, html, name, **kargs)
1373
1374 def _og_search_url(self, html, **kargs):
1375 return self._og_search_property('url', html, **kargs)
1376
1377 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1378 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1379
1380 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1381 name = variadic(name)
1382 if display_name is None:
1383 display_name = name[0]
1384 return self._html_search_regex(
1385 [self._meta_regex(n) for n in name],
1386 html, display_name, fatal=fatal, group='content', **kwargs)
1387
1388 def _dc_search_uploader(self, html):
1389 return self._html_search_meta('dc.creator', html, 'uploader')
1390
1391 @staticmethod
1392 def _rta_search(html):
1393 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1394 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1395 r' content="RTA-5042-1996-1400-1577-RTA"',
1396 html):
1397 return 18
1398
1399 # And then there are the jokers who advertise that they use RTA, but actually don't.
1400 AGE_LIMIT_MARKERS = [
1401 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1402 ]
1403 if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
1404 return 18
1405 return 0
1406
1407 def _media_rating_search(self, html):
1408 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1409 rating = self._html_search_meta('rating', html)
1410
1411 if not rating:
1412 return None
1413
1414 RATING_TABLE = {
1415 'safe for kids': 0,
1416 'general': 8,
1417 '14 years': 14,
1418 'mature': 17,
1419 'restricted': 19,
1420 }
1421 return RATING_TABLE.get(rating.lower())
1422
1423 def _family_friendly_search(self, html):
1424 # See http://schema.org/VideoObject
1425 family_friendly = self._html_search_meta(
1426 'isFamilyFriendly', html, default=None)
1427
1428 if not family_friendly:
1429 return None
1430
1431 RATING_TABLE = {
1432 '1': 0,
1433 'true': 0,
1434 '0': 18,
1435 'false': 18,
1436 }
1437 return RATING_TABLE.get(family_friendly.lower())
1438
1439 def _twitter_search_player(self, html):
1440 return self._html_search_meta('twitter:player', html,
1441 'twitter card player')
1442
1443 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1444 """Yield all json ld objects in the html"""
1445 if default is not NO_DEFAULT:
1446 fatal = False
1447 for mobj in re.finditer(JSON_LD_RE, html):
1448 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1449 for json_ld in variadic(json_ld_item):
1450 if isinstance(json_ld, dict):
1451 yield json_ld
1452
1453 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1454 """Search for a video in any json ld in the html"""
1455 if default is not NO_DEFAULT:
1456 fatal = False
1457 info = self._json_ld(
1458 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1459 video_id, fatal=fatal, expected_type=expected_type)
1460 if info:
1461 return info
1462 if default is not NO_DEFAULT:
1463 return default
1464 elif fatal:
1465 raise RegexNotFoundError('Unable to extract JSON-LD')
1466 else:
1467 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1468 return {}
1469
1470 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1471 if isinstance(json_ld, str):
1472 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1473 if not json_ld:
1474 return {}
1475 info = {}
1476
1477 INTERACTION_TYPE_MAP = {
1478 'CommentAction': 'comment',
1479 'AgreeAction': 'like',
1480 'DisagreeAction': 'dislike',
1481 'LikeAction': 'like',
1482 'DislikeAction': 'dislike',
1483 'ListenAction': 'view',
1484 'WatchAction': 'view',
1485 'ViewAction': 'view',
1486 }
1487
1488 def is_type(e, *expected_types):
1489 type = variadic(traverse_obj(e, '@type'))
1490 return any(x in type for x in expected_types)
1491
1492 def extract_interaction_type(e):
1493 interaction_type = e.get('interactionType')
1494 if isinstance(interaction_type, dict):
1495 interaction_type = interaction_type.get('@type')
1496 return str_or_none(interaction_type)
1497
1498 def extract_interaction_statistic(e):
1499 interaction_statistic = e.get('interactionStatistic')
1500 if isinstance(interaction_statistic, dict):
1501 interaction_statistic = [interaction_statistic]
1502 if not isinstance(interaction_statistic, list):
1503 return
1504 for is_e in interaction_statistic:
1505 if not is_type(is_e, 'InteractionCounter'):
1506 continue
1507 interaction_type = extract_interaction_type(is_e)
1508 if not interaction_type:
1509 continue
1510 # For interaction count some sites provide string instead of
1511 # an integer (as per spec) with non digit characters (e.g. ",")
1512 # so extracting count with more relaxed str_to_int
1513 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1514 if interaction_count is None:
1515 continue
1516 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1517 if not count_kind:
1518 continue
1519 count_key = '%s_count' % count_kind
1520 if info.get(count_key) is not None:
1521 continue
1522 info[count_key] = interaction_count
1523
1524 def extract_chapter_information(e):
1525 chapters = [{
1526 'title': part.get('name'),
1527 'start_time': part.get('startOffset'),
1528 'end_time': part.get('endOffset'),
1529 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1530 for idx, (last_c, current_c, next_c) in enumerate(zip(
1531 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1532 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1533 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1534 if None in current_c.values():
1535 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1536 return
1537 if chapters:
1538 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1539 info['chapters'] = chapters
1540
1541 def extract_video_object(e):
1542 author = e.get('author')
1543 info.update({
1544 'url': url_or_none(e.get('contentUrl')),
1545 'ext': mimetype2ext(e.get('encodingFormat')),
1546 'title': unescapeHTML(e.get('name')),
1547 'description': unescapeHTML(e.get('description')),
1548 'thumbnails': [{'url': unescapeHTML(url)}
1549 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1550 if url_or_none(url)],
1551 'duration': parse_duration(e.get('duration')),
1552 'timestamp': unified_timestamp(e.get('uploadDate')),
1553 # author can be an instance of 'Organization' or 'Person' types.
1554 # both types can have 'name' property(inherited from 'Thing' type). [1]
1555 # however some websites are using 'Text' type instead.
1556 # 1. https://schema.org/VideoObject
1557 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1558 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1559 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1560 'tbr': int_or_none(e.get('bitrate')),
1561 'width': int_or_none(e.get('width')),
1562 'height': int_or_none(e.get('height')),
1563 'view_count': int_or_none(e.get('interactionCount')),
1564 'tags': try_call(lambda: e.get('keywords').split(',')),
1565 })
1566 if is_type(e, 'AudioObject'):
1567 info.update({
1568 'vcodec': 'none',
1569 'abr': int_or_none(e.get('bitrate')),
1570 })
1571 extract_interaction_statistic(e)
1572 extract_chapter_information(e)
1573
1574 def traverse_json_ld(json_ld, at_top_level=True):
1575 for e in variadic(json_ld):
1576 if not isinstance(e, dict):
1577 continue
1578 if at_top_level and '@context' not in e:
1579 continue
1580 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1581 traverse_json_ld(e['@graph'], at_top_level=False)
1582 continue
1583 if expected_type is not None and not is_type(e, expected_type):
1584 continue
1585 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1586 if rating is not None:
1587 info['average_rating'] = rating
1588 if is_type(e, 'TVEpisode', 'Episode'):
1589 episode_name = unescapeHTML(e.get('name'))
1590 info.update({
1591 'episode': episode_name,
1592 'episode_number': int_or_none(e.get('episodeNumber')),
1593 'description': unescapeHTML(e.get('description')),
1594 })
1595 if not info.get('title') and episode_name:
1596 info['title'] = episode_name
1597 part_of_season = e.get('partOfSeason')
1598 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1599 info.update({
1600 'season': unescapeHTML(part_of_season.get('name')),
1601 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1602 })
1603 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1604 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1605 info['series'] = unescapeHTML(part_of_series.get('name'))
1606 elif is_type(e, 'Movie'):
1607 info.update({
1608 'title': unescapeHTML(e.get('name')),
1609 'description': unescapeHTML(e.get('description')),
1610 'duration': parse_duration(e.get('duration')),
1611 'timestamp': unified_timestamp(e.get('dateCreated')),
1612 })
1613 elif is_type(e, 'Article', 'NewsArticle'):
1614 info.update({
1615 'timestamp': parse_iso8601(e.get('datePublished')),
1616 'title': unescapeHTML(e.get('headline')),
1617 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1618 })
1619 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1620 extract_video_object(e['video'][0])
1621 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1622 extract_video_object(e['subjectOf'][0])
1623 elif is_type(e, 'VideoObject', 'AudioObject'):
1624 extract_video_object(e)
1625 if expected_type is None:
1626 continue
1627 else:
1628 break
1629 video = e.get('video')
1630 if is_type(video, 'VideoObject'):
1631 extract_video_object(video)
1632 if expected_type is None:
1633 continue
1634 else:
1635 break
1636
1637 traverse_json_ld(json_ld)
1638 return filter_dict(info)
1639
1640 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1641 return self._parse_json(
1642 self._search_regex(
1643 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1644 webpage, 'next.js data', fatal=fatal, **kw),
1645 video_id, transform_source=transform_source, fatal=fatal)
1646
1647 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1648 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1649 rectx = re.escape(context_name)
1650 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1651 js, arg_keys, arg_vals = self._search_regex(
1652 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1653 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1654 default=NO_DEFAULT if fatal else (None, None, None))
1655 if js is None:
1656 return {}
1657
1658 args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
1659
1660 for key, val in args.items():
1661 if val in ('undefined', 'void 0'):
1662 args[key] = 'null'
1663
1664 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1665 return traverse_obj(ret, traverse) or {}
1666
1667 @staticmethod
1668 def _hidden_inputs(html):
1669 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1670 hidden_inputs = {}
1671 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1672 attrs = extract_attributes(input)
1673 if not input:
1674 continue
1675 if attrs.get('type') not in ('hidden', 'submit'):
1676 continue
1677 name = attrs.get('name') or attrs.get('id')
1678 value = attrs.get('value')
1679 if name and value is not None:
1680 hidden_inputs[name] = value
1681 return hidden_inputs
1682
1683 def _form_hidden_inputs(self, form_id, html):
1684 form = self._search_regex(
1685 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1686 html, '%s form' % form_id, group='form')
1687 return self._hidden_inputs(form)
1688
1689 class FormatSort:
1690 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
1691
1692 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
1693 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
1694 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
1695 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
1696 'height', 'width', 'proto', 'vext', 'abr', 'aext',
1697 'fps', 'fs_approx', 'source', 'id')
1698
1699 settings = {
1700 'vcodec': {'type': 'ordered', 'regex': True,
1701 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
1702 'acodec': {'type': 'ordered', 'regex': True,
1703 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
1704 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
1705 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
1706 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
1707 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
1708 'vext': {'type': 'ordered', 'field': 'video_ext',
1709 'order': ('mp4', 'webm', 'flv', '', 'none'),
1710 'order_free': ('webm', 'mp4', 'flv', '', 'none')},
1711 'aext': {'type': 'ordered', 'field': 'audio_ext',
1712 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
1713 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
1714 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1715 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
1716 'field': ('vcodec', 'acodec'),
1717 'function': lambda it: int(any(v != 'none' for v in it))},
1718 'ie_pref': {'priority': True, 'type': 'extractor'},
1719 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1720 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1721 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
1722 'quality': {'convert': 'float', 'default': -1},
1723 'filesize': {'convert': 'bytes'},
1724 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
1725 'id': {'convert': 'string', 'field': 'format_id'},
1726 'height': {'convert': 'float_none'},
1727 'width': {'convert': 'float_none'},
1728 'fps': {'convert': 'float_none'},
1729 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
1730 'tbr': {'convert': 'float_none'},
1731 'vbr': {'convert': 'float_none'},
1732 'abr': {'convert': 'float_none'},
1733 'asr': {'convert': 'float_none'},
1734 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
1735
1736 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1737 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1738 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1739 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
1740 'res': {'type': 'multiple', 'field': ('height', 'width'),
1741 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
1742
1743 # Actual field names
1744 'format_id': {'type': 'alias', 'field': 'id'},
1745 'preference': {'type': 'alias', 'field': 'ie_pref'},
1746 'language_preference': {'type': 'alias', 'field': 'lang'},
1747 'source_preference': {'type': 'alias', 'field': 'source'},
1748 'protocol': {'type': 'alias', 'field': 'proto'},
1749 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
1750 'audio_channels': {'type': 'alias', 'field': 'channels'},
1751
1752 # Deprecated
1753 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
1754 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
1755 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
1756 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
1757 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
1758 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
1759 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
1760 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
1761 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
1762 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
1763 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
1764 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
1765 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
1766 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
1767 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1768 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
1769 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1770 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
1771 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1772 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
1773 }
1774
1775 def __init__(self, ie, field_preference):
1776 self._order = []
1777 self.ydl = ie._downloader
1778 self.evaluate_params(self.ydl.params, field_preference)
1779 if ie.get_param('verbose'):
1780 self.print_verbose_info(self.ydl.write_debug)
1781
1782 def _get_field_setting(self, field, key):
1783 if field not in self.settings:
1784 if key in ('forced', 'priority'):
1785 return False
1786 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
1787 'deprecated and may be removed in a future version')
1788 self.settings[field] = {}
1789 propObj = self.settings[field]
1790 if key not in propObj:
1791 type = propObj.get('type')
1792 if key == 'field':
1793 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
1794 elif key == 'convert':
1795 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
1796 else:
1797 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
1798 propObj[key] = default
1799 return propObj[key]
1800
1801 def _resolve_field_value(self, field, value, convertNone=False):
1802 if value is None:
1803 if not convertNone:
1804 return None
1805 else:
1806 value = value.lower()
1807 conversion = self._get_field_setting(field, 'convert')
1808 if conversion == 'ignore':
1809 return None
1810 if conversion == 'string':
1811 return value
1812 elif conversion == 'float_none':
1813 return float_or_none(value)
1814 elif conversion == 'bytes':
1815 return FileDownloader.parse_bytes(value)
1816 elif conversion == 'order':
1817 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
1818 use_regex = self._get_field_setting(field, 'regex')
1819 list_length = len(order_list)
1820 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1821 if use_regex and value is not None:
1822 for i, regex in enumerate(order_list):
1823 if regex and re.match(regex, value):
1824 return list_length - i
1825 return list_length - empty_pos # not in list
1826 else: # not regex or value = None
1827 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1828 else:
1829 if value.isnumeric():
1830 return float(value)
1831 else:
1832 self.settings[field]['convert'] = 'string'
1833 return value
1834
1835 def evaluate_params(self, params, sort_extractor):
1836 self._use_free_order = params.get('prefer_free_formats', False)
1837 self._sort_user = params.get('format_sort', [])
1838 self._sort_extractor = sort_extractor
1839
1840 def add_item(field, reverse, closest, limit_text):
1841 field = field.lower()
1842 if field in self._order:
1843 return
1844 self._order.append(field)
1845 limit = self._resolve_field_value(field, limit_text)
1846 data = {
1847 'reverse': reverse,
1848 'closest': False if limit is None else closest,
1849 'limit_text': limit_text,
1850 'limit': limit}
1851 if field in self.settings:
1852 self.settings[field].update(data)
1853 else:
1854 self.settings[field] = data
1855
1856 sort_list = (
1857 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
1858 + (tuple() if params.get('format_sort_force', False)
1859 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
1860 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1861
1862 for item in sort_list:
1863 match = re.match(self.regex, item)
1864 if match is None:
1865 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
1866 field = match.group('field')
1867 if field is None:
1868 continue
1869 if self._get_field_setting(field, 'type') == 'alias':
1870 alias, field = field, self._get_field_setting(field, 'field')
1871 if self._get_field_setting(alias, 'deprecated'):
1872 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
1873 f'be removed in a future version. Please use {field} instead')
1874 reverse = match.group('reverse') is not None
1875 closest = match.group('separator') == '~'
1876 limit_text = match.group('limit')
1877
1878 has_limit = limit_text is not None
1879 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
1880 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
1881
1882 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
1883 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
1884 limit_count = len(limits)
1885 for (i, f) in enumerate(fields):
1886 add_item(f, reverse, closest,
1887 limits[i] if i < limit_count
1888 else limits[0] if has_limit and not has_multiple_limits
1889 else None)
1890
1891 def print_verbose_info(self, write_debug):
1892 if self._sort_user:
1893 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
1894 if self._sort_extractor:
1895 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
1896 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
1897 '+' if self._get_field_setting(field, 'reverse') else '', field,
1898 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
1899 self._get_field_setting(field, 'limit_text'),
1900 self._get_field_setting(field, 'limit'))
1901 if self._get_field_setting(field, 'limit_text') is not None else '')
1902 for field in self._order if self._get_field_setting(field, 'visible')]))
1903
1904 def _calculate_field_preference_from_value(self, format, field, type, value):
1905 reverse = self._get_field_setting(field, 'reverse')
1906 closest = self._get_field_setting(field, 'closest')
1907 limit = self._get_field_setting(field, 'limit')
1908
1909 if type == 'extractor':
1910 maximum = self._get_field_setting(field, 'max')
1911 if value is None or (maximum is not None and value >= maximum):
1912 value = -1
1913 elif type == 'boolean':
1914 in_list = self._get_field_setting(field, 'in_list')
1915 not_in_list = self._get_field_setting(field, 'not_in_list')
1916 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1917 elif type == 'ordered':
1918 value = self._resolve_field_value(field, value, True)
1919
1920 # try to convert to number
1921 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
1922 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
1923 if is_num:
1924 value = val_num
1925
1926 return ((-10, 0) if value is None
1927 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1928 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1929 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1930 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1931 else (-1, value, 0))
1932
1933 def _calculate_field_preference(self, format, field):
1934 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1935 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
1936 if type == 'multiple':
1937 type = 'field' # Only 'field' is allowed in multiple for now
1938 actual_fields = self._get_field_setting(field, 'field')
1939
1940 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
1941 else:
1942 value = get_value(field)
1943 return self._calculate_field_preference_from_value(format, field, type, value)
1944
1945 def calculate_preference(self, format):
1946 # Determine missing protocol
1947 if not format.get('protocol'):
1948 format['protocol'] = determine_protocol(format)
1949
1950 # Determine missing ext
1951 if not format.get('ext') and 'url' in format:
1952 format['ext'] = determine_ext(format['url'])
1953 if format.get('vcodec') == 'none':
1954 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
1955 format['video_ext'] = 'none'
1956 else:
1957 format['video_ext'] = format['ext']
1958 format['audio_ext'] = 'none'
1959 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
1960 # format['preference'] = -1000
1961
1962 # Determine missing bitrates
1963 if format.get('tbr') is None:
1964 if format.get('vbr') is not None and format.get('abr') is not None:
1965 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
1966 else:
1967 if format.get('vcodec') != 'none' and format.get('vbr') is None:
1968 format['vbr'] = format.get('tbr') - format.get('abr', 0)
1969 if format.get('acodec') != 'none' and format.get('abr') is None:
1970 format['abr'] = format.get('tbr') - format.get('vbr', 0)
1971
1972 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1973
1974 def _sort_formats(self, formats, field_preference=[]):
1975 if not formats:
1976 return
1977 formats.sort(key=self.FormatSort(self, field_preference).calculate_preference)
1978
1979 def _check_formats(self, formats, video_id):
1980 if formats:
1981 formats[:] = filter(
1982 lambda f: self._is_valid_url(
1983 f['url'], video_id,
1984 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1985 formats)
1986
1987 @staticmethod
1988 def _remove_duplicate_formats(formats):
1989 format_urls = set()
1990 unique_formats = []
1991 for f in formats:
1992 if f['url'] not in format_urls:
1993 format_urls.add(f['url'])
1994 unique_formats.append(f)
1995 formats[:] = unique_formats
1996
1997 def _is_valid_url(self, url, video_id, item='video', headers={}):
1998 url = self._proto_relative_url(url, scheme='http:')
1999 # For now assume non HTTP(S) URLs always valid
2000 if not (url.startswith('http://') or url.startswith('https://')):
2001 return True
2002 try:
2003 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
2004 return True
2005 except ExtractorError as e:
2006 self.to_screen(
2007 '%s: %s URL is invalid, skipping: %s'
2008 % (video_id, item, error_to_compat_str(e.cause)))
2009 return False
2010
2011 def http_scheme(self):
2012 """ Either "http:" or "https:", depending on the user's preferences """
2013 return (
2014 'http:'
2015 if self.get_param('prefer_insecure', False)
2016 else 'https:')
2017
2018 def _proto_relative_url(self, url, scheme=None):
2019 scheme = scheme or self.http_scheme()
2020 assert scheme.endswith(':')
2021 return sanitize_url(url, scheme=scheme[:-1])
2022
2023 def _sleep(self, timeout, video_id, msg_template=None):
2024 if msg_template is None:
2025 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
2026 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
2027 self.to_screen(msg)
2028 time.sleep(timeout)
2029
2030 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2031 transform_source=lambda s: fix_xml_ampersands(s).strip(),
2032 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
2033 res = self._download_xml_handle(
2034 manifest_url, video_id, 'Downloading f4m manifest',
2035 'Unable to download f4m manifest',
2036 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
2037 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
2038 transform_source=transform_source,
2039 fatal=fatal, data=data, headers=headers, query=query)
2040 if res is False:
2041 return []
2042
2043 manifest, urlh = res
2044 manifest_url = urlh.geturl()
2045
2046 return self._parse_f4m_formats(
2047 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2048 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
2049
2050 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
2051 transform_source=lambda s: fix_xml_ampersands(s).strip(),
2052 fatal=True, m3u8_id=None):
2053 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
2054 return []
2055
2056 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
2057 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
2058 if akamai_pv is not None and ';' in akamai_pv.text:
2059 playerVerificationChallenge = akamai_pv.text.split(';')[0]
2060 if playerVerificationChallenge.strip() != '':
2061 return []
2062
2063 formats = []
2064 manifest_version = '1.0'
2065 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
2066 if not media_nodes:
2067 manifest_version = '2.0'
2068 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
2069 # Remove unsupported DRM protected media from final formats
2070 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
2071 media_nodes = remove_encrypted_media(media_nodes)
2072 if not media_nodes:
2073 return formats
2074
2075 manifest_base_url = get_base_url(manifest)
2076
2077 bootstrap_info = xpath_element(
2078 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
2079 'bootstrap info', default=None)
2080
2081 vcodec = None
2082 mime_type = xpath_text(
2083 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
2084 'base URL', default=None)
2085 if mime_type and mime_type.startswith('audio/'):
2086 vcodec = 'none'
2087
2088 for i, media_el in enumerate(media_nodes):
2089 tbr = int_or_none(media_el.attrib.get('bitrate'))
2090 width = int_or_none(media_el.attrib.get('width'))
2091 height = int_or_none(media_el.attrib.get('height'))
2092 format_id = join_nonempty(f4m_id, tbr or i)
2093 # If <bootstrapInfo> is present, the specified f4m is a
2094 # stream-level manifest, and only set-level manifests may refer to
2095 # external resources. See section 11.4 and section 4 of F4M spec
2096 if bootstrap_info is None:
2097 media_url = None
2098 # @href is introduced in 2.0, see section 11.6 of F4M spec
2099 if manifest_version == '2.0':
2100 media_url = media_el.attrib.get('href')
2101 if media_url is None:
2102 media_url = media_el.attrib.get('url')
2103 if not media_url:
2104 continue
2105 manifest_url = (
2106 media_url if media_url.startswith('http://') or media_url.startswith('https://')
2107 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
2108 # If media_url is itself a f4m manifest do the recursive extraction
2109 # since bitrates in parent manifest (this one) and media_url manifest
2110 # may differ leading to inability to resolve the format by requested
2111 # bitrate in f4m downloader
2112 ext = determine_ext(manifest_url)
2113 if ext == 'f4m':
2114 f4m_formats = self._extract_f4m_formats(
2115 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
2116 transform_source=transform_source, fatal=fatal)
2117 # Sometimes stream-level manifest contains single media entry that
2118 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
2119 # At the same time parent's media entry in set-level manifest may
2120 # contain it. We will copy it from parent in such cases.
2121 if len(f4m_formats) == 1:
2122 f = f4m_formats[0]
2123 f.update({
2124 'tbr': f.get('tbr') or tbr,
2125 'width': f.get('width') or width,
2126 'height': f.get('height') or height,
2127 'format_id': f.get('format_id') if not tbr else format_id,
2128 'vcodec': vcodec,
2129 })
2130 formats.extend(f4m_formats)
2131 continue
2132 elif ext == 'm3u8':
2133 formats.extend(self._extract_m3u8_formats(
2134 manifest_url, video_id, 'mp4', preference=preference,
2135 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
2136 continue
2137 formats.append({
2138 'format_id': format_id,
2139 'url': manifest_url,
2140 'manifest_url': manifest_url,
2141 'ext': 'flv' if bootstrap_info is not None else None,
2142 'protocol': 'f4m',
2143 'tbr': tbr,
2144 'width': width,
2145 'height': height,
2146 'vcodec': vcodec,
2147 'preference': preference,
2148 'quality': quality,
2149 })
2150 return formats
2151
2152 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
2153 return {
2154 'format_id': join_nonempty(m3u8_id, 'meta'),
2155 'url': m3u8_url,
2156 'ext': ext,
2157 'protocol': 'm3u8',
2158 'preference': preference - 100 if preference else -100,
2159 'quality': quality,
2160 'resolution': 'multiple',
2161 'format_note': 'Quality selection URL',
2162 }
2163
2164 def _report_ignoring_subs(self, name):
2165 self.report_warning(bug_reports_message(
2166 f'Ignoring subtitle tracks found in the {name} manifest; '
2167 'if any subtitle tracks are missing,'
2168 ), only_once=True)
2169
2170 def _extract_m3u8_formats(self, *args, **kwargs):
2171 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
2172 if subs:
2173 self._report_ignoring_subs('HLS')
2174 return fmts
2175
2176 def _extract_m3u8_formats_and_subtitles(
2177 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
2178 preference=None, quality=None, m3u8_id=None, note=None,
2179 errnote=None, fatal=True, live=False, data=None, headers={},
2180 query={}):
2181
2182 res = self._download_webpage_handle(
2183 m3u8_url, video_id,
2184 note='Downloading m3u8 information' if note is None else note,
2185 errnote='Failed to download m3u8 information' if errnote is None else errnote,
2186 fatal=fatal, data=data, headers=headers, query=query)
2187
2188 if res is False:
2189 return [], {}
2190
2191 m3u8_doc, urlh = res
2192 m3u8_url = urlh.geturl()
2193
2194 return self._parse_m3u8_formats_and_subtitles(
2195 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
2196 preference=preference, quality=quality, m3u8_id=m3u8_id,
2197 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2198 headers=headers, query=query, video_id=video_id)
2199
2200 def _parse_m3u8_formats_and_subtitles(
2201 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2202 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2203 errnote=None, fatal=True, data=None, headers={}, query={},
2204 video_id=None):
2205 formats, subtitles = [], {}
2206
2207 has_drm = re.search('|'.join([
2208 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
2209 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
2210 ]), m3u8_doc)
2211
2212 def format_url(url):
2213 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2214
2215 if self.get_param('hls_split_discontinuity', False):
2216 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2217 if not m3u8_doc:
2218 if not manifest_url:
2219 return []
2220 m3u8_doc = self._download_webpage(
2221 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2222 note=False, errnote='Failed to download m3u8 playlist information')
2223 if m3u8_doc is False:
2224 return []
2225 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2226
2227 else:
2228 def _extract_m3u8_playlist_indices(*args, **kwargs):
2229 return [None]
2230
2231 # References:
2232 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2233 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2234 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2235
2236 # We should try extracting formats only from master playlists [1, 4.3.4],
2237 # i.e. playlists that describe available qualities. On the other hand
2238 # media playlists [1, 4.3.3] should be returned as is since they contain
2239 # just the media without qualities renditions.
2240 # Fortunately, master playlist can be easily distinguished from media
2241 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2242 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2243 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2244 # media playlist and MUST NOT appear in master playlist thus we can
2245 # clearly detect media playlist with this criterion.
2246
2247 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2248 formats = [{
2249 'format_id': join_nonempty(m3u8_id, idx),
2250 'format_index': idx,
2251 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2252 'ext': ext,
2253 'protocol': entry_protocol,
2254 'preference': preference,
2255 'quality': quality,
2256 'has_drm': has_drm,
2257 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2258
2259 return formats, subtitles
2260
2261 groups = {}
2262 last_stream_inf = {}
2263
2264 def extract_media(x_media_line):
2265 media = parse_m3u8_attributes(x_media_line)
2266 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2267 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2268 if not (media_type and group_id and name):
2269 return
2270 groups.setdefault(group_id, []).append(media)
2271 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2272 if media_type == 'SUBTITLES':
2273 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2274 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2275 # However, lack of URI has been spotted in the wild.
2276 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2277 if not media.get('URI'):
2278 return
2279 url = format_url(media['URI'])
2280 sub_info = {
2281 'url': url,
2282 'ext': determine_ext(url),
2283 }
2284 if sub_info['ext'] == 'm3u8':
2285 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2286 # files may contain is WebVTT:
2287 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2288 sub_info['ext'] = 'vtt'
2289 sub_info['protocol'] = 'm3u8_native'
2290 lang = media.get('LANGUAGE') or 'und'
2291 subtitles.setdefault(lang, []).append(sub_info)
2292 if media_type not in ('VIDEO', 'AUDIO'):
2293 return
2294 media_url = media.get('URI')
2295 if media_url:
2296 manifest_url = format_url(media_url)
2297 formats.extend({
2298 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2299 'format_note': name,
2300 'format_index': idx,
2301 'url': manifest_url,
2302 'manifest_url': m3u8_url,
2303 'language': media.get('LANGUAGE'),
2304 'ext': ext,
2305 'protocol': entry_protocol,
2306 'preference': preference,
2307 'quality': quality,
2308 'vcodec': 'none' if media_type == 'AUDIO' else None,
2309 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2310
2311 def build_stream_name():
2312 # Despite specification does not mention NAME attribute for
2313 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2314 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2315 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2316 stream_name = last_stream_inf.get('NAME')
2317 if stream_name:
2318 return stream_name
2319 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2320 # from corresponding rendition group
2321 stream_group_id = last_stream_inf.get('VIDEO')
2322 if not stream_group_id:
2323 return
2324 stream_group = groups.get(stream_group_id)
2325 if not stream_group:
2326 return stream_group_id
2327 rendition = stream_group[0]
2328 return rendition.get('NAME') or stream_group_id
2329
2330 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2331 # chance to detect video only formats when EXT-X-STREAM-INF tags
2332 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2333 for line in m3u8_doc.splitlines():
2334 if line.startswith('#EXT-X-MEDIA:'):
2335 extract_media(line)
2336
2337 for line in m3u8_doc.splitlines():
2338 if line.startswith('#EXT-X-STREAM-INF:'):
2339 last_stream_inf = parse_m3u8_attributes(line)
2340 elif line.startswith('#') or not line.strip():
2341 continue
2342 else:
2343 tbr = float_or_none(
2344 last_stream_inf.get('AVERAGE-BANDWIDTH')
2345 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2346 manifest_url = format_url(line.strip())
2347
2348 for idx in _extract_m3u8_playlist_indices(manifest_url):
2349 format_id = [m3u8_id, None, idx]
2350 # Bandwidth of live streams may differ over time thus making
2351 # format_id unpredictable. So it's better to keep provided
2352 # format_id intact.
2353 if not live:
2354 stream_name = build_stream_name()
2355 format_id[1] = stream_name or '%d' % (tbr or len(formats))
2356 f = {
2357 'format_id': join_nonempty(*format_id),
2358 'format_index': idx,
2359 'url': manifest_url,
2360 'manifest_url': m3u8_url,
2361 'tbr': tbr,
2362 'ext': ext,
2363 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2364 'protocol': entry_protocol,
2365 'preference': preference,
2366 'quality': quality,
2367 }
2368 resolution = last_stream_inf.get('RESOLUTION')
2369 if resolution:
2370 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2371 if mobj:
2372 f['width'] = int(mobj.group('width'))
2373 f['height'] = int(mobj.group('height'))
2374 # Unified Streaming Platform
2375 mobj = re.search(
2376 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2377 if mobj:
2378 abr, vbr = mobj.groups()
2379 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2380 f.update({
2381 'vbr': vbr,
2382 'abr': abr,
2383 })
2384 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2385 f.update(codecs)
2386 audio_group_id = last_stream_inf.get('AUDIO')
2387 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2388 # references a rendition group MUST have a CODECS attribute.
2389 # However, this is not always respected. E.g. [2]
2390 # contains EXT-X-STREAM-INF tag which references AUDIO
2391 # rendition group but does not have CODECS and despite
2392 # referencing an audio group it represents a complete
2393 # (with audio and video) format. So, for such cases we will
2394 # ignore references to rendition groups and treat them
2395 # as complete formats.
2396 if audio_group_id and codecs and f.get('vcodec') != 'none':
2397 audio_group = groups.get(audio_group_id)
2398 if audio_group and audio_group[0].get('URI'):
2399 # TODO: update acodec for audio only formats with
2400 # the same GROUP-ID
2401 f['acodec'] = 'none'
2402 if not f.get('ext'):
2403 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2404 formats.append(f)
2405
2406 # for DailyMotion
2407 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2408 if progressive_uri:
2409 http_f = f.copy()
2410 del http_f['manifest_url']
2411 http_f.update({
2412 'format_id': f['format_id'].replace('hls-', 'http-'),
2413 'protocol': 'http',
2414 'url': progressive_uri,
2415 })
2416 formats.append(http_f)
2417
2418 last_stream_inf = {}
2419 return formats, subtitles
2420
2421 def _extract_m3u8_vod_duration(
2422 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2423
2424 m3u8_vod = self._download_webpage(
2425 m3u8_vod_url, video_id,
2426 note='Downloading m3u8 VOD manifest' if note is None else note,
2427 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2428 fatal=False, data=data, headers=headers, query=query)
2429
2430 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2431
2432 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2433 if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
2434 return None
2435
2436 return int(sum(
2437 float(line[len('#EXTINF:'):].split(',')[0])
2438 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2439
2440 @staticmethod
2441 def _xpath_ns(path, namespace=None):
2442 if not namespace:
2443 return path
2444 out = []
2445 for c in path.split('/'):
2446 if not c or c == '.':
2447 out.append(c)
2448 else:
2449 out.append('{%s}%s' % (namespace, c))
2450 return '/'.join(out)
2451
2452 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2453 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2454 if res is False:
2455 assert not fatal
2456 return [], {}
2457
2458 smil, urlh = res
2459 smil_url = urlh.geturl()
2460
2461 namespace = self._parse_smil_namespace(smil)
2462
2463 fmts = self._parse_smil_formats(
2464 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2465 subs = self._parse_smil_subtitles(
2466 smil, namespace=namespace)
2467
2468 return fmts, subs
2469
2470 def _extract_smil_formats(self, *args, **kwargs):
2471 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2472 if subs:
2473 self._report_ignoring_subs('SMIL')
2474 return fmts
2475
2476 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2477 res = self._download_smil(smil_url, video_id, fatal=fatal)
2478 if res is False:
2479 return {}
2480
2481 smil, urlh = res
2482 smil_url = urlh.geturl()
2483
2484 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2485
2486 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2487 return self._download_xml_handle(
2488 smil_url, video_id, 'Downloading SMIL file',
2489 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2490
2491 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2492 namespace = self._parse_smil_namespace(smil)
2493
2494 formats = self._parse_smil_formats(
2495 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2496 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2497
2498 video_id = os.path.splitext(url_basename(smil_url))[0]
2499 title = None
2500 description = None
2501 upload_date = None
2502 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2503 name = meta.attrib.get('name')
2504 content = meta.attrib.get('content')
2505 if not name or not content:
2506 continue
2507 if not title and name == 'title':
2508 title = content
2509 elif not description and name in ('description', 'abstract'):
2510 description = content
2511 elif not upload_date and name == 'date':
2512 upload_date = unified_strdate(content)
2513
2514 thumbnails = [{
2515 'id': image.get('type'),
2516 'url': image.get('src'),
2517 'width': int_or_none(image.get('width')),
2518 'height': int_or_none(image.get('height')),
2519 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2520
2521 return {
2522 'id': video_id,
2523 'title': title or video_id,
2524 'description': description,
2525 'upload_date': upload_date,
2526 'thumbnails': thumbnails,
2527 'formats': formats,
2528 'subtitles': subtitles,
2529 }
2530
2531 def _parse_smil_namespace(self, smil):
2532 return self._search_regex(
2533 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2534
2535 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2536 base = smil_url
2537 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2538 b = meta.get('base') or meta.get('httpBase')
2539 if b:
2540 base = b
2541 break
2542
2543 formats = []
2544 rtmp_count = 0
2545 http_count = 0
2546 m3u8_count = 0
2547 imgs_count = 0
2548
2549 srcs = set()
2550 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2551 for medium in media:
2552 src = medium.get('src')
2553 if not src or src in srcs:
2554 continue
2555 srcs.add(src)
2556
2557 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2558 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2559 width = int_or_none(medium.get('width'))
2560 height = int_or_none(medium.get('height'))
2561 proto = medium.get('proto')
2562 ext = medium.get('ext')
2563 src_ext = determine_ext(src)
2564 streamer = medium.get('streamer') or base
2565
2566 if proto == 'rtmp' or streamer.startswith('rtmp'):
2567 rtmp_count += 1
2568 formats.append({
2569 'url': streamer,
2570 'play_path': src,
2571 'ext': 'flv',
2572 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2573 'tbr': bitrate,
2574 'filesize': filesize,
2575 'width': width,
2576 'height': height,
2577 })
2578 if transform_rtmp_url:
2579 streamer, src = transform_rtmp_url(streamer, src)
2580 formats[-1].update({
2581 'url': streamer,
2582 'play_path': src,
2583 })
2584 continue
2585
2586 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2587 src_url = src_url.strip()
2588
2589 if proto == 'm3u8' or src_ext == 'm3u8':
2590 m3u8_formats = self._extract_m3u8_formats(
2591 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2592 if len(m3u8_formats) == 1:
2593 m3u8_count += 1
2594 m3u8_formats[0].update({
2595 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2596 'tbr': bitrate,
2597 'width': width,
2598 'height': height,
2599 })
2600 formats.extend(m3u8_formats)
2601 elif src_ext == 'f4m':
2602 f4m_url = src_url
2603 if not f4m_params:
2604 f4m_params = {
2605 'hdcore': '3.2.0',
2606 'plugin': 'flowplayer-3.2.0.1',
2607 }
2608 f4m_url += '&' if '?' in f4m_url else '?'
2609 f4m_url += urllib.parse.urlencode(f4m_params)
2610 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2611 elif src_ext == 'mpd':
2612 formats.extend(self._extract_mpd_formats(
2613 src_url, video_id, mpd_id='dash', fatal=False))
2614 elif re.search(r'\.ism/[Mm]anifest', src_url):
2615 formats.extend(self._extract_ism_formats(
2616 src_url, video_id, ism_id='mss', fatal=False))
2617 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2618 http_count += 1
2619 formats.append({
2620 'url': src_url,
2621 'ext': ext or src_ext or 'flv',
2622 'format_id': 'http-%d' % (bitrate or http_count),
2623 'tbr': bitrate,
2624 'filesize': filesize,
2625 'width': width,
2626 'height': height,
2627 })
2628
2629 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2630 src = medium.get('src')
2631 if not src or src in srcs:
2632 continue
2633 srcs.add(src)
2634
2635 imgs_count += 1
2636 formats.append({
2637 'format_id': 'imagestream-%d' % (imgs_count),
2638 'url': src,
2639 'ext': mimetype2ext(medium.get('type')),
2640 'acodec': 'none',
2641 'vcodec': 'none',
2642 'width': int_or_none(medium.get('width')),
2643 'height': int_or_none(medium.get('height')),
2644 'format_note': 'SMIL storyboards',
2645 })
2646
2647 return formats
2648
2649 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2650 urls = []
2651 subtitles = {}
2652 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2653 src = textstream.get('src')
2654 if not src or src in urls:
2655 continue
2656 urls.append(src)
2657 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2658 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2659 subtitles.setdefault(lang, []).append({
2660 'url': src,
2661 'ext': ext,
2662 })
2663 return subtitles
2664
2665 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2666 res = self._download_xml_handle(
2667 xspf_url, playlist_id, 'Downloading xpsf playlist',
2668 'Unable to download xspf manifest', fatal=fatal)
2669 if res is False:
2670 return []
2671
2672 xspf, urlh = res
2673 xspf_url = urlh.geturl()
2674
2675 return self._parse_xspf(
2676 xspf, playlist_id, xspf_url=xspf_url,
2677 xspf_base_url=base_url(xspf_url))
2678
2679 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2680 NS_MAP = {
2681 'xspf': 'http://xspf.org/ns/0/',
2682 's1': 'http://static.streamone.nl/player/ns/0',
2683 }
2684
2685 entries = []
2686 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2687 title = xpath_text(
2688 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2689 description = xpath_text(
2690 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2691 thumbnail = xpath_text(
2692 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2693 duration = float_or_none(
2694 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2695
2696 formats = []
2697 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2698 format_url = urljoin(xspf_base_url, location.text)
2699 if not format_url:
2700 continue
2701 formats.append({
2702 'url': format_url,
2703 'manifest_url': xspf_url,
2704 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2705 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2706 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2707 })
2708 self._sort_formats(formats)
2709
2710 entries.append({
2711 'id': playlist_id,
2712 'title': title,
2713 'description': description,
2714 'thumbnail': thumbnail,
2715 'duration': duration,
2716 'formats': formats,
2717 })
2718 return entries
2719
2720 def _extract_mpd_formats(self, *args, **kwargs):
2721 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2722 if subs:
2723 self._report_ignoring_subs('DASH')
2724 return fmts
2725
2726 def _extract_mpd_formats_and_subtitles(
2727 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2728 fatal=True, data=None, headers={}, query={}):
2729 res = self._download_xml_handle(
2730 mpd_url, video_id,
2731 note='Downloading MPD manifest' if note is None else note,
2732 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2733 fatal=fatal, data=data, headers=headers, query=query)
2734 if res is False:
2735 return [], {}
2736 mpd_doc, urlh = res
2737 if mpd_doc is None:
2738 return [], {}
2739
2740 # We could have been redirected to a new url when we retrieved our mpd file.
2741 mpd_url = urlh.geturl()
2742 mpd_base_url = base_url(mpd_url)
2743
2744 return self._parse_mpd_formats_and_subtitles(
2745 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2746
2747 def _parse_mpd_formats(self, *args, **kwargs):
2748 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2749 if subs:
2750 self._report_ignoring_subs('DASH')
2751 return fmts
2752
2753 def _parse_mpd_formats_and_subtitles(
2754 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2755 """
2756 Parse formats from MPD manifest.
2757 References:
2758 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2759 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2760 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2761 """
2762 if not self.get_param('dynamic_mpd', True):
2763 if mpd_doc.get('type') == 'dynamic':
2764 return [], {}
2765
2766 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2767
2768 def _add_ns(path):
2769 return self._xpath_ns(path, namespace)
2770
2771 def is_drm_protected(element):
2772 return element.find(_add_ns('ContentProtection')) is not None
2773
2774 def extract_multisegment_info(element, ms_parent_info):
2775 ms_info = ms_parent_info.copy()
2776
2777 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2778 # common attributes and elements. We will only extract relevant
2779 # for us.
2780 def extract_common(source):
2781 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2782 if segment_timeline is not None:
2783 s_e = segment_timeline.findall(_add_ns('S'))
2784 if s_e:
2785 ms_info['total_number'] = 0
2786 ms_info['s'] = []
2787 for s in s_e:
2788 r = int(s.get('r', 0))
2789 ms_info['total_number'] += 1 + r
2790 ms_info['s'].append({
2791 't': int(s.get('t', 0)),
2792 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2793 'd': int(s.attrib['d']),
2794 'r': r,
2795 })
2796 start_number = source.get('startNumber')
2797 if start_number:
2798 ms_info['start_number'] = int(start_number)
2799 timescale = source.get('timescale')
2800 if timescale:
2801 ms_info['timescale'] = int(timescale)
2802 segment_duration = source.get('duration')
2803 if segment_duration:
2804 ms_info['segment_duration'] = float(segment_duration)
2805
2806 def extract_Initialization(source):
2807 initialization = source.find(_add_ns('Initialization'))
2808 if initialization is not None:
2809 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2810
2811 segment_list = element.find(_add_ns('SegmentList'))
2812 if segment_list is not None:
2813 extract_common(segment_list)
2814 extract_Initialization(segment_list)
2815 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2816 if segment_urls_e:
2817 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2818 else:
2819 segment_template = element.find(_add_ns('SegmentTemplate'))
2820 if segment_template is not None:
2821 extract_common(segment_template)
2822 media = segment_template.get('media')
2823 if media:
2824 ms_info['media'] = media
2825 initialization = segment_template.get('initialization')
2826 if initialization:
2827 ms_info['initialization'] = initialization
2828 else:
2829 extract_Initialization(segment_template)
2830 return ms_info
2831
2832 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2833 formats, subtitles = [], {}
2834 stream_numbers = collections.defaultdict(int)
2835 for period in mpd_doc.findall(_add_ns('Period')):
2836 period_duration = parse_duration(period.get('duration')) or mpd_duration
2837 period_ms_info = extract_multisegment_info(period, {
2838 'start_number': 1,
2839 'timescale': 1,
2840 })
2841 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2842 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2843 for representation in adaptation_set.findall(_add_ns('Representation')):
2844 representation_attrib = adaptation_set.attrib.copy()
2845 representation_attrib.update(representation.attrib)
2846 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2847 mime_type = representation_attrib['mimeType']
2848 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2849
2850 codec_str = representation_attrib.get('codecs', '')
2851 # Some kind of binary subtitle found in some youtube livestreams
2852 if mime_type == 'application/x-rawcc':
2853 codecs = {'scodec': codec_str}
2854 else:
2855 codecs = parse_codecs(codec_str)
2856 if content_type not in ('video', 'audio', 'text'):
2857 if mime_type == 'image/jpeg':
2858 content_type = mime_type
2859 elif codecs.get('vcodec', 'none') != 'none':
2860 content_type = 'video'
2861 elif codecs.get('acodec', 'none') != 'none':
2862 content_type = 'audio'
2863 elif codecs.get('scodec', 'none') != 'none':
2864 content_type = 'text'
2865 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2866 content_type = 'text'
2867 else:
2868 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2869 continue
2870
2871 base_url = ''
2872 for element in (representation, adaptation_set, period, mpd_doc):
2873 base_url_e = element.find(_add_ns('BaseURL'))
2874 if try_call(lambda: base_url_e.text) is not None:
2875 base_url = base_url_e.text + base_url
2876 if re.match(r'^https?://', base_url):
2877 break
2878 if mpd_base_url and base_url.startswith('/'):
2879 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2880 elif mpd_base_url and not re.match(r'^https?://', base_url):
2881 if not mpd_base_url.endswith('/'):
2882 mpd_base_url += '/'
2883 base_url = mpd_base_url + base_url
2884 representation_id = representation_attrib.get('id')
2885 lang = representation_attrib.get('lang')
2886 url_el = representation.find(_add_ns('BaseURL'))
2887 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2888 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2889 if representation_id is not None:
2890 format_id = representation_id
2891 else:
2892 format_id = content_type
2893 if mpd_id:
2894 format_id = mpd_id + '-' + format_id
2895 if content_type in ('video', 'audio'):
2896 f = {
2897 'format_id': format_id,
2898 'manifest_url': mpd_url,
2899 'ext': mimetype2ext(mime_type),
2900 'width': int_or_none(representation_attrib.get('width')),
2901 'height': int_or_none(representation_attrib.get('height')),
2902 'tbr': float_or_none(bandwidth, 1000),
2903 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2904 'fps': int_or_none(representation_attrib.get('frameRate')),
2905 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2906 'format_note': 'DASH %s' % content_type,
2907 'filesize': filesize,
2908 'container': mimetype2ext(mime_type) + '_dash',
2909 **codecs
2910 }
2911 elif content_type == 'text':
2912 f = {
2913 'ext': mimetype2ext(mime_type),
2914 'manifest_url': mpd_url,
2915 'filesize': filesize,
2916 }
2917 elif content_type == 'image/jpeg':
2918 # See test case in VikiIE
2919 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2920 f = {
2921 'format_id': format_id,
2922 'ext': 'mhtml',
2923 'manifest_url': mpd_url,
2924 'format_note': 'DASH storyboards (jpeg)',
2925 'acodec': 'none',
2926 'vcodec': 'none',
2927 }
2928 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2929 f['has_drm'] = True
2930 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2931
2932 def prepare_template(template_name, identifiers):
2933 tmpl = representation_ms_info[template_name]
2934 if representation_id is not None:
2935 tmpl = tmpl.replace('$RepresentationID$', representation_id)
2936 # First of, % characters outside $...$ templates
2937 # must be escaped by doubling for proper processing
2938 # by % operator string formatting used further (see
2939 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2940 t = ''
2941 in_template = False
2942 for c in tmpl:
2943 t += c
2944 if c == '$':
2945 in_template = not in_template
2946 elif c == '%' and not in_template:
2947 t += c
2948 # Next, $...$ templates are translated to their
2949 # %(...) counterparts to be used with % operator
2950 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2951 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2952 t.replace('$$', '$')
2953 return t
2954
2955 # @initialization is a regular template like @media one
2956 # so it should be handled just the same way (see
2957 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2958 if 'initialization' in representation_ms_info:
2959 initialization_template = prepare_template(
2960 'initialization',
2961 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2962 # $Time$ shall not be included for @initialization thus
2963 # only $Bandwidth$ remains
2964 ('Bandwidth', ))
2965 representation_ms_info['initialization_url'] = initialization_template % {
2966 'Bandwidth': bandwidth,
2967 }
2968
2969 def location_key(location):
2970 return 'url' if re.match(r'^https?://', location) else 'path'
2971
2972 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2973
2974 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2975 media_location_key = location_key(media_template)
2976
2977 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2978 # can't be used at the same time
2979 if '%(Number' in media_template and 's' not in representation_ms_info:
2980 segment_duration = None
2981 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2982 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2983 representation_ms_info['total_number'] = int(math.ceil(
2984 float_or_none(period_duration, segment_duration, default=0)))
2985 representation_ms_info['fragments'] = [{
2986 media_location_key: media_template % {
2987 'Number': segment_number,
2988 'Bandwidth': bandwidth,
2989 },
2990 'duration': segment_duration,
2991 } for segment_number in range(
2992 representation_ms_info['start_number'],
2993 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2994 else:
2995 # $Number*$ or $Time$ in media template with S list available
2996 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2997 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2998 representation_ms_info['fragments'] = []
2999 segment_time = 0
3000 segment_d = None
3001 segment_number = representation_ms_info['start_number']
3002
3003 def add_segment_url():
3004 segment_url = media_template % {
3005 'Time': segment_time,
3006 'Bandwidth': bandwidth,
3007 'Number': segment_number,
3008 }
3009 representation_ms_info['fragments'].append({
3010 media_location_key: segment_url,
3011 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
3012 })
3013
3014 for num, s in enumerate(representation_ms_info['s']):
3015 segment_time = s.get('t') or segment_time
3016 segment_d = s['d']
3017 add_segment_url()
3018 segment_number += 1
3019 for r in range(s.get('r', 0)):
3020 segment_time += segment_d
3021 add_segment_url()
3022 segment_number += 1
3023 segment_time += segment_d
3024 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
3025 # No media template,
3026 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
3027 # or any YouTube dashsegments video
3028 fragments = []
3029 segment_index = 0
3030 timescale = representation_ms_info['timescale']
3031 for s in representation_ms_info['s']:
3032 duration = float_or_none(s['d'], timescale)
3033 for r in range(s.get('r', 0) + 1):
3034 segment_uri = representation_ms_info['segment_urls'][segment_index]
3035 fragments.append({
3036 location_key(segment_uri): segment_uri,
3037 'duration': duration,
3038 })
3039 segment_index += 1
3040 representation_ms_info['fragments'] = fragments
3041 elif 'segment_urls' in representation_ms_info:
3042 # Segment URLs with no SegmentTimeline
3043 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
3044 # https://github.com/ytdl-org/youtube-dl/pull/14844
3045 fragments = []
3046 segment_duration = float_or_none(
3047 representation_ms_info['segment_duration'],
3048 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
3049 for segment_url in representation_ms_info['segment_urls']:
3050 fragment = {
3051 location_key(segment_url): segment_url,
3052 }
3053 if segment_duration:
3054 fragment['duration'] = segment_duration
3055 fragments.append(fragment)
3056 representation_ms_info['fragments'] = fragments
3057 # If there is a fragments key available then we correctly recognized fragmented media.
3058 # Otherwise we will assume unfragmented media with direct access. Technically, such
3059 # assumption is not necessarily correct since we may simply have no support for
3060 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
3061 if 'fragments' in representation_ms_info:
3062 f.update({
3063 # NB: mpd_url may be empty when MPD manifest is parsed from a string
3064 'url': mpd_url or base_url,
3065 'fragment_base_url': base_url,
3066 'fragments': [],
3067 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
3068 })
3069 if 'initialization_url' in representation_ms_info:
3070 initialization_url = representation_ms_info['initialization_url']
3071 if not f.get('url'):
3072 f['url'] = initialization_url
3073 f['fragments'].append({location_key(initialization_url): initialization_url})
3074 f['fragments'].extend(representation_ms_info['fragments'])
3075 if not period_duration:
3076 period_duration = try_get(
3077 representation_ms_info,
3078 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
3079 else:
3080 # Assuming direct URL to unfragmented media.
3081 f['url'] = base_url
3082 if content_type in ('video', 'audio', 'image/jpeg'):
3083 f['manifest_stream_number'] = stream_numbers[f['url']]
3084 stream_numbers[f['url']] += 1
3085 formats.append(f)
3086 elif content_type == 'text':
3087 subtitles.setdefault(lang or 'und', []).append(f)
3088
3089 return formats, subtitles
3090
3091 def _extract_ism_formats(self, *args, **kwargs):
3092 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
3093 if subs:
3094 self._report_ignoring_subs('ISM')
3095 return fmts
3096
3097 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
3098 res = self._download_xml_handle(
3099 ism_url, video_id,
3100 note='Downloading ISM manifest' if note is None else note,
3101 errnote='Failed to download ISM manifest' if errnote is None else errnote,
3102 fatal=fatal, data=data, headers=headers, query=query)
3103 if res is False:
3104 return [], {}
3105 ism_doc, urlh = res
3106 if ism_doc is None:
3107 return [], {}
3108
3109 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
3110
3111 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
3112 """
3113 Parse formats from ISM manifest.
3114 References:
3115 1. [MS-SSTR]: Smooth Streaming Protocol,
3116 https://msdn.microsoft.com/en-us/library/ff469518.aspx
3117 """
3118 if ism_doc.get('IsLive') == 'TRUE':
3119 return [], {}
3120
3121 duration = int(ism_doc.attrib['Duration'])
3122 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
3123
3124 formats = []
3125 subtitles = {}
3126 for stream in ism_doc.findall('StreamIndex'):
3127 stream_type = stream.get('Type')
3128 if stream_type not in ('video', 'audio', 'text'):
3129 continue
3130 url_pattern = stream.attrib['Url']
3131 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3132 stream_name = stream.get('Name')
3133 stream_language = stream.get('Language', 'und')
3134 for track in stream.findall('QualityLevel'):
3135 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3136 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3137 # TODO: add support for WVC1 and WMAP
3138 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3139 self.report_warning('%s is not a supported codec' % fourcc)
3140 continue
3141 tbr = int(track.attrib['Bitrate']) // 1000
3142 # [1] does not mention Width and Height attributes. However,
3143 # they're often present while MaxWidth and MaxHeight are
3144 # missing, so should be used as fallbacks
3145 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3146 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3147 sampling_rate = int_or_none(track.get('SamplingRate'))
3148
3149 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3150 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3151
3152 fragments = []
3153 fragment_ctx = {
3154 'time': 0,
3155 }
3156 stream_fragments = stream.findall('c')
3157 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3158 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3159 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3160 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3161 if not fragment_ctx['duration']:
3162 try:
3163 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3164 except IndexError:
3165 next_fragment_time = duration
3166 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3167 for _ in range(fragment_repeat):
3168 fragments.append({
3169 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3170 'duration': fragment_ctx['duration'] / stream_timescale,
3171 })
3172 fragment_ctx['time'] += fragment_ctx['duration']
3173
3174 if stream_type == 'text':
3175 subtitles.setdefault(stream_language, []).append({
3176 'ext': 'ismt',
3177 'protocol': 'ism',
3178 'url': ism_url,
3179 'manifest_url': ism_url,
3180 'fragments': fragments,
3181 '_download_params': {
3182 'stream_type': stream_type,
3183 'duration': duration,
3184 'timescale': stream_timescale,
3185 'fourcc': fourcc,
3186 'language': stream_language,
3187 'codec_private_data': track.get('CodecPrivateData'),
3188 }
3189 })
3190 elif stream_type in ('video', 'audio'):
3191 formats.append({
3192 'format_id': join_nonempty(ism_id, stream_name, tbr),
3193 'url': ism_url,
3194 'manifest_url': ism_url,
3195 'ext': 'ismv' if stream_type == 'video' else 'isma',
3196 'width': width,
3197 'height': height,
3198 'tbr': tbr,
3199 'asr': sampling_rate,
3200 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3201 'acodec': 'none' if stream_type == 'video' else fourcc,
3202 'protocol': 'ism',
3203 'fragments': fragments,
3204 'has_drm': ism_doc.find('Protection') is not None,
3205 '_download_params': {
3206 'stream_type': stream_type,
3207 'duration': duration,
3208 'timescale': stream_timescale,
3209 'width': width or 0,
3210 'height': height or 0,
3211 'fourcc': fourcc,
3212 'language': stream_language,
3213 'codec_private_data': track.get('CodecPrivateData'),
3214 'sampling_rate': sampling_rate,
3215 'channels': int_or_none(track.get('Channels', 2)),
3216 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3217 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3218 },
3219 })
3220 return formats, subtitles
3221
3222 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3223 def absolute_url(item_url):
3224 return urljoin(base_url, item_url)
3225
3226 def parse_content_type(content_type):
3227 if not content_type:
3228 return {}
3229 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3230 if ctr:
3231 mimetype, codecs = ctr.groups()
3232 f = parse_codecs(codecs)
3233 f['ext'] = mimetype2ext(mimetype)
3234 return f
3235 return {}
3236
3237 def _media_formats(src, cur_media_type, type_info=None):
3238 type_info = type_info or {}
3239 full_url = absolute_url(src)
3240 ext = type_info.get('ext') or determine_ext(full_url)
3241 if ext == 'm3u8':
3242 is_plain_url = False
3243 formats = self._extract_m3u8_formats(
3244 full_url, video_id, ext='mp4',
3245 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3246 preference=preference, quality=quality, fatal=False)
3247 elif ext == 'mpd':
3248 is_plain_url = False
3249 formats = self._extract_mpd_formats(
3250 full_url, video_id, mpd_id=mpd_id, fatal=False)
3251 else:
3252 is_plain_url = True
3253 formats = [{
3254 'url': full_url,
3255 'vcodec': 'none' if cur_media_type == 'audio' else None,
3256 'ext': ext,
3257 }]
3258 return is_plain_url, formats
3259
3260 entries = []
3261 # amp-video and amp-audio are very similar to their HTML5 counterparts
3262 # so we will include them right here (see
3263 # https://www.ampproject.org/docs/reference/components/amp-video)
3264 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3265 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3266 media_tags = [(media_tag, media_tag_name, media_type, '')
3267 for media_tag, media_tag_name, media_type
3268 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3269 media_tags.extend(re.findall(
3270 # We only allow video|audio followed by a whitespace or '>'.
3271 # Allowing more characters may end up in significant slow down (see
3272 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3273 # e.g. http://www.porntrex.com/maps/videositemap.xml).
3274 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3275 for media_tag, _, media_type, media_content in media_tags:
3276 media_info = {
3277 'formats': [],
3278 'subtitles': {},
3279 }
3280 media_attributes = extract_attributes(media_tag)
3281 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3282 if src:
3283 f = parse_content_type(media_attributes.get('type'))
3284 _, formats = _media_formats(src, media_type, f)
3285 media_info['formats'].extend(formats)
3286 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3287 if media_content:
3288 for source_tag in re.findall(r'<source[^>]+>', media_content):
3289 s_attr = extract_attributes(source_tag)
3290 # data-video-src and data-src are non standard but seen
3291 # several times in the wild
3292 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3293 if not src:
3294 continue
3295 f = parse_content_type(s_attr.get('type'))
3296 is_plain_url, formats = _media_formats(src, media_type, f)
3297 if is_plain_url:
3298 # width, height, res, label and title attributes are
3299 # all not standard but seen several times in the wild
3300 labels = [
3301 s_attr.get(lbl)
3302 for lbl in ('label', 'title')
3303 if str_or_none(s_attr.get(lbl))
3304 ]
3305 width = int_or_none(s_attr.get('width'))
3306 height = (int_or_none(s_attr.get('height'))
3307 or int_or_none(s_attr.get('res')))
3308 if not width or not height:
3309 for lbl in labels:
3310 resolution = parse_resolution(lbl)
3311 if not resolution:
3312 continue
3313 width = width or resolution.get('width')
3314 height = height or resolution.get('height')
3315 for lbl in labels:
3316 tbr = parse_bitrate(lbl)
3317 if tbr:
3318 break
3319 else:
3320 tbr = None
3321 f.update({
3322 'width': width,
3323 'height': height,
3324 'tbr': tbr,
3325 'format_id': s_attr.get('label') or s_attr.get('title'),
3326 })
3327 f.update(formats[0])
3328 media_info['formats'].append(f)
3329 else:
3330 media_info['formats'].extend(formats)
3331 for track_tag in re.findall(r'<track[^>]+>', media_content):
3332 track_attributes = extract_attributes(track_tag)
3333 kind = track_attributes.get('kind')
3334 if not kind or kind in ('subtitles', 'captions'):
3335 src = strip_or_none(track_attributes.get('src'))
3336 if not src:
3337 continue
3338 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3339 media_info['subtitles'].setdefault(lang, []).append({
3340 'url': absolute_url(src),
3341 })
3342 for f in media_info['formats']:
3343 f.setdefault('http_headers', {})['Referer'] = base_url
3344 if media_info['formats'] or media_info['subtitles']:
3345 entries.append(media_info)
3346 return entries
3347
3348 def _extract_akamai_formats(self, *args, **kwargs):
3349 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3350 if subs:
3351 self._report_ignoring_subs('akamai')
3352 return fmts
3353
3354 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3355 signed = 'hdnea=' in manifest_url
3356 if not signed:
3357 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3358 manifest_url = re.sub(
3359 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3360 '', manifest_url).strip('?')
3361
3362 formats = []
3363 subtitles = {}
3364
3365 hdcore_sign = 'hdcore=3.7.0'
3366 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3367 hds_host = hosts.get('hds')
3368 if hds_host:
3369 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3370 if 'hdcore=' not in f4m_url:
3371 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3372 f4m_formats = self._extract_f4m_formats(
3373 f4m_url, video_id, f4m_id='hds', fatal=False)
3374 for entry in f4m_formats:
3375 entry.update({'extra_param_to_segment_url': hdcore_sign})
3376 formats.extend(f4m_formats)
3377
3378 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3379 hls_host = hosts.get('hls')
3380 if hls_host:
3381 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3382 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3383 m3u8_url, video_id, 'mp4', 'm3u8_native',
3384 m3u8_id='hls', fatal=False)
3385 formats.extend(m3u8_formats)
3386 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3387
3388 http_host = hosts.get('http')
3389 if http_host and m3u8_formats and not signed:
3390 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3391 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3392 qualities_length = len(qualities)
3393 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3394 i = 0
3395 for f in m3u8_formats:
3396 if f['vcodec'] != 'none':
3397 for protocol in ('http', 'https'):
3398 http_f = f.copy()
3399 del http_f['manifest_url']
3400 http_url = re.sub(
3401 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3402 http_f.update({
3403 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3404 'url': http_url,
3405 'protocol': protocol,
3406 })
3407 formats.append(http_f)
3408 i += 1
3409
3410 return formats, subtitles
3411
3412 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3413 query = urllib.parse.urlparse(url).query
3414 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3415 mobj = re.search(
3416 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3417 url_base = mobj.group('url')
3418 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3419 formats = []
3420
3421 def manifest_url(manifest):
3422 m_url = f'{http_base_url}/{manifest}'
3423 if query:
3424 m_url += '?%s' % query
3425 return m_url
3426
3427 if 'm3u8' not in skip_protocols:
3428 formats.extend(self._extract_m3u8_formats(
3429 manifest_url('playlist.m3u8'), video_id, 'mp4',
3430 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3431 if 'f4m' not in skip_protocols:
3432 formats.extend(self._extract_f4m_formats(
3433 manifest_url('manifest.f4m'),
3434 video_id, f4m_id='hds', fatal=False))
3435 if 'dash' not in skip_protocols:
3436 formats.extend(self._extract_mpd_formats(
3437 manifest_url('manifest.mpd'),
3438 video_id, mpd_id='dash', fatal=False))
3439 if re.search(r'(?:/smil:|\.smil)', url_base):
3440 if 'smil' not in skip_protocols:
3441 rtmp_formats = self._extract_smil_formats(
3442 manifest_url('jwplayer.smil'),
3443 video_id, fatal=False)
3444 for rtmp_format in rtmp_formats:
3445 rtsp_format = rtmp_format.copy()
3446 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3447 del rtsp_format['play_path']
3448 del rtsp_format['ext']
3449 rtsp_format.update({
3450 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3451 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3452 'protocol': 'rtsp',
3453 })
3454 formats.extend([rtmp_format, rtsp_format])
3455 else:
3456 for protocol in ('rtmp', 'rtsp'):
3457 if protocol not in skip_protocols:
3458 formats.append({
3459 'url': f'{protocol}:{url_base}',
3460 'format_id': protocol,
3461 'protocol': protocol,
3462 })
3463 return formats
3464
3465 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3466 mobj = re.search(
3467 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3468 webpage)
3469 if mobj:
3470 try:
3471 jwplayer_data = self._parse_json(mobj.group('options'),
3472 video_id=video_id,
3473 transform_source=transform_source)
3474 except ExtractorError:
3475 pass
3476 else:
3477 if isinstance(jwplayer_data, dict):
3478 return jwplayer_data
3479
3480 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3481 jwplayer_data = self._find_jwplayer_data(
3482 webpage, video_id, transform_source=js_to_json)
3483 return self._parse_jwplayer_data(
3484 jwplayer_data, video_id, *args, **kwargs)
3485
3486 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3487 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3488 # JWPlayer backward compatibility: flattened playlists
3489 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3490 if 'playlist' not in jwplayer_data:
3491 jwplayer_data = {'playlist': [jwplayer_data]}
3492
3493 entries = []
3494
3495 # JWPlayer backward compatibility: single playlist item
3496 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3497 if not isinstance(jwplayer_data['playlist'], list):
3498 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3499
3500 for video_data in jwplayer_data['playlist']:
3501 # JWPlayer backward compatibility: flattened sources
3502 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3503 if 'sources' not in video_data:
3504 video_data['sources'] = [video_data]
3505
3506 this_video_id = video_id or video_data['mediaid']
3507
3508 formats = self._parse_jwplayer_formats(
3509 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3510 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3511
3512 subtitles = {}
3513 tracks = video_data.get('tracks')
3514 if tracks and isinstance(tracks, list):
3515 for track in tracks:
3516 if not isinstance(track, dict):
3517 continue
3518 track_kind = track.get('kind')
3519 if not track_kind or not isinstance(track_kind, str):
3520 continue
3521 if track_kind.lower() not in ('captions', 'subtitles'):
3522 continue
3523 track_url = urljoin(base_url, track.get('file'))
3524 if not track_url:
3525 continue
3526 subtitles.setdefault(track.get('label') or 'en', []).append({
3527 'url': self._proto_relative_url(track_url)
3528 })
3529
3530 entry = {
3531 'id': this_video_id,
3532 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3533 'description': clean_html(video_data.get('description')),
3534 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3535 'timestamp': int_or_none(video_data.get('pubdate')),
3536 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3537 'subtitles': subtitles,
3538 }
3539 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3540 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3541 entry.update({
3542 '_type': 'url_transparent',
3543 'url': formats[0]['url'],
3544 })
3545 else:
3546 self._sort_formats(formats)
3547 entry['formats'] = formats
3548 entries.append(entry)
3549 if len(entries) == 1:
3550 return entries[0]
3551 else:
3552 return self.playlist_result(entries)
3553
3554 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3555 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3556 urls = []
3557 formats = []
3558 for source in jwplayer_sources_data:
3559 if not isinstance(source, dict):
3560 continue
3561 source_url = urljoin(
3562 base_url, self._proto_relative_url(source.get('file')))
3563 if not source_url or source_url in urls:
3564 continue
3565 urls.append(source_url)
3566 source_type = source.get('type') or ''
3567 ext = mimetype2ext(source_type) or determine_ext(source_url)
3568 if source_type == 'hls' or ext == 'm3u8':
3569 formats.extend(self._extract_m3u8_formats(
3570 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3571 m3u8_id=m3u8_id, fatal=False))
3572 elif source_type == 'dash' or ext == 'mpd':
3573 formats.extend(self._extract_mpd_formats(
3574 source_url, video_id, mpd_id=mpd_id, fatal=False))
3575 elif ext == 'smil':
3576 formats.extend(self._extract_smil_formats(
3577 source_url, video_id, fatal=False))
3578 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3579 elif source_type.startswith('audio') or ext in (
3580 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3581 formats.append({
3582 'url': source_url,
3583 'vcodec': 'none',
3584 'ext': ext,
3585 })
3586 else:
3587 height = int_or_none(source.get('height'))
3588 if height is None:
3589 # Often no height is provided but there is a label in
3590 # format like "1080p", "720p SD", or 1080.
3591 height = int_or_none(self._search_regex(
3592 r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
3593 'height', default=None))
3594 a_format = {
3595 'url': source_url,
3596 'width': int_or_none(source.get('width')),
3597 'height': height,
3598 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3599 'filesize': int_or_none(source.get('filesize')),
3600 'ext': ext,
3601 }
3602 if source_url.startswith('rtmp'):
3603 a_format['ext'] = 'flv'
3604 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3605 # of jwplayer.flash.swf
3606 rtmp_url_parts = re.split(
3607 r'((?:mp4|mp3|flv):)', source_url, 1)
3608 if len(rtmp_url_parts) == 3:
3609 rtmp_url, prefix, play_path = rtmp_url_parts
3610 a_format.update({
3611 'url': rtmp_url,
3612 'play_path': prefix + play_path,
3613 })
3614 if rtmp_params:
3615 a_format.update(rtmp_params)
3616 formats.append(a_format)
3617 return formats
3618
3619 def _live_title(self, name):
3620 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3621 return name
3622
3623 def _int(self, v, name, fatal=False, **kwargs):
3624 res = int_or_none(v, **kwargs)
3625 if res is None:
3626 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3627 if fatal:
3628 raise ExtractorError(msg)
3629 else:
3630 self.report_warning(msg)
3631 return res
3632
3633 def _float(self, v, name, fatal=False, **kwargs):
3634 res = float_or_none(v, **kwargs)
3635 if res is None:
3636 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3637 if fatal:
3638 raise ExtractorError(msg)
3639 else:
3640 self.report_warning(msg)
3641 return res
3642
3643 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3644 path='/', secure=False, discard=False, rest={}, **kwargs):
3645 cookie = http.cookiejar.Cookie(
3646 0, name, value, port, port is not None, domain, True,
3647 domain.startswith('.'), path, True, secure, expire_time,
3648 discard, None, None, rest)
3649 self.cookiejar.set_cookie(cookie)
3650
3651 def _get_cookies(self, url):
3652 """ Return a http.cookies.SimpleCookie with the cookies for the url """
3653 return LenientSimpleCookie(self._downloader._calc_cookies(url))
3654
3655 def _apply_first_set_cookie_header(self, url_handle, cookie):
3656 """
3657 Apply first Set-Cookie header instead of the last. Experimental.
3658
3659 Some sites (e.g. [1-3]) may serve two cookies under the same name
3660 in Set-Cookie header and expect the first (old) one to be set rather
3661 than second (new). However, as of RFC6265 the newer one cookie
3662 should be set into cookie store what actually happens.
3663 We will workaround this issue by resetting the cookie to
3664 the first one manually.
3665 1. https://new.vk.com/
3666 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3667 3. https://learning.oreilly.com/
3668 """
3669 for header, cookies in url_handle.headers.items():
3670 if header.lower() != 'set-cookie':
3671 continue
3672 cookies = cookies.encode('iso-8859-1').decode('utf-8')
3673 cookie_value = re.search(
3674 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3675 if cookie_value:
3676 value, domain = cookie_value.groups()
3677 self._set_cookie(domain, cookie, value)
3678 break
3679
3680 @classmethod
3681 def get_testcases(cls, include_onlymatching=False):
3682 # Do not look in super classes
3683 t = vars(cls).get('_TEST')
3684 if t:
3685 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3686 tests = [t]
3687 else:
3688 tests = vars(cls).get('_TESTS', [])
3689 for t in tests:
3690 if not include_onlymatching and t.get('only_matching', False):
3691 continue
3692 t['name'] = cls.ie_key()
3693 yield t
3694
3695 @classmethod
3696 def get_webpage_testcases(cls):
3697 tests = vars(cls).get('_WEBPAGE_TESTS', [])
3698 for t in tests:
3699 t['name'] = cls.ie_key()
3700 return tests
3701
3702 @classproperty(cache=True)
3703 def age_limit(cls):
3704 """Get age limit from the testcases"""
3705 return max(traverse_obj(
3706 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3707 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3708
3709 @classproperty(cache=True)
3710 def _RETURN_TYPE(cls):
3711 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3712 tests = tuple(cls.get_testcases(include_onlymatching=False))
3713 if not tests:
3714 return None
3715 elif not any(k.startswith('playlist') for test in tests for k in test):
3716 return 'video'
3717 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3718 return 'playlist'
3719 return 'any'
3720
3721 @classmethod
3722 def is_single_video(cls, url):
3723 """Returns whether the URL is of a single video, None if unknown"""
3724 assert cls.suitable(url), 'The URL must be suitable for the extractor'
3725 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3726
3727 @classmethod
3728 def is_suitable(cls, age_limit):
3729 """Test whether the extractor is generally suitable for the given age limit"""
3730 return not age_restricted(cls.age_limit, age_limit)
3731
3732 @classmethod
3733 def description(cls, *, markdown=True, search_examples=None):
3734 """Description of the extractor"""
3735 desc = ''
3736 if cls._NETRC_MACHINE:
3737 if markdown:
3738 desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
3739 else:
3740 desc += f' [{cls._NETRC_MACHINE}]'
3741 if cls.IE_DESC is False:
3742 desc += ' [HIDDEN]'
3743 elif cls.IE_DESC:
3744 desc += f' {cls.IE_DESC}'
3745 if cls.SEARCH_KEY:
3746 desc += f'; "{cls.SEARCH_KEY}:" prefix'
3747 if search_examples:
3748 _COUNTS = ('', '5', '10', 'all')
3749 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3750 if not cls.working():
3751 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3752
3753 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3754 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3755 return f'{name}:{desc}' if desc else name
3756
3757 def extract_subtitles(self, *args, **kwargs):
3758 if (self.get_param('writesubtitles', False)
3759 or self.get_param('listsubtitles')):
3760 return self._get_subtitles(*args, **kwargs)
3761 return {}
3762
3763 def _get_subtitles(self, *args, **kwargs):
3764 raise NotImplementedError('This method must be implemented by subclasses')
3765
3766 class CommentsDisabled(Exception):
3767 """Raise in _get_comments if comments are disabled for the video"""
3768
3769 def extract_comments(self, *args, **kwargs):
3770 if not self.get_param('getcomments'):
3771 return None
3772 generator = self._get_comments(*args, **kwargs)
3773
3774 def extractor():
3775 comments = []
3776 interrupted = True
3777 try:
3778 while True:
3779 comments.append(next(generator))
3780 except StopIteration:
3781 interrupted = False
3782 except KeyboardInterrupt:
3783 self.to_screen('Interrupted by user')
3784 except self.CommentsDisabled:
3785 return {'comments': None, 'comment_count': None}
3786 except Exception as e:
3787 if self.get_param('ignoreerrors') is not True:
3788 raise
3789 self._downloader.report_error(e)
3790 comment_count = len(comments)
3791 self.to_screen(f'Extracted {comment_count} comments')
3792 return {
3793 'comments': comments,
3794 'comment_count': None if interrupted else comment_count
3795 }
3796 return extractor
3797
3798 def _get_comments(self, *args, **kwargs):
3799 raise NotImplementedError('This method must be implemented by subclasses')
3800
3801 @staticmethod
3802 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3803 """ Merge subtitle items for one language. Items with duplicated URLs/data
3804 will be dropped. """
3805 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3806 ret = list(subtitle_list1)
3807 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3808 return ret
3809
3810 @classmethod
3811 def _merge_subtitles(cls, *dicts, target=None):
3812 """ Merge subtitle dictionaries, language by language. """
3813 if target is None:
3814 target = {}
3815 for d in dicts:
3816 for lang, subs in d.items():
3817 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3818 return target
3819
3820 def extract_automatic_captions(self, *args, **kwargs):
3821 if (self.get_param('writeautomaticsub', False)
3822 or self.get_param('listsubtitles')):
3823 return self._get_automatic_captions(*args, **kwargs)
3824 return {}
3825
3826 def _get_automatic_captions(self, *args, **kwargs):
3827 raise NotImplementedError('This method must be implemented by subclasses')
3828
3829 @functools.cached_property
3830 def _cookies_passed(self):
3831 """Whether cookies have been passed to YoutubeDL"""
3832 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3833
3834 def mark_watched(self, *args, **kwargs):
3835 if not self.get_param('mark_watched', False):
3836 return
3837 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3838 self._mark_watched(*args, **kwargs)
3839
3840 def _mark_watched(self, *args, **kwargs):
3841 raise NotImplementedError('This method must be implemented by subclasses')
3842
3843 def geo_verification_headers(self):
3844 headers = {}
3845 geo_verification_proxy = self.get_param('geo_verification_proxy')
3846 if geo_verification_proxy:
3847 headers['Ytdl-request-proxy'] = geo_verification_proxy
3848 return headers
3849
3850 @staticmethod
3851 def _generic_id(url):
3852 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3853
3854 def _generic_title(self, url='', webpage='', *, default=None):
3855 return (self._og_search_title(webpage, default=None)
3856 or self._html_extract_title(webpage, default=None)
3857 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3858 or default)
3859
3860 @staticmethod
3861 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3862 all_known = all(map(
3863 lambda x: x is not None,
3864 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3865 return (
3866 'private' if is_private
3867 else 'premium_only' if needs_premium
3868 else 'subscriber_only' if needs_subscription
3869 else 'needs_auth' if needs_auth
3870 else 'unlisted' if is_unlisted
3871 else 'public' if all_known
3872 else None)
3873
3874 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3875 '''
3876 @returns A list of values for the extractor argument given by "key"
3877 or "default" if no such key is present
3878 @param default The default value to return when the key is not present (default: [])
3879 @param casesense When false, the values are converted to lower case
3880 '''
3881 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3882 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3883 if val is None:
3884 return [] if default is NO_DEFAULT else default
3885 return list(val) if casesense else [x.lower() for x in val]
3886
3887 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3888 if not playlist_id or not video_id:
3889 return not video_id
3890
3891 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3892 if no_playlist is not None:
3893 return not no_playlist
3894
3895 video_id = '' if video_id is True else f' {video_id}'
3896 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3897 if self.get_param('noplaylist'):
3898 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3899 return False
3900 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3901 return True
3902
3903 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3904 RetryManager.report_retry(
3905 err, _count or int(fatal), _retries,
3906 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3907 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3908
3909 def RetryManager(self, **kwargs):
3910 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3911
3912 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3913 display_id = traverse_obj(info_dict, 'display_id', 'id')
3914 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3915 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3916 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3917
3918 @classmethod
3919 def extract_from_webpage(cls, ydl, url, webpage):
3920 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3921 else ydl.get_info_extractor(cls.ie_key()))
3922 for info in ie._extract_from_webpage(url, webpage) or []:
3923 # url = None since we do not want to set (webpage/original)_url
3924 ydl.add_default_extra_info(info, ie, None)
3925 yield info
3926
3927 @classmethod
3928 def _extract_from_webpage(cls, url, webpage):
3929 for embed_url in orderedSet(
3930 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3931 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3932
3933 @classmethod
3934 def _extract_embed_urls(cls, url, webpage):
3935 """@returns all the embed urls on the webpage"""
3936 if '_EMBED_URL_RE' not in cls.__dict__:
3937 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3938 for idx, regex in enumerate(cls._EMBED_REGEX):
3939 assert regex.count('(?P<url>') == 1, \
3940 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3941 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3942
3943 for regex in cls._EMBED_URL_RE:
3944 for mobj in regex.finditer(webpage):
3945 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3946 if cls._VALID_URL is False or cls.suitable(embed_url):
3947 yield embed_url
3948
3949 class StopExtraction(Exception):
3950 pass
3951
3952 @classmethod
3953 def _extract_url(cls, webpage): # TODO: Remove
3954 """Only for compatibility with some older extractors"""
3955 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3956
3957 @classmethod
3958 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3959 if plugin_name:
3960 mro = inspect.getmro(cls)
3961 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3962 cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
3963 while getattr(super_class, '__wrapped__', None):
3964 super_class = super_class.__wrapped__
3965 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3966
3967 return super().__init_subclass__(**kwargs)
3968
3969
3970 class SearchInfoExtractor(InfoExtractor):
3971 """
3972 Base class for paged search queries extractors.
3973 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3974 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3975 """
3976
3977 _MAX_RESULTS = float('inf')
3978 _RETURN_TYPE = 'playlist'
3979
3980 @classproperty
3981 def _VALID_URL(cls):
3982 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3983
3984 def _real_extract(self, query):
3985 prefix, query = self._match_valid_url(query).group('prefix', 'query')
3986 if prefix == '':
3987 return self._get_n_results(query, 1)
3988 elif prefix == 'all':
3989 return self._get_n_results(query, self._MAX_RESULTS)
3990 else:
3991 n = int(prefix)
3992 if n <= 0:
3993 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3994 elif n > self._MAX_RESULTS:
3995 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3996 n = self._MAX_RESULTS
3997 return self._get_n_results(query, n)
3998
3999 def _get_n_results(self, query, n):
4000 """Get a specified number of results for a query.
4001 Either this function or _search_results must be overridden by subclasses """
4002 return self.playlist_result(
4003 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
4004 query, query)
4005
4006 def _search_results(self, query):
4007 """Returns an iterator of search results"""
4008 raise NotImplementedError('This method must be implemented by subclasses')
4009
4010 @classproperty
4011 def SEARCH_KEY(cls):
4012 return cls._SEARCH_KEY
4013
4014
4015 class UnsupportedURLIE(InfoExtractor):
4016 _VALID_URL = '.*'
4017 _ENABLED = False
4018 IE_DESC = False
4019
4020 def _real_extract(self, url):
4021 raise UnsupportedError(url)