]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
[ie] Migrate commonly plural fields to lists (#8917)
[yt-dlp.git] / yt_dlp / extractor / common.py
1 import base64
2 import collections
3 import getpass
4 import hashlib
5 import http.client
6 import http.cookiejar
7 import http.cookies
8 import inspect
9 import itertools
10 import json
11 import math
12 import netrc
13 import os
14 import random
15 import re
16 import subprocess
17 import sys
18 import time
19 import types
20 import urllib.parse
21 import urllib.request
22 import xml.etree.ElementTree
23
24 from ..compat import functools # isort: split
25 from ..compat import (
26 compat_etree_fromstring,
27 compat_expanduser,
28 compat_os_name,
29 urllib_req_to_req,
30 )
31 from ..cookies import LenientSimpleCookie
32 from ..downloader.f4m import get_base_url, remove_encrypted_media
33 from ..downloader.hls import HlsFD
34 from ..networking import HEADRequest, Request
35 from ..networking.exceptions import (
36 HTTPError,
37 IncompleteRead,
38 network_exceptions,
39 )
40 from ..utils import (
41 IDENTITY,
42 JSON_LD_RE,
43 NO_DEFAULT,
44 ExtractorError,
45 FormatSorter,
46 GeoRestrictedError,
47 GeoUtils,
48 LenientJSONDecoder,
49 Popen,
50 RegexNotFoundError,
51 RetryManager,
52 UnsupportedError,
53 age_restricted,
54 base_url,
55 bug_reports_message,
56 classproperty,
57 clean_html,
58 deprecation_warning,
59 determine_ext,
60 dict_get,
61 encode_data_uri,
62 error_to_compat_str,
63 extract_attributes,
64 filter_dict,
65 fix_xml_ampersands,
66 float_or_none,
67 format_field,
68 int_or_none,
69 join_nonempty,
70 js_to_json,
71 mimetype2ext,
72 netrc_from_content,
73 orderedSet,
74 parse_bitrate,
75 parse_codecs,
76 parse_duration,
77 parse_iso8601,
78 parse_m3u8_attributes,
79 parse_resolution,
80 sanitize_filename,
81 sanitize_url,
82 smuggle_url,
83 str_or_none,
84 str_to_int,
85 strip_or_none,
86 traverse_obj,
87 truncate_string,
88 try_call,
89 try_get,
90 unescapeHTML,
91 unified_strdate,
92 unified_timestamp,
93 url_basename,
94 url_or_none,
95 urlhandle_detect_ext,
96 urljoin,
97 variadic,
98 xpath_element,
99 xpath_text,
100 xpath_with_ns,
101 )
102
103
104 class InfoExtractor:
105 """Information Extractor class.
106
107 Information extractors are the classes that, given a URL, extract
108 information about the video (or videos) the URL refers to. This
109 information includes the real video URL, the video title, author and
110 others. The information is stored in a dictionary which is then
111 passed to the YoutubeDL. The YoutubeDL processes this
112 information possibly downloading the video to the file system, among
113 other possible outcomes.
114
115 The type field determines the type of the result.
116 By far the most common value (and the default if _type is missing) is
117 "video", which indicates a single video.
118
119 For a video, the dictionaries must include the following fields:
120
121 id: Video identifier.
122 title: Video title, unescaped. Set to an empty string if video has
123 no title as opposed to "None" which signifies that the
124 extractor failed to obtain a title
125
126 Additionally, it must contain either a formats entry or a url one:
127
128 formats: A list of dictionaries for each format available, ordered
129 from worst to best quality.
130
131 Potential fields:
132 * url The mandatory URL representing the media:
133 for plain file media - HTTP URL of this file,
134 for RTMP - RTMP URL,
135 for HLS - URL of the M3U8 media playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH
138 - HTTP URL to plain file media (in case of
139 unfragmented media)
140 - URL of the MPD manifest or base URL
141 representing the media if MPD manifest
142 is parsed from a string (in case of
143 fragmented media)
144 for MSS - URL of the ISM manifest.
145 * request_data Data to send in POST request to the URL
146 * manifest_url
147 The URL of the manifest file in case of
148 fragmented media:
149 for HLS - URL of the M3U8 master playlist,
150 for HDS - URL of the F4M manifest,
151 for DASH - URL of the MPD manifest,
152 for MSS - URL of the ISM manifest.
153 * manifest_stream_number (For internal use only)
154 The index of the stream in the manifest file
155 * ext Will be calculated from URL if missing
156 * format A human-readable description of the format
157 ("mp4 container with h264/opus").
158 Calculated from the format_id, width, height.
159 and format_note fields if missing.
160 * format_id A short description of the format
161 ("mp4_h264_opus" or "19").
162 Technically optional, but strongly recommended.
163 * format_note Additional info about the format
164 ("3D" or "DASH video")
165 * width Width of the video, if known
166 * height Height of the video, if known
167 * aspect_ratio Aspect ratio of the video, if known
168 Automatically calculated from width and height
169 * resolution Textual description of width and height
170 Automatically calculated from width and height
171 * dynamic_range The dynamic range of the video. One of:
172 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
173 * tbr Average bitrate of audio and video in KBit/s
174 * abr Average audio bitrate in KBit/s
175 * acodec Name of the audio codec in use
176 * asr Audio sampling rate in Hertz
177 * audio_channels Number of audio channels
178 * vbr Average video bitrate in KBit/s
179 * fps Frame rate
180 * vcodec Name of the video codec in use
181 * container Name of the container format
182 * filesize The number of bytes, if known in advance
183 * filesize_approx An estimate for the number of bytes
184 * player_url SWF Player URL (used for rtmpdump).
185 * protocol The protocol that will be used for the actual
186 download, lower-case. One of "http", "https" or
187 one of the protocols defined in downloader.PROTOCOL_MAP
188 * fragment_base_url
189 Base URL for fragments. Each fragment's path
190 value (if present) will be relative to
191 this URL.
192 * fragments A list of fragments of a fragmented media.
193 Each fragment entry must contain either an url
194 or a path. If an url is present it should be
195 considered by a client. Otherwise both path and
196 fragment_base_url must be present. Here is
197 the list of all potential fields:
198 * "url" - fragment's URL
199 * "path" - fragment's path relative to
200 fragment_base_url
201 * "duration" (optional, int or float)
202 * "filesize" (optional, int)
203 * is_from_start Is a live format that can be downloaded
204 from the start. Boolean
205 * preference Order number of this format. If this field is
206 present and not None, the formats get sorted
207 by this field, regardless of all other values.
208 -1 for default (order by other properties),
209 -2 or smaller for less than default.
210 < -1000 to hide the format (if there is
211 another one which is strictly better)
212 * language Language code, e.g. "de" or "en-US".
213 * language_preference Is this in the language mentioned in
214 the URL?
215 10 if it's what the URL is about,
216 -1 for default (don't know),
217 -10 otherwise, other values reserved for now.
218 * quality Order number of the video quality of this
219 format, irrespective of the file format.
220 -1 for default (order by other properties),
221 -2 or smaller for less than default.
222 * source_preference Order number for this video source
223 (quality takes higher priority)
224 -1 for default (order by other properties),
225 -2 or smaller for less than default.
226 * http_headers A dictionary of additional HTTP headers
227 to add to the request.
228 * stretched_ratio If given and not 1, indicates that the
229 video's pixels are not square.
230 width : height ratio as float.
231 * no_resume The server does not support resuming the
232 (HTTP or RTMP) download. Boolean.
233 * has_drm True if the format has DRM and cannot be downloaded.
234 'maybe' if the format may have DRM and has to be tested before download.
235 * extra_param_to_segment_url A query string to append to each
236 fragment's URL, or to update each existing query string
237 with. Only applied by the native HLS/DASH downloaders.
238 * hls_aes A dictionary of HLS AES-128 decryption information
239 used by the native HLS downloader to override the
240 values in the media playlist when an '#EXT-X-KEY' tag
241 is present in the playlist:
242 * uri The URI from which the key will be downloaded
243 * key The key (as hex) used to decrypt fragments.
244 If `key` is given, any key URI will be ignored
245 * iv The IV (as hex) used to decrypt fragments
246 * downloader_options A dictionary of downloader options
247 (For internal use only)
248 * http_chunk_size Chunk size for HTTP downloads
249 * ffmpeg_args Extra arguments for ffmpeg downloader
250 * is_dash_periods Whether the format is a result of merging
251 multiple DASH periods.
252 RTMP formats can also have the additional fields: page_url,
253 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
254 rtmp_protocol, rtmp_real_time
255
256 url: Final video URL.
257 ext: Video filename extension.
258 format: The video format, defaults to ext (used for --get-format)
259 player_url: SWF Player URL (used for rtmpdump).
260
261 The following fields are optional:
262
263 direct: True if a direct video file was given (must only be set by GenericIE)
264 alt_title: A secondary title of the video.
265 display_id An alternative identifier for the video, not necessarily
266 unique, but available before title. Typically, id is
267 something like "4234987", title "Dancing naked mole rats",
268 and display_id "dancing-naked-mole-rats"
269 thumbnails: A list of dictionaries, with the following entries:
270 * "id" (optional, string) - Thumbnail format ID
271 * "url"
272 * "preference" (optional, int) - quality of the image
273 * "width" (optional, int)
274 * "height" (optional, int)
275 * "resolution" (optional, string "{width}x{height}",
276 deprecated)
277 * "filesize" (optional, int)
278 * "http_headers" (dict) - HTTP headers for the request
279 thumbnail: Full URL to a video thumbnail image.
280 description: Full video description.
281 uploader: Full name of the video uploader.
282 license: License name the video is licensed under.
283 creators: List of creators of the video.
284 timestamp: UNIX timestamp of the moment the video was uploaded
285 upload_date: Video upload date in UTC (YYYYMMDD).
286 If not explicitly set, calculated from timestamp
287 release_timestamp: UNIX timestamp of the moment the video was released.
288 If it is not clear whether to use timestamp or this, use the former
289 release_date: The date (YYYYMMDD) when the video was released in UTC.
290 If not explicitly set, calculated from release_timestamp
291 release_year: Year (YYYY) as integer when the video or album was released.
292 To be used if no exact release date is known.
293 If not explicitly set, calculated from release_date.
294 modified_timestamp: UNIX timestamp of the moment the video was last modified.
295 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
296 If not explicitly set, calculated from modified_timestamp
297 uploader_id: Nickname or id of the video uploader.
298 uploader_url: Full URL to a personal webpage of the video uploader.
299 channel: Full name of the channel the video is uploaded on.
300 Note that channel fields may or may not repeat uploader
301 fields. This depends on a particular extractor.
302 channel_id: Id of the channel.
303 channel_url: Full URL to a channel webpage.
304 channel_follower_count: Number of followers of the channel.
305 channel_is_verified: Whether the channel is verified on the platform.
306 location: Physical location where the video was filmed.
307 subtitles: The available subtitles as a dictionary in the format
308 {tag: subformats}. "tag" is usually a language code, and
309 "subformats" is a list sorted from lower to higher
310 preference, each element is a dictionary with the "ext"
311 entry and one of:
312 * "data": The subtitles file contents
313 * "url": A URL pointing to the subtitles file
314 It can optionally also have:
315 * "name": Name or description of the subtitles
316 * "http_headers": A dictionary of additional HTTP headers
317 to add to the request.
318 "ext" will be calculated from URL if missing
319 automatic_captions: Like 'subtitles'; contains automatically generated
320 captions instead of normal subtitles
321 duration: Length of the video in seconds, as an integer or float.
322 view_count: How many users have watched the video on the platform.
323 concurrent_view_count: How many users are currently watching the video on the platform.
324 like_count: Number of positive ratings of the video
325 dislike_count: Number of negative ratings of the video
326 repost_count: Number of reposts of the video
327 average_rating: Average rating give by users, the scale used depends on the webpage
328 comment_count: Number of comments on the video
329 comments: A list of comments, each with one or more of the following
330 properties (all but one of text or html optional):
331 * "author" - human-readable name of the comment author
332 * "author_id" - user ID of the comment author
333 * "author_thumbnail" - The thumbnail of the comment author
334 * "author_url" - The url to the comment author's page
335 * "author_is_verified" - Whether the author is verified
336 on the platform
337 * "author_is_uploader" - Whether the comment is made by
338 the video uploader
339 * "id" - Comment ID
340 * "html" - Comment as HTML
341 * "text" - Plain text of the comment
342 * "timestamp" - UNIX timestamp of comment
343 * "parent" - ID of the comment this one is replying to.
344 Set to "root" to indicate that this is a
345 comment to the original video.
346 * "like_count" - Number of positive ratings of the comment
347 * "dislike_count" - Number of negative ratings of the comment
348 * "is_favorited" - Whether the comment is marked as
349 favorite by the video uploader
350 * "is_pinned" - Whether the comment is pinned to
351 the top of the comments
352 age_limit: Age restriction for the video, as an integer (years)
353 webpage_url: The URL to the video webpage, if given to yt-dlp it
354 should allow to get the same result again. (It will be set
355 by YoutubeDL if it's missing)
356 categories: A list of categories that the video falls in, for example
357 ["Sports", "Berlin"]
358 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
359 cast: A list of the video cast
360 is_live: True, False, or None (=unknown). Whether this video is a
361 live stream that goes on instead of a fixed-length video.
362 was_live: True, False, or None (=unknown). Whether this video was
363 originally a live stream.
364 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
365 or 'post_live' (was live, but VOD is not yet processed)
366 If absent, automatically set from is_live, was_live
367 start_time: Time in seconds where the reproduction should start, as
368 specified in the URL.
369 end_time: Time in seconds where the reproduction should end, as
370 specified in the URL.
371 chapters: A list of dictionaries, with the following entries:
372 * "start_time" - The start time of the chapter in seconds
373 * "end_time" - The end time of the chapter in seconds
374 * "title" (optional, string)
375 heatmap: A list of dictionaries, with the following entries:
376 * "start_time" - The start time of the data point in seconds
377 * "end_time" - The end time of the data point in seconds
378 * "value" - The normalized value of the data point (float between 0 and 1)
379 playable_in_embed: Whether this video is allowed to play in embedded
380 players on other sites. Can be True (=always allowed),
381 False (=never allowed), None (=unknown), or a string
382 specifying the criteria for embedability; e.g. 'whitelist'
383 availability: Under what condition the video is available. One of
384 'private', 'premium_only', 'subscriber_only', 'needs_auth',
385 'unlisted' or 'public'. Use 'InfoExtractor._availability'
386 to set it
387 media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
388 _old_archive_ids: A list of old archive ids needed for backward compatibility
389 _format_sort_fields: A list of fields to use for sorting formats
390 __post_extractor: A function to be called just before the metadata is
391 written to either disk, logger or console. The function
392 must return a dict which will be added to the info_dict.
393 This is usefull for additional information that is
394 time-consuming to extract. Note that the fields thus
395 extracted will not be available to output template and
396 match_filter. So, only "comments" and "comment_count" are
397 currently allowed to be extracted via this method.
398
399 The following fields should only be used when the video belongs to some logical
400 chapter or section:
401
402 chapter: Name or title of the chapter the video belongs to.
403 chapter_number: Number of the chapter the video belongs to, as an integer.
404 chapter_id: Id of the chapter the video belongs to, as a unicode string.
405
406 The following fields should only be used when the video is an episode of some
407 series, programme or podcast:
408
409 series: Title of the series or programme the video episode belongs to.
410 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
411 season: Title of the season the video episode belongs to.
412 season_number: Number of the season the video episode belongs to, as an integer.
413 season_id: Id of the season the video episode belongs to, as a unicode string.
414 episode: Title of the video episode. Unlike mandatory video title field,
415 this field should denote the exact title of the video episode
416 without any kind of decoration.
417 episode_number: Number of the video episode within a season, as an integer.
418 episode_id: Id of the video episode, as a unicode string.
419
420 The following fields should only be used when the media is a track or a part of
421 a music album:
422
423 track: Title of the track.
424 track_number: Number of the track within an album or a disc, as an integer.
425 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
426 as a unicode string.
427 artists: List of artists of the track.
428 composers: List of composers of the piece.
429 genres: List of genres of the track.
430 album: Title of the album the track belongs to.
431 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
432 album_artists: List of all artists appeared on the album.
433 E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
434 Useful for splits and compilations.
435 disc_number: Number of the disc or other physical medium the track belongs to,
436 as an integer.
437
438 The following fields should only be set for clips that should be cut from the original video:
439
440 section_start: Start time of the section in seconds
441 section_end: End time of the section in seconds
442
443 The following fields should only be set for storyboards:
444 rows: Number of rows in each storyboard fragment, as an integer
445 columns: Number of columns in each storyboard fragment, as an integer
446
447 The following fields are deprecated and should not be set by new code:
448 composer: Use "composers" instead.
449 Composer(s) of the piece, comma-separated.
450 artist: Use "artists" instead.
451 Artist(s) of the track, comma-separated.
452 genre: Use "genres" instead.
453 Genre(s) of the track, comma-separated.
454 album_artist: Use "album_artists" instead.
455 All artists appeared on the album, comma-separated.
456 creator: Use "creators" instead.
457 The creator of the video.
458
459 Unless mentioned otherwise, the fields should be Unicode strings.
460
461 Unless mentioned otherwise, None is equivalent to absence of information.
462
463
464 _type "playlist" indicates multiple videos.
465 There must be a key "entries", which is a list, an iterable, or a PagedList
466 object, each element of which is a valid dictionary by this specification.
467
468 Additionally, playlists can have "id", "title", and any other relevant
469 attributes with the same semantics as videos (see above).
470
471 It can also have the following optional fields:
472
473 playlist_count: The total number of videos in a playlist. If not given,
474 YoutubeDL tries to calculate it from "entries"
475
476
477 _type "multi_video" indicates that there are multiple videos that
478 form a single show, for examples multiple acts of an opera or TV episode.
479 It must have an entries key like a playlist and contain all the keys
480 required for a video at the same time.
481
482
483 _type "url" indicates that the video must be extracted from another
484 location, possibly by a different extractor. Its only required key is:
485 "url" - the next URL to extract.
486 The key "ie_key" can be set to the class name (minus the trailing "IE",
487 e.g. "Youtube") if the extractor class is known in advance.
488 Additionally, the dictionary may have any properties of the resolved entity
489 known in advance, for example "title" if the title of the referred video is
490 known ahead of time.
491
492
493 _type "url_transparent" entities have the same specification as "url", but
494 indicate that the given additional information is more precise than the one
495 associated with the resolved URL.
496 This is useful when a site employs a video service that hosts the video and
497 its technical metadata, but that video service does not embed a useful
498 title, description etc.
499
500
501 Subclasses of this should also be added to the list of extractors and
502 should define _VALID_URL as a regexp or a Sequence of regexps, and
503 re-define the _real_extract() and (optionally) _real_initialize() methods.
504
505 Subclasses may also override suitable() if necessary, but ensure the function
506 signature is preserved and that this function imports everything it needs
507 (except other extractors), so that lazy_extractors works correctly.
508
509 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
510 the HTML of Generic webpages. It may also override _extract_embed_urls
511 or _extract_from_webpage as necessary. While these are normally classmethods,
512 _extract_from_webpage is allowed to be an instance method.
513
514 _extract_from_webpage may raise self.StopExtraction() to stop further
515 processing of the webpage and obtain exclusive rights to it. This is useful
516 when the extractor cannot reliably be matched using just the URL,
517 e.g. invidious/peertube instances
518
519 Embed-only extractors can be defined by setting _VALID_URL = False.
520
521 To support username + password (or netrc) login, the extractor must define a
522 _NETRC_MACHINE and re-define _perform_login(username, password) and
523 (optionally) _initialize_pre_login() methods. The _perform_login method will
524 be called between _initialize_pre_login and _real_initialize if credentials
525 are passed by the user. In cases where it is necessary to have the login
526 process as part of the extraction rather than initialization, _perform_login
527 can be left undefined.
528
529 _GEO_BYPASS attribute may be set to False in order to disable
530 geo restriction bypass mechanisms for a particular extractor.
531 Though it won't disable explicit geo restriction bypass based on
532 country code provided with geo_bypass_country.
533
534 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
535 countries for this extractor. One of these countries will be used by
536 geo restriction bypass mechanism right away in order to bypass
537 geo restriction, of course, if the mechanism is not disabled.
538
539 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
540 IP blocks in CIDR notation for this extractor. One of these IP blocks
541 will be used by geo restriction bypass mechanism similarly
542 to _GEO_COUNTRIES.
543
544 The _ENABLED attribute should be set to False for IEs that
545 are disabled by default and must be explicitly enabled.
546
547 The _WORKING attribute should be set to False for broken IEs
548 in order to warn the users and skip the tests.
549 """
550
551 _ready = False
552 _downloader = None
553 _x_forwarded_for_ip = None
554 _GEO_BYPASS = True
555 _GEO_COUNTRIES = None
556 _GEO_IP_BLOCKS = None
557 _WORKING = True
558 _ENABLED = True
559 _NETRC_MACHINE = None
560 IE_DESC = None
561 SEARCH_KEY = None
562 _VALID_URL = None
563 _EMBED_REGEX = []
564
565 def _login_hint(self, method=NO_DEFAULT, netrc=None):
566 password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
567 return {
568 None: '',
569 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
570 'password': f'Use {password_hint}',
571 'cookies': (
572 'Use --cookies-from-browser or --cookies for the authentication. '
573 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
574 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
575
576 def __init__(self, downloader=None):
577 """Constructor. Receives an optional downloader (a YoutubeDL instance).
578 If a downloader is not passed during initialization,
579 it must be set using "set_downloader()" before "extract()" is called"""
580 self._ready = False
581 self._x_forwarded_for_ip = None
582 self._printed_messages = set()
583 self.set_downloader(downloader)
584
585 @classmethod
586 def _match_valid_url(cls, url):
587 if cls._VALID_URL is False:
588 return None
589 # This does not use has/getattr intentionally - we want to know whether
590 # we have cached the regexp for *this* class, whereas getattr would also
591 # match the superclass
592 if '_VALID_URL_RE' not in cls.__dict__:
593 cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
594 return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
595
596 @classmethod
597 def suitable(cls, url):
598 """Receives a URL and returns True if suitable for this IE."""
599 # This function must import everything it needs (except other extractors),
600 # so that lazy_extractors works correctly
601 return cls._match_valid_url(url) is not None
602
603 @classmethod
604 def _match_id(cls, url):
605 return cls._match_valid_url(url).group('id')
606
607 @classmethod
608 def get_temp_id(cls, url):
609 try:
610 return cls._match_id(url)
611 except (IndexError, AttributeError):
612 return None
613
614 @classmethod
615 def working(cls):
616 """Getter method for _WORKING."""
617 return cls._WORKING
618
619 @classmethod
620 def supports_login(cls):
621 return bool(cls._NETRC_MACHINE)
622
623 def initialize(self):
624 """Initializes an instance (authentication, etc)."""
625 self._printed_messages = set()
626 self._initialize_geo_bypass({
627 'countries': self._GEO_COUNTRIES,
628 'ip_blocks': self._GEO_IP_BLOCKS,
629 })
630 if not self._ready:
631 self._initialize_pre_login()
632 if self.supports_login():
633 username, password = self._get_login_info()
634 if username:
635 self._perform_login(username, password)
636 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
637 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
638 self._real_initialize()
639 self._ready = True
640
641 def _initialize_geo_bypass(self, geo_bypass_context):
642 """
643 Initialize geo restriction bypass mechanism.
644
645 This method is used to initialize geo bypass mechanism based on faking
646 X-Forwarded-For HTTP header. A random country from provided country list
647 is selected and a random IP belonging to this country is generated. This
648 IP will be passed as X-Forwarded-For HTTP header in all subsequent
649 HTTP requests.
650
651 This method will be used for initial geo bypass mechanism initialization
652 during the instance initialization with _GEO_COUNTRIES and
653 _GEO_IP_BLOCKS.
654
655 You may also manually call it from extractor's code if geo bypass
656 information is not available beforehand (e.g. obtained during
657 extraction) or due to some other reason. In this case you should pass
658 this information in geo bypass context passed as first argument. It may
659 contain following fields:
660
661 countries: List of geo unrestricted countries (similar
662 to _GEO_COUNTRIES)
663 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
664 (similar to _GEO_IP_BLOCKS)
665
666 """
667 if not self._x_forwarded_for_ip:
668
669 # Geo bypass mechanism is explicitly disabled by user
670 if not self.get_param('geo_bypass', True):
671 return
672
673 if not geo_bypass_context:
674 geo_bypass_context = {}
675
676 # Backward compatibility: previously _initialize_geo_bypass
677 # expected a list of countries, some 3rd party code may still use
678 # it this way
679 if isinstance(geo_bypass_context, (list, tuple)):
680 geo_bypass_context = {
681 'countries': geo_bypass_context,
682 }
683
684 # The whole point of geo bypass mechanism is to fake IP
685 # as X-Forwarded-For HTTP header based on some IP block or
686 # country code.
687
688 # Path 1: bypassing based on IP block in CIDR notation
689
690 # Explicit IP block specified by user, use it right away
691 # regardless of whether extractor is geo bypassable or not
692 ip_block = self.get_param('geo_bypass_ip_block', None)
693
694 # Otherwise use random IP block from geo bypass context but only
695 # if extractor is known as geo bypassable
696 if not ip_block:
697 ip_blocks = geo_bypass_context.get('ip_blocks')
698 if self._GEO_BYPASS and ip_blocks:
699 ip_block = random.choice(ip_blocks)
700
701 if ip_block:
702 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
703 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
704 return
705
706 # Path 2: bypassing based on country code
707
708 # Explicit country code specified by user, use it right away
709 # regardless of whether extractor is geo bypassable or not
710 country = self.get_param('geo_bypass_country', None)
711
712 # Otherwise use random country code from geo bypass context but
713 # only if extractor is known as geo bypassable
714 if not country:
715 countries = geo_bypass_context.get('countries')
716 if self._GEO_BYPASS and countries:
717 country = random.choice(countries)
718
719 if country:
720 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
721 self._downloader.write_debug(
722 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
723
724 def extract(self, url):
725 """Extracts URL information and returns it in list of dicts."""
726 try:
727 for _ in range(2):
728 try:
729 self.initialize()
730 self.to_screen('Extracting URL: %s' % (
731 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
732 ie_result = self._real_extract(url)
733 if ie_result is None:
734 return None
735 if self._x_forwarded_for_ip:
736 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
737 subtitles = ie_result.get('subtitles') or {}
738 if 'no-live-chat' in self.get_param('compat_opts'):
739 for lang in ('live_chat', 'comments', 'danmaku'):
740 subtitles.pop(lang, None)
741 return ie_result
742 except GeoRestrictedError as e:
743 if self.__maybe_fake_ip_and_retry(e.countries):
744 continue
745 raise
746 except UnsupportedError:
747 raise
748 except ExtractorError as e:
749 e.video_id = e.video_id or self.get_temp_id(url)
750 e.ie = e.ie or self.IE_NAME,
751 e.traceback = e.traceback or sys.exc_info()[2]
752 raise
753 except IncompleteRead as e:
754 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
755 except (KeyError, StopIteration) as e:
756 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
757
758 def __maybe_fake_ip_and_retry(self, countries):
759 if (not self.get_param('geo_bypass_country', None)
760 and self._GEO_BYPASS
761 and self.get_param('geo_bypass', True)
762 and not self._x_forwarded_for_ip
763 and countries):
764 country_code = random.choice(countries)
765 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
766 if self._x_forwarded_for_ip:
767 self.report_warning(
768 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
769 % (self._x_forwarded_for_ip, country_code.upper()))
770 return True
771 return False
772
773 def set_downloader(self, downloader):
774 """Sets a YoutubeDL instance as the downloader for this IE."""
775 self._downloader = downloader
776
777 @property
778 def cache(self):
779 return self._downloader.cache
780
781 @property
782 def cookiejar(self):
783 return self._downloader.cookiejar
784
785 def _initialize_pre_login(self):
786 """ Initialization before login. Redefine in subclasses."""
787 pass
788
789 def _perform_login(self, username, password):
790 """ Login with username and password. Redefine in subclasses."""
791 pass
792
793 def _real_initialize(self):
794 """Real initialization process. Redefine in subclasses."""
795 pass
796
797 def _real_extract(self, url):
798 """Real extraction process. Redefine in subclasses."""
799 raise NotImplementedError('This method must be implemented by subclasses')
800
801 @classmethod
802 def ie_key(cls):
803 """A string for getting the InfoExtractor with get_info_extractor"""
804 return cls.__name__[:-2]
805
806 @classproperty
807 def IE_NAME(cls):
808 return cls.__name__[:-2]
809
810 @staticmethod
811 def __can_accept_status_code(err, expected_status):
812 assert isinstance(err, HTTPError)
813 if expected_status is None:
814 return False
815 elif callable(expected_status):
816 return expected_status(err.status) is True
817 else:
818 return err.status in variadic(expected_status)
819
820 def _create_request(self, url_or_request, data=None, headers=None, query=None):
821 if isinstance(url_or_request, urllib.request.Request):
822 self._downloader.deprecation_warning(
823 'Passing a urllib.request.Request to _create_request() is deprecated. '
824 'Use yt_dlp.networking.common.Request instead.')
825 url_or_request = urllib_req_to_req(url_or_request)
826 elif not isinstance(url_or_request, Request):
827 url_or_request = Request(url_or_request)
828
829 url_or_request.update(data=data, headers=headers, query=query)
830 return url_or_request
831
832 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
833 """
834 Return the response handle.
835
836 See _download_webpage docstring for arguments specification.
837 """
838 if not self._downloader._first_webpage_request:
839 sleep_interval = self.get_param('sleep_interval_requests') or 0
840 if sleep_interval > 0:
841 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
842 time.sleep(sleep_interval)
843 else:
844 self._downloader._first_webpage_request = False
845
846 if note is None:
847 self.report_download_webpage(video_id)
848 elif note is not False:
849 if video_id is None:
850 self.to_screen(str(note))
851 else:
852 self.to_screen(f'{video_id}: {note}')
853
854 # Some sites check X-Forwarded-For HTTP header in order to figure out
855 # the origin of the client behind proxy. This allows bypassing geo
856 # restriction by faking this header's value to IP that belongs to some
857 # geo unrestricted country. We will do so once we encounter any
858 # geo restriction error.
859 if self._x_forwarded_for_ip:
860 headers = (headers or {}).copy()
861 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
862
863 try:
864 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
865 except network_exceptions as err:
866 if isinstance(err, HTTPError):
867 if self.__can_accept_status_code(err, expected_status):
868 return err.response
869
870 if errnote is False:
871 return False
872 if errnote is None:
873 errnote = 'Unable to download webpage'
874
875 errmsg = f'{errnote}: {error_to_compat_str(err)}'
876 if fatal:
877 raise ExtractorError(errmsg, cause=err)
878 else:
879 self.report_warning(errmsg)
880 return False
881
882 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
883 encoding=None, data=None, headers={}, query={}, expected_status=None):
884 """
885 Return a tuple (page content as string, URL handle).
886
887 Arguments:
888 url_or_request -- plain text URL as a string or
889 a urllib.request.Request object
890 video_id -- Video/playlist/item identifier (string)
891
892 Keyword arguments:
893 note -- note printed before downloading (string)
894 errnote -- note printed in case of an error (string)
895 fatal -- flag denoting whether error should be considered fatal,
896 i.e. whether it should cause ExtractionError to be raised,
897 otherwise a warning will be reported and extraction continued
898 encoding -- encoding for a page content decoding, guessed automatically
899 when not explicitly specified
900 data -- POST data (bytes)
901 headers -- HTTP headers (dict)
902 query -- URL query (dict)
903 expected_status -- allows to accept failed HTTP requests (non 2xx
904 status code) by explicitly specifying a set of accepted status
905 codes. Can be any of the following entities:
906 - an integer type specifying an exact failed status code to
907 accept
908 - a list or a tuple of integer types specifying a list of
909 failed status codes to accept
910 - a callable accepting an actual failed status code and
911 returning True if it should be accepted
912 Note that this argument does not affect success status codes (2xx)
913 which are always accepted.
914 """
915
916 # Strip hashes from the URL (#1038)
917 if isinstance(url_or_request, str):
918 url_or_request = url_or_request.partition('#')[0]
919
920 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
921 if urlh is False:
922 assert not fatal
923 return False
924 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
925 return (content, urlh)
926
927 @staticmethod
928 def _guess_encoding_from_content(content_type, webpage_bytes):
929 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
930 if m:
931 encoding = m.group(1)
932 else:
933 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
934 webpage_bytes[:1024])
935 if m:
936 encoding = m.group(1).decode('ascii')
937 elif webpage_bytes.startswith(b'\xff\xfe'):
938 encoding = 'utf-16'
939 else:
940 encoding = 'utf-8'
941
942 return encoding
943
944 def __check_blocked(self, content):
945 first_block = content[:512]
946 if ('<title>Access to this site is blocked</title>' in content
947 and 'Websense' in first_block):
948 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
949 blocked_iframe = self._html_search_regex(
950 r'<iframe src="([^"]+)"', content,
951 'Websense information URL', default=None)
952 if blocked_iframe:
953 msg += ' Visit %s for more details' % blocked_iframe
954 raise ExtractorError(msg, expected=True)
955 if '<title>The URL you requested has been blocked</title>' in first_block:
956 msg = (
957 'Access to this webpage has been blocked by Indian censorship. '
958 'Use a VPN or proxy server (with --proxy) to route around it.')
959 block_msg = self._html_search_regex(
960 r'</h1><p>(.*?)</p>',
961 content, 'block message', default=None)
962 if block_msg:
963 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
964 raise ExtractorError(msg, expected=True)
965 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
966 and 'blocklist.rkn.gov.ru' in content):
967 raise ExtractorError(
968 'Access to this webpage has been blocked by decision of the Russian government. '
969 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
970 expected=True)
971
972 def _request_dump_filename(self, url, video_id):
973 basen = f'{video_id}_{url}'
974 trim_length = self.get_param('trim_file_name') or 240
975 if len(basen) > trim_length:
976 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
977 basen = basen[:trim_length - len(h)] + h
978 filename = sanitize_filename(f'{basen}.dump', restricted=True)
979 # Working around MAX_PATH limitation on Windows (see
980 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
981 if compat_os_name == 'nt':
982 absfilepath = os.path.abspath(filename)
983 if len(absfilepath) > 259:
984 filename = fR'\\?\{absfilepath}'
985 return filename
986
987 def __decode_webpage(self, webpage_bytes, encoding, headers):
988 if not encoding:
989 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
990 try:
991 return webpage_bytes.decode(encoding, 'replace')
992 except LookupError:
993 return webpage_bytes.decode('utf-8', 'replace')
994
995 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
996 webpage_bytes = urlh.read()
997 if prefix is not None:
998 webpage_bytes = prefix + webpage_bytes
999 if self.get_param('dump_intermediate_pages', False):
1000 self.to_screen('Dumping request to ' + urlh.url)
1001 dump = base64.b64encode(webpage_bytes).decode('ascii')
1002 self._downloader.to_screen(dump)
1003 if self.get_param('write_pages'):
1004 filename = self._request_dump_filename(urlh.url, video_id)
1005 self.to_screen(f'Saving request to {filename}')
1006 with open(filename, 'wb') as outf:
1007 outf.write(webpage_bytes)
1008
1009 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1010 self.__check_blocked(content)
1011
1012 return content
1013
1014 def __print_error(self, errnote, fatal, video_id, err):
1015 if fatal:
1016 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1017 elif errnote:
1018 self.report_warning(f'{video_id}: {errnote}: {err}')
1019
1020 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1021 if transform_source:
1022 xml_string = transform_source(xml_string)
1023 try:
1024 return compat_etree_fromstring(xml_string.encode('utf-8'))
1025 except xml.etree.ElementTree.ParseError as ve:
1026 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1027
1028 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1029 try:
1030 return json.loads(
1031 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1032 except ValueError as ve:
1033 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1034
1035 def _parse_socket_response_as_json(self, data, *args, **kwargs):
1036 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1037
1038 def __create_download_methods(name, parser, note, errnote, return_value):
1039
1040 def parse(ie, content, *args, errnote=errnote, **kwargs):
1041 if parser is None:
1042 return content
1043 if errnote is False:
1044 kwargs['errnote'] = errnote
1045 # parser is fetched by name so subclasses can override it
1046 return getattr(ie, parser)(content, *args, **kwargs)
1047
1048 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1049 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1050 res = self._download_webpage_handle(
1051 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1052 data=data, headers=headers, query=query, expected_status=expected_status)
1053 if res is False:
1054 return res
1055 content, urlh = res
1056 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1057
1058 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1059 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1060 if self.get_param('load_pages'):
1061 url_or_request = self._create_request(url_or_request, data, headers, query)
1062 filename = self._request_dump_filename(url_or_request.url, video_id)
1063 self.to_screen(f'Loading request from {filename}')
1064 try:
1065 with open(filename, 'rb') as dumpf:
1066 webpage_bytes = dumpf.read()
1067 except OSError as e:
1068 self.report_warning(f'Unable to load request from disk: {e}')
1069 else:
1070 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1071 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1072 kwargs = {
1073 'note': note,
1074 'errnote': errnote,
1075 'transform_source': transform_source,
1076 'fatal': fatal,
1077 'encoding': encoding,
1078 'data': data,
1079 'headers': headers,
1080 'query': query,
1081 'expected_status': expected_status,
1082 }
1083 if parser is None:
1084 kwargs.pop('transform_source')
1085 # The method is fetched by name so subclasses can override _download_..._handle
1086 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1087 return res if res is False else res[0]
1088
1089 def impersonate(func, name, return_value):
1090 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1091 func.__doc__ = f'''
1092 @param transform_source Apply this transformation before parsing
1093 @returns {return_value}
1094
1095 See _download_webpage_handle docstring for other arguments specification
1096 '''
1097
1098 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1099 impersonate(download_content, f'_download_{name}', f'{return_value}')
1100 return download_handle, download_content
1101
1102 _download_xml_handle, _download_xml = __create_download_methods(
1103 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1104 _download_json_handle, _download_json = __create_download_methods(
1105 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1106 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1107 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1108 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1109
1110 def _download_webpage(
1111 self, url_or_request, video_id, note=None, errnote=None,
1112 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1113 """
1114 Return the data of the page as a string.
1115
1116 Keyword arguments:
1117 tries -- number of tries
1118 timeout -- sleep interval between tries
1119
1120 See _download_webpage_handle docstring for other arguments specification.
1121 """
1122
1123 R''' # NB: These are unused; should they be deprecated?
1124 if tries != 1:
1125 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1126 if timeout is NO_DEFAULT:
1127 timeout = 5
1128 else:
1129 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1130 '''
1131
1132 try_count = 0
1133 while True:
1134 try:
1135 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1136 except IncompleteRead as e:
1137 try_count += 1
1138 if try_count >= tries:
1139 raise e
1140 self._sleep(timeout, video_id)
1141
1142 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1143 idstr = format_field(video_id, None, '%s: ')
1144 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1145 if only_once:
1146 if f'WARNING: {msg}' in self._printed_messages:
1147 return
1148 self._printed_messages.add(f'WARNING: {msg}')
1149 self._downloader.report_warning(msg, *args, **kwargs)
1150
1151 def to_screen(self, msg, *args, **kwargs):
1152 """Print msg to screen, prefixing it with '[ie_name]'"""
1153 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1154
1155 def write_debug(self, msg, *args, **kwargs):
1156 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1157
1158 def get_param(self, name, default=None, *args, **kwargs):
1159 if self._downloader:
1160 return self._downloader.params.get(name, default, *args, **kwargs)
1161 return default
1162
1163 def report_drm(self, video_id, partial=NO_DEFAULT):
1164 if partial is not NO_DEFAULT:
1165 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1166 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1167
1168 def report_extraction(self, id_or_name):
1169 """Report information extraction."""
1170 self.to_screen('%s: Extracting information' % id_or_name)
1171
1172 def report_download_webpage(self, video_id):
1173 """Report webpage download."""
1174 self.to_screen('%s: Downloading webpage' % video_id)
1175
1176 def report_age_confirmation(self):
1177 """Report attempt to confirm age."""
1178 self.to_screen('Confirming age')
1179
1180 def report_login(self):
1181 """Report attempt to log in."""
1182 self.to_screen('Logging in')
1183
1184 def raise_login_required(
1185 self, msg='This video is only available for registered users',
1186 metadata_available=False, method=NO_DEFAULT):
1187 if metadata_available and (
1188 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1189 self.report_warning(msg)
1190 return
1191 msg += format_field(self._login_hint(method), None, '. %s')
1192 raise ExtractorError(msg, expected=True)
1193
1194 def raise_geo_restricted(
1195 self, msg='This video is not available from your location due to geo restriction',
1196 countries=None, metadata_available=False):
1197 if metadata_available and (
1198 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1199 self.report_warning(msg)
1200 else:
1201 raise GeoRestrictedError(msg, countries=countries)
1202
1203 def raise_no_formats(self, msg, expected=False, video_id=None):
1204 if expected and (
1205 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1206 self.report_warning(msg, video_id)
1207 elif isinstance(msg, ExtractorError):
1208 raise msg
1209 else:
1210 raise ExtractorError(msg, expected=expected, video_id=video_id)
1211
1212 # Methods for following #608
1213 @staticmethod
1214 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1215 """Returns a URL that points to a page that should be processed"""
1216 if ie is not None:
1217 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1218 if video_id is not None:
1219 kwargs['id'] = video_id
1220 if video_title is not None:
1221 kwargs['title'] = video_title
1222 return {
1223 **kwargs,
1224 '_type': 'url_transparent' if url_transparent else 'url',
1225 'url': url,
1226 }
1227
1228 @classmethod
1229 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1230 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1231 return cls.playlist_result(
1232 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1233 playlist_id, playlist_title, **kwargs)
1234
1235 @staticmethod
1236 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1237 """Returns a playlist"""
1238 if playlist_id:
1239 kwargs['id'] = playlist_id
1240 if playlist_title:
1241 kwargs['title'] = playlist_title
1242 if playlist_description is not None:
1243 kwargs['description'] = playlist_description
1244 return {
1245 **kwargs,
1246 '_type': 'multi_video' if multi_video else 'playlist',
1247 'entries': entries,
1248 }
1249
1250 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1251 """
1252 Perform a regex search on the given string, using a single or a list of
1253 patterns returning the first matching group.
1254 In case of failure return a default value or raise a WARNING or a
1255 RegexNotFoundError, depending on fatal, specifying the field name.
1256 """
1257 if string is None:
1258 mobj = None
1259 elif isinstance(pattern, (str, re.Pattern)):
1260 mobj = re.search(pattern, string, flags)
1261 else:
1262 for p in pattern:
1263 mobj = re.search(p, string, flags)
1264 if mobj:
1265 break
1266
1267 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1268
1269 if mobj:
1270 if group is None:
1271 # return the first matching group
1272 return next(g for g in mobj.groups() if g is not None)
1273 elif isinstance(group, (list, tuple)):
1274 return tuple(mobj.group(g) for g in group)
1275 else:
1276 return mobj.group(group)
1277 elif default is not NO_DEFAULT:
1278 return default
1279 elif fatal:
1280 raise RegexNotFoundError('Unable to extract %s' % _name)
1281 else:
1282 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1283 return None
1284
1285 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1286 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1287 """Searches string for the JSON object specified by start_pattern"""
1288 # NB: end_pattern is only used to reduce the size of the initial match
1289 if default is NO_DEFAULT:
1290 default, has_default = {}, False
1291 else:
1292 fatal, has_default = False, True
1293
1294 json_string = self._search_regex(
1295 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1296 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1297 if not json_string:
1298 return default
1299
1300 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1301 try:
1302 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1303 except ExtractorError as e:
1304 if fatal:
1305 raise ExtractorError(
1306 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1307 elif not has_default:
1308 self.report_warning(
1309 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1310 return default
1311
1312 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1313 """
1314 Like _search_regex, but strips HTML tags and unescapes entities.
1315 """
1316 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1317 if isinstance(res, tuple):
1318 return tuple(map(clean_html, res))
1319 return clean_html(res)
1320
1321 def _get_netrc_login_info(self, netrc_machine=None):
1322 netrc_machine = netrc_machine or self._NETRC_MACHINE
1323
1324 cmd = self.get_param('netrc_cmd')
1325 if cmd:
1326 cmd = cmd.replace('{}', netrc_machine)
1327 self.to_screen(f'Executing command: {cmd}')
1328 stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1329 if ret != 0:
1330 raise OSError(f'Command returned error code {ret}')
1331 info = netrc_from_content(stdout).authenticators(netrc_machine)
1332
1333 elif self.get_param('usenetrc', False):
1334 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1335 if os.path.isdir(netrc_file):
1336 netrc_file = os.path.join(netrc_file, '.netrc')
1337 info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1338
1339 else:
1340 return None, None
1341 if not info:
1342 raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
1343 return info[0], info[2]
1344
1345 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1346 """
1347 Get the login info as (username, password)
1348 First look for the manually specified credentials using username_option
1349 and password_option as keys in params dictionary. If no such credentials
1350 are available try the netrc_cmd if it is defined or look in the
1351 netrc file using the netrc_machine or _NETRC_MACHINE value.
1352 If there's no info available, return (None, None)
1353 """
1354
1355 username = self.get_param(username_option)
1356 if username is not None:
1357 password = self.get_param(password_option)
1358 else:
1359 try:
1360 username, password = self._get_netrc_login_info(netrc_machine)
1361 except (OSError, netrc.NetrcParseError) as err:
1362 self.report_warning(f'Failed to parse .netrc: {err}')
1363 return None, None
1364 return username, password
1365
1366 def _get_tfa_info(self, note='two-factor verification code'):
1367 """
1368 Get the two-factor authentication info
1369 TODO - asking the user will be required for sms/phone verify
1370 currently just uses the command line option
1371 If there's no info available, return None
1372 """
1373
1374 tfa = self.get_param('twofactor')
1375 if tfa is not None:
1376 return tfa
1377
1378 return getpass.getpass('Type %s and press [Return]: ' % note)
1379
1380 # Helper functions for extracting OpenGraph info
1381 @staticmethod
1382 def _og_regexes(prop):
1383 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1384 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1385 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1386 template = r'<meta[^>]+?%s[^>]+?%s'
1387 return [
1388 template % (property_re, content_re),
1389 template % (content_re, property_re),
1390 ]
1391
1392 @staticmethod
1393 def _meta_regex(prop):
1394 return r'''(?isx)<meta
1395 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1396 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1397
1398 def _og_search_property(self, prop, html, name=None, **kargs):
1399 prop = variadic(prop)
1400 if name is None:
1401 name = 'OpenGraph %s' % prop[0]
1402 og_regexes = []
1403 for p in prop:
1404 og_regexes.extend(self._og_regexes(p))
1405 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1406 if escaped is None:
1407 return None
1408 return unescapeHTML(escaped)
1409
1410 def _og_search_thumbnail(self, html, **kargs):
1411 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1412
1413 def _og_search_description(self, html, **kargs):
1414 return self._og_search_property('description', html, fatal=False, **kargs)
1415
1416 def _og_search_title(self, html, *, fatal=False, **kargs):
1417 return self._og_search_property('title', html, fatal=fatal, **kargs)
1418
1419 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1420 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1421 if secure:
1422 regexes = self._og_regexes('video:secure_url') + regexes
1423 return self._html_search_regex(regexes, html, name, **kargs)
1424
1425 def _og_search_url(self, html, **kargs):
1426 return self._og_search_property('url', html, **kargs)
1427
1428 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1429 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1430
1431 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1432 name = variadic(name)
1433 if display_name is None:
1434 display_name = name[0]
1435 return self._html_search_regex(
1436 [self._meta_regex(n) for n in name],
1437 html, display_name, fatal=fatal, group='content', **kwargs)
1438
1439 def _dc_search_uploader(self, html):
1440 return self._html_search_meta('dc.creator', html, 'uploader')
1441
1442 @staticmethod
1443 def _rta_search(html):
1444 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1445 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1446 r' content="RTA-5042-1996-1400-1577-RTA"',
1447 html):
1448 return 18
1449
1450 # And then there are the jokers who advertise that they use RTA, but actually don't.
1451 AGE_LIMIT_MARKERS = [
1452 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1453 r'>[^<]*you acknowledge you are at least (\d+) years old',
1454 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1455 ]
1456
1457 age_limit = 0
1458 for marker in AGE_LIMIT_MARKERS:
1459 mobj = re.search(marker, html)
1460 if mobj:
1461 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1462 return age_limit
1463
1464 def _media_rating_search(self, html):
1465 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1466 rating = self._html_search_meta('rating', html)
1467
1468 if not rating:
1469 return None
1470
1471 RATING_TABLE = {
1472 'safe for kids': 0,
1473 'general': 8,
1474 '14 years': 14,
1475 'mature': 17,
1476 'restricted': 19,
1477 }
1478 return RATING_TABLE.get(rating.lower())
1479
1480 def _family_friendly_search(self, html):
1481 # See http://schema.org/VideoObject
1482 family_friendly = self._html_search_meta(
1483 'isFamilyFriendly', html, default=None)
1484
1485 if not family_friendly:
1486 return None
1487
1488 RATING_TABLE = {
1489 '1': 0,
1490 'true': 0,
1491 '0': 18,
1492 'false': 18,
1493 }
1494 return RATING_TABLE.get(family_friendly.lower())
1495
1496 def _twitter_search_player(self, html):
1497 return self._html_search_meta('twitter:player', html,
1498 'twitter card player')
1499
1500 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1501 """Yield all json ld objects in the html"""
1502 if default is not NO_DEFAULT:
1503 fatal = False
1504 for mobj in re.finditer(JSON_LD_RE, html):
1505 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1506 for json_ld in variadic(json_ld_item):
1507 if isinstance(json_ld, dict):
1508 yield json_ld
1509
1510 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1511 """Search for a video in any json ld in the html"""
1512 if default is not NO_DEFAULT:
1513 fatal = False
1514 info = self._json_ld(
1515 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1516 video_id, fatal=fatal, expected_type=expected_type)
1517 if info:
1518 return info
1519 if default is not NO_DEFAULT:
1520 return default
1521 elif fatal:
1522 raise RegexNotFoundError('Unable to extract JSON-LD')
1523 else:
1524 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1525 return {}
1526
1527 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1528 if isinstance(json_ld, str):
1529 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1530 if not json_ld:
1531 return {}
1532 info = {}
1533
1534 INTERACTION_TYPE_MAP = {
1535 'CommentAction': 'comment',
1536 'AgreeAction': 'like',
1537 'DisagreeAction': 'dislike',
1538 'LikeAction': 'like',
1539 'DislikeAction': 'dislike',
1540 'ListenAction': 'view',
1541 'WatchAction': 'view',
1542 'ViewAction': 'view',
1543 }
1544
1545 def is_type(e, *expected_types):
1546 type = variadic(traverse_obj(e, '@type'))
1547 return any(x in type for x in expected_types)
1548
1549 def extract_interaction_type(e):
1550 interaction_type = e.get('interactionType')
1551 if isinstance(interaction_type, dict):
1552 interaction_type = interaction_type.get('@type')
1553 return str_or_none(interaction_type)
1554
1555 def extract_interaction_statistic(e):
1556 interaction_statistic = e.get('interactionStatistic')
1557 if isinstance(interaction_statistic, dict):
1558 interaction_statistic = [interaction_statistic]
1559 if not isinstance(interaction_statistic, list):
1560 return
1561 for is_e in interaction_statistic:
1562 if not is_type(is_e, 'InteractionCounter'):
1563 continue
1564 interaction_type = extract_interaction_type(is_e)
1565 if not interaction_type:
1566 continue
1567 # For interaction count some sites provide string instead of
1568 # an integer (as per spec) with non digit characters (e.g. ",")
1569 # so extracting count with more relaxed str_to_int
1570 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1571 if interaction_count is None:
1572 continue
1573 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1574 if not count_kind:
1575 continue
1576 count_key = '%s_count' % count_kind
1577 if info.get(count_key) is not None:
1578 continue
1579 info[count_key] = interaction_count
1580
1581 def extract_chapter_information(e):
1582 chapters = [{
1583 'title': part.get('name'),
1584 'start_time': part.get('startOffset'),
1585 'end_time': part.get('endOffset'),
1586 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1587 for idx, (last_c, current_c, next_c) in enumerate(zip(
1588 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1589 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1590 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1591 if None in current_c.values():
1592 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1593 return
1594 if chapters:
1595 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1596 info['chapters'] = chapters
1597
1598 def extract_video_object(e):
1599 author = e.get('author')
1600 info.update({
1601 'url': url_or_none(e.get('contentUrl')),
1602 'ext': mimetype2ext(e.get('encodingFormat')),
1603 'title': unescapeHTML(e.get('name')),
1604 'description': unescapeHTML(e.get('description')),
1605 'thumbnails': [{'url': unescapeHTML(url)}
1606 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1607 if url_or_none(url)],
1608 'duration': parse_duration(e.get('duration')),
1609 'timestamp': unified_timestamp(e.get('uploadDate')),
1610 # author can be an instance of 'Organization' or 'Person' types.
1611 # both types can have 'name' property(inherited from 'Thing' type). [1]
1612 # however some websites are using 'Text' type instead.
1613 # 1. https://schema.org/VideoObject
1614 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1615 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1616 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1617 'tbr': int_or_none(e.get('bitrate')),
1618 'width': int_or_none(e.get('width')),
1619 'height': int_or_none(e.get('height')),
1620 'view_count': int_or_none(e.get('interactionCount')),
1621 'tags': try_call(lambda: e.get('keywords').split(',')),
1622 })
1623 if is_type(e, 'AudioObject'):
1624 info.update({
1625 'vcodec': 'none',
1626 'abr': int_or_none(e.get('bitrate')),
1627 })
1628 extract_interaction_statistic(e)
1629 extract_chapter_information(e)
1630
1631 def traverse_json_ld(json_ld, at_top_level=True):
1632 for e in variadic(json_ld):
1633 if not isinstance(e, dict):
1634 continue
1635 if at_top_level and '@context' not in e:
1636 continue
1637 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1638 traverse_json_ld(e['@graph'], at_top_level=False)
1639 continue
1640 if expected_type is not None and not is_type(e, expected_type):
1641 continue
1642 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1643 if rating is not None:
1644 info['average_rating'] = rating
1645 if is_type(e, 'TVEpisode', 'Episode'):
1646 episode_name = unescapeHTML(e.get('name'))
1647 info.update({
1648 'episode': episode_name,
1649 'episode_number': int_or_none(e.get('episodeNumber')),
1650 'description': unescapeHTML(e.get('description')),
1651 })
1652 if not info.get('title') and episode_name:
1653 info['title'] = episode_name
1654 part_of_season = e.get('partOfSeason')
1655 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1656 info.update({
1657 'season': unescapeHTML(part_of_season.get('name')),
1658 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1659 })
1660 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1661 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1662 info['series'] = unescapeHTML(part_of_series.get('name'))
1663 elif is_type(e, 'Movie'):
1664 info.update({
1665 'title': unescapeHTML(e.get('name')),
1666 'description': unescapeHTML(e.get('description')),
1667 'duration': parse_duration(e.get('duration')),
1668 'timestamp': unified_timestamp(e.get('dateCreated')),
1669 })
1670 elif is_type(e, 'Article', 'NewsArticle'):
1671 info.update({
1672 'timestamp': parse_iso8601(e.get('datePublished')),
1673 'title': unescapeHTML(e.get('headline')),
1674 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1675 })
1676 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1677 extract_video_object(e['video'][0])
1678 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1679 extract_video_object(e['subjectOf'][0])
1680 elif is_type(e, 'VideoObject', 'AudioObject'):
1681 extract_video_object(e)
1682 if expected_type is None:
1683 continue
1684 else:
1685 break
1686 video = e.get('video')
1687 if is_type(video, 'VideoObject'):
1688 extract_video_object(video)
1689 if expected_type is None:
1690 continue
1691 else:
1692 break
1693
1694 traverse_json_ld(json_ld)
1695 return filter_dict(info)
1696
1697 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1698 return self._parse_json(
1699 self._search_regex(
1700 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1701 webpage, 'next.js data', fatal=fatal, **kw),
1702 video_id, transform_source=transform_source, fatal=fatal)
1703
1704 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1705 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1706 rectx = re.escape(context_name)
1707 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1708 js, arg_keys, arg_vals = self._search_regex(
1709 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1710 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1711 default=NO_DEFAULT if fatal else (None, None, None))
1712 if js is None:
1713 return {}
1714
1715 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1716 f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1717
1718 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1719 return traverse_obj(ret, traverse) or {}
1720
1721 @staticmethod
1722 def _hidden_inputs(html):
1723 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1724 hidden_inputs = {}
1725 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1726 attrs = extract_attributes(input)
1727 if not input:
1728 continue
1729 if attrs.get('type') not in ('hidden', 'submit'):
1730 continue
1731 name = attrs.get('name') or attrs.get('id')
1732 value = attrs.get('value')
1733 if name and value is not None:
1734 hidden_inputs[name] = value
1735 return hidden_inputs
1736
1737 def _form_hidden_inputs(self, form_id, html):
1738 form = self._search_regex(
1739 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1740 html, '%s form' % form_id, group='form')
1741 return self._hidden_inputs(form)
1742
1743 @classproperty(cache=True)
1744 def FormatSort(cls):
1745 class FormatSort(FormatSorter):
1746 def __init__(ie, *args, **kwargs):
1747 super().__init__(ie._downloader, *args, **kwargs)
1748
1749 deprecation_warning(
1750 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1751 'Use yt_dlp.utils.FormatSorter instead')
1752 return FormatSort
1753
1754 def _sort_formats(self, formats, field_preference=[]):
1755 if not field_preference:
1756 self._downloader.deprecation_warning(
1757 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1758 return
1759 self._downloader.deprecation_warning(
1760 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1761 'Return _format_sort_fields in the info_dict instead')
1762 if formats:
1763 formats[0]['__sort_fields'] = field_preference
1764
1765 def _check_formats(self, formats, video_id):
1766 if formats:
1767 formats[:] = filter(
1768 lambda f: self._is_valid_url(
1769 f['url'], video_id,
1770 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1771 formats)
1772
1773 @staticmethod
1774 def _remove_duplicate_formats(formats):
1775 format_urls = set()
1776 unique_formats = []
1777 for f in formats:
1778 if f['url'] not in format_urls:
1779 format_urls.add(f['url'])
1780 unique_formats.append(f)
1781 formats[:] = unique_formats
1782
1783 def _is_valid_url(self, url, video_id, item='video', headers={}):
1784 url = self._proto_relative_url(url, scheme='http:')
1785 # For now assume non HTTP(S) URLs always valid
1786 if not (url.startswith('http://') or url.startswith('https://')):
1787 return True
1788 try:
1789 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1790 return True
1791 except ExtractorError as e:
1792 self.to_screen(
1793 '%s: %s URL is invalid, skipping: %s'
1794 % (video_id, item, error_to_compat_str(e.cause)))
1795 return False
1796
1797 def http_scheme(self):
1798 """ Either "http:" or "https:", depending on the user's preferences """
1799 return (
1800 'http:'
1801 if self.get_param('prefer_insecure', False)
1802 else 'https:')
1803
1804 def _proto_relative_url(self, url, scheme=None):
1805 scheme = scheme or self.http_scheme()
1806 assert scheme.endswith(':')
1807 return sanitize_url(url, scheme=scheme[:-1])
1808
1809 def _sleep(self, timeout, video_id, msg_template=None):
1810 if msg_template is None:
1811 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1812 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1813 self.to_screen(msg)
1814 time.sleep(timeout)
1815
1816 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1817 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1818 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1819 if self.get_param('ignore_no_formats_error'):
1820 fatal = False
1821
1822 res = self._download_xml_handle(
1823 manifest_url, video_id, 'Downloading f4m manifest',
1824 'Unable to download f4m manifest',
1825 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1826 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1827 transform_source=transform_source,
1828 fatal=fatal, data=data, headers=headers, query=query)
1829 if res is False:
1830 return []
1831
1832 manifest, urlh = res
1833 manifest_url = urlh.url
1834
1835 return self._parse_f4m_formats(
1836 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1837 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1838
1839 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1840 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1841 fatal=True, m3u8_id=None):
1842 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1843 return []
1844
1845 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1846 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1847 if akamai_pv is not None and ';' in akamai_pv.text:
1848 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1849 if playerVerificationChallenge.strip() != '':
1850 return []
1851
1852 formats = []
1853 manifest_version = '1.0'
1854 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1855 if not media_nodes:
1856 manifest_version = '2.0'
1857 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1858 # Remove unsupported DRM protected media from final formats
1859 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1860 media_nodes = remove_encrypted_media(media_nodes)
1861 if not media_nodes:
1862 return formats
1863
1864 manifest_base_url = get_base_url(manifest)
1865
1866 bootstrap_info = xpath_element(
1867 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1868 'bootstrap info', default=None)
1869
1870 vcodec = None
1871 mime_type = xpath_text(
1872 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1873 'base URL', default=None)
1874 if mime_type and mime_type.startswith('audio/'):
1875 vcodec = 'none'
1876
1877 for i, media_el in enumerate(media_nodes):
1878 tbr = int_or_none(media_el.attrib.get('bitrate'))
1879 width = int_or_none(media_el.attrib.get('width'))
1880 height = int_or_none(media_el.attrib.get('height'))
1881 format_id = join_nonempty(f4m_id, tbr or i)
1882 # If <bootstrapInfo> is present, the specified f4m is a
1883 # stream-level manifest, and only set-level manifests may refer to
1884 # external resources. See section 11.4 and section 4 of F4M spec
1885 if bootstrap_info is None:
1886 media_url = None
1887 # @href is introduced in 2.0, see section 11.6 of F4M spec
1888 if manifest_version == '2.0':
1889 media_url = media_el.attrib.get('href')
1890 if media_url is None:
1891 media_url = media_el.attrib.get('url')
1892 if not media_url:
1893 continue
1894 manifest_url = (
1895 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1896 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1897 # If media_url is itself a f4m manifest do the recursive extraction
1898 # since bitrates in parent manifest (this one) and media_url manifest
1899 # may differ leading to inability to resolve the format by requested
1900 # bitrate in f4m downloader
1901 ext = determine_ext(manifest_url)
1902 if ext == 'f4m':
1903 f4m_formats = self._extract_f4m_formats(
1904 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1905 transform_source=transform_source, fatal=fatal)
1906 # Sometimes stream-level manifest contains single media entry that
1907 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1908 # At the same time parent's media entry in set-level manifest may
1909 # contain it. We will copy it from parent in such cases.
1910 if len(f4m_formats) == 1:
1911 f = f4m_formats[0]
1912 f.update({
1913 'tbr': f.get('tbr') or tbr,
1914 'width': f.get('width') or width,
1915 'height': f.get('height') or height,
1916 'format_id': f.get('format_id') if not tbr else format_id,
1917 'vcodec': vcodec,
1918 })
1919 formats.extend(f4m_formats)
1920 continue
1921 elif ext == 'm3u8':
1922 formats.extend(self._extract_m3u8_formats(
1923 manifest_url, video_id, 'mp4', preference=preference,
1924 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1925 continue
1926 formats.append({
1927 'format_id': format_id,
1928 'url': manifest_url,
1929 'manifest_url': manifest_url,
1930 'ext': 'flv' if bootstrap_info is not None else None,
1931 'protocol': 'f4m',
1932 'tbr': tbr,
1933 'width': width,
1934 'height': height,
1935 'vcodec': vcodec,
1936 'preference': preference,
1937 'quality': quality,
1938 })
1939 return formats
1940
1941 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1942 return {
1943 'format_id': join_nonempty(m3u8_id, 'meta'),
1944 'url': m3u8_url,
1945 'ext': ext,
1946 'protocol': 'm3u8',
1947 'preference': preference - 100 if preference else -100,
1948 'quality': quality,
1949 'resolution': 'multiple',
1950 'format_note': 'Quality selection URL',
1951 }
1952
1953 def _report_ignoring_subs(self, name):
1954 self.report_warning(bug_reports_message(
1955 f'Ignoring subtitle tracks found in the {name} manifest; '
1956 'if any subtitle tracks are missing,'
1957 ), only_once=True)
1958
1959 def _extract_m3u8_formats(self, *args, **kwargs):
1960 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1961 if subs:
1962 self._report_ignoring_subs('HLS')
1963 return fmts
1964
1965 def _extract_m3u8_formats_and_subtitles(
1966 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1967 preference=None, quality=None, m3u8_id=None, note=None,
1968 errnote=None, fatal=True, live=False, data=None, headers={},
1969 query={}):
1970
1971 if self.get_param('ignore_no_formats_error'):
1972 fatal = False
1973
1974 if not m3u8_url:
1975 if errnote is not False:
1976 errnote = errnote or 'Failed to obtain m3u8 URL'
1977 if fatal:
1978 raise ExtractorError(errnote, video_id=video_id)
1979 self.report_warning(f'{errnote}{bug_reports_message()}')
1980 return [], {}
1981
1982 res = self._download_webpage_handle(
1983 m3u8_url, video_id,
1984 note='Downloading m3u8 information' if note is None else note,
1985 errnote='Failed to download m3u8 information' if errnote is None else errnote,
1986 fatal=fatal, data=data, headers=headers, query=query)
1987
1988 if res is False:
1989 return [], {}
1990
1991 m3u8_doc, urlh = res
1992 m3u8_url = urlh.url
1993
1994 return self._parse_m3u8_formats_and_subtitles(
1995 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1996 preference=preference, quality=quality, m3u8_id=m3u8_id,
1997 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1998 headers=headers, query=query, video_id=video_id)
1999
2000 def _parse_m3u8_formats_and_subtitles(
2001 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2002 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2003 errnote=None, fatal=True, data=None, headers={}, query={},
2004 video_id=None):
2005 formats, subtitles = [], {}
2006 has_drm = HlsFD._has_drm(m3u8_doc)
2007
2008 def format_url(url):
2009 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2010
2011 if self.get_param('hls_split_discontinuity', False):
2012 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2013 if not m3u8_doc:
2014 if not manifest_url:
2015 return []
2016 m3u8_doc = self._download_webpage(
2017 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2018 note=False, errnote='Failed to download m3u8 playlist information')
2019 if m3u8_doc is False:
2020 return []
2021 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2022
2023 else:
2024 def _extract_m3u8_playlist_indices(*args, **kwargs):
2025 return [None]
2026
2027 # References:
2028 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2029 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2030 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2031
2032 # We should try extracting formats only from master playlists [1, 4.3.4],
2033 # i.e. playlists that describe available qualities. On the other hand
2034 # media playlists [1, 4.3.3] should be returned as is since they contain
2035 # just the media without qualities renditions.
2036 # Fortunately, master playlist can be easily distinguished from media
2037 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2038 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2039 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2040 # media playlist and MUST NOT appear in master playlist thus we can
2041 # clearly detect media playlist with this criterion.
2042
2043 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2044 formats = [{
2045 'format_id': join_nonempty(m3u8_id, idx),
2046 'format_index': idx,
2047 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2048 'ext': ext,
2049 'protocol': entry_protocol,
2050 'preference': preference,
2051 'quality': quality,
2052 'has_drm': has_drm,
2053 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2054
2055 return formats, subtitles
2056
2057 groups = {}
2058 last_stream_inf = {}
2059
2060 def extract_media(x_media_line):
2061 media = parse_m3u8_attributes(x_media_line)
2062 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2063 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2064 if not (media_type and group_id and name):
2065 return
2066 groups.setdefault(group_id, []).append(media)
2067 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2068 if media_type == 'SUBTITLES':
2069 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2070 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2071 # However, lack of URI has been spotted in the wild.
2072 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2073 if not media.get('URI'):
2074 return
2075 url = format_url(media['URI'])
2076 sub_info = {
2077 'url': url,
2078 'ext': determine_ext(url),
2079 }
2080 if sub_info['ext'] == 'm3u8':
2081 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2082 # files may contain is WebVTT:
2083 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2084 sub_info['ext'] = 'vtt'
2085 sub_info['protocol'] = 'm3u8_native'
2086 lang = media.get('LANGUAGE') or 'und'
2087 subtitles.setdefault(lang, []).append(sub_info)
2088 if media_type not in ('VIDEO', 'AUDIO'):
2089 return
2090 media_url = media.get('URI')
2091 if media_url:
2092 manifest_url = format_url(media_url)
2093 formats.extend({
2094 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2095 'format_note': name,
2096 'format_index': idx,
2097 'url': manifest_url,
2098 'manifest_url': m3u8_url,
2099 'language': media.get('LANGUAGE'),
2100 'ext': ext,
2101 'protocol': entry_protocol,
2102 'preference': preference,
2103 'quality': quality,
2104 'has_drm': has_drm,
2105 'vcodec': 'none' if media_type == 'AUDIO' else None,
2106 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2107
2108 def build_stream_name():
2109 # Despite specification does not mention NAME attribute for
2110 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2111 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2112 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2113 stream_name = last_stream_inf.get('NAME')
2114 if stream_name:
2115 return stream_name
2116 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2117 # from corresponding rendition group
2118 stream_group_id = last_stream_inf.get('VIDEO')
2119 if not stream_group_id:
2120 return
2121 stream_group = groups.get(stream_group_id)
2122 if not stream_group:
2123 return stream_group_id
2124 rendition = stream_group[0]
2125 return rendition.get('NAME') or stream_group_id
2126
2127 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2128 # chance to detect video only formats when EXT-X-STREAM-INF tags
2129 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2130 for line in m3u8_doc.splitlines():
2131 if line.startswith('#EXT-X-MEDIA:'):
2132 extract_media(line)
2133
2134 for line in m3u8_doc.splitlines():
2135 if line.startswith('#EXT-X-STREAM-INF:'):
2136 last_stream_inf = parse_m3u8_attributes(line)
2137 elif line.startswith('#') or not line.strip():
2138 continue
2139 else:
2140 tbr = float_or_none(
2141 last_stream_inf.get('AVERAGE-BANDWIDTH')
2142 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2143 manifest_url = format_url(line.strip())
2144
2145 for idx in _extract_m3u8_playlist_indices(manifest_url):
2146 format_id = [m3u8_id, None, idx]
2147 # Bandwidth of live streams may differ over time thus making
2148 # format_id unpredictable. So it's better to keep provided
2149 # format_id intact.
2150 if not live:
2151 stream_name = build_stream_name()
2152 format_id[1] = stream_name or '%d' % (tbr or len(formats))
2153 f = {
2154 'format_id': join_nonempty(*format_id),
2155 'format_index': idx,
2156 'url': manifest_url,
2157 'manifest_url': m3u8_url,
2158 'tbr': tbr,
2159 'ext': ext,
2160 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2161 'protocol': entry_protocol,
2162 'preference': preference,
2163 'quality': quality,
2164 'has_drm': has_drm,
2165 }
2166 resolution = last_stream_inf.get('RESOLUTION')
2167 if resolution:
2168 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2169 if mobj:
2170 f['width'] = int(mobj.group('width'))
2171 f['height'] = int(mobj.group('height'))
2172 # Unified Streaming Platform
2173 mobj = re.search(
2174 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2175 if mobj:
2176 abr, vbr = mobj.groups()
2177 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2178 f.update({
2179 'vbr': vbr,
2180 'abr': abr,
2181 })
2182 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2183 f.update(codecs)
2184 audio_group_id = last_stream_inf.get('AUDIO')
2185 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2186 # references a rendition group MUST have a CODECS attribute.
2187 # However, this is not always respected. E.g. [2]
2188 # contains EXT-X-STREAM-INF tag which references AUDIO
2189 # rendition group but does not have CODECS and despite
2190 # referencing an audio group it represents a complete
2191 # (with audio and video) format. So, for such cases we will
2192 # ignore references to rendition groups and treat them
2193 # as complete formats.
2194 if audio_group_id and codecs and f.get('vcodec') != 'none':
2195 audio_group = groups.get(audio_group_id)
2196 if audio_group and audio_group[0].get('URI'):
2197 # TODO: update acodec for audio only formats with
2198 # the same GROUP-ID
2199 f['acodec'] = 'none'
2200 if not f.get('ext'):
2201 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2202 formats.append(f)
2203
2204 # for DailyMotion
2205 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2206 if progressive_uri:
2207 http_f = f.copy()
2208 del http_f['manifest_url']
2209 http_f.update({
2210 'format_id': f['format_id'].replace('hls-', 'http-'),
2211 'protocol': 'http',
2212 'url': progressive_uri,
2213 })
2214 formats.append(http_f)
2215
2216 last_stream_inf = {}
2217 return formats, subtitles
2218
2219 def _extract_m3u8_vod_duration(
2220 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2221
2222 m3u8_vod = self._download_webpage(
2223 m3u8_vod_url, video_id,
2224 note='Downloading m3u8 VOD manifest' if note is None else note,
2225 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2226 fatal=False, data=data, headers=headers, query=query)
2227
2228 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2229
2230 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2231 if '#EXT-X-ENDLIST' not in m3u8_vod:
2232 return None
2233
2234 return int(sum(
2235 float(line[len('#EXTINF:'):].split(',')[0])
2236 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2237
2238 def _extract_mpd_vod_duration(
2239 self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2240
2241 mpd_doc = self._download_xml(
2242 mpd_url, video_id,
2243 note='Downloading MPD VOD manifest' if note is None else note,
2244 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2245 fatal=False, data=data, headers=headers, query=query)
2246 if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2247 return None
2248 return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2249
2250 @staticmethod
2251 def _xpath_ns(path, namespace=None):
2252 if not namespace:
2253 return path
2254 out = []
2255 for c in path.split('/'):
2256 if not c or c == '.':
2257 out.append(c)
2258 else:
2259 out.append('{%s}%s' % (namespace, c))
2260 return '/'.join(out)
2261
2262 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2263 if self.get_param('ignore_no_formats_error'):
2264 fatal = False
2265
2266 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2267 if res is False:
2268 assert not fatal
2269 return [], {}
2270 smil, urlh = res
2271
2272 return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2273 namespace=self._parse_smil_namespace(smil))
2274
2275 def _extract_smil_formats(self, *args, **kwargs):
2276 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2277 if subs:
2278 self._report_ignoring_subs('SMIL')
2279 return fmts
2280
2281 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2282 res = self._download_smil(smil_url, video_id, fatal=fatal)
2283 if res is False:
2284 return {}
2285
2286 smil, urlh = res
2287 smil_url = urlh.url
2288
2289 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2290
2291 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2292 return self._download_xml_handle(
2293 smil_url, video_id, 'Downloading SMIL file',
2294 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2295
2296 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2297 namespace = self._parse_smil_namespace(smil)
2298
2299 formats, subtitles = self._parse_smil_formats_and_subtitles(
2300 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2301
2302 video_id = os.path.splitext(url_basename(smil_url))[0]
2303 title = None
2304 description = None
2305 upload_date = None
2306 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2307 name = meta.attrib.get('name')
2308 content = meta.attrib.get('content')
2309 if not name or not content:
2310 continue
2311 if not title and name == 'title':
2312 title = content
2313 elif not description and name in ('description', 'abstract'):
2314 description = content
2315 elif not upload_date and name == 'date':
2316 upload_date = unified_strdate(content)
2317
2318 thumbnails = [{
2319 'id': image.get('type'),
2320 'url': image.get('src'),
2321 'width': int_or_none(image.get('width')),
2322 'height': int_or_none(image.get('height')),
2323 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2324
2325 return {
2326 'id': video_id,
2327 'title': title or video_id,
2328 'description': description,
2329 'upload_date': upload_date,
2330 'thumbnails': thumbnails,
2331 'formats': formats,
2332 'subtitles': subtitles,
2333 }
2334
2335 def _parse_smil_namespace(self, smil):
2336 return self._search_regex(
2337 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2338
2339 def _parse_smil_formats(self, *args, **kwargs):
2340 fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2341 if subs:
2342 self._report_ignoring_subs('SMIL')
2343 return fmts
2344
2345 def _parse_smil_formats_and_subtitles(
2346 self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2347 base = smil_url
2348 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2349 b = meta.get('base') or meta.get('httpBase')
2350 if b:
2351 base = b
2352 break
2353
2354 formats, subtitles = [], {}
2355 rtmp_count = 0
2356 http_count = 0
2357 m3u8_count = 0
2358 imgs_count = 0
2359
2360 srcs = set()
2361 media = itertools.chain.from_iterable(
2362 smil.findall(self._xpath_ns(arg, namespace))
2363 for arg in ['.//video', './/audio', './/media'])
2364 for medium in media:
2365 src = medium.get('src')
2366 if not src or src in srcs:
2367 continue
2368 srcs.add(src)
2369
2370 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2371 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2372 width = int_or_none(medium.get('width'))
2373 height = int_or_none(medium.get('height'))
2374 proto = medium.get('proto')
2375 ext = medium.get('ext')
2376 src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2377 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2378 streamer = medium.get('streamer') or base
2379
2380 if proto == 'rtmp' or streamer.startswith('rtmp'):
2381 rtmp_count += 1
2382 formats.append({
2383 'url': streamer,
2384 'play_path': src,
2385 'ext': 'flv',
2386 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2387 'tbr': bitrate,
2388 'filesize': filesize,
2389 'width': width,
2390 'height': height,
2391 })
2392 if transform_rtmp_url:
2393 streamer, src = transform_rtmp_url(streamer, src)
2394 formats[-1].update({
2395 'url': streamer,
2396 'play_path': src,
2397 })
2398 continue
2399
2400 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2401 src_url = src_url.strip()
2402
2403 if proto == 'm3u8' or src_ext == 'm3u8':
2404 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2405 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2406 self._merge_subtitles(m3u8_subs, target=subtitles)
2407 if len(m3u8_formats) == 1:
2408 m3u8_count += 1
2409 m3u8_formats[0].update({
2410 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2411 'tbr': bitrate,
2412 'width': width,
2413 'height': height,
2414 })
2415 formats.extend(m3u8_formats)
2416 elif src_ext == 'f4m':
2417 f4m_url = src_url
2418 if not f4m_params:
2419 f4m_params = {
2420 'hdcore': '3.2.0',
2421 'plugin': 'flowplayer-3.2.0.1',
2422 }
2423 f4m_url += '&' if '?' in f4m_url else '?'
2424 f4m_url += urllib.parse.urlencode(f4m_params)
2425 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2426 elif src_ext == 'mpd':
2427 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2428 src_url, video_id, mpd_id='dash', fatal=False)
2429 formats.extend(mpd_formats)
2430 self._merge_subtitles(mpd_subs, target=subtitles)
2431 elif re.search(r'\.ism/[Mm]anifest', src_url):
2432 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2433 src_url, video_id, ism_id='mss', fatal=False)
2434 formats.extend(ism_formats)
2435 self._merge_subtitles(ism_subs, target=subtitles)
2436 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2437 http_count += 1
2438 formats.append({
2439 'url': src_url,
2440 'ext': ext or src_ext or 'flv',
2441 'format_id': 'http-%d' % (bitrate or http_count),
2442 'tbr': bitrate,
2443 'filesize': filesize,
2444 'width': width,
2445 'height': height,
2446 })
2447
2448 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2449 src = medium.get('src')
2450 if not src or src in srcs:
2451 continue
2452 srcs.add(src)
2453
2454 imgs_count += 1
2455 formats.append({
2456 'format_id': 'imagestream-%d' % (imgs_count),
2457 'url': src,
2458 'ext': mimetype2ext(medium.get('type')),
2459 'acodec': 'none',
2460 'vcodec': 'none',
2461 'width': int_or_none(medium.get('width')),
2462 'height': int_or_none(medium.get('height')),
2463 'format_note': 'SMIL storyboards',
2464 })
2465
2466 smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2467 self._merge_subtitles(smil_subs, target=subtitles)
2468
2469 return formats, subtitles
2470
2471 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2472 urls = []
2473 subtitles = {}
2474 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2475 src = textstream.get('src')
2476 if not src or src in urls:
2477 continue
2478 urls.append(src)
2479 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2480 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2481 subtitles.setdefault(lang, []).append({
2482 'url': src,
2483 'ext': ext,
2484 })
2485 return subtitles
2486
2487 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2488 res = self._download_xml_handle(
2489 xspf_url, playlist_id, 'Downloading xpsf playlist',
2490 'Unable to download xspf manifest', fatal=fatal)
2491 if res is False:
2492 return []
2493
2494 xspf, urlh = res
2495 xspf_url = urlh.url
2496
2497 return self._parse_xspf(
2498 xspf, playlist_id, xspf_url=xspf_url,
2499 xspf_base_url=base_url(xspf_url))
2500
2501 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2502 NS_MAP = {
2503 'xspf': 'http://xspf.org/ns/0/',
2504 's1': 'http://static.streamone.nl/player/ns/0',
2505 }
2506
2507 entries = []
2508 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2509 title = xpath_text(
2510 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2511 description = xpath_text(
2512 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2513 thumbnail = xpath_text(
2514 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2515 duration = float_or_none(
2516 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2517
2518 formats = []
2519 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2520 format_url = urljoin(xspf_base_url, location.text)
2521 if not format_url:
2522 continue
2523 formats.append({
2524 'url': format_url,
2525 'manifest_url': xspf_url,
2526 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2527 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2528 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2529 })
2530
2531 entries.append({
2532 'id': playlist_id,
2533 'title': title,
2534 'description': description,
2535 'thumbnail': thumbnail,
2536 'duration': duration,
2537 'formats': formats,
2538 })
2539 return entries
2540
2541 def _extract_mpd_formats(self, *args, **kwargs):
2542 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2543 if subs:
2544 self._report_ignoring_subs('DASH')
2545 return fmts
2546
2547 def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2548 periods = self._extract_mpd_periods(*args, **kwargs)
2549 return self._merge_mpd_periods(periods)
2550
2551 def _extract_mpd_periods(
2552 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2553 fatal=True, data=None, headers={}, query={}):
2554
2555 if self.get_param('ignore_no_formats_error'):
2556 fatal = False
2557
2558 res = self._download_xml_handle(
2559 mpd_url, video_id,
2560 note='Downloading MPD manifest' if note is None else note,
2561 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2562 fatal=fatal, data=data, headers=headers, query=query)
2563 if res is False:
2564 return []
2565 mpd_doc, urlh = res
2566 if mpd_doc is None:
2567 return []
2568
2569 # We could have been redirected to a new url when we retrieved our mpd file.
2570 mpd_url = urlh.url
2571 mpd_base_url = base_url(mpd_url)
2572
2573 return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2574
2575 def _parse_mpd_formats(self, *args, **kwargs):
2576 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2577 if subs:
2578 self._report_ignoring_subs('DASH')
2579 return fmts
2580
2581 def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2582 periods = self._parse_mpd_periods(*args, **kwargs)
2583 return self._merge_mpd_periods(periods)
2584
2585 def _merge_mpd_periods(self, periods):
2586 """
2587 Combine all formats and subtitles from an MPD manifest into a single list,
2588 by concatenate streams with similar formats.
2589 """
2590 formats, subtitles = {}, {}
2591 for period in periods:
2592 for f in period['formats']:
2593 assert 'is_dash_periods' not in f, 'format already processed'
2594 f['is_dash_periods'] = True
2595 format_key = tuple(v for k, v in f.items() if k not in (
2596 ('format_id', 'fragments', 'manifest_stream_number')))
2597 if format_key not in formats:
2598 formats[format_key] = f
2599 elif 'fragments' in f:
2600 formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2601
2602 if subtitles and period['subtitles']:
2603 self.report_warning(bug_reports_message(
2604 'Found subtitles in multiple periods in the DASH manifest; '
2605 'if part of the subtitles are missing,'
2606 ), only_once=True)
2607
2608 for sub_lang, sub_info in period['subtitles'].items():
2609 subtitles.setdefault(sub_lang, []).extend(sub_info)
2610
2611 return list(formats.values()), subtitles
2612
2613 def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2614 """
2615 Parse formats from MPD manifest.
2616 References:
2617 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2618 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2619 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2620 """
2621 if not self.get_param('dynamic_mpd', True):
2622 if mpd_doc.get('type') == 'dynamic':
2623 return [], {}
2624
2625 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2626
2627 def _add_ns(path):
2628 return self._xpath_ns(path, namespace)
2629
2630 def is_drm_protected(element):
2631 return element.find(_add_ns('ContentProtection')) is not None
2632
2633 def extract_multisegment_info(element, ms_parent_info):
2634 ms_info = ms_parent_info.copy()
2635
2636 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2637 # common attributes and elements. We will only extract relevant
2638 # for us.
2639 def extract_common(source):
2640 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2641 if segment_timeline is not None:
2642 s_e = segment_timeline.findall(_add_ns('S'))
2643 if s_e:
2644 ms_info['total_number'] = 0
2645 ms_info['s'] = []
2646 for s in s_e:
2647 r = int(s.get('r', 0))
2648 ms_info['total_number'] += 1 + r
2649 ms_info['s'].append({
2650 't': int(s.get('t', 0)),
2651 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2652 'd': int(s.attrib['d']),
2653 'r': r,
2654 })
2655 start_number = source.get('startNumber')
2656 if start_number:
2657 ms_info['start_number'] = int(start_number)
2658 timescale = source.get('timescale')
2659 if timescale:
2660 ms_info['timescale'] = int(timescale)
2661 segment_duration = source.get('duration')
2662 if segment_duration:
2663 ms_info['segment_duration'] = float(segment_duration)
2664
2665 def extract_Initialization(source):
2666 initialization = source.find(_add_ns('Initialization'))
2667 if initialization is not None:
2668 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2669
2670 segment_list = element.find(_add_ns('SegmentList'))
2671 if segment_list is not None:
2672 extract_common(segment_list)
2673 extract_Initialization(segment_list)
2674 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2675 if segment_urls_e:
2676 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2677 else:
2678 segment_template = element.find(_add_ns('SegmentTemplate'))
2679 if segment_template is not None:
2680 extract_common(segment_template)
2681 media = segment_template.get('media')
2682 if media:
2683 ms_info['media'] = media
2684 initialization = segment_template.get('initialization')
2685 if initialization:
2686 ms_info['initialization'] = initialization
2687 else:
2688 extract_Initialization(segment_template)
2689 return ms_info
2690
2691 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2692 stream_numbers = collections.defaultdict(int)
2693 for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2694 period_entry = {
2695 'id': period.get('id', f'period-{period_idx}'),
2696 'formats': [],
2697 'subtitles': collections.defaultdict(list),
2698 }
2699 period_duration = parse_duration(period.get('duration')) or mpd_duration
2700 period_ms_info = extract_multisegment_info(period, {
2701 'start_number': 1,
2702 'timescale': 1,
2703 })
2704 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2705 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2706 for representation in adaptation_set.findall(_add_ns('Representation')):
2707 representation_attrib = adaptation_set.attrib.copy()
2708 representation_attrib.update(representation.attrib)
2709 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2710 mime_type = representation_attrib['mimeType']
2711 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2712
2713 codec_str = representation_attrib.get('codecs', '')
2714 # Some kind of binary subtitle found in some youtube livestreams
2715 if mime_type == 'application/x-rawcc':
2716 codecs = {'scodec': codec_str}
2717 else:
2718 codecs = parse_codecs(codec_str)
2719 if content_type not in ('video', 'audio', 'text'):
2720 if mime_type == 'image/jpeg':
2721 content_type = mime_type
2722 elif codecs.get('vcodec', 'none') != 'none':
2723 content_type = 'video'
2724 elif codecs.get('acodec', 'none') != 'none':
2725 content_type = 'audio'
2726 elif codecs.get('scodec', 'none') != 'none':
2727 content_type = 'text'
2728 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2729 content_type = 'text'
2730 else:
2731 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2732 continue
2733
2734 base_url = ''
2735 for element in (representation, adaptation_set, period, mpd_doc):
2736 base_url_e = element.find(_add_ns('BaseURL'))
2737 if try_call(lambda: base_url_e.text) is not None:
2738 base_url = base_url_e.text + base_url
2739 if re.match(r'^https?://', base_url):
2740 break
2741 if mpd_base_url and base_url.startswith('/'):
2742 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2743 elif mpd_base_url and not re.match(r'^https?://', base_url):
2744 if not mpd_base_url.endswith('/'):
2745 mpd_base_url += '/'
2746 base_url = mpd_base_url + base_url
2747 representation_id = representation_attrib.get('id')
2748 lang = representation_attrib.get('lang')
2749 url_el = representation.find(_add_ns('BaseURL'))
2750 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2751 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2752 if representation_id is not None:
2753 format_id = representation_id
2754 else:
2755 format_id = content_type
2756 if mpd_id:
2757 format_id = mpd_id + '-' + format_id
2758 if content_type in ('video', 'audio'):
2759 f = {
2760 'format_id': format_id,
2761 'manifest_url': mpd_url,
2762 'ext': mimetype2ext(mime_type),
2763 'width': int_or_none(representation_attrib.get('width')),
2764 'height': int_or_none(representation_attrib.get('height')),
2765 'tbr': float_or_none(bandwidth, 1000),
2766 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2767 'fps': int_or_none(representation_attrib.get('frameRate')),
2768 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2769 'format_note': 'DASH %s' % content_type,
2770 'filesize': filesize,
2771 'container': mimetype2ext(mime_type) + '_dash',
2772 **codecs
2773 }
2774 elif content_type == 'text':
2775 f = {
2776 'ext': mimetype2ext(mime_type),
2777 'manifest_url': mpd_url,
2778 'filesize': filesize,
2779 }
2780 elif content_type == 'image/jpeg':
2781 # See test case in VikiIE
2782 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2783 f = {
2784 'format_id': format_id,
2785 'ext': 'mhtml',
2786 'manifest_url': mpd_url,
2787 'format_note': 'DASH storyboards (jpeg)',
2788 'acodec': 'none',
2789 'vcodec': 'none',
2790 }
2791 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2792 f['has_drm'] = True
2793 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2794
2795 def prepare_template(template_name, identifiers):
2796 tmpl = representation_ms_info[template_name]
2797 if representation_id is not None:
2798 tmpl = tmpl.replace('$RepresentationID$', representation_id)
2799 # First of, % characters outside $...$ templates
2800 # must be escaped by doubling for proper processing
2801 # by % operator string formatting used further (see
2802 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2803 t = ''
2804 in_template = False
2805 for c in tmpl:
2806 t += c
2807 if c == '$':
2808 in_template = not in_template
2809 elif c == '%' and not in_template:
2810 t += c
2811 # Next, $...$ templates are translated to their
2812 # %(...) counterparts to be used with % operator
2813 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2814 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2815 t.replace('$$', '$')
2816 return t
2817
2818 # @initialization is a regular template like @media one
2819 # so it should be handled just the same way (see
2820 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2821 if 'initialization' in representation_ms_info:
2822 initialization_template = prepare_template(
2823 'initialization',
2824 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2825 # $Time$ shall not be included for @initialization thus
2826 # only $Bandwidth$ remains
2827 ('Bandwidth', ))
2828 representation_ms_info['initialization_url'] = initialization_template % {
2829 'Bandwidth': bandwidth,
2830 }
2831
2832 def location_key(location):
2833 return 'url' if re.match(r'^https?://', location) else 'path'
2834
2835 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2836
2837 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2838 media_location_key = location_key(media_template)
2839
2840 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2841 # can't be used at the same time
2842 if '%(Number' in media_template and 's' not in representation_ms_info:
2843 segment_duration = None
2844 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2845 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2846 representation_ms_info['total_number'] = int(math.ceil(
2847 float_or_none(period_duration, segment_duration, default=0)))
2848 representation_ms_info['fragments'] = [{
2849 media_location_key: media_template % {
2850 'Number': segment_number,
2851 'Bandwidth': bandwidth,
2852 },
2853 'duration': segment_duration,
2854 } for segment_number in range(
2855 representation_ms_info['start_number'],
2856 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2857 else:
2858 # $Number*$ or $Time$ in media template with S list available
2859 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2860 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2861 representation_ms_info['fragments'] = []
2862 segment_time = 0
2863 segment_d = None
2864 segment_number = representation_ms_info['start_number']
2865
2866 def add_segment_url():
2867 segment_url = media_template % {
2868 'Time': segment_time,
2869 'Bandwidth': bandwidth,
2870 'Number': segment_number,
2871 }
2872 representation_ms_info['fragments'].append({
2873 media_location_key: segment_url,
2874 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2875 })
2876
2877 for num, s in enumerate(representation_ms_info['s']):
2878 segment_time = s.get('t') or segment_time
2879 segment_d = s['d']
2880 add_segment_url()
2881 segment_number += 1
2882 for r in range(s.get('r', 0)):
2883 segment_time += segment_d
2884 add_segment_url()
2885 segment_number += 1
2886 segment_time += segment_d
2887 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2888 # No media template,
2889 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2890 # or any YouTube dashsegments video
2891 fragments = []
2892 segment_index = 0
2893 timescale = representation_ms_info['timescale']
2894 for s in representation_ms_info['s']:
2895 duration = float_or_none(s['d'], timescale)
2896 for r in range(s.get('r', 0) + 1):
2897 segment_uri = representation_ms_info['segment_urls'][segment_index]
2898 fragments.append({
2899 location_key(segment_uri): segment_uri,
2900 'duration': duration,
2901 })
2902 segment_index += 1
2903 representation_ms_info['fragments'] = fragments
2904 elif 'segment_urls' in representation_ms_info:
2905 # Segment URLs with no SegmentTimeline
2906 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2907 # https://github.com/ytdl-org/youtube-dl/pull/14844
2908 fragments = []
2909 segment_duration = float_or_none(
2910 representation_ms_info['segment_duration'],
2911 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2912 for segment_url in representation_ms_info['segment_urls']:
2913 fragment = {
2914 location_key(segment_url): segment_url,
2915 }
2916 if segment_duration:
2917 fragment['duration'] = segment_duration
2918 fragments.append(fragment)
2919 representation_ms_info['fragments'] = fragments
2920 # If there is a fragments key available then we correctly recognized fragmented media.
2921 # Otherwise we will assume unfragmented media with direct access. Technically, such
2922 # assumption is not necessarily correct since we may simply have no support for
2923 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2924 if 'fragments' in representation_ms_info:
2925 f.update({
2926 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2927 'url': mpd_url or base_url,
2928 'fragment_base_url': base_url,
2929 'fragments': [],
2930 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2931 })
2932 if 'initialization_url' in representation_ms_info:
2933 initialization_url = representation_ms_info['initialization_url']
2934 if not f.get('url'):
2935 f['url'] = initialization_url
2936 f['fragments'].append({location_key(initialization_url): initialization_url})
2937 f['fragments'].extend(representation_ms_info['fragments'])
2938 if not period_duration:
2939 period_duration = try_get(
2940 representation_ms_info,
2941 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2942 else:
2943 # Assuming direct URL to unfragmented media.
2944 f['url'] = base_url
2945 if content_type in ('video', 'audio', 'image/jpeg'):
2946 f['manifest_stream_number'] = stream_numbers[f['url']]
2947 stream_numbers[f['url']] += 1
2948 period_entry['formats'].append(f)
2949 elif content_type == 'text':
2950 period_entry['subtitles'][lang or 'und'].append(f)
2951 yield period_entry
2952
2953 def _extract_ism_formats(self, *args, **kwargs):
2954 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2955 if subs:
2956 self._report_ignoring_subs('ISM')
2957 return fmts
2958
2959 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2960 if self.get_param('ignore_no_formats_error'):
2961 fatal = False
2962
2963 res = self._download_xml_handle(
2964 ism_url, video_id,
2965 note='Downloading ISM manifest' if note is None else note,
2966 errnote='Failed to download ISM manifest' if errnote is None else errnote,
2967 fatal=fatal, data=data, headers=headers, query=query)
2968 if res is False:
2969 return [], {}
2970 ism_doc, urlh = res
2971 if ism_doc is None:
2972 return [], {}
2973
2974 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
2975
2976 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2977 """
2978 Parse formats from ISM manifest.
2979 References:
2980 1. [MS-SSTR]: Smooth Streaming Protocol,
2981 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2982 """
2983 if ism_doc.get('IsLive') == 'TRUE':
2984 return [], {}
2985
2986 duration = int(ism_doc.attrib['Duration'])
2987 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2988
2989 formats = []
2990 subtitles = {}
2991 for stream in ism_doc.findall('StreamIndex'):
2992 stream_type = stream.get('Type')
2993 if stream_type not in ('video', 'audio', 'text'):
2994 continue
2995 url_pattern = stream.attrib['Url']
2996 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2997 stream_name = stream.get('Name')
2998 stream_language = stream.get('Language', 'und')
2999 for track in stream.findall('QualityLevel'):
3000 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3001 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3002 # TODO: add support for WVC1 and WMAP
3003 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3004 self.report_warning('%s is not a supported codec' % fourcc)
3005 continue
3006 tbr = int(track.attrib['Bitrate']) // 1000
3007 # [1] does not mention Width and Height attributes. However,
3008 # they're often present while MaxWidth and MaxHeight are
3009 # missing, so should be used as fallbacks
3010 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3011 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3012 sampling_rate = int_or_none(track.get('SamplingRate'))
3013
3014 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3015 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3016
3017 fragments = []
3018 fragment_ctx = {
3019 'time': 0,
3020 }
3021 stream_fragments = stream.findall('c')
3022 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3023 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3024 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3025 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3026 if not fragment_ctx['duration']:
3027 try:
3028 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3029 except IndexError:
3030 next_fragment_time = duration
3031 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3032 for _ in range(fragment_repeat):
3033 fragments.append({
3034 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3035 'duration': fragment_ctx['duration'] / stream_timescale,
3036 })
3037 fragment_ctx['time'] += fragment_ctx['duration']
3038
3039 if stream_type == 'text':
3040 subtitles.setdefault(stream_language, []).append({
3041 'ext': 'ismt',
3042 'protocol': 'ism',
3043 'url': ism_url,
3044 'manifest_url': ism_url,
3045 'fragments': fragments,
3046 '_download_params': {
3047 'stream_type': stream_type,
3048 'duration': duration,
3049 'timescale': stream_timescale,
3050 'fourcc': fourcc,
3051 'language': stream_language,
3052 'codec_private_data': track.get('CodecPrivateData'),
3053 }
3054 })
3055 elif stream_type in ('video', 'audio'):
3056 formats.append({
3057 'format_id': join_nonempty(ism_id, stream_name, tbr),
3058 'url': ism_url,
3059 'manifest_url': ism_url,
3060 'ext': 'ismv' if stream_type == 'video' else 'isma',
3061 'width': width,
3062 'height': height,
3063 'tbr': tbr,
3064 'asr': sampling_rate,
3065 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3066 'acodec': 'none' if stream_type == 'video' else fourcc,
3067 'protocol': 'ism',
3068 'fragments': fragments,
3069 'has_drm': ism_doc.find('Protection') is not None,
3070 'language': stream_language,
3071 'audio_channels': int_or_none(track.get('Channels')),
3072 '_download_params': {
3073 'stream_type': stream_type,
3074 'duration': duration,
3075 'timescale': stream_timescale,
3076 'width': width or 0,
3077 'height': height or 0,
3078 'fourcc': fourcc,
3079 'language': stream_language,
3080 'codec_private_data': track.get('CodecPrivateData'),
3081 'sampling_rate': sampling_rate,
3082 'channels': int_or_none(track.get('Channels', 2)),
3083 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3084 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3085 },
3086 })
3087 return formats, subtitles
3088
3089 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3090 def absolute_url(item_url):
3091 return urljoin(base_url, item_url)
3092
3093 def parse_content_type(content_type):
3094 if not content_type:
3095 return {}
3096 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3097 if ctr:
3098 mimetype, codecs = ctr.groups()
3099 f = parse_codecs(codecs)
3100 f['ext'] = mimetype2ext(mimetype)
3101 return f
3102 return {}
3103
3104 def _media_formats(src, cur_media_type, type_info=None):
3105 type_info = type_info or {}
3106 full_url = absolute_url(src)
3107 ext = type_info.get('ext') or determine_ext(full_url)
3108 if ext == 'm3u8':
3109 is_plain_url = False
3110 formats = self._extract_m3u8_formats(
3111 full_url, video_id, ext='mp4',
3112 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3113 preference=preference, quality=quality, fatal=False)
3114 elif ext == 'mpd':
3115 is_plain_url = False
3116 formats = self._extract_mpd_formats(
3117 full_url, video_id, mpd_id=mpd_id, fatal=False)
3118 else:
3119 is_plain_url = True
3120 formats = [{
3121 'url': full_url,
3122 'vcodec': 'none' if cur_media_type == 'audio' else None,
3123 'ext': ext,
3124 }]
3125 return is_plain_url, formats
3126
3127 entries = []
3128 # amp-video and amp-audio are very similar to their HTML5 counterparts
3129 # so we will include them right here (see
3130 # https://www.ampproject.org/docs/reference/components/amp-video)
3131 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3132 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3133 media_tags = [(media_tag, media_tag_name, media_type, '')
3134 for media_tag, media_tag_name, media_type
3135 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3136 media_tags.extend(re.findall(
3137 # We only allow video|audio followed by a whitespace or '>'.
3138 # Allowing more characters may end up in significant slow down (see
3139 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3140 # e.g. http://www.porntrex.com/maps/videositemap.xml).
3141 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3142 for media_tag, _, media_type, media_content in media_tags:
3143 media_info = {
3144 'formats': [],
3145 'subtitles': {},
3146 }
3147 media_attributes = extract_attributes(media_tag)
3148 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3149 if src:
3150 f = parse_content_type(media_attributes.get('type'))
3151 _, formats = _media_formats(src, media_type, f)
3152 media_info['formats'].extend(formats)
3153 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3154 if media_content:
3155 for source_tag in re.findall(r'<source[^>]+>', media_content):
3156 s_attr = extract_attributes(source_tag)
3157 # data-video-src and data-src are non standard but seen
3158 # several times in the wild
3159 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3160 if not src:
3161 continue
3162 f = parse_content_type(s_attr.get('type'))
3163 is_plain_url, formats = _media_formats(src, media_type, f)
3164 if is_plain_url:
3165 # width, height, res, label and title attributes are
3166 # all not standard but seen several times in the wild
3167 labels = [
3168 s_attr.get(lbl)
3169 for lbl in ('label', 'title')
3170 if str_or_none(s_attr.get(lbl))
3171 ]
3172 width = int_or_none(s_attr.get('width'))
3173 height = (int_or_none(s_attr.get('height'))
3174 or int_or_none(s_attr.get('res')))
3175 if not width or not height:
3176 for lbl in labels:
3177 resolution = parse_resolution(lbl)
3178 if not resolution:
3179 continue
3180 width = width or resolution.get('width')
3181 height = height or resolution.get('height')
3182 for lbl in labels:
3183 tbr = parse_bitrate(lbl)
3184 if tbr:
3185 break
3186 else:
3187 tbr = None
3188 f.update({
3189 'width': width,
3190 'height': height,
3191 'tbr': tbr,
3192 'format_id': s_attr.get('label') or s_attr.get('title'),
3193 })
3194 f.update(formats[0])
3195 media_info['formats'].append(f)
3196 else:
3197 media_info['formats'].extend(formats)
3198 for track_tag in re.findall(r'<track[^>]+>', media_content):
3199 track_attributes = extract_attributes(track_tag)
3200 kind = track_attributes.get('kind')
3201 if not kind or kind in ('subtitles', 'captions'):
3202 src = strip_or_none(track_attributes.get('src'))
3203 if not src:
3204 continue
3205 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3206 media_info['subtitles'].setdefault(lang, []).append({
3207 'url': absolute_url(src),
3208 })
3209 for f in media_info['formats']:
3210 f.setdefault('http_headers', {})['Referer'] = base_url
3211 if media_info['formats'] or media_info['subtitles']:
3212 entries.append(media_info)
3213 return entries
3214
3215 def _extract_akamai_formats(self, *args, **kwargs):
3216 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3217 if subs:
3218 self._report_ignoring_subs('akamai')
3219 return fmts
3220
3221 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3222 signed = 'hdnea=' in manifest_url
3223 if not signed:
3224 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3225 manifest_url = re.sub(
3226 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3227 '', manifest_url).strip('?')
3228
3229 formats = []
3230 subtitles = {}
3231
3232 hdcore_sign = 'hdcore=3.7.0'
3233 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3234 hds_host = hosts.get('hds')
3235 if hds_host:
3236 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3237 if 'hdcore=' not in f4m_url:
3238 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3239 f4m_formats = self._extract_f4m_formats(
3240 f4m_url, video_id, f4m_id='hds', fatal=False)
3241 for entry in f4m_formats:
3242 entry.update({'extra_param_to_segment_url': hdcore_sign})
3243 formats.extend(f4m_formats)
3244
3245 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3246 hls_host = hosts.get('hls')
3247 if hls_host:
3248 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3249 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3250 m3u8_url, video_id, 'mp4', 'm3u8_native',
3251 m3u8_id='hls', fatal=False)
3252 formats.extend(m3u8_formats)
3253 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3254
3255 http_host = hosts.get('http')
3256 if http_host and m3u8_formats and not signed:
3257 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3258 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3259 qualities_length = len(qualities)
3260 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3261 i = 0
3262 for f in m3u8_formats:
3263 if f['vcodec'] != 'none':
3264 for protocol in ('http', 'https'):
3265 http_f = f.copy()
3266 del http_f['manifest_url']
3267 http_url = re.sub(
3268 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3269 http_f.update({
3270 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3271 'url': http_url,
3272 'protocol': protocol,
3273 })
3274 formats.append(http_f)
3275 i += 1
3276
3277 return formats, subtitles
3278
3279 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3280 query = urllib.parse.urlparse(url).query
3281 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3282 mobj = re.search(
3283 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3284 url_base = mobj.group('url')
3285 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3286 formats = []
3287
3288 def manifest_url(manifest):
3289 m_url = f'{http_base_url}/{manifest}'
3290 if query:
3291 m_url += '?%s' % query
3292 return m_url
3293
3294 if 'm3u8' not in skip_protocols:
3295 formats.extend(self._extract_m3u8_formats(
3296 manifest_url('playlist.m3u8'), video_id, 'mp4',
3297 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3298 if 'f4m' not in skip_protocols:
3299 formats.extend(self._extract_f4m_formats(
3300 manifest_url('manifest.f4m'),
3301 video_id, f4m_id='hds', fatal=False))
3302 if 'dash' not in skip_protocols:
3303 formats.extend(self._extract_mpd_formats(
3304 manifest_url('manifest.mpd'),
3305 video_id, mpd_id='dash', fatal=False))
3306 if re.search(r'(?:/smil:|\.smil)', url_base):
3307 if 'smil' not in skip_protocols:
3308 rtmp_formats = self._extract_smil_formats(
3309 manifest_url('jwplayer.smil'),
3310 video_id, fatal=False)
3311 for rtmp_format in rtmp_formats:
3312 rtsp_format = rtmp_format.copy()
3313 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3314 del rtsp_format['play_path']
3315 del rtsp_format['ext']
3316 rtsp_format.update({
3317 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3318 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3319 'protocol': 'rtsp',
3320 })
3321 formats.extend([rtmp_format, rtsp_format])
3322 else:
3323 for protocol in ('rtmp', 'rtsp'):
3324 if protocol not in skip_protocols:
3325 formats.append({
3326 'url': f'{protocol}:{url_base}',
3327 'format_id': protocol,
3328 'protocol': protocol,
3329 })
3330 return formats
3331
3332 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3333 mobj = re.search(
3334 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3335 webpage)
3336 if mobj:
3337 try:
3338 jwplayer_data = self._parse_json(mobj.group('options'),
3339 video_id=video_id,
3340 transform_source=transform_source)
3341 except ExtractorError:
3342 pass
3343 else:
3344 if isinstance(jwplayer_data, dict):
3345 return jwplayer_data
3346
3347 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3348 jwplayer_data = self._find_jwplayer_data(
3349 webpage, video_id, transform_source=js_to_json)
3350 return self._parse_jwplayer_data(
3351 jwplayer_data, video_id, *args, **kwargs)
3352
3353 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3354 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3355 entries = []
3356 if not isinstance(jwplayer_data, dict):
3357 return entries
3358
3359 playlist_items = jwplayer_data.get('playlist')
3360 # JWPlayer backward compatibility: single playlist item/flattened playlists
3361 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3362 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3363 if not isinstance(playlist_items, list):
3364 playlist_items = (playlist_items or jwplayer_data, )
3365
3366 for video_data in playlist_items:
3367 if not isinstance(video_data, dict):
3368 continue
3369 # JWPlayer backward compatibility: flattened sources
3370 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3371 if 'sources' not in video_data:
3372 video_data['sources'] = [video_data]
3373
3374 this_video_id = video_id or video_data['mediaid']
3375
3376 formats = self._parse_jwplayer_formats(
3377 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3378 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3379
3380 subtitles = {}
3381 tracks = video_data.get('tracks')
3382 if tracks and isinstance(tracks, list):
3383 for track in tracks:
3384 if not isinstance(track, dict):
3385 continue
3386 track_kind = track.get('kind')
3387 if not track_kind or not isinstance(track_kind, str):
3388 continue
3389 if track_kind.lower() not in ('captions', 'subtitles'):
3390 continue
3391 track_url = urljoin(base_url, track.get('file'))
3392 if not track_url:
3393 continue
3394 subtitles.setdefault(track.get('label') or 'en', []).append({
3395 'url': self._proto_relative_url(track_url)
3396 })
3397
3398 entry = {
3399 'id': this_video_id,
3400 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3401 'description': clean_html(video_data.get('description')),
3402 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3403 'timestamp': int_or_none(video_data.get('pubdate')),
3404 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3405 'subtitles': subtitles,
3406 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3407 'genre': clean_html(video_data.get('genre')),
3408 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3409 'season_number': int_or_none(video_data.get('season')),
3410 'episode_number': int_or_none(video_data.get('episode')),
3411 'release_year': int_or_none(video_data.get('releasedate')),
3412 'age_limit': int_or_none(video_data.get('age_restriction')),
3413 }
3414 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3415 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3416 entry.update({
3417 '_type': 'url_transparent',
3418 'url': formats[0]['url'],
3419 })
3420 else:
3421 entry['formats'] = formats
3422 entries.append(entry)
3423 if len(entries) == 1:
3424 return entries[0]
3425 else:
3426 return self.playlist_result(entries)
3427
3428 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3429 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3430 urls = set()
3431 formats = []
3432 for source in jwplayer_sources_data:
3433 if not isinstance(source, dict):
3434 continue
3435 source_url = urljoin(
3436 base_url, self._proto_relative_url(source.get('file')))
3437 if not source_url or source_url in urls:
3438 continue
3439 urls.add(source_url)
3440 source_type = source.get('type') or ''
3441 ext = mimetype2ext(source_type) or determine_ext(source_url)
3442 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3443 formats.extend(self._extract_m3u8_formats(
3444 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3445 m3u8_id=m3u8_id, fatal=False))
3446 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3447 formats.extend(self._extract_mpd_formats(
3448 source_url, video_id, mpd_id=mpd_id, fatal=False))
3449 elif ext == 'smil':
3450 formats.extend(self._extract_smil_formats(
3451 source_url, video_id, fatal=False))
3452 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3453 elif source_type.startswith('audio') or ext in (
3454 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3455 formats.append({
3456 'url': source_url,
3457 'vcodec': 'none',
3458 'ext': ext,
3459 })
3460 else:
3461 format_id = str_or_none(source.get('label'))
3462 height = int_or_none(source.get('height'))
3463 if height is None and format_id:
3464 # Often no height is provided but there is a label in
3465 # format like "1080p", "720p SD", or 1080.
3466 height = parse_resolution(format_id).get('height')
3467 a_format = {
3468 'url': source_url,
3469 'width': int_or_none(source.get('width')),
3470 'height': height,
3471 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3472 'filesize': int_or_none(source.get('filesize')),
3473 'ext': ext,
3474 'format_id': format_id
3475 }
3476 if source_url.startswith('rtmp'):
3477 a_format['ext'] = 'flv'
3478 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3479 # of jwplayer.flash.swf
3480 rtmp_url_parts = re.split(
3481 r'((?:mp4|mp3|flv):)', source_url, 1)
3482 if len(rtmp_url_parts) == 3:
3483 rtmp_url, prefix, play_path = rtmp_url_parts
3484 a_format.update({
3485 'url': rtmp_url,
3486 'play_path': prefix + play_path,
3487 })
3488 if rtmp_params:
3489 a_format.update(rtmp_params)
3490 formats.append(a_format)
3491 return formats
3492
3493 def _live_title(self, name):
3494 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3495 return name
3496
3497 def _int(self, v, name, fatal=False, **kwargs):
3498 res = int_or_none(v, **kwargs)
3499 if res is None:
3500 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3501 if fatal:
3502 raise ExtractorError(msg)
3503 else:
3504 self.report_warning(msg)
3505 return res
3506
3507 def _float(self, v, name, fatal=False, **kwargs):
3508 res = float_or_none(v, **kwargs)
3509 if res is None:
3510 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3511 if fatal:
3512 raise ExtractorError(msg)
3513 else:
3514 self.report_warning(msg)
3515 return res
3516
3517 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3518 path='/', secure=False, discard=False, rest={}, **kwargs):
3519 cookie = http.cookiejar.Cookie(
3520 0, name, value, port, port is not None, domain, True,
3521 domain.startswith('.'), path, True, secure, expire_time,
3522 discard, None, None, rest)
3523 self.cookiejar.set_cookie(cookie)
3524
3525 def _get_cookies(self, url):
3526 """ Return a http.cookies.SimpleCookie with the cookies for the url """
3527 return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3528
3529 def _apply_first_set_cookie_header(self, url_handle, cookie):
3530 """
3531 Apply first Set-Cookie header instead of the last. Experimental.
3532
3533 Some sites (e.g. [1-3]) may serve two cookies under the same name
3534 in Set-Cookie header and expect the first (old) one to be set rather
3535 than second (new). However, as of RFC6265 the newer one cookie
3536 should be set into cookie store what actually happens.
3537 We will workaround this issue by resetting the cookie to
3538 the first one manually.
3539 1. https://new.vk.com/
3540 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3541 3. https://learning.oreilly.com/
3542 """
3543 for header, cookies in url_handle.headers.items():
3544 if header.lower() != 'set-cookie':
3545 continue
3546 cookies = cookies.encode('iso-8859-1').decode('utf-8')
3547 cookie_value = re.search(
3548 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3549 if cookie_value:
3550 value, domain = cookie_value.groups()
3551 self._set_cookie(domain, cookie, value)
3552 break
3553
3554 @classmethod
3555 def get_testcases(cls, include_onlymatching=False):
3556 # Do not look in super classes
3557 t = vars(cls).get('_TEST')
3558 if t:
3559 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3560 tests = [t]
3561 else:
3562 tests = vars(cls).get('_TESTS', [])
3563 for t in tests:
3564 if not include_onlymatching and t.get('only_matching', False):
3565 continue
3566 t['name'] = cls.ie_key()
3567 yield t
3568 if getattr(cls, '__wrapped__', None):
3569 yield from cls.__wrapped__.get_testcases(include_onlymatching)
3570
3571 @classmethod
3572 def get_webpage_testcases(cls):
3573 tests = vars(cls).get('_WEBPAGE_TESTS', [])
3574 for t in tests:
3575 t['name'] = cls.ie_key()
3576 yield t
3577 if getattr(cls, '__wrapped__', None):
3578 yield from cls.__wrapped__.get_webpage_testcases()
3579
3580 @classproperty(cache=True)
3581 def age_limit(cls):
3582 """Get age limit from the testcases"""
3583 return max(traverse_obj(
3584 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3585 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3586
3587 @classproperty(cache=True)
3588 def _RETURN_TYPE(cls):
3589 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3590 tests = tuple(cls.get_testcases(include_onlymatching=False))
3591 if not tests:
3592 return None
3593 elif not any(k.startswith('playlist') for test in tests for k in test):
3594 return 'video'
3595 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3596 return 'playlist'
3597 return 'any'
3598
3599 @classmethod
3600 def is_single_video(cls, url):
3601 """Returns whether the URL is of a single video, None if unknown"""
3602 if cls.suitable(url):
3603 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3604
3605 @classmethod
3606 def is_suitable(cls, age_limit):
3607 """Test whether the extractor is generally suitable for the given age limit"""
3608 return not age_restricted(cls.age_limit, age_limit)
3609
3610 @classmethod
3611 def description(cls, *, markdown=True, search_examples=None):
3612 """Description of the extractor"""
3613 desc = ''
3614 if cls._NETRC_MACHINE:
3615 if markdown:
3616 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3617 else:
3618 desc += f' [{cls._NETRC_MACHINE}]'
3619 if cls.IE_DESC is False:
3620 desc += ' [HIDDEN]'
3621 elif cls.IE_DESC:
3622 desc += f' {cls.IE_DESC}'
3623 if cls.SEARCH_KEY:
3624 desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3625 if search_examples:
3626 _COUNTS = ('', '5', '10', 'all')
3627 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3628 if not cls.working():
3629 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3630
3631 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3632 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3633 return f'{name}:{desc}' if desc else name
3634
3635 def extract_subtitles(self, *args, **kwargs):
3636 if (self.get_param('writesubtitles', False)
3637 or self.get_param('listsubtitles')):
3638 return self._get_subtitles(*args, **kwargs)
3639 return {}
3640
3641 def _get_subtitles(self, *args, **kwargs):
3642 raise NotImplementedError('This method must be implemented by subclasses')
3643
3644 class CommentsDisabled(Exception):
3645 """Raise in _get_comments if comments are disabled for the video"""
3646
3647 def extract_comments(self, *args, **kwargs):
3648 if not self.get_param('getcomments'):
3649 return None
3650 generator = self._get_comments(*args, **kwargs)
3651
3652 def extractor():
3653 comments = []
3654 interrupted = True
3655 try:
3656 while True:
3657 comments.append(next(generator))
3658 except StopIteration:
3659 interrupted = False
3660 except KeyboardInterrupt:
3661 self.to_screen('Interrupted by user')
3662 except self.CommentsDisabled:
3663 return {'comments': None, 'comment_count': None}
3664 except Exception as e:
3665 if self.get_param('ignoreerrors') is not True:
3666 raise
3667 self._downloader.report_error(e)
3668 comment_count = len(comments)
3669 self.to_screen(f'Extracted {comment_count} comments')
3670 return {
3671 'comments': comments,
3672 'comment_count': None if interrupted else comment_count
3673 }
3674 return extractor
3675
3676 def _get_comments(self, *args, **kwargs):
3677 raise NotImplementedError('This method must be implemented by subclasses')
3678
3679 @staticmethod
3680 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3681 """ Merge subtitle items for one language. Items with duplicated URLs/data
3682 will be dropped. """
3683 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3684 ret = list(subtitle_list1)
3685 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3686 return ret
3687
3688 @classmethod
3689 def _merge_subtitles(cls, *dicts, target=None):
3690 """ Merge subtitle dictionaries, language by language. """
3691 if target is None:
3692 target = {}
3693 for d in dicts:
3694 for lang, subs in d.items():
3695 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3696 return target
3697
3698 def extract_automatic_captions(self, *args, **kwargs):
3699 if (self.get_param('writeautomaticsub', False)
3700 or self.get_param('listsubtitles')):
3701 return self._get_automatic_captions(*args, **kwargs)
3702 return {}
3703
3704 def _get_automatic_captions(self, *args, **kwargs):
3705 raise NotImplementedError('This method must be implemented by subclasses')
3706
3707 @functools.cached_property
3708 def _cookies_passed(self):
3709 """Whether cookies have been passed to YoutubeDL"""
3710 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3711
3712 def mark_watched(self, *args, **kwargs):
3713 if not self.get_param('mark_watched', False):
3714 return
3715 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3716 self._mark_watched(*args, **kwargs)
3717
3718 def _mark_watched(self, *args, **kwargs):
3719 raise NotImplementedError('This method must be implemented by subclasses')
3720
3721 def geo_verification_headers(self):
3722 headers = {}
3723 geo_verification_proxy = self.get_param('geo_verification_proxy')
3724 if geo_verification_proxy:
3725 headers['Ytdl-request-proxy'] = geo_verification_proxy
3726 return headers
3727
3728 @staticmethod
3729 def _generic_id(url):
3730 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3731
3732 def _generic_title(self, url='', webpage='', *, default=None):
3733 return (self._og_search_title(webpage, default=None)
3734 or self._html_extract_title(webpage, default=None)
3735 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3736 or default)
3737
3738 def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3739 if not duration:
3740 return
3741 chapter_list = [{
3742 'start_time': start_function(chapter),
3743 'title': title_function(chapter),
3744 } for chapter in chapter_list or []]
3745 if strict:
3746 warn = self.report_warning
3747 else:
3748 warn = self.write_debug
3749 chapter_list.sort(key=lambda c: c['start_time'] or 0)
3750
3751 chapters = [{'start_time': 0}]
3752 for idx, chapter in enumerate(chapter_list):
3753 if chapter['start_time'] is None:
3754 warn(f'Incomplete chapter {idx}')
3755 elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3756 chapters.append(chapter)
3757 elif chapter not in chapters:
3758 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3759 else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3760 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3761 return chapters[1:]
3762
3763 def _extract_chapters_from_description(self, description, duration):
3764 duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3765 sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3766 return self._extract_chapters_helper(
3767 re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3768 start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3769 duration=duration, strict=False) or self._extract_chapters_helper(
3770 re.findall(sep_re % (r'.+?', duration_re), description or ''),
3771 start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3772 duration=duration, strict=False)
3773
3774 @staticmethod
3775 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3776 all_known = all(map(
3777 lambda x: x is not None,
3778 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3779 return (
3780 'private' if is_private
3781 else 'premium_only' if needs_premium
3782 else 'subscriber_only' if needs_subscription
3783 else 'needs_auth' if needs_auth
3784 else 'unlisted' if is_unlisted
3785 else 'public' if all_known
3786 else None)
3787
3788 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3789 '''
3790 @returns A list of values for the extractor argument given by "key"
3791 or "default" if no such key is present
3792 @param default The default value to return when the key is not present (default: [])
3793 @param casesense When false, the values are converted to lower case
3794 '''
3795 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3796 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3797 if val is None:
3798 return [] if default is NO_DEFAULT else default
3799 return list(val) if casesense else [x.lower() for x in val]
3800
3801 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3802 if not playlist_id or not video_id:
3803 return not video_id
3804
3805 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3806 if no_playlist is not None:
3807 return not no_playlist
3808
3809 video_id = '' if video_id is True else f' {video_id}'
3810 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3811 if self.get_param('noplaylist'):
3812 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3813 return False
3814 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3815 return True
3816
3817 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3818 RetryManager.report_retry(
3819 err, _count or int(fatal), _retries,
3820 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3821 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3822
3823 def RetryManager(self, **kwargs):
3824 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3825
3826 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3827 display_id = traverse_obj(info_dict, 'display_id', 'id')
3828 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3829 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3830 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3831
3832 @classmethod
3833 def extract_from_webpage(cls, ydl, url, webpage):
3834 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3835 else ydl.get_info_extractor(cls.ie_key()))
3836 for info in ie._extract_from_webpage(url, webpage) or []:
3837 # url = None since we do not want to set (webpage/original)_url
3838 ydl.add_default_extra_info(info, ie, None)
3839 yield info
3840
3841 @classmethod
3842 def _extract_from_webpage(cls, url, webpage):
3843 for embed_url in orderedSet(
3844 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3845 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3846
3847 @classmethod
3848 def _extract_embed_urls(cls, url, webpage):
3849 """@returns all the embed urls on the webpage"""
3850 if '_EMBED_URL_RE' not in cls.__dict__:
3851 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3852 for idx, regex in enumerate(cls._EMBED_REGEX):
3853 assert regex.count('(?P<url>') == 1, \
3854 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3855 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3856
3857 for regex in cls._EMBED_URL_RE:
3858 for mobj in regex.finditer(webpage):
3859 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3860 if cls._VALID_URL is False or cls.suitable(embed_url):
3861 yield embed_url
3862
3863 class StopExtraction(Exception):
3864 pass
3865
3866 @classmethod
3867 def _extract_url(cls, webpage): # TODO: Remove
3868 """Only for compatibility with some older extractors"""
3869 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3870
3871 @classmethod
3872 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3873 if plugin_name:
3874 mro = inspect.getmro(cls)
3875 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3876 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3877 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3878 while getattr(super_class, '__wrapped__', None):
3879 super_class = super_class.__wrapped__
3880 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3881 _PLUGIN_OVERRIDES[super_class].append(cls)
3882
3883 return super().__init_subclass__(**kwargs)
3884
3885
3886 class SearchInfoExtractor(InfoExtractor):
3887 """
3888 Base class for paged search queries extractors.
3889 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3890 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3891 """
3892
3893 _MAX_RESULTS = float('inf')
3894 _RETURN_TYPE = 'playlist'
3895
3896 @classproperty
3897 def _VALID_URL(cls):
3898 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3899
3900 def _real_extract(self, query):
3901 prefix, query = self._match_valid_url(query).group('prefix', 'query')
3902 if prefix == '':
3903 return self._get_n_results(query, 1)
3904 elif prefix == 'all':
3905 return self._get_n_results(query, self._MAX_RESULTS)
3906 else:
3907 n = int(prefix)
3908 if n <= 0:
3909 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3910 elif n > self._MAX_RESULTS:
3911 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3912 n = self._MAX_RESULTS
3913 return self._get_n_results(query, n)
3914
3915 def _get_n_results(self, query, n):
3916 """Get a specified number of results for a query.
3917 Either this function or _search_results must be overridden by subclasses """
3918 return self.playlist_result(
3919 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3920 query, query)
3921
3922 def _search_results(self, query):
3923 """Returns an iterator of search results"""
3924 raise NotImplementedError('This method must be implemented by subclasses')
3925
3926 @classproperty
3927 def SEARCH_KEY(cls):
3928 return cls._SEARCH_KEY
3929
3930
3931 class UnsupportedURLIE(InfoExtractor):
3932 _VALID_URL = '.*'
3933 _ENABLED = False
3934 IE_DESC = False
3935
3936 def _real_extract(self, url):
3937 raise UnsupportedError(url)
3938
3939
3940 _PLUGIN_OVERRIDES = collections.defaultdict(list)