]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
[cleanup] Fix misc bugs (#8968)
[yt-dlp.git] / yt_dlp / extractor / common.py
1 import base64
2 import collections
3 import getpass
4 import hashlib
5 import http.client
6 import http.cookiejar
7 import http.cookies
8 import inspect
9 import itertools
10 import json
11 import math
12 import netrc
13 import os
14 import random
15 import re
16 import subprocess
17 import sys
18 import time
19 import types
20 import urllib.parse
21 import urllib.request
22 import xml.etree.ElementTree
23
24 from ..compat import functools # isort: split
25 from ..compat import (
26 compat_etree_fromstring,
27 compat_expanduser,
28 compat_os_name,
29 urllib_req_to_req,
30 )
31 from ..cookies import LenientSimpleCookie
32 from ..downloader.f4m import get_base_url, remove_encrypted_media
33 from ..downloader.hls import HlsFD
34 from ..networking import HEADRequest, Request
35 from ..networking.exceptions import (
36 HTTPError,
37 IncompleteRead,
38 network_exceptions,
39 )
40 from ..utils import (
41 IDENTITY,
42 JSON_LD_RE,
43 NO_DEFAULT,
44 ExtractorError,
45 FormatSorter,
46 GeoRestrictedError,
47 GeoUtils,
48 LenientJSONDecoder,
49 Popen,
50 RegexNotFoundError,
51 RetryManager,
52 UnsupportedError,
53 age_restricted,
54 base_url,
55 bug_reports_message,
56 classproperty,
57 clean_html,
58 deprecation_warning,
59 determine_ext,
60 dict_get,
61 encode_data_uri,
62 error_to_compat_str,
63 extract_attributes,
64 filter_dict,
65 fix_xml_ampersands,
66 float_or_none,
67 format_field,
68 int_or_none,
69 join_nonempty,
70 js_to_json,
71 mimetype2ext,
72 netrc_from_content,
73 orderedSet,
74 parse_bitrate,
75 parse_codecs,
76 parse_duration,
77 parse_iso8601,
78 parse_m3u8_attributes,
79 parse_resolution,
80 sanitize_filename,
81 sanitize_url,
82 smuggle_url,
83 str_or_none,
84 str_to_int,
85 strip_or_none,
86 traverse_obj,
87 truncate_string,
88 try_call,
89 try_get,
90 unescapeHTML,
91 unified_strdate,
92 unified_timestamp,
93 url_basename,
94 url_or_none,
95 urlhandle_detect_ext,
96 urljoin,
97 variadic,
98 xpath_element,
99 xpath_text,
100 xpath_with_ns,
101 )
102
103
104 class InfoExtractor:
105 """Information Extractor class.
106
107 Information extractors are the classes that, given a URL, extract
108 information about the video (or videos) the URL refers to. This
109 information includes the real video URL, the video title, author and
110 others. The information is stored in a dictionary which is then
111 passed to the YoutubeDL. The YoutubeDL processes this
112 information possibly downloading the video to the file system, among
113 other possible outcomes.
114
115 The type field determines the type of the result.
116 By far the most common value (and the default if _type is missing) is
117 "video", which indicates a single video.
118
119 For a video, the dictionaries must include the following fields:
120
121 id: Video identifier.
122 title: Video title, unescaped. Set to an empty string if video has
123 no title as opposed to "None" which signifies that the
124 extractor failed to obtain a title
125
126 Additionally, it must contain either a formats entry or a url one:
127
128 formats: A list of dictionaries for each format available, ordered
129 from worst to best quality.
130
131 Potential fields:
132 * url The mandatory URL representing the media:
133 for plain file media - HTTP URL of this file,
134 for RTMP - RTMP URL,
135 for HLS - URL of the M3U8 media playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH
138 - HTTP URL to plain file media (in case of
139 unfragmented media)
140 - URL of the MPD manifest or base URL
141 representing the media if MPD manifest
142 is parsed from a string (in case of
143 fragmented media)
144 for MSS - URL of the ISM manifest.
145 * request_data Data to send in POST request to the URL
146 * manifest_url
147 The URL of the manifest file in case of
148 fragmented media:
149 for HLS - URL of the M3U8 master playlist,
150 for HDS - URL of the F4M manifest,
151 for DASH - URL of the MPD manifest,
152 for MSS - URL of the ISM manifest.
153 * manifest_stream_number (For internal use only)
154 The index of the stream in the manifest file
155 * ext Will be calculated from URL if missing
156 * format A human-readable description of the format
157 ("mp4 container with h264/opus").
158 Calculated from the format_id, width, height.
159 and format_note fields if missing.
160 * format_id A short description of the format
161 ("mp4_h264_opus" or "19").
162 Technically optional, but strongly recommended.
163 * format_note Additional info about the format
164 ("3D" or "DASH video")
165 * width Width of the video, if known
166 * height Height of the video, if known
167 * aspect_ratio Aspect ratio of the video, if known
168 Automatically calculated from width and height
169 * resolution Textual description of width and height
170 Automatically calculated from width and height
171 * dynamic_range The dynamic range of the video. One of:
172 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
173 * tbr Average bitrate of audio and video in KBit/s
174 * abr Average audio bitrate in KBit/s
175 * acodec Name of the audio codec in use
176 * asr Audio sampling rate in Hertz
177 * audio_channels Number of audio channels
178 * vbr Average video bitrate in KBit/s
179 * fps Frame rate
180 * vcodec Name of the video codec in use
181 * container Name of the container format
182 * filesize The number of bytes, if known in advance
183 * filesize_approx An estimate for the number of bytes
184 * player_url SWF Player URL (used for rtmpdump).
185 * protocol The protocol that will be used for the actual
186 download, lower-case. One of "http", "https" or
187 one of the protocols defined in downloader.PROTOCOL_MAP
188 * fragment_base_url
189 Base URL for fragments. Each fragment's path
190 value (if present) will be relative to
191 this URL.
192 * fragments A list of fragments of a fragmented media.
193 Each fragment entry must contain either an url
194 or a path. If an url is present it should be
195 considered by a client. Otherwise both path and
196 fragment_base_url must be present. Here is
197 the list of all potential fields:
198 * "url" - fragment's URL
199 * "path" - fragment's path relative to
200 fragment_base_url
201 * "duration" (optional, int or float)
202 * "filesize" (optional, int)
203 * is_from_start Is a live format that can be downloaded
204 from the start. Boolean
205 * preference Order number of this format. If this field is
206 present and not None, the formats get sorted
207 by this field, regardless of all other values.
208 -1 for default (order by other properties),
209 -2 or smaller for less than default.
210 < -1000 to hide the format (if there is
211 another one which is strictly better)
212 * language Language code, e.g. "de" or "en-US".
213 * language_preference Is this in the language mentioned in
214 the URL?
215 10 if it's what the URL is about,
216 -1 for default (don't know),
217 -10 otherwise, other values reserved for now.
218 * quality Order number of the video quality of this
219 format, irrespective of the file format.
220 -1 for default (order by other properties),
221 -2 or smaller for less than default.
222 * source_preference Order number for this video source
223 (quality takes higher priority)
224 -1 for default (order by other properties),
225 -2 or smaller for less than default.
226 * http_headers A dictionary of additional HTTP headers
227 to add to the request.
228 * stretched_ratio If given and not 1, indicates that the
229 video's pixels are not square.
230 width : height ratio as float.
231 * no_resume The server does not support resuming the
232 (HTTP or RTMP) download. Boolean.
233 * has_drm True if the format has DRM and cannot be downloaded.
234 'maybe' if the format may have DRM and has to be tested before download.
235 * extra_param_to_segment_url A query string to append to each
236 fragment's URL, or to update each existing query string
237 with. Only applied by the native HLS/DASH downloaders.
238 * hls_aes A dictionary of HLS AES-128 decryption information
239 used by the native HLS downloader to override the
240 values in the media playlist when an '#EXT-X-KEY' tag
241 is present in the playlist:
242 * uri The URI from which the key will be downloaded
243 * key The key (as hex) used to decrypt fragments.
244 If `key` is given, any key URI will be ignored
245 * iv The IV (as hex) used to decrypt fragments
246 * downloader_options A dictionary of downloader options
247 (For internal use only)
248 * http_chunk_size Chunk size for HTTP downloads
249 * ffmpeg_args Extra arguments for ffmpeg downloader
250 * is_dash_periods Whether the format is a result of merging
251 multiple DASH periods.
252 RTMP formats can also have the additional fields: page_url,
253 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
254 rtmp_protocol, rtmp_real_time
255
256 url: Final video URL.
257 ext: Video filename extension.
258 format: The video format, defaults to ext (used for --get-format)
259 player_url: SWF Player URL (used for rtmpdump).
260
261 The following fields are optional:
262
263 direct: True if a direct video file was given (must only be set by GenericIE)
264 alt_title: A secondary title of the video.
265 display_id: An alternative identifier for the video, not necessarily
266 unique, but available before title. Typically, id is
267 something like "4234987", title "Dancing naked mole rats",
268 and display_id "dancing-naked-mole-rats"
269 thumbnails: A list of dictionaries, with the following entries:
270 * "id" (optional, string) - Thumbnail format ID
271 * "url"
272 * "preference" (optional, int) - quality of the image
273 * "width" (optional, int)
274 * "height" (optional, int)
275 * "resolution" (optional, string "{width}x{height}",
276 deprecated)
277 * "filesize" (optional, int)
278 * "http_headers" (dict) - HTTP headers for the request
279 thumbnail: Full URL to a video thumbnail image.
280 description: Full video description.
281 uploader: Full name of the video uploader.
282 license: License name the video is licensed under.
283 creators: List of creators of the video.
284 timestamp: UNIX timestamp of the moment the video was uploaded
285 upload_date: Video upload date in UTC (YYYYMMDD).
286 If not explicitly set, calculated from timestamp
287 release_timestamp: UNIX timestamp of the moment the video was released.
288 If it is not clear whether to use timestamp or this, use the former
289 release_date: The date (YYYYMMDD) when the video was released in UTC.
290 If not explicitly set, calculated from release_timestamp
291 release_year: Year (YYYY) as integer when the video or album was released.
292 To be used if no exact release date is known.
293 If not explicitly set, calculated from release_date.
294 modified_timestamp: UNIX timestamp of the moment the video was last modified.
295 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
296 If not explicitly set, calculated from modified_timestamp
297 uploader_id: Nickname or id of the video uploader.
298 uploader_url: Full URL to a personal webpage of the video uploader.
299 channel: Full name of the channel the video is uploaded on.
300 Note that channel fields may or may not repeat uploader
301 fields. This depends on a particular extractor.
302 channel_id: Id of the channel.
303 channel_url: Full URL to a channel webpage.
304 channel_follower_count: Number of followers of the channel.
305 channel_is_verified: Whether the channel is verified on the platform.
306 location: Physical location where the video was filmed.
307 subtitles: The available subtitles as a dictionary in the format
308 {tag: subformats}. "tag" is usually a language code, and
309 "subformats" is a list sorted from lower to higher
310 preference, each element is a dictionary with the "ext"
311 entry and one of:
312 * "data": The subtitles file contents
313 * "url": A URL pointing to the subtitles file
314 It can optionally also have:
315 * "name": Name or description of the subtitles
316 * "http_headers": A dictionary of additional HTTP headers
317 to add to the request.
318 "ext" will be calculated from URL if missing
319 automatic_captions: Like 'subtitles'; contains automatically generated
320 captions instead of normal subtitles
321 duration: Length of the video in seconds, as an integer or float.
322 view_count: How many users have watched the video on the platform.
323 concurrent_view_count: How many users are currently watching the video on the platform.
324 like_count: Number of positive ratings of the video
325 dislike_count: Number of negative ratings of the video
326 repost_count: Number of reposts of the video
327 average_rating: Average rating give by users, the scale used depends on the webpage
328 comment_count: Number of comments on the video
329 comments: A list of comments, each with one or more of the following
330 properties (all but one of text or html optional):
331 * "author" - human-readable name of the comment author
332 * "author_id" - user ID of the comment author
333 * "author_thumbnail" - The thumbnail of the comment author
334 * "author_url" - The url to the comment author's page
335 * "author_is_verified" - Whether the author is verified
336 on the platform
337 * "author_is_uploader" - Whether the comment is made by
338 the video uploader
339 * "id" - Comment ID
340 * "html" - Comment as HTML
341 * "text" - Plain text of the comment
342 * "timestamp" - UNIX timestamp of comment
343 * "parent" - ID of the comment this one is replying to.
344 Set to "root" to indicate that this is a
345 comment to the original video.
346 * "like_count" - Number of positive ratings of the comment
347 * "dislike_count" - Number of negative ratings of the comment
348 * "is_favorited" - Whether the comment is marked as
349 favorite by the video uploader
350 * "is_pinned" - Whether the comment is pinned to
351 the top of the comments
352 age_limit: Age restriction for the video, as an integer (years)
353 webpage_url: The URL to the video webpage, if given to yt-dlp it
354 should allow to get the same result again. (It will be set
355 by YoutubeDL if it's missing)
356 categories: A list of categories that the video falls in, for example
357 ["Sports", "Berlin"]
358 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
359 cast: A list of the video cast
360 is_live: True, False, or None (=unknown). Whether this video is a
361 live stream that goes on instead of a fixed-length video.
362 was_live: True, False, or None (=unknown). Whether this video was
363 originally a live stream.
364 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
365 or 'post_live' (was live, but VOD is not yet processed)
366 If absent, automatically set from is_live, was_live
367 start_time: Time in seconds where the reproduction should start, as
368 specified in the URL.
369 end_time: Time in seconds where the reproduction should end, as
370 specified in the URL.
371 chapters: A list of dictionaries, with the following entries:
372 * "start_time" - The start time of the chapter in seconds
373 * "end_time" - The end time of the chapter in seconds
374 * "title" (optional, string)
375 heatmap: A list of dictionaries, with the following entries:
376 * "start_time" - The start time of the data point in seconds
377 * "end_time" - The end time of the data point in seconds
378 * "value" - The normalized value of the data point (float between 0 and 1)
379 playable_in_embed: Whether this video is allowed to play in embedded
380 players on other sites. Can be True (=always allowed),
381 False (=never allowed), None (=unknown), or a string
382 specifying the criteria for embedability; e.g. 'whitelist'
383 availability: Under what condition the video is available. One of
384 'private', 'premium_only', 'subscriber_only', 'needs_auth',
385 'unlisted' or 'public'. Use 'InfoExtractor._availability'
386 to set it
387 media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
388 _old_archive_ids: A list of old archive ids needed for backward compatibility
389 _format_sort_fields: A list of fields to use for sorting formats
390 __post_extractor: A function to be called just before the metadata is
391 written to either disk, logger or console. The function
392 must return a dict which will be added to the info_dict.
393 This is usefull for additional information that is
394 time-consuming to extract. Note that the fields thus
395 extracted will not be available to output template and
396 match_filter. So, only "comments" and "comment_count" are
397 currently allowed to be extracted via this method.
398
399 The following fields should only be used when the video belongs to some logical
400 chapter or section:
401
402 chapter: Name or title of the chapter the video belongs to.
403 chapter_number: Number of the chapter the video belongs to, as an integer.
404 chapter_id: Id of the chapter the video belongs to, as a unicode string.
405
406 The following fields should only be used when the video is an episode of some
407 series, programme or podcast:
408
409 series: Title of the series or programme the video episode belongs to.
410 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
411 season: Title of the season the video episode belongs to.
412 season_number: Number of the season the video episode belongs to, as an integer.
413 season_id: Id of the season the video episode belongs to, as a unicode string.
414 episode: Title of the video episode. Unlike mandatory video title field,
415 this field should denote the exact title of the video episode
416 without any kind of decoration.
417 episode_number: Number of the video episode within a season, as an integer.
418 episode_id: Id of the video episode, as a unicode string.
419
420 The following fields should only be used when the media is a track or a part of
421 a music album:
422
423 track: Title of the track.
424 track_number: Number of the track within an album or a disc, as an integer.
425 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
426 as a unicode string.
427 artists: List of artists of the track.
428 composers: List of composers of the piece.
429 genres: List of genres of the track.
430 album: Title of the album the track belongs to.
431 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
432 album_artists: List of all artists appeared on the album.
433 E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"].
434 Useful for splits and compilations.
435 disc_number: Number of the disc or other physical medium the track belongs to,
436 as an integer.
437
438 The following fields should only be set for clips that should be cut from the original video:
439
440 section_start: Start time of the section in seconds
441 section_end: End time of the section in seconds
442
443 The following fields should only be set for storyboards:
444 rows: Number of rows in each storyboard fragment, as an integer
445 columns: Number of columns in each storyboard fragment, as an integer
446
447 The following fields are deprecated and should not be set by new code:
448 composer: Use "composers" instead.
449 Composer(s) of the piece, comma-separated.
450 artist: Use "artists" instead.
451 Artist(s) of the track, comma-separated.
452 genre: Use "genres" instead.
453 Genre(s) of the track, comma-separated.
454 album_artist: Use "album_artists" instead.
455 All artists appeared on the album, comma-separated.
456 creator: Use "creators" instead.
457 The creator of the video.
458
459 Unless mentioned otherwise, the fields should be Unicode strings.
460
461 Unless mentioned otherwise, None is equivalent to absence of information.
462
463
464 _type "playlist" indicates multiple videos.
465 There must be a key "entries", which is a list, an iterable, or a PagedList
466 object, each element of which is a valid dictionary by this specification.
467
468 Additionally, playlists can have "id", "title", and any other relevant
469 attributes with the same semantics as videos (see above).
470
471 It can also have the following optional fields:
472
473 playlist_count: The total number of videos in a playlist. If not given,
474 YoutubeDL tries to calculate it from "entries"
475
476
477 _type "multi_video" indicates that there are multiple videos that
478 form a single show, for examples multiple acts of an opera or TV episode.
479 It must have an entries key like a playlist and contain all the keys
480 required for a video at the same time.
481
482
483 _type "url" indicates that the video must be extracted from another
484 location, possibly by a different extractor. Its only required key is:
485 "url" - the next URL to extract.
486 The key "ie_key" can be set to the class name (minus the trailing "IE",
487 e.g. "Youtube") if the extractor class is known in advance.
488 Additionally, the dictionary may have any properties of the resolved entity
489 known in advance, for example "title" if the title of the referred video is
490 known ahead of time.
491
492
493 _type "url_transparent" entities have the same specification as "url", but
494 indicate that the given additional information is more precise than the one
495 associated with the resolved URL.
496 This is useful when a site employs a video service that hosts the video and
497 its technical metadata, but that video service does not embed a useful
498 title, description etc.
499
500
501 Subclasses of this should also be added to the list of extractors and
502 should define _VALID_URL as a regexp or a Sequence of regexps, and
503 re-define the _real_extract() and (optionally) _real_initialize() methods.
504
505 Subclasses may also override suitable() if necessary, but ensure the function
506 signature is preserved and that this function imports everything it needs
507 (except other extractors), so that lazy_extractors works correctly.
508
509 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
510 the HTML of Generic webpages. It may also override _extract_embed_urls
511 or _extract_from_webpage as necessary. While these are normally classmethods,
512 _extract_from_webpage is allowed to be an instance method.
513
514 _extract_from_webpage may raise self.StopExtraction() to stop further
515 processing of the webpage and obtain exclusive rights to it. This is useful
516 when the extractor cannot reliably be matched using just the URL,
517 e.g. invidious/peertube instances
518
519 Embed-only extractors can be defined by setting _VALID_URL = False.
520
521 To support username + password (or netrc) login, the extractor must define a
522 _NETRC_MACHINE and re-define _perform_login(username, password) and
523 (optionally) _initialize_pre_login() methods. The _perform_login method will
524 be called between _initialize_pre_login and _real_initialize if credentials
525 are passed by the user. In cases where it is necessary to have the login
526 process as part of the extraction rather than initialization, _perform_login
527 can be left undefined.
528
529 _GEO_BYPASS attribute may be set to False in order to disable
530 geo restriction bypass mechanisms for a particular extractor.
531 Though it won't disable explicit geo restriction bypass based on
532 country code provided with geo_bypass_country.
533
534 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
535 countries for this extractor. One of these countries will be used by
536 geo restriction bypass mechanism right away in order to bypass
537 geo restriction, of course, if the mechanism is not disabled.
538
539 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
540 IP blocks in CIDR notation for this extractor. One of these IP blocks
541 will be used by geo restriction bypass mechanism similarly
542 to _GEO_COUNTRIES.
543
544 The _ENABLED attribute should be set to False for IEs that
545 are disabled by default and must be explicitly enabled.
546
547 The _WORKING attribute should be set to False for broken IEs
548 in order to warn the users and skip the tests.
549 """
550
551 _ready = False
552 _downloader = None
553 _x_forwarded_for_ip = None
554 _GEO_BYPASS = True
555 _GEO_COUNTRIES = None
556 _GEO_IP_BLOCKS = None
557 _WORKING = True
558 _ENABLED = True
559 _NETRC_MACHINE = None
560 IE_DESC = None
561 SEARCH_KEY = None
562 _VALID_URL = None
563 _EMBED_REGEX = []
564
565 def _login_hint(self, method=NO_DEFAULT, netrc=None):
566 password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
567 return {
568 None: '',
569 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
570 'password': f'Use {password_hint}',
571 'cookies': (
572 'Use --cookies-from-browser or --cookies for the authentication. '
573 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
574 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
575
576 def __init__(self, downloader=None):
577 """Constructor. Receives an optional downloader (a YoutubeDL instance).
578 If a downloader is not passed during initialization,
579 it must be set using "set_downloader()" before "extract()" is called"""
580 self._ready = False
581 self._x_forwarded_for_ip = None
582 self._printed_messages = set()
583 self.set_downloader(downloader)
584
585 @classmethod
586 def _match_valid_url(cls, url):
587 if cls._VALID_URL is False:
588 return None
589 # This does not use has/getattr intentionally - we want to know whether
590 # we have cached the regexp for *this* class, whereas getattr would also
591 # match the superclass
592 if '_VALID_URL_RE' not in cls.__dict__:
593 cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
594 return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
595
596 @classmethod
597 def suitable(cls, url):
598 """Receives a URL and returns True if suitable for this IE."""
599 # This function must import everything it needs (except other extractors),
600 # so that lazy_extractors works correctly
601 return cls._match_valid_url(url) is not None
602
603 @classmethod
604 def _match_id(cls, url):
605 return cls._match_valid_url(url).group('id')
606
607 @classmethod
608 def get_temp_id(cls, url):
609 try:
610 return cls._match_id(url)
611 except (IndexError, AttributeError):
612 return None
613
614 @classmethod
615 def working(cls):
616 """Getter method for _WORKING."""
617 return cls._WORKING
618
619 @classmethod
620 def supports_login(cls):
621 return bool(cls._NETRC_MACHINE)
622
623 def initialize(self):
624 """Initializes an instance (authentication, etc)."""
625 self._printed_messages = set()
626 self._initialize_geo_bypass({
627 'countries': self._GEO_COUNTRIES,
628 'ip_blocks': self._GEO_IP_BLOCKS,
629 })
630 if not self._ready:
631 self._initialize_pre_login()
632 if self.supports_login():
633 username, password = self._get_login_info()
634 if username:
635 self._perform_login(username, password)
636 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
637 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
638 self._real_initialize()
639 self._ready = True
640
641 def _initialize_geo_bypass(self, geo_bypass_context):
642 """
643 Initialize geo restriction bypass mechanism.
644
645 This method is used to initialize geo bypass mechanism based on faking
646 X-Forwarded-For HTTP header. A random country from provided country list
647 is selected and a random IP belonging to this country is generated. This
648 IP will be passed as X-Forwarded-For HTTP header in all subsequent
649 HTTP requests.
650
651 This method will be used for initial geo bypass mechanism initialization
652 during the instance initialization with _GEO_COUNTRIES and
653 _GEO_IP_BLOCKS.
654
655 You may also manually call it from extractor's code if geo bypass
656 information is not available beforehand (e.g. obtained during
657 extraction) or due to some other reason. In this case you should pass
658 this information in geo bypass context passed as first argument. It may
659 contain following fields:
660
661 countries: List of geo unrestricted countries (similar
662 to _GEO_COUNTRIES)
663 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
664 (similar to _GEO_IP_BLOCKS)
665
666 """
667 if not self._x_forwarded_for_ip:
668
669 # Geo bypass mechanism is explicitly disabled by user
670 if not self.get_param('geo_bypass', True):
671 return
672
673 if not geo_bypass_context:
674 geo_bypass_context = {}
675
676 # Backward compatibility: previously _initialize_geo_bypass
677 # expected a list of countries, some 3rd party code may still use
678 # it this way
679 if isinstance(geo_bypass_context, (list, tuple)):
680 geo_bypass_context = {
681 'countries': geo_bypass_context,
682 }
683
684 # The whole point of geo bypass mechanism is to fake IP
685 # as X-Forwarded-For HTTP header based on some IP block or
686 # country code.
687
688 # Path 1: bypassing based on IP block in CIDR notation
689
690 # Explicit IP block specified by user, use it right away
691 # regardless of whether extractor is geo bypassable or not
692 ip_block = self.get_param('geo_bypass_ip_block', None)
693
694 # Otherwise use random IP block from geo bypass context but only
695 # if extractor is known as geo bypassable
696 if not ip_block:
697 ip_blocks = geo_bypass_context.get('ip_blocks')
698 if self._GEO_BYPASS and ip_blocks:
699 ip_block = random.choice(ip_blocks)
700
701 if ip_block:
702 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
703 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
704 return
705
706 # Path 2: bypassing based on country code
707
708 # Explicit country code specified by user, use it right away
709 # regardless of whether extractor is geo bypassable or not
710 country = self.get_param('geo_bypass_country', None)
711
712 # Otherwise use random country code from geo bypass context but
713 # only if extractor is known as geo bypassable
714 if not country:
715 countries = geo_bypass_context.get('countries')
716 if self._GEO_BYPASS and countries:
717 country = random.choice(countries)
718
719 if country:
720 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
721 self._downloader.write_debug(
722 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
723
724 def extract(self, url):
725 """Extracts URL information and returns it in list of dicts."""
726 try:
727 for _ in range(2):
728 try:
729 self.initialize()
730 self.to_screen('Extracting URL: %s' % (
731 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
732 ie_result = self._real_extract(url)
733 if ie_result is None:
734 return None
735 if self._x_forwarded_for_ip:
736 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
737 subtitles = ie_result.get('subtitles') or {}
738 if 'no-live-chat' in self.get_param('compat_opts'):
739 for lang in ('live_chat', 'comments', 'danmaku'):
740 subtitles.pop(lang, None)
741 return ie_result
742 except GeoRestrictedError as e:
743 if self.__maybe_fake_ip_and_retry(e.countries):
744 continue
745 raise
746 except UnsupportedError:
747 raise
748 except ExtractorError as e:
749 e.video_id = e.video_id or self.get_temp_id(url)
750 e.ie = e.ie or self.IE_NAME
751 e.traceback = e.traceback or sys.exc_info()[2]
752 raise
753 except IncompleteRead as e:
754 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
755 except (KeyError, StopIteration) as e:
756 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
757
758 def __maybe_fake_ip_and_retry(self, countries):
759 if (not self.get_param('geo_bypass_country', None)
760 and self._GEO_BYPASS
761 and self.get_param('geo_bypass', True)
762 and not self._x_forwarded_for_ip
763 and countries):
764 country_code = random.choice(countries)
765 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
766 if self._x_forwarded_for_ip:
767 self.report_warning(
768 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
769 % (self._x_forwarded_for_ip, country_code.upper()))
770 return True
771 return False
772
773 def set_downloader(self, downloader):
774 """Sets a YoutubeDL instance as the downloader for this IE."""
775 self._downloader = downloader
776
777 @property
778 def cache(self):
779 return self._downloader.cache
780
781 @property
782 def cookiejar(self):
783 return self._downloader.cookiejar
784
785 def _initialize_pre_login(self):
786 """ Initialization before login. Redefine in subclasses."""
787 pass
788
789 def _perform_login(self, username, password):
790 """ Login with username and password. Redefine in subclasses."""
791 pass
792
793 def _real_initialize(self):
794 """Real initialization process. Redefine in subclasses."""
795 pass
796
797 def _real_extract(self, url):
798 """Real extraction process. Redefine in subclasses."""
799 raise NotImplementedError('This method must be implemented by subclasses')
800
801 @classmethod
802 def ie_key(cls):
803 """A string for getting the InfoExtractor with get_info_extractor"""
804 return cls.__name__[:-2]
805
806 @classproperty
807 def IE_NAME(cls):
808 return cls.__name__[:-2]
809
810 @staticmethod
811 def __can_accept_status_code(err, expected_status):
812 assert isinstance(err, HTTPError)
813 if expected_status is None:
814 return False
815 elif callable(expected_status):
816 return expected_status(err.status) is True
817 else:
818 return err.status in variadic(expected_status)
819
820 def _create_request(self, url_or_request, data=None, headers=None, query=None):
821 if isinstance(url_or_request, urllib.request.Request):
822 self._downloader.deprecation_warning(
823 'Passing a urllib.request.Request to _create_request() is deprecated. '
824 'Use yt_dlp.networking.common.Request instead.')
825 url_or_request = urllib_req_to_req(url_or_request)
826 elif not isinstance(url_or_request, Request):
827 url_or_request = Request(url_or_request)
828
829 url_or_request.update(data=data, headers=headers, query=query)
830 return url_or_request
831
832 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
833 """
834 Return the response handle.
835
836 See _download_webpage docstring for arguments specification.
837 """
838 if not self._downloader._first_webpage_request:
839 sleep_interval = self.get_param('sleep_interval_requests') or 0
840 if sleep_interval > 0:
841 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
842 time.sleep(sleep_interval)
843 else:
844 self._downloader._first_webpage_request = False
845
846 if note is None:
847 self.report_download_webpage(video_id)
848 elif note is not False:
849 if video_id is None:
850 self.to_screen(str(note))
851 else:
852 self.to_screen(f'{video_id}: {note}')
853
854 # Some sites check X-Forwarded-For HTTP header in order to figure out
855 # the origin of the client behind proxy. This allows bypassing geo
856 # restriction by faking this header's value to IP that belongs to some
857 # geo unrestricted country. We will do so once we encounter any
858 # geo restriction error.
859 if self._x_forwarded_for_ip:
860 headers = (headers or {}).copy()
861 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
862
863 try:
864 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
865 except network_exceptions as err:
866 if isinstance(err, HTTPError):
867 if self.__can_accept_status_code(err, expected_status):
868 return err.response
869
870 if errnote is False:
871 return False
872 if errnote is None:
873 errnote = 'Unable to download webpage'
874
875 errmsg = f'{errnote}: {error_to_compat_str(err)}'
876 if fatal:
877 raise ExtractorError(errmsg, cause=err)
878 else:
879 self.report_warning(errmsg)
880 return False
881
882 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
883 encoding=None, data=None, headers={}, query={}, expected_status=None):
884 """
885 Return a tuple (page content as string, URL handle).
886
887 Arguments:
888 url_or_request -- plain text URL as a string or
889 a urllib.request.Request object
890 video_id -- Video/playlist/item identifier (string)
891
892 Keyword arguments:
893 note -- note printed before downloading (string)
894 errnote -- note printed in case of an error (string)
895 fatal -- flag denoting whether error should be considered fatal,
896 i.e. whether it should cause ExtractionError to be raised,
897 otherwise a warning will be reported and extraction continued
898 encoding -- encoding for a page content decoding, guessed automatically
899 when not explicitly specified
900 data -- POST data (bytes)
901 headers -- HTTP headers (dict)
902 query -- URL query (dict)
903 expected_status -- allows to accept failed HTTP requests (non 2xx
904 status code) by explicitly specifying a set of accepted status
905 codes. Can be any of the following entities:
906 - an integer type specifying an exact failed status code to
907 accept
908 - a list or a tuple of integer types specifying a list of
909 failed status codes to accept
910 - a callable accepting an actual failed status code and
911 returning True if it should be accepted
912 Note that this argument does not affect success status codes (2xx)
913 which are always accepted.
914 """
915
916 # Strip hashes from the URL (#1038)
917 if isinstance(url_or_request, str):
918 url_or_request = url_or_request.partition('#')[0]
919
920 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
921 if urlh is False:
922 assert not fatal
923 return False
924 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
925 return (content, urlh)
926
927 @staticmethod
928 def _guess_encoding_from_content(content_type, webpage_bytes):
929 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
930 if m:
931 encoding = m.group(1)
932 else:
933 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
934 webpage_bytes[:1024])
935 if m:
936 encoding = m.group(1).decode('ascii')
937 elif webpage_bytes.startswith(b'\xff\xfe'):
938 encoding = 'utf-16'
939 else:
940 encoding = 'utf-8'
941
942 return encoding
943
944 def __check_blocked(self, content):
945 first_block = content[:512]
946 if ('<title>Access to this site is blocked</title>' in content
947 and 'Websense' in first_block):
948 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
949 blocked_iframe = self._html_search_regex(
950 r'<iframe src="([^"]+)"', content,
951 'Websense information URL', default=None)
952 if blocked_iframe:
953 msg += ' Visit %s for more details' % blocked_iframe
954 raise ExtractorError(msg, expected=True)
955 if '<title>The URL you requested has been blocked</title>' in first_block:
956 msg = (
957 'Access to this webpage has been blocked by Indian censorship. '
958 'Use a VPN or proxy server (with --proxy) to route around it.')
959 block_msg = self._html_search_regex(
960 r'</h1><p>(.*?)</p>',
961 content, 'block message', default=None)
962 if block_msg:
963 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
964 raise ExtractorError(msg, expected=True)
965 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
966 and 'blocklist.rkn.gov.ru' in content):
967 raise ExtractorError(
968 'Access to this webpage has been blocked by decision of the Russian government. '
969 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
970 expected=True)
971
972 def _request_dump_filename(self, url, video_id):
973 basen = f'{video_id}_{url}'
974 trim_length = self.get_param('trim_file_name') or 240
975 if len(basen) > trim_length:
976 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
977 basen = basen[:trim_length - len(h)] + h
978 filename = sanitize_filename(f'{basen}.dump', restricted=True)
979 # Working around MAX_PATH limitation on Windows (see
980 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
981 if compat_os_name == 'nt':
982 absfilepath = os.path.abspath(filename)
983 if len(absfilepath) > 259:
984 filename = fR'\\?\{absfilepath}'
985 return filename
986
987 def __decode_webpage(self, webpage_bytes, encoding, headers):
988 if not encoding:
989 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
990 try:
991 return webpage_bytes.decode(encoding, 'replace')
992 except LookupError:
993 return webpage_bytes.decode('utf-8', 'replace')
994
995 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
996 webpage_bytes = urlh.read()
997 if prefix is not None:
998 webpage_bytes = prefix + webpage_bytes
999 if self.get_param('dump_intermediate_pages', False):
1000 self.to_screen('Dumping request to ' + urlh.url)
1001 dump = base64.b64encode(webpage_bytes).decode('ascii')
1002 self._downloader.to_screen(dump)
1003 if self.get_param('write_pages'):
1004 filename = self._request_dump_filename(urlh.url, video_id)
1005 self.to_screen(f'Saving request to {filename}')
1006 with open(filename, 'wb') as outf:
1007 outf.write(webpage_bytes)
1008
1009 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
1010 self.__check_blocked(content)
1011
1012 return content
1013
1014 def __print_error(self, errnote, fatal, video_id, err):
1015 if fatal:
1016 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1017 elif errnote:
1018 self.report_warning(f'{video_id}: {errnote}: {err}')
1019
1020 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1021 if transform_source:
1022 xml_string = transform_source(xml_string)
1023 try:
1024 return compat_etree_fromstring(xml_string.encode('utf-8'))
1025 except xml.etree.ElementTree.ParseError as ve:
1026 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1027
1028 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1029 try:
1030 return json.loads(
1031 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1032 except ValueError as ve:
1033 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1034
1035 def _parse_socket_response_as_json(self, data, *args, **kwargs):
1036 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1037
1038 def __create_download_methods(name, parser, note, errnote, return_value):
1039
1040 def parse(ie, content, *args, errnote=errnote, **kwargs):
1041 if parser is None:
1042 return content
1043 if errnote is False:
1044 kwargs['errnote'] = errnote
1045 # parser is fetched by name so subclasses can override it
1046 return getattr(ie, parser)(content, *args, **kwargs)
1047
1048 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1049 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1050 res = self._download_webpage_handle(
1051 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1052 data=data, headers=headers, query=query, expected_status=expected_status)
1053 if res is False:
1054 return res
1055 content, urlh = res
1056 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1057
1058 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1059 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1060 if self.get_param('load_pages'):
1061 url_or_request = self._create_request(url_or_request, data, headers, query)
1062 filename = self._request_dump_filename(url_or_request.url, video_id)
1063 self.to_screen(f'Loading request from {filename}')
1064 try:
1065 with open(filename, 'rb') as dumpf:
1066 webpage_bytes = dumpf.read()
1067 except OSError as e:
1068 self.report_warning(f'Unable to load request from disk: {e}')
1069 else:
1070 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1071 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1072 kwargs = {
1073 'note': note,
1074 'errnote': errnote,
1075 'transform_source': transform_source,
1076 'fatal': fatal,
1077 'encoding': encoding,
1078 'data': data,
1079 'headers': headers,
1080 'query': query,
1081 'expected_status': expected_status,
1082 }
1083 if parser is None:
1084 kwargs.pop('transform_source')
1085 # The method is fetched by name so subclasses can override _download_..._handle
1086 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1087 return res if res is False else res[0]
1088
1089 def impersonate(func, name, return_value):
1090 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1091 func.__doc__ = f'''
1092 @param transform_source Apply this transformation before parsing
1093 @returns {return_value}
1094
1095 See _download_webpage_handle docstring for other arguments specification
1096 '''
1097
1098 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1099 impersonate(download_content, f'_download_{name}', f'{return_value}')
1100 return download_handle, download_content
1101
1102 _download_xml_handle, _download_xml = __create_download_methods(
1103 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1104 _download_json_handle, _download_json = __create_download_methods(
1105 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1106 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1107 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1108 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1109
1110 def _download_webpage(
1111 self, url_or_request, video_id, note=None, errnote=None,
1112 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1113 """
1114 Return the data of the page as a string.
1115
1116 Keyword arguments:
1117 tries -- number of tries
1118 timeout -- sleep interval between tries
1119
1120 See _download_webpage_handle docstring for other arguments specification.
1121 """
1122
1123 R''' # NB: These are unused; should they be deprecated?
1124 if tries != 1:
1125 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1126 if timeout is NO_DEFAULT:
1127 timeout = 5
1128 else:
1129 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1130 '''
1131
1132 try_count = 0
1133 while True:
1134 try:
1135 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1136 except IncompleteRead as e:
1137 try_count += 1
1138 if try_count >= tries:
1139 raise e
1140 self._sleep(timeout, video_id)
1141
1142 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1143 idstr = format_field(video_id, None, '%s: ')
1144 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1145 if only_once:
1146 if f'WARNING: {msg}' in self._printed_messages:
1147 return
1148 self._printed_messages.add(f'WARNING: {msg}')
1149 self._downloader.report_warning(msg, *args, **kwargs)
1150
1151 def to_screen(self, msg, *args, **kwargs):
1152 """Print msg to screen, prefixing it with '[ie_name]'"""
1153 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1154
1155 def write_debug(self, msg, *args, **kwargs):
1156 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1157
1158 def get_param(self, name, default=None, *args, **kwargs):
1159 if self._downloader:
1160 return self._downloader.params.get(name, default, *args, **kwargs)
1161 return default
1162
1163 def report_drm(self, video_id, partial=NO_DEFAULT):
1164 if partial is not NO_DEFAULT:
1165 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1166 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1167
1168 def report_extraction(self, id_or_name):
1169 """Report information extraction."""
1170 self.to_screen('%s: Extracting information' % id_or_name)
1171
1172 def report_download_webpage(self, video_id):
1173 """Report webpage download."""
1174 self.to_screen('%s: Downloading webpage' % video_id)
1175
1176 def report_age_confirmation(self):
1177 """Report attempt to confirm age."""
1178 self.to_screen('Confirming age')
1179
1180 def report_login(self):
1181 """Report attempt to log in."""
1182 self.to_screen('Logging in')
1183
1184 def raise_login_required(
1185 self, msg='This video is only available for registered users',
1186 metadata_available=False, method=NO_DEFAULT):
1187 if metadata_available and (
1188 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1189 self.report_warning(msg)
1190 return
1191 msg += format_field(self._login_hint(method), None, '. %s')
1192 raise ExtractorError(msg, expected=True)
1193
1194 def raise_geo_restricted(
1195 self, msg='This video is not available from your location due to geo restriction',
1196 countries=None, metadata_available=False):
1197 if metadata_available and (
1198 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1199 self.report_warning(msg)
1200 else:
1201 raise GeoRestrictedError(msg, countries=countries)
1202
1203 def raise_no_formats(self, msg, expected=False, video_id=None):
1204 if expected and (
1205 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1206 self.report_warning(msg, video_id)
1207 elif isinstance(msg, ExtractorError):
1208 raise msg
1209 else:
1210 raise ExtractorError(msg, expected=expected, video_id=video_id)
1211
1212 # Methods for following #608
1213 @staticmethod
1214 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1215 """Returns a URL that points to a page that should be processed"""
1216 if ie is not None:
1217 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1218 if video_id is not None:
1219 kwargs['id'] = video_id
1220 if video_title is not None:
1221 kwargs['title'] = video_title
1222 return {
1223 **kwargs,
1224 '_type': 'url_transparent' if url_transparent else 'url',
1225 'url': url,
1226 }
1227
1228 @classmethod
1229 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1230 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1231 return cls.playlist_result(
1232 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1233 playlist_id, playlist_title, **kwargs)
1234
1235 @staticmethod
1236 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1237 """Returns a playlist"""
1238 if playlist_id:
1239 kwargs['id'] = playlist_id
1240 if playlist_title:
1241 kwargs['title'] = playlist_title
1242 if playlist_description is not None:
1243 kwargs['description'] = playlist_description
1244 return {
1245 **kwargs,
1246 '_type': 'multi_video' if multi_video else 'playlist',
1247 'entries': entries,
1248 }
1249
1250 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1251 """
1252 Perform a regex search on the given string, using a single or a list of
1253 patterns returning the first matching group.
1254 In case of failure return a default value or raise a WARNING or a
1255 RegexNotFoundError, depending on fatal, specifying the field name.
1256 """
1257 if string is None:
1258 mobj = None
1259 elif isinstance(pattern, (str, re.Pattern)):
1260 mobj = re.search(pattern, string, flags)
1261 else:
1262 for p in pattern:
1263 mobj = re.search(p, string, flags)
1264 if mobj:
1265 break
1266
1267 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1268
1269 if mobj:
1270 if group is None:
1271 # return the first matching group
1272 return next(g for g in mobj.groups() if g is not None)
1273 elif isinstance(group, (list, tuple)):
1274 return tuple(mobj.group(g) for g in group)
1275 else:
1276 return mobj.group(group)
1277 elif default is not NO_DEFAULT:
1278 return default
1279 elif fatal:
1280 raise RegexNotFoundError('Unable to extract %s' % _name)
1281 else:
1282 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1283 return None
1284
1285 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1286 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1287 """Searches string for the JSON object specified by start_pattern"""
1288 # NB: end_pattern is only used to reduce the size of the initial match
1289 if default is NO_DEFAULT:
1290 default, has_default = {}, False
1291 else:
1292 fatal, has_default = False, True
1293
1294 json_string = self._search_regex(
1295 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1296 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1297 if not json_string:
1298 return default
1299
1300 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1301 try:
1302 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1303 except ExtractorError as e:
1304 if fatal:
1305 raise ExtractorError(
1306 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1307 elif not has_default:
1308 self.report_warning(
1309 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1310 return default
1311
1312 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1313 """
1314 Like _search_regex, but strips HTML tags and unescapes entities.
1315 """
1316 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1317 if isinstance(res, tuple):
1318 return tuple(map(clean_html, res))
1319 return clean_html(res)
1320
1321 def _get_netrc_login_info(self, netrc_machine=None):
1322 netrc_machine = netrc_machine or self._NETRC_MACHINE
1323
1324 cmd = self.get_param('netrc_cmd')
1325 if cmd:
1326 cmd = cmd.replace('{}', netrc_machine)
1327 self.to_screen(f'Executing command: {cmd}')
1328 stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1329 if ret != 0:
1330 raise OSError(f'Command returned error code {ret}')
1331 info = netrc_from_content(stdout).authenticators(netrc_machine)
1332
1333 elif self.get_param('usenetrc', False):
1334 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1335 if os.path.isdir(netrc_file):
1336 netrc_file = os.path.join(netrc_file, '.netrc')
1337 info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1338
1339 else:
1340 return None, None
1341 if not info:
1342 self.to_screen(f'No authenticators for {netrc_machine}')
1343 return None, None
1344
1345 self.write_debug(f'Using netrc for {netrc_machine} authentication')
1346 return info[0], info[2]
1347
1348 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1349 """
1350 Get the login info as (username, password)
1351 First look for the manually specified credentials using username_option
1352 and password_option as keys in params dictionary. If no such credentials
1353 are available try the netrc_cmd if it is defined or look in the
1354 netrc file using the netrc_machine or _NETRC_MACHINE value.
1355 If there's no info available, return (None, None)
1356 """
1357
1358 username = self.get_param(username_option)
1359 if username is not None:
1360 password = self.get_param(password_option)
1361 else:
1362 try:
1363 username, password = self._get_netrc_login_info(netrc_machine)
1364 except (OSError, netrc.NetrcParseError) as err:
1365 self.report_warning(f'Failed to parse .netrc: {err}')
1366 return None, None
1367 return username, password
1368
1369 def _get_tfa_info(self, note='two-factor verification code'):
1370 """
1371 Get the two-factor authentication info
1372 TODO - asking the user will be required for sms/phone verify
1373 currently just uses the command line option
1374 If there's no info available, return None
1375 """
1376
1377 tfa = self.get_param('twofactor')
1378 if tfa is not None:
1379 return tfa
1380
1381 return getpass.getpass('Type %s and press [Return]: ' % note)
1382
1383 # Helper functions for extracting OpenGraph info
1384 @staticmethod
1385 def _og_regexes(prop):
1386 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1387 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1388 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1389 template = r'<meta[^>]+?%s[^>]+?%s'
1390 return [
1391 template % (property_re, content_re),
1392 template % (content_re, property_re),
1393 ]
1394
1395 @staticmethod
1396 def _meta_regex(prop):
1397 return r'''(?isx)<meta
1398 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1399 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1400
1401 def _og_search_property(self, prop, html, name=None, **kargs):
1402 prop = variadic(prop)
1403 if name is None:
1404 name = 'OpenGraph %s' % prop[0]
1405 og_regexes = []
1406 for p in prop:
1407 og_regexes.extend(self._og_regexes(p))
1408 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1409 if escaped is None:
1410 return None
1411 return unescapeHTML(escaped)
1412
1413 def _og_search_thumbnail(self, html, **kargs):
1414 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1415
1416 def _og_search_description(self, html, **kargs):
1417 return self._og_search_property('description', html, fatal=False, **kargs)
1418
1419 def _og_search_title(self, html, *, fatal=False, **kargs):
1420 return self._og_search_property('title', html, fatal=fatal, **kargs)
1421
1422 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1423 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1424 if secure:
1425 regexes = self._og_regexes('video:secure_url') + regexes
1426 return self._html_search_regex(regexes, html, name, **kargs)
1427
1428 def _og_search_url(self, html, **kargs):
1429 return self._og_search_property('url', html, **kargs)
1430
1431 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1432 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1433
1434 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1435 name = variadic(name)
1436 if display_name is None:
1437 display_name = name[0]
1438 return self._html_search_regex(
1439 [self._meta_regex(n) for n in name],
1440 html, display_name, fatal=fatal, group='content', **kwargs)
1441
1442 def _dc_search_uploader(self, html):
1443 return self._html_search_meta('dc.creator', html, 'uploader')
1444
1445 @staticmethod
1446 def _rta_search(html):
1447 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1448 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1449 r' content="RTA-5042-1996-1400-1577-RTA"',
1450 html):
1451 return 18
1452
1453 # And then there are the jokers who advertise that they use RTA, but actually don't.
1454 AGE_LIMIT_MARKERS = [
1455 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1456 r'>[^<]*you acknowledge you are at least (\d+) years old',
1457 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1458 ]
1459
1460 age_limit = 0
1461 for marker in AGE_LIMIT_MARKERS:
1462 mobj = re.search(marker, html)
1463 if mobj:
1464 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1465 return age_limit
1466
1467 def _media_rating_search(self, html):
1468 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1469 rating = self._html_search_meta('rating', html)
1470
1471 if not rating:
1472 return None
1473
1474 RATING_TABLE = {
1475 'safe for kids': 0,
1476 'general': 8,
1477 '14 years': 14,
1478 'mature': 17,
1479 'restricted': 19,
1480 }
1481 return RATING_TABLE.get(rating.lower())
1482
1483 def _family_friendly_search(self, html):
1484 # See http://schema.org/VideoObject
1485 family_friendly = self._html_search_meta(
1486 'isFamilyFriendly', html, default=None)
1487
1488 if not family_friendly:
1489 return None
1490
1491 RATING_TABLE = {
1492 '1': 0,
1493 'true': 0,
1494 '0': 18,
1495 'false': 18,
1496 }
1497 return RATING_TABLE.get(family_friendly.lower())
1498
1499 def _twitter_search_player(self, html):
1500 return self._html_search_meta('twitter:player', html,
1501 'twitter card player')
1502
1503 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1504 """Yield all json ld objects in the html"""
1505 if default is not NO_DEFAULT:
1506 fatal = False
1507 for mobj in re.finditer(JSON_LD_RE, html):
1508 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1509 for json_ld in variadic(json_ld_item):
1510 if isinstance(json_ld, dict):
1511 yield json_ld
1512
1513 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1514 """Search for a video in any json ld in the html"""
1515 if default is not NO_DEFAULT:
1516 fatal = False
1517 info = self._json_ld(
1518 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1519 video_id, fatal=fatal, expected_type=expected_type)
1520 if info:
1521 return info
1522 if default is not NO_DEFAULT:
1523 return default
1524 elif fatal:
1525 raise RegexNotFoundError('Unable to extract JSON-LD')
1526 else:
1527 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1528 return {}
1529
1530 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1531 if isinstance(json_ld, str):
1532 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1533 if not json_ld:
1534 return {}
1535 info = {}
1536
1537 INTERACTION_TYPE_MAP = {
1538 'CommentAction': 'comment',
1539 'AgreeAction': 'like',
1540 'DisagreeAction': 'dislike',
1541 'LikeAction': 'like',
1542 'DislikeAction': 'dislike',
1543 'ListenAction': 'view',
1544 'WatchAction': 'view',
1545 'ViewAction': 'view',
1546 }
1547
1548 def is_type(e, *expected_types):
1549 type = variadic(traverse_obj(e, '@type'))
1550 return any(x in type for x in expected_types)
1551
1552 def extract_interaction_type(e):
1553 interaction_type = e.get('interactionType')
1554 if isinstance(interaction_type, dict):
1555 interaction_type = interaction_type.get('@type')
1556 return str_or_none(interaction_type)
1557
1558 def extract_interaction_statistic(e):
1559 interaction_statistic = e.get('interactionStatistic')
1560 if isinstance(interaction_statistic, dict):
1561 interaction_statistic = [interaction_statistic]
1562 if not isinstance(interaction_statistic, list):
1563 return
1564 for is_e in interaction_statistic:
1565 if not is_type(is_e, 'InteractionCounter'):
1566 continue
1567 interaction_type = extract_interaction_type(is_e)
1568 if not interaction_type:
1569 continue
1570 # For interaction count some sites provide string instead of
1571 # an integer (as per spec) with non digit characters (e.g. ",")
1572 # so extracting count with more relaxed str_to_int
1573 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1574 if interaction_count is None:
1575 continue
1576 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1577 if not count_kind:
1578 continue
1579 count_key = '%s_count' % count_kind
1580 if info.get(count_key) is not None:
1581 continue
1582 info[count_key] = interaction_count
1583
1584 def extract_chapter_information(e):
1585 chapters = [{
1586 'title': part.get('name'),
1587 'start_time': part.get('startOffset'),
1588 'end_time': part.get('endOffset'),
1589 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1590 for idx, (last_c, current_c, next_c) in enumerate(zip(
1591 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1592 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1593 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1594 if None in current_c.values():
1595 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1596 return
1597 if chapters:
1598 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1599 info['chapters'] = chapters
1600
1601 def extract_video_object(e):
1602 author = e.get('author')
1603 info.update({
1604 'url': url_or_none(e.get('contentUrl')),
1605 'ext': mimetype2ext(e.get('encodingFormat')),
1606 'title': unescapeHTML(e.get('name')),
1607 'description': unescapeHTML(e.get('description')),
1608 'thumbnails': [{'url': unescapeHTML(url)}
1609 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1610 if url_or_none(url)],
1611 'duration': parse_duration(e.get('duration')),
1612 'timestamp': unified_timestamp(e.get('uploadDate')),
1613 # author can be an instance of 'Organization' or 'Person' types.
1614 # both types can have 'name' property(inherited from 'Thing' type). [1]
1615 # however some websites are using 'Text' type instead.
1616 # 1. https://schema.org/VideoObject
1617 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1618 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1619 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1620 'tbr': int_or_none(e.get('bitrate')),
1621 'width': int_or_none(e.get('width')),
1622 'height': int_or_none(e.get('height')),
1623 'view_count': int_or_none(e.get('interactionCount')),
1624 'tags': try_call(lambda: e.get('keywords').split(',')),
1625 })
1626 if is_type(e, 'AudioObject'):
1627 info.update({
1628 'vcodec': 'none',
1629 'abr': int_or_none(e.get('bitrate')),
1630 })
1631 extract_interaction_statistic(e)
1632 extract_chapter_information(e)
1633
1634 def traverse_json_ld(json_ld, at_top_level=True):
1635 for e in variadic(json_ld):
1636 if not isinstance(e, dict):
1637 continue
1638 if at_top_level and '@context' not in e:
1639 continue
1640 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1641 traverse_json_ld(e['@graph'], at_top_level=False)
1642 continue
1643 if expected_type is not None and not is_type(e, expected_type):
1644 continue
1645 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1646 if rating is not None:
1647 info['average_rating'] = rating
1648 if is_type(e, 'TVEpisode', 'Episode'):
1649 episode_name = unescapeHTML(e.get('name'))
1650 info.update({
1651 'episode': episode_name,
1652 'episode_number': int_or_none(e.get('episodeNumber')),
1653 'description': unescapeHTML(e.get('description')),
1654 })
1655 if not info.get('title') and episode_name:
1656 info['title'] = episode_name
1657 part_of_season = e.get('partOfSeason')
1658 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1659 info.update({
1660 'season': unescapeHTML(part_of_season.get('name')),
1661 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1662 })
1663 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1664 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1665 info['series'] = unescapeHTML(part_of_series.get('name'))
1666 elif is_type(e, 'Movie'):
1667 info.update({
1668 'title': unescapeHTML(e.get('name')),
1669 'description': unescapeHTML(e.get('description')),
1670 'duration': parse_duration(e.get('duration')),
1671 'timestamp': unified_timestamp(e.get('dateCreated')),
1672 })
1673 elif is_type(e, 'Article', 'NewsArticle'):
1674 info.update({
1675 'timestamp': parse_iso8601(e.get('datePublished')),
1676 'title': unescapeHTML(e.get('headline')),
1677 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1678 })
1679 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1680 extract_video_object(e['video'][0])
1681 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1682 extract_video_object(e['subjectOf'][0])
1683 elif is_type(e, 'VideoObject', 'AudioObject'):
1684 extract_video_object(e)
1685 if expected_type is None:
1686 continue
1687 else:
1688 break
1689 video = e.get('video')
1690 if is_type(video, 'VideoObject'):
1691 extract_video_object(video)
1692 if expected_type is None:
1693 continue
1694 else:
1695 break
1696
1697 traverse_json_ld(json_ld)
1698 return filter_dict(info)
1699
1700 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1701 return self._parse_json(
1702 self._search_regex(
1703 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1704 webpage, 'next.js data', fatal=fatal, **kw),
1705 video_id, transform_source=transform_source, fatal=fatal)
1706
1707 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1708 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1709 rectx = re.escape(context_name)
1710 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1711 js, arg_keys, arg_vals = self._search_regex(
1712 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1713 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1714 default=NO_DEFAULT if fatal else (None, None, None))
1715 if js is None:
1716 return {}
1717
1718 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1719 f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1720
1721 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1722 return traverse_obj(ret, traverse) or {}
1723
1724 @staticmethod
1725 def _hidden_inputs(html):
1726 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1727 hidden_inputs = {}
1728 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1729 attrs = extract_attributes(input)
1730 if not input:
1731 continue
1732 if attrs.get('type') not in ('hidden', 'submit'):
1733 continue
1734 name = attrs.get('name') or attrs.get('id')
1735 value = attrs.get('value')
1736 if name and value is not None:
1737 hidden_inputs[name] = value
1738 return hidden_inputs
1739
1740 def _form_hidden_inputs(self, form_id, html):
1741 form = self._search_regex(
1742 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1743 html, '%s form' % form_id, group='form')
1744 return self._hidden_inputs(form)
1745
1746 @classproperty(cache=True)
1747 def FormatSort(cls):
1748 class FormatSort(FormatSorter):
1749 def __init__(ie, *args, **kwargs):
1750 super().__init__(ie._downloader, *args, **kwargs)
1751
1752 deprecation_warning(
1753 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1754 'Use yt_dlp.utils.FormatSorter instead')
1755 return FormatSort
1756
1757 def _sort_formats(self, formats, field_preference=[]):
1758 if not field_preference:
1759 self._downloader.deprecation_warning(
1760 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1761 return
1762 self._downloader.deprecation_warning(
1763 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1764 'Return _format_sort_fields in the info_dict instead')
1765 if formats:
1766 formats[0]['__sort_fields'] = field_preference
1767
1768 def _check_formats(self, formats, video_id):
1769 if formats:
1770 formats[:] = filter(
1771 lambda f: self._is_valid_url(
1772 f['url'], video_id,
1773 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1774 formats)
1775
1776 @staticmethod
1777 def _remove_duplicate_formats(formats):
1778 format_urls = set()
1779 unique_formats = []
1780 for f in formats:
1781 if f['url'] not in format_urls:
1782 format_urls.add(f['url'])
1783 unique_formats.append(f)
1784 formats[:] = unique_formats
1785
1786 def _is_valid_url(self, url, video_id, item='video', headers={}):
1787 url = self._proto_relative_url(url, scheme='http:')
1788 # For now assume non HTTP(S) URLs always valid
1789 if not (url.startswith('http://') or url.startswith('https://')):
1790 return True
1791 try:
1792 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1793 return True
1794 except ExtractorError as e:
1795 self.to_screen(
1796 '%s: %s URL is invalid, skipping: %s'
1797 % (video_id, item, error_to_compat_str(e.cause)))
1798 return False
1799
1800 def http_scheme(self):
1801 """ Either "http:" or "https:", depending on the user's preferences """
1802 return (
1803 'http:'
1804 if self.get_param('prefer_insecure', False)
1805 else 'https:')
1806
1807 def _proto_relative_url(self, url, scheme=None):
1808 scheme = scheme or self.http_scheme()
1809 assert scheme.endswith(':')
1810 return sanitize_url(url, scheme=scheme[:-1])
1811
1812 def _sleep(self, timeout, video_id, msg_template=None):
1813 if msg_template is None:
1814 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1815 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1816 self.to_screen(msg)
1817 time.sleep(timeout)
1818
1819 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1820 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1821 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1822 if self.get_param('ignore_no_formats_error'):
1823 fatal = False
1824
1825 res = self._download_xml_handle(
1826 manifest_url, video_id, 'Downloading f4m manifest',
1827 'Unable to download f4m manifest',
1828 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1829 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1830 transform_source=transform_source,
1831 fatal=fatal, data=data, headers=headers, query=query)
1832 if res is False:
1833 return []
1834
1835 manifest, urlh = res
1836 manifest_url = urlh.url
1837
1838 return self._parse_f4m_formats(
1839 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1840 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1841
1842 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1843 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1844 fatal=True, m3u8_id=None):
1845 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1846 return []
1847
1848 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1849 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1850 if akamai_pv is not None and ';' in akamai_pv.text:
1851 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1852 if playerVerificationChallenge.strip() != '':
1853 return []
1854
1855 formats = []
1856 manifest_version = '1.0'
1857 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1858 if not media_nodes:
1859 manifest_version = '2.0'
1860 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1861 # Remove unsupported DRM protected media from final formats
1862 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1863 media_nodes = remove_encrypted_media(media_nodes)
1864 if not media_nodes:
1865 return formats
1866
1867 manifest_base_url = get_base_url(manifest)
1868
1869 bootstrap_info = xpath_element(
1870 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1871 'bootstrap info', default=None)
1872
1873 vcodec = None
1874 mime_type = xpath_text(
1875 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1876 'base URL', default=None)
1877 if mime_type and mime_type.startswith('audio/'):
1878 vcodec = 'none'
1879
1880 for i, media_el in enumerate(media_nodes):
1881 tbr = int_or_none(media_el.attrib.get('bitrate'))
1882 width = int_or_none(media_el.attrib.get('width'))
1883 height = int_or_none(media_el.attrib.get('height'))
1884 format_id = join_nonempty(f4m_id, tbr or i)
1885 # If <bootstrapInfo> is present, the specified f4m is a
1886 # stream-level manifest, and only set-level manifests may refer to
1887 # external resources. See section 11.4 and section 4 of F4M spec
1888 if bootstrap_info is None:
1889 media_url = None
1890 # @href is introduced in 2.0, see section 11.6 of F4M spec
1891 if manifest_version == '2.0':
1892 media_url = media_el.attrib.get('href')
1893 if media_url is None:
1894 media_url = media_el.attrib.get('url')
1895 if not media_url:
1896 continue
1897 manifest_url = (
1898 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1899 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1900 # If media_url is itself a f4m manifest do the recursive extraction
1901 # since bitrates in parent manifest (this one) and media_url manifest
1902 # may differ leading to inability to resolve the format by requested
1903 # bitrate in f4m downloader
1904 ext = determine_ext(manifest_url)
1905 if ext == 'f4m':
1906 f4m_formats = self._extract_f4m_formats(
1907 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1908 transform_source=transform_source, fatal=fatal)
1909 # Sometimes stream-level manifest contains single media entry that
1910 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1911 # At the same time parent's media entry in set-level manifest may
1912 # contain it. We will copy it from parent in such cases.
1913 if len(f4m_formats) == 1:
1914 f = f4m_formats[0]
1915 f.update({
1916 'tbr': f.get('tbr') or tbr,
1917 'width': f.get('width') or width,
1918 'height': f.get('height') or height,
1919 'format_id': f.get('format_id') if not tbr else format_id,
1920 'vcodec': vcodec,
1921 })
1922 formats.extend(f4m_formats)
1923 continue
1924 elif ext == 'm3u8':
1925 formats.extend(self._extract_m3u8_formats(
1926 manifest_url, video_id, 'mp4', preference=preference,
1927 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1928 continue
1929 formats.append({
1930 'format_id': format_id,
1931 'url': manifest_url,
1932 'manifest_url': manifest_url,
1933 'ext': 'flv' if bootstrap_info is not None else None,
1934 'protocol': 'f4m',
1935 'tbr': tbr,
1936 'width': width,
1937 'height': height,
1938 'vcodec': vcodec,
1939 'preference': preference,
1940 'quality': quality,
1941 })
1942 return formats
1943
1944 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1945 return {
1946 'format_id': join_nonempty(m3u8_id, 'meta'),
1947 'url': m3u8_url,
1948 'ext': ext,
1949 'protocol': 'm3u8',
1950 'preference': preference - 100 if preference else -100,
1951 'quality': quality,
1952 'resolution': 'multiple',
1953 'format_note': 'Quality selection URL',
1954 }
1955
1956 def _report_ignoring_subs(self, name):
1957 self.report_warning(bug_reports_message(
1958 f'Ignoring subtitle tracks found in the {name} manifest; '
1959 'if any subtitle tracks are missing,'
1960 ), only_once=True)
1961
1962 def _extract_m3u8_formats(self, *args, **kwargs):
1963 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1964 if subs:
1965 self._report_ignoring_subs('HLS')
1966 return fmts
1967
1968 def _extract_m3u8_formats_and_subtitles(
1969 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1970 preference=None, quality=None, m3u8_id=None, note=None,
1971 errnote=None, fatal=True, live=False, data=None, headers={},
1972 query={}):
1973
1974 if self.get_param('ignore_no_formats_error'):
1975 fatal = False
1976
1977 if not m3u8_url:
1978 if errnote is not False:
1979 errnote = errnote or 'Failed to obtain m3u8 URL'
1980 if fatal:
1981 raise ExtractorError(errnote, video_id=video_id)
1982 self.report_warning(f'{errnote}{bug_reports_message()}')
1983 return [], {}
1984
1985 res = self._download_webpage_handle(
1986 m3u8_url, video_id,
1987 note='Downloading m3u8 information' if note is None else note,
1988 errnote='Failed to download m3u8 information' if errnote is None else errnote,
1989 fatal=fatal, data=data, headers=headers, query=query)
1990
1991 if res is False:
1992 return [], {}
1993
1994 m3u8_doc, urlh = res
1995 m3u8_url = urlh.url
1996
1997 return self._parse_m3u8_formats_and_subtitles(
1998 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1999 preference=preference, quality=quality, m3u8_id=m3u8_id,
2000 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
2001 headers=headers, query=query, video_id=video_id)
2002
2003 def _parse_m3u8_formats_and_subtitles(
2004 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
2005 preference=None, quality=None, m3u8_id=None, live=False, note=None,
2006 errnote=None, fatal=True, data=None, headers={}, query={},
2007 video_id=None):
2008 formats, subtitles = [], {}
2009 has_drm = HlsFD._has_drm(m3u8_doc)
2010
2011 def format_url(url):
2012 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
2013
2014 if self.get_param('hls_split_discontinuity', False):
2015 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2016 if not m3u8_doc:
2017 if not manifest_url:
2018 return []
2019 m3u8_doc = self._download_webpage(
2020 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2021 note=False, errnote='Failed to download m3u8 playlist information')
2022 if m3u8_doc is False:
2023 return []
2024 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2025
2026 else:
2027 def _extract_m3u8_playlist_indices(*args, **kwargs):
2028 return [None]
2029
2030 # References:
2031 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2032 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2033 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2034
2035 # We should try extracting formats only from master playlists [1, 4.3.4],
2036 # i.e. playlists that describe available qualities. On the other hand
2037 # media playlists [1, 4.3.3] should be returned as is since they contain
2038 # just the media without qualities renditions.
2039 # Fortunately, master playlist can be easily distinguished from media
2040 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2041 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2042 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2043 # media playlist and MUST NOT appear in master playlist thus we can
2044 # clearly detect media playlist with this criterion.
2045
2046 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2047 formats = [{
2048 'format_id': join_nonempty(m3u8_id, idx),
2049 'format_index': idx,
2050 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2051 'ext': ext,
2052 'protocol': entry_protocol,
2053 'preference': preference,
2054 'quality': quality,
2055 'has_drm': has_drm,
2056 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2057
2058 return formats, subtitles
2059
2060 groups = {}
2061 last_stream_inf = {}
2062
2063 def extract_media(x_media_line):
2064 media = parse_m3u8_attributes(x_media_line)
2065 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2066 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2067 if not (media_type and group_id and name):
2068 return
2069 groups.setdefault(group_id, []).append(media)
2070 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2071 if media_type == 'SUBTITLES':
2072 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2073 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2074 # However, lack of URI has been spotted in the wild.
2075 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2076 if not media.get('URI'):
2077 return
2078 url = format_url(media['URI'])
2079 sub_info = {
2080 'url': url,
2081 'ext': determine_ext(url),
2082 }
2083 if sub_info['ext'] == 'm3u8':
2084 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2085 # files may contain is WebVTT:
2086 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2087 sub_info['ext'] = 'vtt'
2088 sub_info['protocol'] = 'm3u8_native'
2089 lang = media.get('LANGUAGE') or 'und'
2090 subtitles.setdefault(lang, []).append(sub_info)
2091 if media_type not in ('VIDEO', 'AUDIO'):
2092 return
2093 media_url = media.get('URI')
2094 if media_url:
2095 manifest_url = format_url(media_url)
2096 formats.extend({
2097 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2098 'format_note': name,
2099 'format_index': idx,
2100 'url': manifest_url,
2101 'manifest_url': m3u8_url,
2102 'language': media.get('LANGUAGE'),
2103 'ext': ext,
2104 'protocol': entry_protocol,
2105 'preference': preference,
2106 'quality': quality,
2107 'has_drm': has_drm,
2108 'vcodec': 'none' if media_type == 'AUDIO' else None,
2109 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2110
2111 def build_stream_name():
2112 # Despite specification does not mention NAME attribute for
2113 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2114 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2115 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2116 stream_name = last_stream_inf.get('NAME')
2117 if stream_name:
2118 return stream_name
2119 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2120 # from corresponding rendition group
2121 stream_group_id = last_stream_inf.get('VIDEO')
2122 if not stream_group_id:
2123 return
2124 stream_group = groups.get(stream_group_id)
2125 if not stream_group:
2126 return stream_group_id
2127 rendition = stream_group[0]
2128 return rendition.get('NAME') or stream_group_id
2129
2130 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2131 # chance to detect video only formats when EXT-X-STREAM-INF tags
2132 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2133 for line in m3u8_doc.splitlines():
2134 if line.startswith('#EXT-X-MEDIA:'):
2135 extract_media(line)
2136
2137 for line in m3u8_doc.splitlines():
2138 if line.startswith('#EXT-X-STREAM-INF:'):
2139 last_stream_inf = parse_m3u8_attributes(line)
2140 elif line.startswith('#') or not line.strip():
2141 continue
2142 else:
2143 tbr = float_or_none(
2144 last_stream_inf.get('AVERAGE-BANDWIDTH')
2145 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2146 manifest_url = format_url(line.strip())
2147
2148 for idx in _extract_m3u8_playlist_indices(manifest_url):
2149 format_id = [m3u8_id, None, idx]
2150 # Bandwidth of live streams may differ over time thus making
2151 # format_id unpredictable. So it's better to keep provided
2152 # format_id intact.
2153 if not live:
2154 stream_name = build_stream_name()
2155 format_id[1] = stream_name or '%d' % (tbr or len(formats))
2156 f = {
2157 'format_id': join_nonempty(*format_id),
2158 'format_index': idx,
2159 'url': manifest_url,
2160 'manifest_url': m3u8_url,
2161 'tbr': tbr,
2162 'ext': ext,
2163 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2164 'protocol': entry_protocol,
2165 'preference': preference,
2166 'quality': quality,
2167 'has_drm': has_drm,
2168 }
2169 resolution = last_stream_inf.get('RESOLUTION')
2170 if resolution:
2171 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2172 if mobj:
2173 f['width'] = int(mobj.group('width'))
2174 f['height'] = int(mobj.group('height'))
2175 # Unified Streaming Platform
2176 mobj = re.search(
2177 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2178 if mobj:
2179 abr, vbr = mobj.groups()
2180 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2181 f.update({
2182 'vbr': vbr,
2183 'abr': abr,
2184 })
2185 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2186 f.update(codecs)
2187 audio_group_id = last_stream_inf.get('AUDIO')
2188 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2189 # references a rendition group MUST have a CODECS attribute.
2190 # However, this is not always respected. E.g. [2]
2191 # contains EXT-X-STREAM-INF tag which references AUDIO
2192 # rendition group but does not have CODECS and despite
2193 # referencing an audio group it represents a complete
2194 # (with audio and video) format. So, for such cases we will
2195 # ignore references to rendition groups and treat them
2196 # as complete formats.
2197 if audio_group_id and codecs and f.get('vcodec') != 'none':
2198 audio_group = groups.get(audio_group_id)
2199 if audio_group and audio_group[0].get('URI'):
2200 # TODO: update acodec for audio only formats with
2201 # the same GROUP-ID
2202 f['acodec'] = 'none'
2203 if not f.get('ext'):
2204 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2205 formats.append(f)
2206
2207 # for DailyMotion
2208 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2209 if progressive_uri:
2210 http_f = f.copy()
2211 del http_f['manifest_url']
2212 http_f.update({
2213 'format_id': f['format_id'].replace('hls-', 'http-'),
2214 'protocol': 'http',
2215 'url': progressive_uri,
2216 })
2217 formats.append(http_f)
2218
2219 last_stream_inf = {}
2220 return formats, subtitles
2221
2222 def _extract_m3u8_vod_duration(
2223 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2224
2225 m3u8_vod = self._download_webpage(
2226 m3u8_vod_url, video_id,
2227 note='Downloading m3u8 VOD manifest' if note is None else note,
2228 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2229 fatal=False, data=data, headers=headers, query=query)
2230
2231 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2232
2233 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2234 if '#EXT-X-ENDLIST' not in m3u8_vod:
2235 return None
2236
2237 return int(sum(
2238 float(line[len('#EXTINF:'):].split(',')[0])
2239 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2240
2241 def _extract_mpd_vod_duration(
2242 self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2243
2244 mpd_doc = self._download_xml(
2245 mpd_url, video_id,
2246 note='Downloading MPD VOD manifest' if note is None else note,
2247 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2248 fatal=False, data=data, headers=headers, query=query)
2249 if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2250 return None
2251 return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2252
2253 @staticmethod
2254 def _xpath_ns(path, namespace=None):
2255 if not namespace:
2256 return path
2257 out = []
2258 for c in path.split('/'):
2259 if not c or c == '.':
2260 out.append(c)
2261 else:
2262 out.append('{%s}%s' % (namespace, c))
2263 return '/'.join(out)
2264
2265 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2266 if self.get_param('ignore_no_formats_error'):
2267 fatal = False
2268
2269 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2270 if res is False:
2271 assert not fatal
2272 return [], {}
2273 smil, urlh = res
2274
2275 return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2276 namespace=self._parse_smil_namespace(smil))
2277
2278 def _extract_smil_formats(self, *args, **kwargs):
2279 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2280 if subs:
2281 self._report_ignoring_subs('SMIL')
2282 return fmts
2283
2284 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2285 res = self._download_smil(smil_url, video_id, fatal=fatal)
2286 if res is False:
2287 return {}
2288
2289 smil, urlh = res
2290 smil_url = urlh.url
2291
2292 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2293
2294 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2295 return self._download_xml_handle(
2296 smil_url, video_id, 'Downloading SMIL file',
2297 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2298
2299 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2300 namespace = self._parse_smil_namespace(smil)
2301
2302 formats, subtitles = self._parse_smil_formats_and_subtitles(
2303 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2304
2305 video_id = os.path.splitext(url_basename(smil_url))[0]
2306 title = None
2307 description = None
2308 upload_date = None
2309 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2310 name = meta.attrib.get('name')
2311 content = meta.attrib.get('content')
2312 if not name or not content:
2313 continue
2314 if not title and name == 'title':
2315 title = content
2316 elif not description and name in ('description', 'abstract'):
2317 description = content
2318 elif not upload_date and name == 'date':
2319 upload_date = unified_strdate(content)
2320
2321 thumbnails = [{
2322 'id': image.get('type'),
2323 'url': image.get('src'),
2324 'width': int_or_none(image.get('width')),
2325 'height': int_or_none(image.get('height')),
2326 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2327
2328 return {
2329 'id': video_id,
2330 'title': title or video_id,
2331 'description': description,
2332 'upload_date': upload_date,
2333 'thumbnails': thumbnails,
2334 'formats': formats,
2335 'subtitles': subtitles,
2336 }
2337
2338 def _parse_smil_namespace(self, smil):
2339 return self._search_regex(
2340 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2341
2342 def _parse_smil_formats(self, *args, **kwargs):
2343 fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2344 if subs:
2345 self._report_ignoring_subs('SMIL')
2346 return fmts
2347
2348 def _parse_smil_formats_and_subtitles(
2349 self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2350 base = smil_url
2351 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2352 b = meta.get('base') or meta.get('httpBase')
2353 if b:
2354 base = b
2355 break
2356
2357 formats, subtitles = [], {}
2358 rtmp_count = 0
2359 http_count = 0
2360 m3u8_count = 0
2361 imgs_count = 0
2362
2363 srcs = set()
2364 media = itertools.chain.from_iterable(
2365 smil.findall(self._xpath_ns(arg, namespace))
2366 for arg in ['.//video', './/audio', './/media'])
2367 for medium in media:
2368 src = medium.get('src')
2369 if not src or src in srcs:
2370 continue
2371 srcs.add(src)
2372
2373 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2374 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2375 width = int_or_none(medium.get('width'))
2376 height = int_or_none(medium.get('height'))
2377 proto = medium.get('proto')
2378 ext = medium.get('ext')
2379 src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2380 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2381 streamer = medium.get('streamer') or base
2382
2383 if proto == 'rtmp' or streamer.startswith('rtmp'):
2384 rtmp_count += 1
2385 formats.append({
2386 'url': streamer,
2387 'play_path': src,
2388 'ext': 'flv',
2389 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2390 'tbr': bitrate,
2391 'filesize': filesize,
2392 'width': width,
2393 'height': height,
2394 })
2395 if transform_rtmp_url:
2396 streamer, src = transform_rtmp_url(streamer, src)
2397 formats[-1].update({
2398 'url': streamer,
2399 'play_path': src,
2400 })
2401 continue
2402
2403 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2404 src_url = src_url.strip()
2405
2406 if proto == 'm3u8' or src_ext == 'm3u8':
2407 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2408 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2409 self._merge_subtitles(m3u8_subs, target=subtitles)
2410 if len(m3u8_formats) == 1:
2411 m3u8_count += 1
2412 m3u8_formats[0].update({
2413 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2414 'tbr': bitrate,
2415 'width': width,
2416 'height': height,
2417 })
2418 formats.extend(m3u8_formats)
2419 elif src_ext == 'f4m':
2420 f4m_url = src_url
2421 if not f4m_params:
2422 f4m_params = {
2423 'hdcore': '3.2.0',
2424 'plugin': 'flowplayer-3.2.0.1',
2425 }
2426 f4m_url += '&' if '?' in f4m_url else '?'
2427 f4m_url += urllib.parse.urlencode(f4m_params)
2428 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2429 elif src_ext == 'mpd':
2430 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2431 src_url, video_id, mpd_id='dash', fatal=False)
2432 formats.extend(mpd_formats)
2433 self._merge_subtitles(mpd_subs, target=subtitles)
2434 elif re.search(r'\.ism/[Mm]anifest', src_url):
2435 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2436 src_url, video_id, ism_id='mss', fatal=False)
2437 formats.extend(ism_formats)
2438 self._merge_subtitles(ism_subs, target=subtitles)
2439 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2440 http_count += 1
2441 formats.append({
2442 'url': src_url,
2443 'ext': ext or src_ext or 'flv',
2444 'format_id': 'http-%d' % (bitrate or http_count),
2445 'tbr': bitrate,
2446 'filesize': filesize,
2447 'width': width,
2448 'height': height,
2449 })
2450
2451 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2452 src = medium.get('src')
2453 if not src or src in srcs:
2454 continue
2455 srcs.add(src)
2456
2457 imgs_count += 1
2458 formats.append({
2459 'format_id': 'imagestream-%d' % (imgs_count),
2460 'url': src,
2461 'ext': mimetype2ext(medium.get('type')),
2462 'acodec': 'none',
2463 'vcodec': 'none',
2464 'width': int_or_none(medium.get('width')),
2465 'height': int_or_none(medium.get('height')),
2466 'format_note': 'SMIL storyboards',
2467 })
2468
2469 smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2470 self._merge_subtitles(smil_subs, target=subtitles)
2471
2472 return formats, subtitles
2473
2474 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2475 urls = []
2476 subtitles = {}
2477 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2478 src = textstream.get('src')
2479 if not src or src in urls:
2480 continue
2481 urls.append(src)
2482 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2483 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2484 subtitles.setdefault(lang, []).append({
2485 'url': src,
2486 'ext': ext,
2487 })
2488 return subtitles
2489
2490 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2491 res = self._download_xml_handle(
2492 xspf_url, playlist_id, 'Downloading xpsf playlist',
2493 'Unable to download xspf manifest', fatal=fatal)
2494 if res is False:
2495 return []
2496
2497 xspf, urlh = res
2498 xspf_url = urlh.url
2499
2500 return self._parse_xspf(
2501 xspf, playlist_id, xspf_url=xspf_url,
2502 xspf_base_url=base_url(xspf_url))
2503
2504 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2505 NS_MAP = {
2506 'xspf': 'http://xspf.org/ns/0/',
2507 's1': 'http://static.streamone.nl/player/ns/0',
2508 }
2509
2510 entries = []
2511 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2512 title = xpath_text(
2513 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2514 description = xpath_text(
2515 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2516 thumbnail = xpath_text(
2517 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2518 duration = float_or_none(
2519 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2520
2521 formats = []
2522 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2523 format_url = urljoin(xspf_base_url, location.text)
2524 if not format_url:
2525 continue
2526 formats.append({
2527 'url': format_url,
2528 'manifest_url': xspf_url,
2529 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2530 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2531 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2532 })
2533
2534 entries.append({
2535 'id': playlist_id,
2536 'title': title,
2537 'description': description,
2538 'thumbnail': thumbnail,
2539 'duration': duration,
2540 'formats': formats,
2541 })
2542 return entries
2543
2544 def _extract_mpd_formats(self, *args, **kwargs):
2545 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2546 if subs:
2547 self._report_ignoring_subs('DASH')
2548 return fmts
2549
2550 def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2551 periods = self._extract_mpd_periods(*args, **kwargs)
2552 return self._merge_mpd_periods(periods)
2553
2554 def _extract_mpd_periods(
2555 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2556 fatal=True, data=None, headers={}, query={}):
2557
2558 if self.get_param('ignore_no_formats_error'):
2559 fatal = False
2560
2561 res = self._download_xml_handle(
2562 mpd_url, video_id,
2563 note='Downloading MPD manifest' if note is None else note,
2564 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2565 fatal=fatal, data=data, headers=headers, query=query)
2566 if res is False:
2567 return []
2568 mpd_doc, urlh = res
2569 if mpd_doc is None:
2570 return []
2571
2572 # We could have been redirected to a new url when we retrieved our mpd file.
2573 mpd_url = urlh.url
2574 mpd_base_url = base_url(mpd_url)
2575
2576 return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2577
2578 def _parse_mpd_formats(self, *args, **kwargs):
2579 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2580 if subs:
2581 self._report_ignoring_subs('DASH')
2582 return fmts
2583
2584 def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2585 periods = self._parse_mpd_periods(*args, **kwargs)
2586 return self._merge_mpd_periods(periods)
2587
2588 def _merge_mpd_periods(self, periods):
2589 """
2590 Combine all formats and subtitles from an MPD manifest into a single list,
2591 by concatenate streams with similar formats.
2592 """
2593 formats, subtitles = {}, {}
2594 for period in periods:
2595 for f in period['formats']:
2596 assert 'is_dash_periods' not in f, 'format already processed'
2597 f['is_dash_periods'] = True
2598 format_key = tuple(v for k, v in f.items() if k not in (
2599 ('format_id', 'fragments', 'manifest_stream_number')))
2600 if format_key not in formats:
2601 formats[format_key] = f
2602 elif 'fragments' in f:
2603 formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2604
2605 if subtitles and period['subtitles']:
2606 self.report_warning(bug_reports_message(
2607 'Found subtitles in multiple periods in the DASH manifest; '
2608 'if part of the subtitles are missing,'
2609 ), only_once=True)
2610
2611 for sub_lang, sub_info in period['subtitles'].items():
2612 subtitles.setdefault(sub_lang, []).extend(sub_info)
2613
2614 return list(formats.values()), subtitles
2615
2616 def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2617 """
2618 Parse formats from MPD manifest.
2619 References:
2620 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2621 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2622 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2623 """
2624 if not self.get_param('dynamic_mpd', True):
2625 if mpd_doc.get('type') == 'dynamic':
2626 return [], {}
2627
2628 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2629
2630 def _add_ns(path):
2631 return self._xpath_ns(path, namespace)
2632
2633 def is_drm_protected(element):
2634 return element.find(_add_ns('ContentProtection')) is not None
2635
2636 def extract_multisegment_info(element, ms_parent_info):
2637 ms_info = ms_parent_info.copy()
2638
2639 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2640 # common attributes and elements. We will only extract relevant
2641 # for us.
2642 def extract_common(source):
2643 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2644 if segment_timeline is not None:
2645 s_e = segment_timeline.findall(_add_ns('S'))
2646 if s_e:
2647 ms_info['total_number'] = 0
2648 ms_info['s'] = []
2649 for s in s_e:
2650 r = int(s.get('r', 0))
2651 ms_info['total_number'] += 1 + r
2652 ms_info['s'].append({
2653 't': int(s.get('t', 0)),
2654 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2655 'd': int(s.attrib['d']),
2656 'r': r,
2657 })
2658 start_number = source.get('startNumber')
2659 if start_number:
2660 ms_info['start_number'] = int(start_number)
2661 timescale = source.get('timescale')
2662 if timescale:
2663 ms_info['timescale'] = int(timescale)
2664 segment_duration = source.get('duration')
2665 if segment_duration:
2666 ms_info['segment_duration'] = float(segment_duration)
2667
2668 def extract_Initialization(source):
2669 initialization = source.find(_add_ns('Initialization'))
2670 if initialization is not None:
2671 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2672
2673 segment_list = element.find(_add_ns('SegmentList'))
2674 if segment_list is not None:
2675 extract_common(segment_list)
2676 extract_Initialization(segment_list)
2677 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2678 if segment_urls_e:
2679 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2680 else:
2681 segment_template = element.find(_add_ns('SegmentTemplate'))
2682 if segment_template is not None:
2683 extract_common(segment_template)
2684 media = segment_template.get('media')
2685 if media:
2686 ms_info['media'] = media
2687 initialization = segment_template.get('initialization')
2688 if initialization:
2689 ms_info['initialization'] = initialization
2690 else:
2691 extract_Initialization(segment_template)
2692 return ms_info
2693
2694 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2695 stream_numbers = collections.defaultdict(int)
2696 for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2697 period_entry = {
2698 'id': period.get('id', f'period-{period_idx}'),
2699 'formats': [],
2700 'subtitles': collections.defaultdict(list),
2701 }
2702 period_duration = parse_duration(period.get('duration')) or mpd_duration
2703 period_ms_info = extract_multisegment_info(period, {
2704 'start_number': 1,
2705 'timescale': 1,
2706 })
2707 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2708 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2709 for representation in adaptation_set.findall(_add_ns('Representation')):
2710 representation_attrib = adaptation_set.attrib.copy()
2711 representation_attrib.update(representation.attrib)
2712 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2713 mime_type = representation_attrib['mimeType']
2714 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2715
2716 codec_str = representation_attrib.get('codecs', '')
2717 # Some kind of binary subtitle found in some youtube livestreams
2718 if mime_type == 'application/x-rawcc':
2719 codecs = {'scodec': codec_str}
2720 else:
2721 codecs = parse_codecs(codec_str)
2722 if content_type not in ('video', 'audio', 'text'):
2723 if mime_type == 'image/jpeg':
2724 content_type = mime_type
2725 elif codecs.get('vcodec', 'none') != 'none':
2726 content_type = 'video'
2727 elif codecs.get('acodec', 'none') != 'none':
2728 content_type = 'audio'
2729 elif codecs.get('scodec', 'none') != 'none':
2730 content_type = 'text'
2731 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2732 content_type = 'text'
2733 else:
2734 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2735 continue
2736
2737 base_url = ''
2738 for element in (representation, adaptation_set, period, mpd_doc):
2739 base_url_e = element.find(_add_ns('BaseURL'))
2740 if try_call(lambda: base_url_e.text) is not None:
2741 base_url = base_url_e.text + base_url
2742 if re.match(r'^https?://', base_url):
2743 break
2744 if mpd_base_url and base_url.startswith('/'):
2745 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2746 elif mpd_base_url and not re.match(r'^https?://', base_url):
2747 if not mpd_base_url.endswith('/'):
2748 mpd_base_url += '/'
2749 base_url = mpd_base_url + base_url
2750 representation_id = representation_attrib.get('id')
2751 lang = representation_attrib.get('lang')
2752 url_el = representation.find(_add_ns('BaseURL'))
2753 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2754 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2755 if representation_id is not None:
2756 format_id = representation_id
2757 else:
2758 format_id = content_type
2759 if mpd_id:
2760 format_id = mpd_id + '-' + format_id
2761 if content_type in ('video', 'audio'):
2762 f = {
2763 'format_id': format_id,
2764 'manifest_url': mpd_url,
2765 'ext': mimetype2ext(mime_type),
2766 'width': int_or_none(representation_attrib.get('width')),
2767 'height': int_or_none(representation_attrib.get('height')),
2768 'tbr': float_or_none(bandwidth, 1000),
2769 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2770 'fps': int_or_none(representation_attrib.get('frameRate')),
2771 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2772 'format_note': 'DASH %s' % content_type,
2773 'filesize': filesize,
2774 'container': mimetype2ext(mime_type) + '_dash',
2775 **codecs
2776 }
2777 elif content_type == 'text':
2778 f = {
2779 'ext': mimetype2ext(mime_type),
2780 'manifest_url': mpd_url,
2781 'filesize': filesize,
2782 }
2783 elif content_type == 'image/jpeg':
2784 # See test case in VikiIE
2785 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2786 f = {
2787 'format_id': format_id,
2788 'ext': 'mhtml',
2789 'manifest_url': mpd_url,
2790 'format_note': 'DASH storyboards (jpeg)',
2791 'acodec': 'none',
2792 'vcodec': 'none',
2793 }
2794 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2795 f['has_drm'] = True
2796 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2797
2798 def prepare_template(template_name, identifiers):
2799 tmpl = representation_ms_info[template_name]
2800 if representation_id is not None:
2801 tmpl = tmpl.replace('$RepresentationID$', representation_id)
2802 # First of, % characters outside $...$ templates
2803 # must be escaped by doubling for proper processing
2804 # by % operator string formatting used further (see
2805 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2806 t = ''
2807 in_template = False
2808 for c in tmpl:
2809 t += c
2810 if c == '$':
2811 in_template = not in_template
2812 elif c == '%' and not in_template:
2813 t += c
2814 # Next, $...$ templates are translated to their
2815 # %(...) counterparts to be used with % operator
2816 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2817 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2818 t.replace('$$', '$')
2819 return t
2820
2821 # @initialization is a regular template like @media one
2822 # so it should be handled just the same way (see
2823 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2824 if 'initialization' in representation_ms_info:
2825 initialization_template = prepare_template(
2826 'initialization',
2827 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2828 # $Time$ shall not be included for @initialization thus
2829 # only $Bandwidth$ remains
2830 ('Bandwidth', ))
2831 representation_ms_info['initialization_url'] = initialization_template % {
2832 'Bandwidth': bandwidth,
2833 }
2834
2835 def location_key(location):
2836 return 'url' if re.match(r'^https?://', location) else 'path'
2837
2838 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2839
2840 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2841 media_location_key = location_key(media_template)
2842
2843 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2844 # can't be used at the same time
2845 if '%(Number' in media_template and 's' not in representation_ms_info:
2846 segment_duration = None
2847 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2848 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2849 representation_ms_info['total_number'] = int(math.ceil(
2850 float_or_none(period_duration, segment_duration, default=0)))
2851 representation_ms_info['fragments'] = [{
2852 media_location_key: media_template % {
2853 'Number': segment_number,
2854 'Bandwidth': bandwidth,
2855 },
2856 'duration': segment_duration,
2857 } for segment_number in range(
2858 representation_ms_info['start_number'],
2859 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2860 else:
2861 # $Number*$ or $Time$ in media template with S list available
2862 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2863 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2864 representation_ms_info['fragments'] = []
2865 segment_time = 0
2866 segment_d = None
2867 segment_number = representation_ms_info['start_number']
2868
2869 def add_segment_url():
2870 segment_url = media_template % {
2871 'Time': segment_time,
2872 'Bandwidth': bandwidth,
2873 'Number': segment_number,
2874 }
2875 representation_ms_info['fragments'].append({
2876 media_location_key: segment_url,
2877 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2878 })
2879
2880 for num, s in enumerate(representation_ms_info['s']):
2881 segment_time = s.get('t') or segment_time
2882 segment_d = s['d']
2883 add_segment_url()
2884 segment_number += 1
2885 for r in range(s.get('r', 0)):
2886 segment_time += segment_d
2887 add_segment_url()
2888 segment_number += 1
2889 segment_time += segment_d
2890 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2891 # No media template,
2892 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2893 # or any YouTube dashsegments video
2894 fragments = []
2895 segment_index = 0
2896 timescale = representation_ms_info['timescale']
2897 for s in representation_ms_info['s']:
2898 duration = float_or_none(s['d'], timescale)
2899 for r in range(s.get('r', 0) + 1):
2900 segment_uri = representation_ms_info['segment_urls'][segment_index]
2901 fragments.append({
2902 location_key(segment_uri): segment_uri,
2903 'duration': duration,
2904 })
2905 segment_index += 1
2906 representation_ms_info['fragments'] = fragments
2907 elif 'segment_urls' in representation_ms_info:
2908 # Segment URLs with no SegmentTimeline
2909 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2910 # https://github.com/ytdl-org/youtube-dl/pull/14844
2911 fragments = []
2912 segment_duration = float_or_none(
2913 representation_ms_info['segment_duration'],
2914 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2915 for segment_url in representation_ms_info['segment_urls']:
2916 fragment = {
2917 location_key(segment_url): segment_url,
2918 }
2919 if segment_duration:
2920 fragment['duration'] = segment_duration
2921 fragments.append(fragment)
2922 representation_ms_info['fragments'] = fragments
2923 # If there is a fragments key available then we correctly recognized fragmented media.
2924 # Otherwise we will assume unfragmented media with direct access. Technically, such
2925 # assumption is not necessarily correct since we may simply have no support for
2926 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2927 if 'fragments' in representation_ms_info:
2928 f.update({
2929 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2930 'url': mpd_url or base_url,
2931 'fragment_base_url': base_url,
2932 'fragments': [],
2933 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2934 })
2935 if 'initialization_url' in representation_ms_info:
2936 initialization_url = representation_ms_info['initialization_url']
2937 if not f.get('url'):
2938 f['url'] = initialization_url
2939 f['fragments'].append({location_key(initialization_url): initialization_url})
2940 f['fragments'].extend(representation_ms_info['fragments'])
2941 if not period_duration:
2942 period_duration = try_get(
2943 representation_ms_info,
2944 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2945 else:
2946 # Assuming direct URL to unfragmented media.
2947 f['url'] = base_url
2948 if content_type in ('video', 'audio', 'image/jpeg'):
2949 f['manifest_stream_number'] = stream_numbers[f['url']]
2950 stream_numbers[f['url']] += 1
2951 period_entry['formats'].append(f)
2952 elif content_type == 'text':
2953 period_entry['subtitles'][lang or 'und'].append(f)
2954 yield period_entry
2955
2956 def _extract_ism_formats(self, *args, **kwargs):
2957 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2958 if subs:
2959 self._report_ignoring_subs('ISM')
2960 return fmts
2961
2962 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2963 if self.get_param('ignore_no_formats_error'):
2964 fatal = False
2965
2966 res = self._download_xml_handle(
2967 ism_url, video_id,
2968 note='Downloading ISM manifest' if note is None else note,
2969 errnote='Failed to download ISM manifest' if errnote is None else errnote,
2970 fatal=fatal, data=data, headers=headers, query=query)
2971 if res is False:
2972 return [], {}
2973 ism_doc, urlh = res
2974 if ism_doc is None:
2975 return [], {}
2976
2977 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
2978
2979 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2980 """
2981 Parse formats from ISM manifest.
2982 References:
2983 1. [MS-SSTR]: Smooth Streaming Protocol,
2984 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2985 """
2986 if ism_doc.get('IsLive') == 'TRUE':
2987 return [], {}
2988
2989 duration = int(ism_doc.attrib['Duration'])
2990 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2991
2992 formats = []
2993 subtitles = {}
2994 for stream in ism_doc.findall('StreamIndex'):
2995 stream_type = stream.get('Type')
2996 if stream_type not in ('video', 'audio', 'text'):
2997 continue
2998 url_pattern = stream.attrib['Url']
2999 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
3000 stream_name = stream.get('Name')
3001 stream_language = stream.get('Language', 'und')
3002 for track in stream.findall('QualityLevel'):
3003 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
3004 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
3005 # TODO: add support for WVC1 and WMAP
3006 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
3007 self.report_warning('%s is not a supported codec' % fourcc)
3008 continue
3009 tbr = int(track.attrib['Bitrate']) // 1000
3010 # [1] does not mention Width and Height attributes. However,
3011 # they're often present while MaxWidth and MaxHeight are
3012 # missing, so should be used as fallbacks
3013 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
3014 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3015 sampling_rate = int_or_none(track.get('SamplingRate'))
3016
3017 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3018 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3019
3020 fragments = []
3021 fragment_ctx = {
3022 'time': 0,
3023 }
3024 stream_fragments = stream.findall('c')
3025 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3026 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3027 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3028 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3029 if not fragment_ctx['duration']:
3030 try:
3031 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3032 except IndexError:
3033 next_fragment_time = duration
3034 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3035 for _ in range(fragment_repeat):
3036 fragments.append({
3037 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3038 'duration': fragment_ctx['duration'] / stream_timescale,
3039 })
3040 fragment_ctx['time'] += fragment_ctx['duration']
3041
3042 if stream_type == 'text':
3043 subtitles.setdefault(stream_language, []).append({
3044 'ext': 'ismt',
3045 'protocol': 'ism',
3046 'url': ism_url,
3047 'manifest_url': ism_url,
3048 'fragments': fragments,
3049 '_download_params': {
3050 'stream_type': stream_type,
3051 'duration': duration,
3052 'timescale': stream_timescale,
3053 'fourcc': fourcc,
3054 'language': stream_language,
3055 'codec_private_data': track.get('CodecPrivateData'),
3056 }
3057 })
3058 elif stream_type in ('video', 'audio'):
3059 formats.append({
3060 'format_id': join_nonempty(ism_id, stream_name, tbr),
3061 'url': ism_url,
3062 'manifest_url': ism_url,
3063 'ext': 'ismv' if stream_type == 'video' else 'isma',
3064 'width': width,
3065 'height': height,
3066 'tbr': tbr,
3067 'asr': sampling_rate,
3068 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3069 'acodec': 'none' if stream_type == 'video' else fourcc,
3070 'protocol': 'ism',
3071 'fragments': fragments,
3072 'has_drm': ism_doc.find('Protection') is not None,
3073 'language': stream_language,
3074 'audio_channels': int_or_none(track.get('Channels')),
3075 '_download_params': {
3076 'stream_type': stream_type,
3077 'duration': duration,
3078 'timescale': stream_timescale,
3079 'width': width or 0,
3080 'height': height or 0,
3081 'fourcc': fourcc,
3082 'language': stream_language,
3083 'codec_private_data': track.get('CodecPrivateData'),
3084 'sampling_rate': sampling_rate,
3085 'channels': int_or_none(track.get('Channels', 2)),
3086 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3087 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3088 },
3089 })
3090 return formats, subtitles
3091
3092 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3093 def absolute_url(item_url):
3094 return urljoin(base_url, item_url)
3095
3096 def parse_content_type(content_type):
3097 if not content_type:
3098 return {}
3099 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3100 if ctr:
3101 mimetype, codecs = ctr.groups()
3102 f = parse_codecs(codecs)
3103 f['ext'] = mimetype2ext(mimetype)
3104 return f
3105 return {}
3106
3107 def _media_formats(src, cur_media_type, type_info=None):
3108 type_info = type_info or {}
3109 full_url = absolute_url(src)
3110 ext = type_info.get('ext') or determine_ext(full_url)
3111 if ext == 'm3u8':
3112 is_plain_url = False
3113 formats = self._extract_m3u8_formats(
3114 full_url, video_id, ext='mp4',
3115 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3116 preference=preference, quality=quality, fatal=False)
3117 elif ext == 'mpd':
3118 is_plain_url = False
3119 formats = self._extract_mpd_formats(
3120 full_url, video_id, mpd_id=mpd_id, fatal=False)
3121 else:
3122 is_plain_url = True
3123 formats = [{
3124 'url': full_url,
3125 'vcodec': 'none' if cur_media_type == 'audio' else None,
3126 'ext': ext,
3127 }]
3128 return is_plain_url, formats
3129
3130 entries = []
3131 # amp-video and amp-audio are very similar to their HTML5 counterparts
3132 # so we will include them right here (see
3133 # https://www.ampproject.org/docs/reference/components/amp-video)
3134 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3135 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3136 media_tags = [(media_tag, media_tag_name, media_type, '')
3137 for media_tag, media_tag_name, media_type
3138 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3139 media_tags.extend(re.findall(
3140 # We only allow video|audio followed by a whitespace or '>'.
3141 # Allowing more characters may end up in significant slow down (see
3142 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3143 # e.g. http://www.porntrex.com/maps/videositemap.xml).
3144 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3145 for media_tag, _, media_type, media_content in media_tags:
3146 media_info = {
3147 'formats': [],
3148 'subtitles': {},
3149 }
3150 media_attributes = extract_attributes(media_tag)
3151 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3152 if src:
3153 f = parse_content_type(media_attributes.get('type'))
3154 _, formats = _media_formats(src, media_type, f)
3155 media_info['formats'].extend(formats)
3156 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3157 if media_content:
3158 for source_tag in re.findall(r'<source[^>]+>', media_content):
3159 s_attr = extract_attributes(source_tag)
3160 # data-video-src and data-src are non standard but seen
3161 # several times in the wild
3162 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3163 if not src:
3164 continue
3165 f = parse_content_type(s_attr.get('type'))
3166 is_plain_url, formats = _media_formats(src, media_type, f)
3167 if is_plain_url:
3168 # width, height, res, label and title attributes are
3169 # all not standard but seen several times in the wild
3170 labels = [
3171 s_attr.get(lbl)
3172 for lbl in ('label', 'title')
3173 if str_or_none(s_attr.get(lbl))
3174 ]
3175 width = int_or_none(s_attr.get('width'))
3176 height = (int_or_none(s_attr.get('height'))
3177 or int_or_none(s_attr.get('res')))
3178 if not width or not height:
3179 for lbl in labels:
3180 resolution = parse_resolution(lbl)
3181 if not resolution:
3182 continue
3183 width = width or resolution.get('width')
3184 height = height or resolution.get('height')
3185 for lbl in labels:
3186 tbr = parse_bitrate(lbl)
3187 if tbr:
3188 break
3189 else:
3190 tbr = None
3191 f.update({
3192 'width': width,
3193 'height': height,
3194 'tbr': tbr,
3195 'format_id': s_attr.get('label') or s_attr.get('title'),
3196 })
3197 f.update(formats[0])
3198 media_info['formats'].append(f)
3199 else:
3200 media_info['formats'].extend(formats)
3201 for track_tag in re.findall(r'<track[^>]+>', media_content):
3202 track_attributes = extract_attributes(track_tag)
3203 kind = track_attributes.get('kind')
3204 if not kind or kind in ('subtitles', 'captions'):
3205 src = strip_or_none(track_attributes.get('src'))
3206 if not src:
3207 continue
3208 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3209 media_info['subtitles'].setdefault(lang, []).append({
3210 'url': absolute_url(src),
3211 })
3212 for f in media_info['formats']:
3213 f.setdefault('http_headers', {})['Referer'] = base_url
3214 if media_info['formats'] or media_info['subtitles']:
3215 entries.append(media_info)
3216 return entries
3217
3218 def _extract_akamai_formats(self, *args, **kwargs):
3219 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3220 if subs:
3221 self._report_ignoring_subs('akamai')
3222 return fmts
3223
3224 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3225 signed = 'hdnea=' in manifest_url
3226 if not signed:
3227 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3228 manifest_url = re.sub(
3229 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3230 '', manifest_url).strip('?')
3231
3232 formats = []
3233 subtitles = {}
3234
3235 hdcore_sign = 'hdcore=3.7.0'
3236 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3237 hds_host = hosts.get('hds')
3238 if hds_host:
3239 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3240 if 'hdcore=' not in f4m_url:
3241 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3242 f4m_formats = self._extract_f4m_formats(
3243 f4m_url, video_id, f4m_id='hds', fatal=False)
3244 for entry in f4m_formats:
3245 entry.update({'extra_param_to_segment_url': hdcore_sign})
3246 formats.extend(f4m_formats)
3247
3248 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3249 hls_host = hosts.get('hls')
3250 if hls_host:
3251 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3252 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3253 m3u8_url, video_id, 'mp4', 'm3u8_native',
3254 m3u8_id='hls', fatal=False)
3255 formats.extend(m3u8_formats)
3256 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3257
3258 http_host = hosts.get('http')
3259 if http_host and m3u8_formats and not signed:
3260 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3261 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3262 qualities_length = len(qualities)
3263 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3264 i = 0
3265 for f in m3u8_formats:
3266 if f['vcodec'] != 'none':
3267 for protocol in ('http', 'https'):
3268 http_f = f.copy()
3269 del http_f['manifest_url']
3270 http_url = re.sub(
3271 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3272 http_f.update({
3273 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3274 'url': http_url,
3275 'protocol': protocol,
3276 })
3277 formats.append(http_f)
3278 i += 1
3279
3280 return formats, subtitles
3281
3282 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3283 query = urllib.parse.urlparse(url).query
3284 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3285 mobj = re.search(
3286 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3287 url_base = mobj.group('url')
3288 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3289 formats = []
3290
3291 def manifest_url(manifest):
3292 m_url = f'{http_base_url}/{manifest}'
3293 if query:
3294 m_url += '?%s' % query
3295 return m_url
3296
3297 if 'm3u8' not in skip_protocols:
3298 formats.extend(self._extract_m3u8_formats(
3299 manifest_url('playlist.m3u8'), video_id, 'mp4',
3300 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3301 if 'f4m' not in skip_protocols:
3302 formats.extend(self._extract_f4m_formats(
3303 manifest_url('manifest.f4m'),
3304 video_id, f4m_id='hds', fatal=False))
3305 if 'dash' not in skip_protocols:
3306 formats.extend(self._extract_mpd_formats(
3307 manifest_url('manifest.mpd'),
3308 video_id, mpd_id='dash', fatal=False))
3309 if re.search(r'(?:/smil:|\.smil)', url_base):
3310 if 'smil' not in skip_protocols:
3311 rtmp_formats = self._extract_smil_formats(
3312 manifest_url('jwplayer.smil'),
3313 video_id, fatal=False)
3314 for rtmp_format in rtmp_formats:
3315 rtsp_format = rtmp_format.copy()
3316 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3317 del rtsp_format['play_path']
3318 del rtsp_format['ext']
3319 rtsp_format.update({
3320 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3321 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3322 'protocol': 'rtsp',
3323 })
3324 formats.extend([rtmp_format, rtsp_format])
3325 else:
3326 for protocol in ('rtmp', 'rtsp'):
3327 if protocol not in skip_protocols:
3328 formats.append({
3329 'url': f'{protocol}:{url_base}',
3330 'format_id': protocol,
3331 'protocol': protocol,
3332 })
3333 return formats
3334
3335 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3336 mobj = re.search(
3337 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3338 webpage)
3339 if mobj:
3340 try:
3341 jwplayer_data = self._parse_json(mobj.group('options'),
3342 video_id=video_id,
3343 transform_source=transform_source)
3344 except ExtractorError:
3345 pass
3346 else:
3347 if isinstance(jwplayer_data, dict):
3348 return jwplayer_data
3349
3350 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3351 jwplayer_data = self._find_jwplayer_data(
3352 webpage, video_id, transform_source=js_to_json)
3353 return self._parse_jwplayer_data(
3354 jwplayer_data, video_id, *args, **kwargs)
3355
3356 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3357 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3358 entries = []
3359 if not isinstance(jwplayer_data, dict):
3360 return entries
3361
3362 playlist_items = jwplayer_data.get('playlist')
3363 # JWPlayer backward compatibility: single playlist item/flattened playlists
3364 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3365 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3366 if not isinstance(playlist_items, list):
3367 playlist_items = (playlist_items or jwplayer_data, )
3368
3369 for video_data in playlist_items:
3370 if not isinstance(video_data, dict):
3371 continue
3372 # JWPlayer backward compatibility: flattened sources
3373 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3374 if 'sources' not in video_data:
3375 video_data['sources'] = [video_data]
3376
3377 this_video_id = video_id or video_data['mediaid']
3378
3379 formats = self._parse_jwplayer_formats(
3380 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3381 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3382
3383 subtitles = {}
3384 tracks = video_data.get('tracks')
3385 if tracks and isinstance(tracks, list):
3386 for track in tracks:
3387 if not isinstance(track, dict):
3388 continue
3389 track_kind = track.get('kind')
3390 if not track_kind or not isinstance(track_kind, str):
3391 continue
3392 if track_kind.lower() not in ('captions', 'subtitles'):
3393 continue
3394 track_url = urljoin(base_url, track.get('file'))
3395 if not track_url:
3396 continue
3397 subtitles.setdefault(track.get('label') or 'en', []).append({
3398 'url': self._proto_relative_url(track_url)
3399 })
3400
3401 entry = {
3402 'id': this_video_id,
3403 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3404 'description': clean_html(video_data.get('description')),
3405 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3406 'timestamp': int_or_none(video_data.get('pubdate')),
3407 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3408 'subtitles': subtitles,
3409 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3410 'genre': clean_html(video_data.get('genre')),
3411 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3412 'season_number': int_or_none(video_data.get('season')),
3413 'episode_number': int_or_none(video_data.get('episode')),
3414 'release_year': int_or_none(video_data.get('releasedate')),
3415 'age_limit': int_or_none(video_data.get('age_restriction')),
3416 }
3417 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3418 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3419 entry.update({
3420 '_type': 'url_transparent',
3421 'url': formats[0]['url'],
3422 })
3423 else:
3424 entry['formats'] = formats
3425 entries.append(entry)
3426 if len(entries) == 1:
3427 return entries[0]
3428 else:
3429 return self.playlist_result(entries)
3430
3431 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3432 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3433 urls = set()
3434 formats = []
3435 for source in jwplayer_sources_data:
3436 if not isinstance(source, dict):
3437 continue
3438 source_url = urljoin(
3439 base_url, self._proto_relative_url(source.get('file')))
3440 if not source_url or source_url in urls:
3441 continue
3442 urls.add(source_url)
3443 source_type = source.get('type') or ''
3444 ext = mimetype2ext(source_type) or determine_ext(source_url)
3445 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3446 formats.extend(self._extract_m3u8_formats(
3447 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3448 m3u8_id=m3u8_id, fatal=False))
3449 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3450 formats.extend(self._extract_mpd_formats(
3451 source_url, video_id, mpd_id=mpd_id, fatal=False))
3452 elif ext == 'smil':
3453 formats.extend(self._extract_smil_formats(
3454 source_url, video_id, fatal=False))
3455 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3456 elif source_type.startswith('audio') or ext in (
3457 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3458 formats.append({
3459 'url': source_url,
3460 'vcodec': 'none',
3461 'ext': ext,
3462 })
3463 else:
3464 format_id = str_or_none(source.get('label'))
3465 height = int_or_none(source.get('height'))
3466 if height is None and format_id:
3467 # Often no height is provided but there is a label in
3468 # format like "1080p", "720p SD", or 1080.
3469 height = parse_resolution(format_id).get('height')
3470 a_format = {
3471 'url': source_url,
3472 'width': int_or_none(source.get('width')),
3473 'height': height,
3474 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3475 'filesize': int_or_none(source.get('filesize')),
3476 'ext': ext,
3477 'format_id': format_id
3478 }
3479 if source_url.startswith('rtmp'):
3480 a_format['ext'] = 'flv'
3481 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3482 # of jwplayer.flash.swf
3483 rtmp_url_parts = re.split(
3484 r'((?:mp4|mp3|flv):)', source_url, 1)
3485 if len(rtmp_url_parts) == 3:
3486 rtmp_url, prefix, play_path = rtmp_url_parts
3487 a_format.update({
3488 'url': rtmp_url,
3489 'play_path': prefix + play_path,
3490 })
3491 if rtmp_params:
3492 a_format.update(rtmp_params)
3493 formats.append(a_format)
3494 return formats
3495
3496 def _live_title(self, name):
3497 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3498 return name
3499
3500 def _int(self, v, name, fatal=False, **kwargs):
3501 res = int_or_none(v, **kwargs)
3502 if res is None:
3503 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3504 if fatal:
3505 raise ExtractorError(msg)
3506 else:
3507 self.report_warning(msg)
3508 return res
3509
3510 def _float(self, v, name, fatal=False, **kwargs):
3511 res = float_or_none(v, **kwargs)
3512 if res is None:
3513 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3514 if fatal:
3515 raise ExtractorError(msg)
3516 else:
3517 self.report_warning(msg)
3518 return res
3519
3520 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3521 path='/', secure=False, discard=False, rest={}, **kwargs):
3522 cookie = http.cookiejar.Cookie(
3523 0, name, value, port, port is not None, domain, True,
3524 domain.startswith('.'), path, True, secure, expire_time,
3525 discard, None, None, rest)
3526 self.cookiejar.set_cookie(cookie)
3527
3528 def _get_cookies(self, url):
3529 """ Return a http.cookies.SimpleCookie with the cookies for the url """
3530 return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3531
3532 def _apply_first_set_cookie_header(self, url_handle, cookie):
3533 """
3534 Apply first Set-Cookie header instead of the last. Experimental.
3535
3536 Some sites (e.g. [1-3]) may serve two cookies under the same name
3537 in Set-Cookie header and expect the first (old) one to be set rather
3538 than second (new). However, as of RFC6265 the newer one cookie
3539 should be set into cookie store what actually happens.
3540 We will workaround this issue by resetting the cookie to
3541 the first one manually.
3542 1. https://new.vk.com/
3543 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3544 3. https://learning.oreilly.com/
3545 """
3546 for header, cookies in url_handle.headers.items():
3547 if header.lower() != 'set-cookie':
3548 continue
3549 cookies = cookies.encode('iso-8859-1').decode('utf-8')
3550 cookie_value = re.search(
3551 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3552 if cookie_value:
3553 value, domain = cookie_value.groups()
3554 self._set_cookie(domain, cookie, value)
3555 break
3556
3557 @classmethod
3558 def get_testcases(cls, include_onlymatching=False):
3559 # Do not look in super classes
3560 t = vars(cls).get('_TEST')
3561 if t:
3562 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3563 tests = [t]
3564 else:
3565 tests = vars(cls).get('_TESTS', [])
3566 for t in tests:
3567 if not include_onlymatching and t.get('only_matching', False):
3568 continue
3569 t['name'] = cls.ie_key()
3570 yield t
3571 if getattr(cls, '__wrapped__', None):
3572 yield from cls.__wrapped__.get_testcases(include_onlymatching)
3573
3574 @classmethod
3575 def get_webpage_testcases(cls):
3576 tests = vars(cls).get('_WEBPAGE_TESTS', [])
3577 for t in tests:
3578 t['name'] = cls.ie_key()
3579 yield t
3580 if getattr(cls, '__wrapped__', None):
3581 yield from cls.__wrapped__.get_webpage_testcases()
3582
3583 @classproperty(cache=True)
3584 def age_limit(cls):
3585 """Get age limit from the testcases"""
3586 return max(traverse_obj(
3587 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3588 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3589
3590 @classproperty(cache=True)
3591 def _RETURN_TYPE(cls):
3592 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3593 tests = tuple(cls.get_testcases(include_onlymatching=False))
3594 if not tests:
3595 return None
3596 elif not any(k.startswith('playlist') for test in tests for k in test):
3597 return 'video'
3598 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3599 return 'playlist'
3600 return 'any'
3601
3602 @classmethod
3603 def is_single_video(cls, url):
3604 """Returns whether the URL is of a single video, None if unknown"""
3605 if cls.suitable(url):
3606 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3607
3608 @classmethod
3609 def is_suitable(cls, age_limit):
3610 """Test whether the extractor is generally suitable for the given age limit"""
3611 return not age_restricted(cls.age_limit, age_limit)
3612
3613 @classmethod
3614 def description(cls, *, markdown=True, search_examples=None):
3615 """Description of the extractor"""
3616 desc = ''
3617 if cls._NETRC_MACHINE:
3618 if markdown:
3619 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3620 else:
3621 desc += f' [{cls._NETRC_MACHINE}]'
3622 if cls.IE_DESC is False:
3623 desc += ' [HIDDEN]'
3624 elif cls.IE_DESC:
3625 desc += f' {cls.IE_DESC}'
3626 if cls.SEARCH_KEY:
3627 desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3628 if search_examples:
3629 _COUNTS = ('', '5', '10', 'all')
3630 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3631 if not cls.working():
3632 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3633
3634 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3635 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3636 return f'{name}:{desc}' if desc else name
3637
3638 def extract_subtitles(self, *args, **kwargs):
3639 if (self.get_param('writesubtitles', False)
3640 or self.get_param('listsubtitles')):
3641 return self._get_subtitles(*args, **kwargs)
3642 return {}
3643
3644 def _get_subtitles(self, *args, **kwargs):
3645 raise NotImplementedError('This method must be implemented by subclasses')
3646
3647 class CommentsDisabled(Exception):
3648 """Raise in _get_comments if comments are disabled for the video"""
3649
3650 def extract_comments(self, *args, **kwargs):
3651 if not self.get_param('getcomments'):
3652 return None
3653 generator = self._get_comments(*args, **kwargs)
3654
3655 def extractor():
3656 comments = []
3657 interrupted = True
3658 try:
3659 while True:
3660 comments.append(next(generator))
3661 except StopIteration:
3662 interrupted = False
3663 except KeyboardInterrupt:
3664 self.to_screen('Interrupted by user')
3665 except self.CommentsDisabled:
3666 return {'comments': None, 'comment_count': None}
3667 except Exception as e:
3668 if self.get_param('ignoreerrors') is not True:
3669 raise
3670 self._downloader.report_error(e)
3671 comment_count = len(comments)
3672 self.to_screen(f'Extracted {comment_count} comments')
3673 return {
3674 'comments': comments,
3675 'comment_count': None if interrupted else comment_count
3676 }
3677 return extractor
3678
3679 def _get_comments(self, *args, **kwargs):
3680 raise NotImplementedError('This method must be implemented by subclasses')
3681
3682 @staticmethod
3683 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3684 """ Merge subtitle items for one language. Items with duplicated URLs/data
3685 will be dropped. """
3686 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3687 ret = list(subtitle_list1)
3688 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3689 return ret
3690
3691 @classmethod
3692 def _merge_subtitles(cls, *dicts, target=None):
3693 """ Merge subtitle dictionaries, language by language. """
3694 if target is None:
3695 target = {}
3696 for d in dicts:
3697 for lang, subs in d.items():
3698 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3699 return target
3700
3701 def extract_automatic_captions(self, *args, **kwargs):
3702 if (self.get_param('writeautomaticsub', False)
3703 or self.get_param('listsubtitles')):
3704 return self._get_automatic_captions(*args, **kwargs)
3705 return {}
3706
3707 def _get_automatic_captions(self, *args, **kwargs):
3708 raise NotImplementedError('This method must be implemented by subclasses')
3709
3710 @functools.cached_property
3711 def _cookies_passed(self):
3712 """Whether cookies have been passed to YoutubeDL"""
3713 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3714
3715 def mark_watched(self, *args, **kwargs):
3716 if not self.get_param('mark_watched', False):
3717 return
3718 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3719 self._mark_watched(*args, **kwargs)
3720
3721 def _mark_watched(self, *args, **kwargs):
3722 raise NotImplementedError('This method must be implemented by subclasses')
3723
3724 def geo_verification_headers(self):
3725 headers = {}
3726 geo_verification_proxy = self.get_param('geo_verification_proxy')
3727 if geo_verification_proxy:
3728 headers['Ytdl-request-proxy'] = geo_verification_proxy
3729 return headers
3730
3731 @staticmethod
3732 def _generic_id(url):
3733 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3734
3735 def _generic_title(self, url='', webpage='', *, default=None):
3736 return (self._og_search_title(webpage, default=None)
3737 or self._html_extract_title(webpage, default=None)
3738 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3739 or default)
3740
3741 def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3742 if not duration:
3743 return
3744 chapter_list = [{
3745 'start_time': start_function(chapter),
3746 'title': title_function(chapter),
3747 } for chapter in chapter_list or []]
3748 if strict:
3749 warn = self.report_warning
3750 else:
3751 warn = self.write_debug
3752 chapter_list.sort(key=lambda c: c['start_time'] or 0)
3753
3754 chapters = [{'start_time': 0}]
3755 for idx, chapter in enumerate(chapter_list):
3756 if chapter['start_time'] is None:
3757 warn(f'Incomplete chapter {idx}')
3758 elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3759 chapters.append(chapter)
3760 elif chapter not in chapters:
3761 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3762 else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3763 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3764 return chapters[1:]
3765
3766 def _extract_chapters_from_description(self, description, duration):
3767 duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3768 sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3769 return self._extract_chapters_helper(
3770 re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3771 start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3772 duration=duration, strict=False) or self._extract_chapters_helper(
3773 re.findall(sep_re % (r'.+?', duration_re), description or ''),
3774 start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3775 duration=duration, strict=False)
3776
3777 @staticmethod
3778 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3779 all_known = all(map(
3780 lambda x: x is not None,
3781 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3782 return (
3783 'private' if is_private
3784 else 'premium_only' if needs_premium
3785 else 'subscriber_only' if needs_subscription
3786 else 'needs_auth' if needs_auth
3787 else 'unlisted' if is_unlisted
3788 else 'public' if all_known
3789 else None)
3790
3791 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3792 '''
3793 @returns A list of values for the extractor argument given by "key"
3794 or "default" if no such key is present
3795 @param default The default value to return when the key is not present (default: [])
3796 @param casesense When false, the values are converted to lower case
3797 '''
3798 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3799 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3800 if val is None:
3801 return [] if default is NO_DEFAULT else default
3802 return list(val) if casesense else [x.lower() for x in val]
3803
3804 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3805 if not playlist_id or not video_id:
3806 return not video_id
3807
3808 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3809 if no_playlist is not None:
3810 return not no_playlist
3811
3812 video_id = '' if video_id is True else f' {video_id}'
3813 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3814 if self.get_param('noplaylist'):
3815 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3816 return False
3817 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3818 return True
3819
3820 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3821 RetryManager.report_retry(
3822 err, _count or int(fatal), _retries,
3823 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3824 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3825
3826 def RetryManager(self, **kwargs):
3827 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3828
3829 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3830 display_id = traverse_obj(info_dict, 'display_id', 'id')
3831 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3832 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3833 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3834
3835 @classmethod
3836 def extract_from_webpage(cls, ydl, url, webpage):
3837 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3838 else ydl.get_info_extractor(cls.ie_key()))
3839 for info in ie._extract_from_webpage(url, webpage) or []:
3840 # url = None since we do not want to set (webpage/original)_url
3841 ydl.add_default_extra_info(info, ie, None)
3842 yield info
3843
3844 @classmethod
3845 def _extract_from_webpage(cls, url, webpage):
3846 for embed_url in orderedSet(
3847 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3848 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3849
3850 @classmethod
3851 def _extract_embed_urls(cls, url, webpage):
3852 """@returns all the embed urls on the webpage"""
3853 if '_EMBED_URL_RE' not in cls.__dict__:
3854 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3855 for idx, regex in enumerate(cls._EMBED_REGEX):
3856 assert regex.count('(?P<url>') == 1, \
3857 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3858 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3859
3860 for regex in cls._EMBED_URL_RE:
3861 for mobj in regex.finditer(webpage):
3862 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3863 if cls._VALID_URL is False or cls.suitable(embed_url):
3864 yield embed_url
3865
3866 class StopExtraction(Exception):
3867 pass
3868
3869 @classmethod
3870 def _extract_url(cls, webpage): # TODO: Remove
3871 """Only for compatibility with some older extractors"""
3872 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3873
3874 @classmethod
3875 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3876 if plugin_name:
3877 mro = inspect.getmro(cls)
3878 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3879 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3880 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3881 while getattr(super_class, '__wrapped__', None):
3882 super_class = super_class.__wrapped__
3883 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3884 _PLUGIN_OVERRIDES[super_class].append(cls)
3885
3886 return super().__init_subclass__(**kwargs)
3887
3888
3889 class SearchInfoExtractor(InfoExtractor):
3890 """
3891 Base class for paged search queries extractors.
3892 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3893 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3894 """
3895
3896 _MAX_RESULTS = float('inf')
3897 _RETURN_TYPE = 'playlist'
3898
3899 @classproperty
3900 def _VALID_URL(cls):
3901 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3902
3903 def _real_extract(self, query):
3904 prefix, query = self._match_valid_url(query).group('prefix', 'query')
3905 if prefix == '':
3906 return self._get_n_results(query, 1)
3907 elif prefix == 'all':
3908 return self._get_n_results(query, self._MAX_RESULTS)
3909 else:
3910 n = int(prefix)
3911 if n <= 0:
3912 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3913 elif n > self._MAX_RESULTS:
3914 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3915 n = self._MAX_RESULTS
3916 return self._get_n_results(query, n)
3917
3918 def _get_n_results(self, query, n):
3919 """Get a specified number of results for a query.
3920 Either this function or _search_results must be overridden by subclasses """
3921 return self.playlist_result(
3922 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3923 query, query)
3924
3925 def _search_results(self, query):
3926 """Returns an iterator of search results"""
3927 raise NotImplementedError('This method must be implemented by subclasses')
3928
3929 @classproperty
3930 def SEARCH_KEY(cls):
3931 return cls._SEARCH_KEY
3932
3933
3934 class UnsupportedURLIE(InfoExtractor):
3935 _VALID_URL = '.*'
3936 _ENABLED = False
3937 IE_DESC = False
3938
3939 def _real_extract(self, url):
3940 raise UnsupportedError(url)
3941
3942
3943 _PLUGIN_OVERRIDES = collections.defaultdict(list)