]> jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/extractor/common.py
[ie] Support multi-period MPD streams (#6654)
[yt-dlp.git] / yt_dlp / extractor / common.py
... / ...
CommitLineData
1import base64
2import collections
3import getpass
4import hashlib
5import http.client
6import http.cookiejar
7import http.cookies
8import inspect
9import itertools
10import json
11import math
12import netrc
13import os
14import random
15import re
16import subprocess
17import sys
18import time
19import types
20import urllib.parse
21import urllib.request
22import xml.etree.ElementTree
23
24from ..compat import functools # isort: split
25from ..compat import (
26 compat_etree_fromstring,
27 compat_expanduser,
28 compat_os_name,
29 urllib_req_to_req,
30)
31from ..cookies import LenientSimpleCookie
32from ..downloader.f4m import get_base_url, remove_encrypted_media
33from ..downloader.hls import HlsFD
34from ..networking import HEADRequest, Request
35from ..networking.exceptions import (
36 HTTPError,
37 IncompleteRead,
38 network_exceptions,
39)
40from ..utils import (
41 IDENTITY,
42 JSON_LD_RE,
43 NO_DEFAULT,
44 ExtractorError,
45 FormatSorter,
46 GeoRestrictedError,
47 GeoUtils,
48 LenientJSONDecoder,
49 Popen,
50 RegexNotFoundError,
51 RetryManager,
52 UnsupportedError,
53 age_restricted,
54 base_url,
55 bug_reports_message,
56 classproperty,
57 clean_html,
58 deprecation_warning,
59 determine_ext,
60 dict_get,
61 encode_data_uri,
62 error_to_compat_str,
63 extract_attributes,
64 filter_dict,
65 fix_xml_ampersands,
66 float_or_none,
67 format_field,
68 int_or_none,
69 join_nonempty,
70 js_to_json,
71 mimetype2ext,
72 netrc_from_content,
73 orderedSet,
74 parse_bitrate,
75 parse_codecs,
76 parse_duration,
77 parse_iso8601,
78 parse_m3u8_attributes,
79 parse_resolution,
80 sanitize_filename,
81 sanitize_url,
82 smuggle_url,
83 str_or_none,
84 str_to_int,
85 strip_or_none,
86 traverse_obj,
87 truncate_string,
88 try_call,
89 try_get,
90 unescapeHTML,
91 unified_strdate,
92 unified_timestamp,
93 url_basename,
94 url_or_none,
95 urlhandle_detect_ext,
96 urljoin,
97 variadic,
98 xpath_element,
99 xpath_text,
100 xpath_with_ns,
101)
102
103
104class InfoExtractor:
105 """Information Extractor class.
106
107 Information extractors are the classes that, given a URL, extract
108 information about the video (or videos) the URL refers to. This
109 information includes the real video URL, the video title, author and
110 others. The information is stored in a dictionary which is then
111 passed to the YoutubeDL. The YoutubeDL processes this
112 information possibly downloading the video to the file system, among
113 other possible outcomes.
114
115 The type field determines the type of the result.
116 By far the most common value (and the default if _type is missing) is
117 "video", which indicates a single video.
118
119 For a video, the dictionaries must include the following fields:
120
121 id: Video identifier.
122 title: Video title, unescaped. Set to an empty string if video has
123 no title as opposed to "None" which signifies that the
124 extractor failed to obtain a title
125
126 Additionally, it must contain either a formats entry or a url one:
127
128 formats: A list of dictionaries for each format available, ordered
129 from worst to best quality.
130
131 Potential fields:
132 * url The mandatory URL representing the media:
133 for plain file media - HTTP URL of this file,
134 for RTMP - RTMP URL,
135 for HLS - URL of the M3U8 media playlist,
136 for HDS - URL of the F4M manifest,
137 for DASH
138 - HTTP URL to plain file media (in case of
139 unfragmented media)
140 - URL of the MPD manifest or base URL
141 representing the media if MPD manifest
142 is parsed from a string (in case of
143 fragmented media)
144 for MSS - URL of the ISM manifest.
145 * request_data Data to send in POST request to the URL
146 * manifest_url
147 The URL of the manifest file in case of
148 fragmented media:
149 for HLS - URL of the M3U8 master playlist,
150 for HDS - URL of the F4M manifest,
151 for DASH - URL of the MPD manifest,
152 for MSS - URL of the ISM manifest.
153 * manifest_stream_number (For internal use only)
154 The index of the stream in the manifest file
155 * ext Will be calculated from URL if missing
156 * format A human-readable description of the format
157 ("mp4 container with h264/opus").
158 Calculated from the format_id, width, height.
159 and format_note fields if missing.
160 * format_id A short description of the format
161 ("mp4_h264_opus" or "19").
162 Technically optional, but strongly recommended.
163 * format_note Additional info about the format
164 ("3D" or "DASH video")
165 * width Width of the video, if known
166 * height Height of the video, if known
167 * aspect_ratio Aspect ratio of the video, if known
168 Automatically calculated from width and height
169 * resolution Textual description of width and height
170 Automatically calculated from width and height
171 * dynamic_range The dynamic range of the video. One of:
172 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
173 * tbr Average bitrate of audio and video in KBit/s
174 * abr Average audio bitrate in KBit/s
175 * acodec Name of the audio codec in use
176 * asr Audio sampling rate in Hertz
177 * audio_channels Number of audio channels
178 * vbr Average video bitrate in KBit/s
179 * fps Frame rate
180 * vcodec Name of the video codec in use
181 * container Name of the container format
182 * filesize The number of bytes, if known in advance
183 * filesize_approx An estimate for the number of bytes
184 * player_url SWF Player URL (used for rtmpdump).
185 * protocol The protocol that will be used for the actual
186 download, lower-case. One of "http", "https" or
187 one of the protocols defined in downloader.PROTOCOL_MAP
188 * fragment_base_url
189 Base URL for fragments. Each fragment's path
190 value (if present) will be relative to
191 this URL.
192 * fragments A list of fragments of a fragmented media.
193 Each fragment entry must contain either an url
194 or a path. If an url is present it should be
195 considered by a client. Otherwise both path and
196 fragment_base_url must be present. Here is
197 the list of all potential fields:
198 * "url" - fragment's URL
199 * "path" - fragment's path relative to
200 fragment_base_url
201 * "duration" (optional, int or float)
202 * "filesize" (optional, int)
203 * is_from_start Is a live format that can be downloaded
204 from the start. Boolean
205 * preference Order number of this format. If this field is
206 present and not None, the formats get sorted
207 by this field, regardless of all other values.
208 -1 for default (order by other properties),
209 -2 or smaller for less than default.
210 < -1000 to hide the format (if there is
211 another one which is strictly better)
212 * language Language code, e.g. "de" or "en-US".
213 * language_preference Is this in the language mentioned in
214 the URL?
215 10 if it's what the URL is about,
216 -1 for default (don't know),
217 -10 otherwise, other values reserved for now.
218 * quality Order number of the video quality of this
219 format, irrespective of the file format.
220 -1 for default (order by other properties),
221 -2 or smaller for less than default.
222 * source_preference Order number for this video source
223 (quality takes higher priority)
224 -1 for default (order by other properties),
225 -2 or smaller for less than default.
226 * http_headers A dictionary of additional HTTP headers
227 to add to the request.
228 * stretched_ratio If given and not 1, indicates that the
229 video's pixels are not square.
230 width : height ratio as float.
231 * no_resume The server does not support resuming the
232 (HTTP or RTMP) download. Boolean.
233 * has_drm True if the format has DRM and cannot be downloaded.
234 'maybe' if the format may have DRM and has to be tested before download.
235 * extra_param_to_segment_url A query string to append to each
236 fragment's URL, or to update each existing query string
237 with. Only applied by the native HLS/DASH downloaders.
238 * hls_aes A dictionary of HLS AES-128 decryption information
239 used by the native HLS downloader to override the
240 values in the media playlist when an '#EXT-X-KEY' tag
241 is present in the playlist:
242 * uri The URI from which the key will be downloaded
243 * key The key (as hex) used to decrypt fragments.
244 If `key` is given, any key URI will be ignored
245 * iv The IV (as hex) used to decrypt fragments
246 * downloader_options A dictionary of downloader options
247 (For internal use only)
248 * http_chunk_size Chunk size for HTTP downloads
249 * ffmpeg_args Extra arguments for ffmpeg downloader
250 * is_dash_periods Whether the format is a result of merging
251 multiple DASH periods.
252 RTMP formats can also have the additional fields: page_url,
253 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
254 rtmp_protocol, rtmp_real_time
255
256 url: Final video URL.
257 ext: Video filename extension.
258 format: The video format, defaults to ext (used for --get-format)
259 player_url: SWF Player URL (used for rtmpdump).
260
261 The following fields are optional:
262
263 direct: True if a direct video file was given (must only be set by GenericIE)
264 alt_title: A secondary title of the video.
265 display_id An alternative identifier for the video, not necessarily
266 unique, but available before title. Typically, id is
267 something like "4234987", title "Dancing naked mole rats",
268 and display_id "dancing-naked-mole-rats"
269 thumbnails: A list of dictionaries, with the following entries:
270 * "id" (optional, string) - Thumbnail format ID
271 * "url"
272 * "preference" (optional, int) - quality of the image
273 * "width" (optional, int)
274 * "height" (optional, int)
275 * "resolution" (optional, string "{width}x{height}",
276 deprecated)
277 * "filesize" (optional, int)
278 * "http_headers" (dict) - HTTP headers for the request
279 thumbnail: Full URL to a video thumbnail image.
280 description: Full video description.
281 uploader: Full name of the video uploader.
282 license: License name the video is licensed under.
283 creator: The creator of the video.
284 timestamp: UNIX timestamp of the moment the video was uploaded
285 upload_date: Video upload date in UTC (YYYYMMDD).
286 If not explicitly set, calculated from timestamp
287 release_timestamp: UNIX timestamp of the moment the video was released.
288 If it is not clear whether to use timestamp or this, use the former
289 release_date: The date (YYYYMMDD) when the video was released in UTC.
290 If not explicitly set, calculated from release_timestamp
291 release_year: Year (YYYY) as integer when the video or album was released.
292 To be used if no exact release date is known.
293 If not explicitly set, calculated from release_date.
294 modified_timestamp: UNIX timestamp of the moment the video was last modified.
295 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
296 If not explicitly set, calculated from modified_timestamp
297 uploader_id: Nickname or id of the video uploader.
298 uploader_url: Full URL to a personal webpage of the video uploader.
299 channel: Full name of the channel the video is uploaded on.
300 Note that channel fields may or may not repeat uploader
301 fields. This depends on a particular extractor.
302 channel_id: Id of the channel.
303 channel_url: Full URL to a channel webpage.
304 channel_follower_count: Number of followers of the channel.
305 channel_is_verified: Whether the channel is verified on the platform.
306 location: Physical location where the video was filmed.
307 subtitles: The available subtitles as a dictionary in the format
308 {tag: subformats}. "tag" is usually a language code, and
309 "subformats" is a list sorted from lower to higher
310 preference, each element is a dictionary with the "ext"
311 entry and one of:
312 * "data": The subtitles file contents
313 * "url": A URL pointing to the subtitles file
314 It can optionally also have:
315 * "name": Name or description of the subtitles
316 * "http_headers": A dictionary of additional HTTP headers
317 to add to the request.
318 "ext" will be calculated from URL if missing
319 automatic_captions: Like 'subtitles'; contains automatically generated
320 captions instead of normal subtitles
321 duration: Length of the video in seconds, as an integer or float.
322 view_count: How many users have watched the video on the platform.
323 concurrent_view_count: How many users are currently watching the video on the platform.
324 like_count: Number of positive ratings of the video
325 dislike_count: Number of negative ratings of the video
326 repost_count: Number of reposts of the video
327 average_rating: Average rating give by users, the scale used depends on the webpage
328 comment_count: Number of comments on the video
329 comments: A list of comments, each with one or more of the following
330 properties (all but one of text or html optional):
331 * "author" - human-readable name of the comment author
332 * "author_id" - user ID of the comment author
333 * "author_thumbnail" - The thumbnail of the comment author
334 * "author_url" - The url to the comment author's page
335 * "author_is_verified" - Whether the author is verified
336 on the platform
337 * "author_is_uploader" - Whether the comment is made by
338 the video uploader
339 * "id" - Comment ID
340 * "html" - Comment as HTML
341 * "text" - Plain text of the comment
342 * "timestamp" - UNIX timestamp of comment
343 * "parent" - ID of the comment this one is replying to.
344 Set to "root" to indicate that this is a
345 comment to the original video.
346 * "like_count" - Number of positive ratings of the comment
347 * "dislike_count" - Number of negative ratings of the comment
348 * "is_favorited" - Whether the comment is marked as
349 favorite by the video uploader
350 * "is_pinned" - Whether the comment is pinned to
351 the top of the comments
352 age_limit: Age restriction for the video, as an integer (years)
353 webpage_url: The URL to the video webpage, if given to yt-dlp it
354 should allow to get the same result again. (It will be set
355 by YoutubeDL if it's missing)
356 categories: A list of categories that the video falls in, for example
357 ["Sports", "Berlin"]
358 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
359 cast: A list of the video cast
360 is_live: True, False, or None (=unknown). Whether this video is a
361 live stream that goes on instead of a fixed-length video.
362 was_live: True, False, or None (=unknown). Whether this video was
363 originally a live stream.
364 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
365 or 'post_live' (was live, but VOD is not yet processed)
366 If absent, automatically set from is_live, was_live
367 start_time: Time in seconds where the reproduction should start, as
368 specified in the URL.
369 end_time: Time in seconds where the reproduction should end, as
370 specified in the URL.
371 chapters: A list of dictionaries, with the following entries:
372 * "start_time" - The start time of the chapter in seconds
373 * "end_time" - The end time of the chapter in seconds
374 * "title" (optional, string)
375 heatmap: A list of dictionaries, with the following entries:
376 * "start_time" - The start time of the data point in seconds
377 * "end_time" - The end time of the data point in seconds
378 * "value" - The normalized value of the data point (float between 0 and 1)
379 playable_in_embed: Whether this video is allowed to play in embedded
380 players on other sites. Can be True (=always allowed),
381 False (=never allowed), None (=unknown), or a string
382 specifying the criteria for embedability; e.g. 'whitelist'
383 availability: Under what condition the video is available. One of
384 'private', 'premium_only', 'subscriber_only', 'needs_auth',
385 'unlisted' or 'public'. Use 'InfoExtractor._availability'
386 to set it
387 media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer"
388 _old_archive_ids: A list of old archive ids needed for backward compatibility
389 _format_sort_fields: A list of fields to use for sorting formats
390 __post_extractor: A function to be called just before the metadata is
391 written to either disk, logger or console. The function
392 must return a dict which will be added to the info_dict.
393 This is usefull for additional information that is
394 time-consuming to extract. Note that the fields thus
395 extracted will not be available to output template and
396 match_filter. So, only "comments" and "comment_count" are
397 currently allowed to be extracted via this method.
398
399 The following fields should only be used when the video belongs to some logical
400 chapter or section:
401
402 chapter: Name or title of the chapter the video belongs to.
403 chapter_number: Number of the chapter the video belongs to, as an integer.
404 chapter_id: Id of the chapter the video belongs to, as a unicode string.
405
406 The following fields should only be used when the video is an episode of some
407 series, programme or podcast:
408
409 series: Title of the series or programme the video episode belongs to.
410 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
411 season: Title of the season the video episode belongs to.
412 season_number: Number of the season the video episode belongs to, as an integer.
413 season_id: Id of the season the video episode belongs to, as a unicode string.
414 episode: Title of the video episode. Unlike mandatory video title field,
415 this field should denote the exact title of the video episode
416 without any kind of decoration.
417 episode_number: Number of the video episode within a season, as an integer.
418 episode_id: Id of the video episode, as a unicode string.
419
420 The following fields should only be used when the media is a track or a part of
421 a music album:
422
423 track: Title of the track.
424 track_number: Number of the track within an album or a disc, as an integer.
425 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
426 as a unicode string.
427 artist: Artist(s) of the track.
428 genre: Genre(s) of the track.
429 album: Title of the album the track belongs to.
430 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
431 album_artist: List of all artists appeared on the album (e.g.
432 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
433 and compilations).
434 disc_number: Number of the disc or other physical medium the track belongs to,
435 as an integer.
436 composer: Composer of the piece
437
438 The following fields should only be set for clips that should be cut from the original video:
439
440 section_start: Start time of the section in seconds
441 section_end: End time of the section in seconds
442
443 The following fields should only be set for storyboards:
444 rows: Number of rows in each storyboard fragment, as an integer
445 columns: Number of columns in each storyboard fragment, as an integer
446
447 Unless mentioned otherwise, the fields should be Unicode strings.
448
449 Unless mentioned otherwise, None is equivalent to absence of information.
450
451
452 _type "playlist" indicates multiple videos.
453 There must be a key "entries", which is a list, an iterable, or a PagedList
454 object, each element of which is a valid dictionary by this specification.
455
456 Additionally, playlists can have "id", "title", and any other relevant
457 attributes with the same semantics as videos (see above).
458
459 It can also have the following optional fields:
460
461 playlist_count: The total number of videos in a playlist. If not given,
462 YoutubeDL tries to calculate it from "entries"
463
464
465 _type "multi_video" indicates that there are multiple videos that
466 form a single show, for examples multiple acts of an opera or TV episode.
467 It must have an entries key like a playlist and contain all the keys
468 required for a video at the same time.
469
470
471 _type "url" indicates that the video must be extracted from another
472 location, possibly by a different extractor. Its only required key is:
473 "url" - the next URL to extract.
474 The key "ie_key" can be set to the class name (minus the trailing "IE",
475 e.g. "Youtube") if the extractor class is known in advance.
476 Additionally, the dictionary may have any properties of the resolved entity
477 known in advance, for example "title" if the title of the referred video is
478 known ahead of time.
479
480
481 _type "url_transparent" entities have the same specification as "url", but
482 indicate that the given additional information is more precise than the one
483 associated with the resolved URL.
484 This is useful when a site employs a video service that hosts the video and
485 its technical metadata, but that video service does not embed a useful
486 title, description etc.
487
488
489 Subclasses of this should also be added to the list of extractors and
490 should define _VALID_URL as a regexp or a Sequence of regexps, and
491 re-define the _real_extract() and (optionally) _real_initialize() methods.
492
493 Subclasses may also override suitable() if necessary, but ensure the function
494 signature is preserved and that this function imports everything it needs
495 (except other extractors), so that lazy_extractors works correctly.
496
497 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
498 the HTML of Generic webpages. It may also override _extract_embed_urls
499 or _extract_from_webpage as necessary. While these are normally classmethods,
500 _extract_from_webpage is allowed to be an instance method.
501
502 _extract_from_webpage may raise self.StopExtraction() to stop further
503 processing of the webpage and obtain exclusive rights to it. This is useful
504 when the extractor cannot reliably be matched using just the URL,
505 e.g. invidious/peertube instances
506
507 Embed-only extractors can be defined by setting _VALID_URL = False.
508
509 To support username + password (or netrc) login, the extractor must define a
510 _NETRC_MACHINE and re-define _perform_login(username, password) and
511 (optionally) _initialize_pre_login() methods. The _perform_login method will
512 be called between _initialize_pre_login and _real_initialize if credentials
513 are passed by the user. In cases where it is necessary to have the login
514 process as part of the extraction rather than initialization, _perform_login
515 can be left undefined.
516
517 _GEO_BYPASS attribute may be set to False in order to disable
518 geo restriction bypass mechanisms for a particular extractor.
519 Though it won't disable explicit geo restriction bypass based on
520 country code provided with geo_bypass_country.
521
522 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
523 countries for this extractor. One of these countries will be used by
524 geo restriction bypass mechanism right away in order to bypass
525 geo restriction, of course, if the mechanism is not disabled.
526
527 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
528 IP blocks in CIDR notation for this extractor. One of these IP blocks
529 will be used by geo restriction bypass mechanism similarly
530 to _GEO_COUNTRIES.
531
532 The _ENABLED attribute should be set to False for IEs that
533 are disabled by default and must be explicitly enabled.
534
535 The _WORKING attribute should be set to False for broken IEs
536 in order to warn the users and skip the tests.
537 """
538
539 _ready = False
540 _downloader = None
541 _x_forwarded_for_ip = None
542 _GEO_BYPASS = True
543 _GEO_COUNTRIES = None
544 _GEO_IP_BLOCKS = None
545 _WORKING = True
546 _ENABLED = True
547 _NETRC_MACHINE = None
548 IE_DESC = None
549 SEARCH_KEY = None
550 _VALID_URL = None
551 _EMBED_REGEX = []
552
553 def _login_hint(self, method=NO_DEFAULT, netrc=None):
554 password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
555 return {
556 None: '',
557 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
558 'password': f'Use {password_hint}',
559 'cookies': (
560 'Use --cookies-from-browser or --cookies for the authentication. '
561 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
562 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
563
564 def __init__(self, downloader=None):
565 """Constructor. Receives an optional downloader (a YoutubeDL instance).
566 If a downloader is not passed during initialization,
567 it must be set using "set_downloader()" before "extract()" is called"""
568 self._ready = False
569 self._x_forwarded_for_ip = None
570 self._printed_messages = set()
571 self.set_downloader(downloader)
572
573 @classmethod
574 def _match_valid_url(cls, url):
575 if cls._VALID_URL is False:
576 return None
577 # This does not use has/getattr intentionally - we want to know whether
578 # we have cached the regexp for *this* class, whereas getattr would also
579 # match the superclass
580 if '_VALID_URL_RE' not in cls.__dict__:
581 cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
582 return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
583
584 @classmethod
585 def suitable(cls, url):
586 """Receives a URL and returns True if suitable for this IE."""
587 # This function must import everything it needs (except other extractors),
588 # so that lazy_extractors works correctly
589 return cls._match_valid_url(url) is not None
590
591 @classmethod
592 def _match_id(cls, url):
593 return cls._match_valid_url(url).group('id')
594
595 @classmethod
596 def get_temp_id(cls, url):
597 try:
598 return cls._match_id(url)
599 except (IndexError, AttributeError):
600 return None
601
602 @classmethod
603 def working(cls):
604 """Getter method for _WORKING."""
605 return cls._WORKING
606
607 @classmethod
608 def supports_login(cls):
609 return bool(cls._NETRC_MACHINE)
610
611 def initialize(self):
612 """Initializes an instance (authentication, etc)."""
613 self._printed_messages = set()
614 self._initialize_geo_bypass({
615 'countries': self._GEO_COUNTRIES,
616 'ip_blocks': self._GEO_IP_BLOCKS,
617 })
618 if not self._ready:
619 self._initialize_pre_login()
620 if self.supports_login():
621 username, password = self._get_login_info()
622 if username:
623 self._perform_login(username, password)
624 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
625 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
626 self._real_initialize()
627 self._ready = True
628
629 def _initialize_geo_bypass(self, geo_bypass_context):
630 """
631 Initialize geo restriction bypass mechanism.
632
633 This method is used to initialize geo bypass mechanism based on faking
634 X-Forwarded-For HTTP header. A random country from provided country list
635 is selected and a random IP belonging to this country is generated. This
636 IP will be passed as X-Forwarded-For HTTP header in all subsequent
637 HTTP requests.
638
639 This method will be used for initial geo bypass mechanism initialization
640 during the instance initialization with _GEO_COUNTRIES and
641 _GEO_IP_BLOCKS.
642
643 You may also manually call it from extractor's code if geo bypass
644 information is not available beforehand (e.g. obtained during
645 extraction) or due to some other reason. In this case you should pass
646 this information in geo bypass context passed as first argument. It may
647 contain following fields:
648
649 countries: List of geo unrestricted countries (similar
650 to _GEO_COUNTRIES)
651 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
652 (similar to _GEO_IP_BLOCKS)
653
654 """
655 if not self._x_forwarded_for_ip:
656
657 # Geo bypass mechanism is explicitly disabled by user
658 if not self.get_param('geo_bypass', True):
659 return
660
661 if not geo_bypass_context:
662 geo_bypass_context = {}
663
664 # Backward compatibility: previously _initialize_geo_bypass
665 # expected a list of countries, some 3rd party code may still use
666 # it this way
667 if isinstance(geo_bypass_context, (list, tuple)):
668 geo_bypass_context = {
669 'countries': geo_bypass_context,
670 }
671
672 # The whole point of geo bypass mechanism is to fake IP
673 # as X-Forwarded-For HTTP header based on some IP block or
674 # country code.
675
676 # Path 1: bypassing based on IP block in CIDR notation
677
678 # Explicit IP block specified by user, use it right away
679 # regardless of whether extractor is geo bypassable or not
680 ip_block = self.get_param('geo_bypass_ip_block', None)
681
682 # Otherwise use random IP block from geo bypass context but only
683 # if extractor is known as geo bypassable
684 if not ip_block:
685 ip_blocks = geo_bypass_context.get('ip_blocks')
686 if self._GEO_BYPASS and ip_blocks:
687 ip_block = random.choice(ip_blocks)
688
689 if ip_block:
690 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
691 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
692 return
693
694 # Path 2: bypassing based on country code
695
696 # Explicit country code specified by user, use it right away
697 # regardless of whether extractor is geo bypassable or not
698 country = self.get_param('geo_bypass_country', None)
699
700 # Otherwise use random country code from geo bypass context but
701 # only if extractor is known as geo bypassable
702 if not country:
703 countries = geo_bypass_context.get('countries')
704 if self._GEO_BYPASS and countries:
705 country = random.choice(countries)
706
707 if country:
708 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
709 self._downloader.write_debug(
710 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
711
712 def extract(self, url):
713 """Extracts URL information and returns it in list of dicts."""
714 try:
715 for _ in range(2):
716 try:
717 self.initialize()
718 self.to_screen('Extracting URL: %s' % (
719 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
720 ie_result = self._real_extract(url)
721 if ie_result is None:
722 return None
723 if self._x_forwarded_for_ip:
724 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
725 subtitles = ie_result.get('subtitles') or {}
726 if 'no-live-chat' in self.get_param('compat_opts'):
727 for lang in ('live_chat', 'comments', 'danmaku'):
728 subtitles.pop(lang, None)
729 return ie_result
730 except GeoRestrictedError as e:
731 if self.__maybe_fake_ip_and_retry(e.countries):
732 continue
733 raise
734 except UnsupportedError:
735 raise
736 except ExtractorError as e:
737 e.video_id = e.video_id or self.get_temp_id(url)
738 e.ie = e.ie or self.IE_NAME,
739 e.traceback = e.traceback or sys.exc_info()[2]
740 raise
741 except IncompleteRead as e:
742 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
743 except (KeyError, StopIteration) as e:
744 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
745
746 def __maybe_fake_ip_and_retry(self, countries):
747 if (not self.get_param('geo_bypass_country', None)
748 and self._GEO_BYPASS
749 and self.get_param('geo_bypass', True)
750 and not self._x_forwarded_for_ip
751 and countries):
752 country_code = random.choice(countries)
753 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
754 if self._x_forwarded_for_ip:
755 self.report_warning(
756 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
757 % (self._x_forwarded_for_ip, country_code.upper()))
758 return True
759 return False
760
761 def set_downloader(self, downloader):
762 """Sets a YoutubeDL instance as the downloader for this IE."""
763 self._downloader = downloader
764
765 @property
766 def cache(self):
767 return self._downloader.cache
768
769 @property
770 def cookiejar(self):
771 return self._downloader.cookiejar
772
773 def _initialize_pre_login(self):
774 """ Initialization before login. Redefine in subclasses."""
775 pass
776
777 def _perform_login(self, username, password):
778 """ Login with username and password. Redefine in subclasses."""
779 pass
780
781 def _real_initialize(self):
782 """Real initialization process. Redefine in subclasses."""
783 pass
784
785 def _real_extract(self, url):
786 """Real extraction process. Redefine in subclasses."""
787 raise NotImplementedError('This method must be implemented by subclasses')
788
789 @classmethod
790 def ie_key(cls):
791 """A string for getting the InfoExtractor with get_info_extractor"""
792 return cls.__name__[:-2]
793
794 @classproperty
795 def IE_NAME(cls):
796 return cls.__name__[:-2]
797
798 @staticmethod
799 def __can_accept_status_code(err, expected_status):
800 assert isinstance(err, HTTPError)
801 if expected_status is None:
802 return False
803 elif callable(expected_status):
804 return expected_status(err.status) is True
805 else:
806 return err.status in variadic(expected_status)
807
808 def _create_request(self, url_or_request, data=None, headers=None, query=None):
809 if isinstance(url_or_request, urllib.request.Request):
810 self._downloader.deprecation_warning(
811 'Passing a urllib.request.Request to _create_request() is deprecated. '
812 'Use yt_dlp.networking.common.Request instead.')
813 url_or_request = urllib_req_to_req(url_or_request)
814 elif not isinstance(url_or_request, Request):
815 url_or_request = Request(url_or_request)
816
817 url_or_request.update(data=data, headers=headers, query=query)
818 return url_or_request
819
820 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
821 """
822 Return the response handle.
823
824 See _download_webpage docstring for arguments specification.
825 """
826 if not self._downloader._first_webpage_request:
827 sleep_interval = self.get_param('sleep_interval_requests') or 0
828 if sleep_interval > 0:
829 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
830 time.sleep(sleep_interval)
831 else:
832 self._downloader._first_webpage_request = False
833
834 if note is None:
835 self.report_download_webpage(video_id)
836 elif note is not False:
837 if video_id is None:
838 self.to_screen(str(note))
839 else:
840 self.to_screen(f'{video_id}: {note}')
841
842 # Some sites check X-Forwarded-For HTTP header in order to figure out
843 # the origin of the client behind proxy. This allows bypassing geo
844 # restriction by faking this header's value to IP that belongs to some
845 # geo unrestricted country. We will do so once we encounter any
846 # geo restriction error.
847 if self._x_forwarded_for_ip:
848 headers = (headers or {}).copy()
849 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
850
851 try:
852 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
853 except network_exceptions as err:
854 if isinstance(err, HTTPError):
855 if self.__can_accept_status_code(err, expected_status):
856 return err.response
857
858 if errnote is False:
859 return False
860 if errnote is None:
861 errnote = 'Unable to download webpage'
862
863 errmsg = f'{errnote}: {error_to_compat_str(err)}'
864 if fatal:
865 raise ExtractorError(errmsg, cause=err)
866 else:
867 self.report_warning(errmsg)
868 return False
869
870 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
871 encoding=None, data=None, headers={}, query={}, expected_status=None):
872 """
873 Return a tuple (page content as string, URL handle).
874
875 Arguments:
876 url_or_request -- plain text URL as a string or
877 a urllib.request.Request object
878 video_id -- Video/playlist/item identifier (string)
879
880 Keyword arguments:
881 note -- note printed before downloading (string)
882 errnote -- note printed in case of an error (string)
883 fatal -- flag denoting whether error should be considered fatal,
884 i.e. whether it should cause ExtractionError to be raised,
885 otherwise a warning will be reported and extraction continued
886 encoding -- encoding for a page content decoding, guessed automatically
887 when not explicitly specified
888 data -- POST data (bytes)
889 headers -- HTTP headers (dict)
890 query -- URL query (dict)
891 expected_status -- allows to accept failed HTTP requests (non 2xx
892 status code) by explicitly specifying a set of accepted status
893 codes. Can be any of the following entities:
894 - an integer type specifying an exact failed status code to
895 accept
896 - a list or a tuple of integer types specifying a list of
897 failed status codes to accept
898 - a callable accepting an actual failed status code and
899 returning True if it should be accepted
900 Note that this argument does not affect success status codes (2xx)
901 which are always accepted.
902 """
903
904 # Strip hashes from the URL (#1038)
905 if isinstance(url_or_request, str):
906 url_or_request = url_or_request.partition('#')[0]
907
908 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
909 if urlh is False:
910 assert not fatal
911 return False
912 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
913 return (content, urlh)
914
915 @staticmethod
916 def _guess_encoding_from_content(content_type, webpage_bytes):
917 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
918 if m:
919 encoding = m.group(1)
920 else:
921 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
922 webpage_bytes[:1024])
923 if m:
924 encoding = m.group(1).decode('ascii')
925 elif webpage_bytes.startswith(b'\xff\xfe'):
926 encoding = 'utf-16'
927 else:
928 encoding = 'utf-8'
929
930 return encoding
931
932 def __check_blocked(self, content):
933 first_block = content[:512]
934 if ('<title>Access to this site is blocked</title>' in content
935 and 'Websense' in first_block):
936 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
937 blocked_iframe = self._html_search_regex(
938 r'<iframe src="([^"]+)"', content,
939 'Websense information URL', default=None)
940 if blocked_iframe:
941 msg += ' Visit %s for more details' % blocked_iframe
942 raise ExtractorError(msg, expected=True)
943 if '<title>The URL you requested has been blocked</title>' in first_block:
944 msg = (
945 'Access to this webpage has been blocked by Indian censorship. '
946 'Use a VPN or proxy server (with --proxy) to route around it.')
947 block_msg = self._html_search_regex(
948 r'</h1><p>(.*?)</p>',
949 content, 'block message', default=None)
950 if block_msg:
951 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
952 raise ExtractorError(msg, expected=True)
953 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
954 and 'blocklist.rkn.gov.ru' in content):
955 raise ExtractorError(
956 'Access to this webpage has been blocked by decision of the Russian government. '
957 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
958 expected=True)
959
960 def _request_dump_filename(self, url, video_id):
961 basen = f'{video_id}_{url}'
962 trim_length = self.get_param('trim_file_name') or 240
963 if len(basen) > trim_length:
964 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
965 basen = basen[:trim_length - len(h)] + h
966 filename = sanitize_filename(f'{basen}.dump', restricted=True)
967 # Working around MAX_PATH limitation on Windows (see
968 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
969 if compat_os_name == 'nt':
970 absfilepath = os.path.abspath(filename)
971 if len(absfilepath) > 259:
972 filename = fR'\\?\{absfilepath}'
973 return filename
974
975 def __decode_webpage(self, webpage_bytes, encoding, headers):
976 if not encoding:
977 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
978 try:
979 return webpage_bytes.decode(encoding, 'replace')
980 except LookupError:
981 return webpage_bytes.decode('utf-8', 'replace')
982
983 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
984 webpage_bytes = urlh.read()
985 if prefix is not None:
986 webpage_bytes = prefix + webpage_bytes
987 if self.get_param('dump_intermediate_pages', False):
988 self.to_screen('Dumping request to ' + urlh.url)
989 dump = base64.b64encode(webpage_bytes).decode('ascii')
990 self._downloader.to_screen(dump)
991 if self.get_param('write_pages'):
992 filename = self._request_dump_filename(urlh.url, video_id)
993 self.to_screen(f'Saving request to {filename}')
994 with open(filename, 'wb') as outf:
995 outf.write(webpage_bytes)
996
997 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
998 self.__check_blocked(content)
999
1000 return content
1001
1002 def __print_error(self, errnote, fatal, video_id, err):
1003 if fatal:
1004 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
1005 elif errnote:
1006 self.report_warning(f'{video_id}: {errnote}: {err}')
1007
1008 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
1009 if transform_source:
1010 xml_string = transform_source(xml_string)
1011 try:
1012 return compat_etree_fromstring(xml_string.encode('utf-8'))
1013 except xml.etree.ElementTree.ParseError as ve:
1014 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
1015
1016 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
1017 try:
1018 return json.loads(
1019 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
1020 except ValueError as ve:
1021 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1022
1023 def _parse_socket_response_as_json(self, data, *args, **kwargs):
1024 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1025
1026 def __create_download_methods(name, parser, note, errnote, return_value):
1027
1028 def parse(ie, content, *args, errnote=errnote, **kwargs):
1029 if parser is None:
1030 return content
1031 if errnote is False:
1032 kwargs['errnote'] = errnote
1033 # parser is fetched by name so subclasses can override it
1034 return getattr(ie, parser)(content, *args, **kwargs)
1035
1036 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1037 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1038 res = self._download_webpage_handle(
1039 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1040 data=data, headers=headers, query=query, expected_status=expected_status)
1041 if res is False:
1042 return res
1043 content, urlh = res
1044 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1045
1046 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1047 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1048 if self.get_param('load_pages'):
1049 url_or_request = self._create_request(url_or_request, data, headers, query)
1050 filename = self._request_dump_filename(url_or_request.url, video_id)
1051 self.to_screen(f'Loading request from {filename}')
1052 try:
1053 with open(filename, 'rb') as dumpf:
1054 webpage_bytes = dumpf.read()
1055 except OSError as e:
1056 self.report_warning(f'Unable to load request from disk: {e}')
1057 else:
1058 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1059 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1060 kwargs = {
1061 'note': note,
1062 'errnote': errnote,
1063 'transform_source': transform_source,
1064 'fatal': fatal,
1065 'encoding': encoding,
1066 'data': data,
1067 'headers': headers,
1068 'query': query,
1069 'expected_status': expected_status,
1070 }
1071 if parser is None:
1072 kwargs.pop('transform_source')
1073 # The method is fetched by name so subclasses can override _download_..._handle
1074 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1075 return res if res is False else res[0]
1076
1077 def impersonate(func, name, return_value):
1078 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1079 func.__doc__ = f'''
1080 @param transform_source Apply this transformation before parsing
1081 @returns {return_value}
1082
1083 See _download_webpage_handle docstring for other arguments specification
1084 '''
1085
1086 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1087 impersonate(download_content, f'_download_{name}', f'{return_value}')
1088 return download_handle, download_content
1089
1090 _download_xml_handle, _download_xml = __create_download_methods(
1091 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1092 _download_json_handle, _download_json = __create_download_methods(
1093 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1094 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1095 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1096 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1097
1098 def _download_webpage(
1099 self, url_or_request, video_id, note=None, errnote=None,
1100 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1101 """
1102 Return the data of the page as a string.
1103
1104 Keyword arguments:
1105 tries -- number of tries
1106 timeout -- sleep interval between tries
1107
1108 See _download_webpage_handle docstring for other arguments specification.
1109 """
1110
1111 R''' # NB: These are unused; should they be deprecated?
1112 if tries != 1:
1113 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1114 if timeout is NO_DEFAULT:
1115 timeout = 5
1116 else:
1117 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1118 '''
1119
1120 try_count = 0
1121 while True:
1122 try:
1123 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1124 except IncompleteRead as e:
1125 try_count += 1
1126 if try_count >= tries:
1127 raise e
1128 self._sleep(timeout, video_id)
1129
1130 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1131 idstr = format_field(video_id, None, '%s: ')
1132 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1133 if only_once:
1134 if f'WARNING: {msg}' in self._printed_messages:
1135 return
1136 self._printed_messages.add(f'WARNING: {msg}')
1137 self._downloader.report_warning(msg, *args, **kwargs)
1138
1139 def to_screen(self, msg, *args, **kwargs):
1140 """Print msg to screen, prefixing it with '[ie_name]'"""
1141 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1142
1143 def write_debug(self, msg, *args, **kwargs):
1144 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1145
1146 def get_param(self, name, default=None, *args, **kwargs):
1147 if self._downloader:
1148 return self._downloader.params.get(name, default, *args, **kwargs)
1149 return default
1150
1151 def report_drm(self, video_id, partial=NO_DEFAULT):
1152 if partial is not NO_DEFAULT:
1153 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1154 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1155
1156 def report_extraction(self, id_or_name):
1157 """Report information extraction."""
1158 self.to_screen('%s: Extracting information' % id_or_name)
1159
1160 def report_download_webpage(self, video_id):
1161 """Report webpage download."""
1162 self.to_screen('%s: Downloading webpage' % video_id)
1163
1164 def report_age_confirmation(self):
1165 """Report attempt to confirm age."""
1166 self.to_screen('Confirming age')
1167
1168 def report_login(self):
1169 """Report attempt to log in."""
1170 self.to_screen('Logging in')
1171
1172 def raise_login_required(
1173 self, msg='This video is only available for registered users',
1174 metadata_available=False, method=NO_DEFAULT):
1175 if metadata_available and (
1176 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1177 self.report_warning(msg)
1178 return
1179 msg += format_field(self._login_hint(method), None, '. %s')
1180 raise ExtractorError(msg, expected=True)
1181
1182 def raise_geo_restricted(
1183 self, msg='This video is not available from your location due to geo restriction',
1184 countries=None, metadata_available=False):
1185 if metadata_available and (
1186 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1187 self.report_warning(msg)
1188 else:
1189 raise GeoRestrictedError(msg, countries=countries)
1190
1191 def raise_no_formats(self, msg, expected=False, video_id=None):
1192 if expected and (
1193 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1194 self.report_warning(msg, video_id)
1195 elif isinstance(msg, ExtractorError):
1196 raise msg
1197 else:
1198 raise ExtractorError(msg, expected=expected, video_id=video_id)
1199
1200 # Methods for following #608
1201 @staticmethod
1202 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1203 """Returns a URL that points to a page that should be processed"""
1204 if ie is not None:
1205 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1206 if video_id is not None:
1207 kwargs['id'] = video_id
1208 if video_title is not None:
1209 kwargs['title'] = video_title
1210 return {
1211 **kwargs,
1212 '_type': 'url_transparent' if url_transparent else 'url',
1213 'url': url,
1214 }
1215
1216 @classmethod
1217 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1218 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1219 return cls.playlist_result(
1220 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1221 playlist_id, playlist_title, **kwargs)
1222
1223 @staticmethod
1224 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1225 """Returns a playlist"""
1226 if playlist_id:
1227 kwargs['id'] = playlist_id
1228 if playlist_title:
1229 kwargs['title'] = playlist_title
1230 if playlist_description is not None:
1231 kwargs['description'] = playlist_description
1232 return {
1233 **kwargs,
1234 '_type': 'multi_video' if multi_video else 'playlist',
1235 'entries': entries,
1236 }
1237
1238 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1239 """
1240 Perform a regex search on the given string, using a single or a list of
1241 patterns returning the first matching group.
1242 In case of failure return a default value or raise a WARNING or a
1243 RegexNotFoundError, depending on fatal, specifying the field name.
1244 """
1245 if string is None:
1246 mobj = None
1247 elif isinstance(pattern, (str, re.Pattern)):
1248 mobj = re.search(pattern, string, flags)
1249 else:
1250 for p in pattern:
1251 mobj = re.search(p, string, flags)
1252 if mobj:
1253 break
1254
1255 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1256
1257 if mobj:
1258 if group is None:
1259 # return the first matching group
1260 return next(g for g in mobj.groups() if g is not None)
1261 elif isinstance(group, (list, tuple)):
1262 return tuple(mobj.group(g) for g in group)
1263 else:
1264 return mobj.group(group)
1265 elif default is not NO_DEFAULT:
1266 return default
1267 elif fatal:
1268 raise RegexNotFoundError('Unable to extract %s' % _name)
1269 else:
1270 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1271 return None
1272
1273 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1274 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1275 """Searches string for the JSON object specified by start_pattern"""
1276 # NB: end_pattern is only used to reduce the size of the initial match
1277 if default is NO_DEFAULT:
1278 default, has_default = {}, False
1279 else:
1280 fatal, has_default = False, True
1281
1282 json_string = self._search_regex(
1283 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1284 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1285 if not json_string:
1286 return default
1287
1288 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1289 try:
1290 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1291 except ExtractorError as e:
1292 if fatal:
1293 raise ExtractorError(
1294 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1295 elif not has_default:
1296 self.report_warning(
1297 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1298 return default
1299
1300 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1301 """
1302 Like _search_regex, but strips HTML tags and unescapes entities.
1303 """
1304 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1305 if isinstance(res, tuple):
1306 return tuple(map(clean_html, res))
1307 return clean_html(res)
1308
1309 def _get_netrc_login_info(self, netrc_machine=None):
1310 netrc_machine = netrc_machine or self._NETRC_MACHINE
1311
1312 cmd = self.get_param('netrc_cmd')
1313 if cmd:
1314 cmd = cmd.replace('{}', netrc_machine)
1315 self.to_screen(f'Executing command: {cmd}')
1316 stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
1317 if ret != 0:
1318 raise OSError(f'Command returned error code {ret}')
1319 info = netrc_from_content(stdout).authenticators(netrc_machine)
1320
1321 elif self.get_param('usenetrc', False):
1322 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1323 if os.path.isdir(netrc_file):
1324 netrc_file = os.path.join(netrc_file, '.netrc')
1325 info = netrc.netrc(netrc_file).authenticators(netrc_machine)
1326
1327 else:
1328 return None, None
1329 if not info:
1330 raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
1331 return info[0], info[2]
1332
1333 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1334 """
1335 Get the login info as (username, password)
1336 First look for the manually specified credentials using username_option
1337 and password_option as keys in params dictionary. If no such credentials
1338 are available try the netrc_cmd if it is defined or look in the
1339 netrc file using the netrc_machine or _NETRC_MACHINE value.
1340 If there's no info available, return (None, None)
1341 """
1342
1343 username = self.get_param(username_option)
1344 if username is not None:
1345 password = self.get_param(password_option)
1346 else:
1347 try:
1348 username, password = self._get_netrc_login_info(netrc_machine)
1349 except (OSError, netrc.NetrcParseError) as err:
1350 self.report_warning(f'Failed to parse .netrc: {err}')
1351 return None, None
1352 return username, password
1353
1354 def _get_tfa_info(self, note='two-factor verification code'):
1355 """
1356 Get the two-factor authentication info
1357 TODO - asking the user will be required for sms/phone verify
1358 currently just uses the command line option
1359 If there's no info available, return None
1360 """
1361
1362 tfa = self.get_param('twofactor')
1363 if tfa is not None:
1364 return tfa
1365
1366 return getpass.getpass('Type %s and press [Return]: ' % note)
1367
1368 # Helper functions for extracting OpenGraph info
1369 @staticmethod
1370 def _og_regexes(prop):
1371 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1372 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1373 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1374 template = r'<meta[^>]+?%s[^>]+?%s'
1375 return [
1376 template % (property_re, content_re),
1377 template % (content_re, property_re),
1378 ]
1379
1380 @staticmethod
1381 def _meta_regex(prop):
1382 return r'''(?isx)<meta
1383 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1384 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1385
1386 def _og_search_property(self, prop, html, name=None, **kargs):
1387 prop = variadic(prop)
1388 if name is None:
1389 name = 'OpenGraph %s' % prop[0]
1390 og_regexes = []
1391 for p in prop:
1392 og_regexes.extend(self._og_regexes(p))
1393 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1394 if escaped is None:
1395 return None
1396 return unescapeHTML(escaped)
1397
1398 def _og_search_thumbnail(self, html, **kargs):
1399 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1400
1401 def _og_search_description(self, html, **kargs):
1402 return self._og_search_property('description', html, fatal=False, **kargs)
1403
1404 def _og_search_title(self, html, *, fatal=False, **kargs):
1405 return self._og_search_property('title', html, fatal=fatal, **kargs)
1406
1407 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1408 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1409 if secure:
1410 regexes = self._og_regexes('video:secure_url') + regexes
1411 return self._html_search_regex(regexes, html, name, **kargs)
1412
1413 def _og_search_url(self, html, **kargs):
1414 return self._og_search_property('url', html, **kargs)
1415
1416 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1417 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1418
1419 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1420 name = variadic(name)
1421 if display_name is None:
1422 display_name = name[0]
1423 return self._html_search_regex(
1424 [self._meta_regex(n) for n in name],
1425 html, display_name, fatal=fatal, group='content', **kwargs)
1426
1427 def _dc_search_uploader(self, html):
1428 return self._html_search_meta('dc.creator', html, 'uploader')
1429
1430 @staticmethod
1431 def _rta_search(html):
1432 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1433 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1434 r' content="RTA-5042-1996-1400-1577-RTA"',
1435 html):
1436 return 18
1437
1438 # And then there are the jokers who advertise that they use RTA, but actually don't.
1439 AGE_LIMIT_MARKERS = [
1440 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1441 r'>[^<]*you acknowledge you are at least (\d+) years old',
1442 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1443 ]
1444
1445 age_limit = 0
1446 for marker in AGE_LIMIT_MARKERS:
1447 mobj = re.search(marker, html)
1448 if mobj:
1449 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1450 return age_limit
1451
1452 def _media_rating_search(self, html):
1453 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1454 rating = self._html_search_meta('rating', html)
1455
1456 if not rating:
1457 return None
1458
1459 RATING_TABLE = {
1460 'safe for kids': 0,
1461 'general': 8,
1462 '14 years': 14,
1463 'mature': 17,
1464 'restricted': 19,
1465 }
1466 return RATING_TABLE.get(rating.lower())
1467
1468 def _family_friendly_search(self, html):
1469 # See http://schema.org/VideoObject
1470 family_friendly = self._html_search_meta(
1471 'isFamilyFriendly', html, default=None)
1472
1473 if not family_friendly:
1474 return None
1475
1476 RATING_TABLE = {
1477 '1': 0,
1478 'true': 0,
1479 '0': 18,
1480 'false': 18,
1481 }
1482 return RATING_TABLE.get(family_friendly.lower())
1483
1484 def _twitter_search_player(self, html):
1485 return self._html_search_meta('twitter:player', html,
1486 'twitter card player')
1487
1488 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1489 """Yield all json ld objects in the html"""
1490 if default is not NO_DEFAULT:
1491 fatal = False
1492 for mobj in re.finditer(JSON_LD_RE, html):
1493 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1494 for json_ld in variadic(json_ld_item):
1495 if isinstance(json_ld, dict):
1496 yield json_ld
1497
1498 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1499 """Search for a video in any json ld in the html"""
1500 if default is not NO_DEFAULT:
1501 fatal = False
1502 info = self._json_ld(
1503 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1504 video_id, fatal=fatal, expected_type=expected_type)
1505 if info:
1506 return info
1507 if default is not NO_DEFAULT:
1508 return default
1509 elif fatal:
1510 raise RegexNotFoundError('Unable to extract JSON-LD')
1511 else:
1512 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1513 return {}
1514
1515 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1516 if isinstance(json_ld, str):
1517 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1518 if not json_ld:
1519 return {}
1520 info = {}
1521
1522 INTERACTION_TYPE_MAP = {
1523 'CommentAction': 'comment',
1524 'AgreeAction': 'like',
1525 'DisagreeAction': 'dislike',
1526 'LikeAction': 'like',
1527 'DislikeAction': 'dislike',
1528 'ListenAction': 'view',
1529 'WatchAction': 'view',
1530 'ViewAction': 'view',
1531 }
1532
1533 def is_type(e, *expected_types):
1534 type = variadic(traverse_obj(e, '@type'))
1535 return any(x in type for x in expected_types)
1536
1537 def extract_interaction_type(e):
1538 interaction_type = e.get('interactionType')
1539 if isinstance(interaction_type, dict):
1540 interaction_type = interaction_type.get('@type')
1541 return str_or_none(interaction_type)
1542
1543 def extract_interaction_statistic(e):
1544 interaction_statistic = e.get('interactionStatistic')
1545 if isinstance(interaction_statistic, dict):
1546 interaction_statistic = [interaction_statistic]
1547 if not isinstance(interaction_statistic, list):
1548 return
1549 for is_e in interaction_statistic:
1550 if not is_type(is_e, 'InteractionCounter'):
1551 continue
1552 interaction_type = extract_interaction_type(is_e)
1553 if not interaction_type:
1554 continue
1555 # For interaction count some sites provide string instead of
1556 # an integer (as per spec) with non digit characters (e.g. ",")
1557 # so extracting count with more relaxed str_to_int
1558 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1559 if interaction_count is None:
1560 continue
1561 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1562 if not count_kind:
1563 continue
1564 count_key = '%s_count' % count_kind
1565 if info.get(count_key) is not None:
1566 continue
1567 info[count_key] = interaction_count
1568
1569 def extract_chapter_information(e):
1570 chapters = [{
1571 'title': part.get('name'),
1572 'start_time': part.get('startOffset'),
1573 'end_time': part.get('endOffset'),
1574 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1575 for idx, (last_c, current_c, next_c) in enumerate(zip(
1576 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1577 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1578 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1579 if None in current_c.values():
1580 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1581 return
1582 if chapters:
1583 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1584 info['chapters'] = chapters
1585
1586 def extract_video_object(e):
1587 author = e.get('author')
1588 info.update({
1589 'url': url_or_none(e.get('contentUrl')),
1590 'ext': mimetype2ext(e.get('encodingFormat')),
1591 'title': unescapeHTML(e.get('name')),
1592 'description': unescapeHTML(e.get('description')),
1593 'thumbnails': [{'url': unescapeHTML(url)}
1594 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1595 if url_or_none(url)],
1596 'duration': parse_duration(e.get('duration')),
1597 'timestamp': unified_timestamp(e.get('uploadDate')),
1598 # author can be an instance of 'Organization' or 'Person' types.
1599 # both types can have 'name' property(inherited from 'Thing' type). [1]
1600 # however some websites are using 'Text' type instead.
1601 # 1. https://schema.org/VideoObject
1602 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1603 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1604 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1605 'tbr': int_or_none(e.get('bitrate')),
1606 'width': int_or_none(e.get('width')),
1607 'height': int_or_none(e.get('height')),
1608 'view_count': int_or_none(e.get('interactionCount')),
1609 'tags': try_call(lambda: e.get('keywords').split(',')),
1610 })
1611 if is_type(e, 'AudioObject'):
1612 info.update({
1613 'vcodec': 'none',
1614 'abr': int_or_none(e.get('bitrate')),
1615 })
1616 extract_interaction_statistic(e)
1617 extract_chapter_information(e)
1618
1619 def traverse_json_ld(json_ld, at_top_level=True):
1620 for e in variadic(json_ld):
1621 if not isinstance(e, dict):
1622 continue
1623 if at_top_level and '@context' not in e:
1624 continue
1625 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1626 traverse_json_ld(e['@graph'], at_top_level=False)
1627 continue
1628 if expected_type is not None and not is_type(e, expected_type):
1629 continue
1630 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1631 if rating is not None:
1632 info['average_rating'] = rating
1633 if is_type(e, 'TVEpisode', 'Episode'):
1634 episode_name = unescapeHTML(e.get('name'))
1635 info.update({
1636 'episode': episode_name,
1637 'episode_number': int_or_none(e.get('episodeNumber')),
1638 'description': unescapeHTML(e.get('description')),
1639 })
1640 if not info.get('title') and episode_name:
1641 info['title'] = episode_name
1642 part_of_season = e.get('partOfSeason')
1643 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1644 info.update({
1645 'season': unescapeHTML(part_of_season.get('name')),
1646 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1647 })
1648 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1649 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1650 info['series'] = unescapeHTML(part_of_series.get('name'))
1651 elif is_type(e, 'Movie'):
1652 info.update({
1653 'title': unescapeHTML(e.get('name')),
1654 'description': unescapeHTML(e.get('description')),
1655 'duration': parse_duration(e.get('duration')),
1656 'timestamp': unified_timestamp(e.get('dateCreated')),
1657 })
1658 elif is_type(e, 'Article', 'NewsArticle'):
1659 info.update({
1660 'timestamp': parse_iso8601(e.get('datePublished')),
1661 'title': unescapeHTML(e.get('headline')),
1662 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1663 })
1664 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1665 extract_video_object(e['video'][0])
1666 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1667 extract_video_object(e['subjectOf'][0])
1668 elif is_type(e, 'VideoObject', 'AudioObject'):
1669 extract_video_object(e)
1670 if expected_type is None:
1671 continue
1672 else:
1673 break
1674 video = e.get('video')
1675 if is_type(video, 'VideoObject'):
1676 extract_video_object(video)
1677 if expected_type is None:
1678 continue
1679 else:
1680 break
1681
1682 traverse_json_ld(json_ld)
1683 return filter_dict(info)
1684
1685 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1686 return self._parse_json(
1687 self._search_regex(
1688 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1689 webpage, 'next.js data', fatal=fatal, **kw),
1690 video_id, transform_source=transform_source, fatal=fatal)
1691
1692 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1693 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1694 rectx = re.escape(context_name)
1695 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){.*?\breturn\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1696 js, arg_keys, arg_vals = self._search_regex(
1697 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1698 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1699 default=NO_DEFAULT if fatal else (None, None, None))
1700 if js is None:
1701 return {}
1702
1703 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1704 f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1705
1706 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1707 return traverse_obj(ret, traverse) or {}
1708
1709 @staticmethod
1710 def _hidden_inputs(html):
1711 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1712 hidden_inputs = {}
1713 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1714 attrs = extract_attributes(input)
1715 if not input:
1716 continue
1717 if attrs.get('type') not in ('hidden', 'submit'):
1718 continue
1719 name = attrs.get('name') or attrs.get('id')
1720 value = attrs.get('value')
1721 if name and value is not None:
1722 hidden_inputs[name] = value
1723 return hidden_inputs
1724
1725 def _form_hidden_inputs(self, form_id, html):
1726 form = self._search_regex(
1727 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1728 html, '%s form' % form_id, group='form')
1729 return self._hidden_inputs(form)
1730
1731 @classproperty(cache=True)
1732 def FormatSort(cls):
1733 class FormatSort(FormatSorter):
1734 def __init__(ie, *args, **kwargs):
1735 super().__init__(ie._downloader, *args, **kwargs)
1736
1737 deprecation_warning(
1738 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1739 'Use yt_dlp.utils.FormatSorter instead')
1740 return FormatSort
1741
1742 def _sort_formats(self, formats, field_preference=[]):
1743 if not field_preference:
1744 self._downloader.deprecation_warning(
1745 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1746 return
1747 self._downloader.deprecation_warning(
1748 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1749 'Return _format_sort_fields in the info_dict instead')
1750 if formats:
1751 formats[0]['__sort_fields'] = field_preference
1752
1753 def _check_formats(self, formats, video_id):
1754 if formats:
1755 formats[:] = filter(
1756 lambda f: self._is_valid_url(
1757 f['url'], video_id,
1758 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1759 formats)
1760
1761 @staticmethod
1762 def _remove_duplicate_formats(formats):
1763 format_urls = set()
1764 unique_formats = []
1765 for f in formats:
1766 if f['url'] not in format_urls:
1767 format_urls.add(f['url'])
1768 unique_formats.append(f)
1769 formats[:] = unique_formats
1770
1771 def _is_valid_url(self, url, video_id, item='video', headers={}):
1772 url = self._proto_relative_url(url, scheme='http:')
1773 # For now assume non HTTP(S) URLs always valid
1774 if not (url.startswith('http://') or url.startswith('https://')):
1775 return True
1776 try:
1777 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1778 return True
1779 except ExtractorError as e:
1780 self.to_screen(
1781 '%s: %s URL is invalid, skipping: %s'
1782 % (video_id, item, error_to_compat_str(e.cause)))
1783 return False
1784
1785 def http_scheme(self):
1786 """ Either "http:" or "https:", depending on the user's preferences """
1787 return (
1788 'http:'
1789 if self.get_param('prefer_insecure', False)
1790 else 'https:')
1791
1792 def _proto_relative_url(self, url, scheme=None):
1793 scheme = scheme or self.http_scheme()
1794 assert scheme.endswith(':')
1795 return sanitize_url(url, scheme=scheme[:-1])
1796
1797 def _sleep(self, timeout, video_id, msg_template=None):
1798 if msg_template is None:
1799 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1800 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1801 self.to_screen(msg)
1802 time.sleep(timeout)
1803
1804 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1805 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1806 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1807 if self.get_param('ignore_no_formats_error'):
1808 fatal = False
1809
1810 res = self._download_xml_handle(
1811 manifest_url, video_id, 'Downloading f4m manifest',
1812 'Unable to download f4m manifest',
1813 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1814 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1815 transform_source=transform_source,
1816 fatal=fatal, data=data, headers=headers, query=query)
1817 if res is False:
1818 return []
1819
1820 manifest, urlh = res
1821 manifest_url = urlh.url
1822
1823 return self._parse_f4m_formats(
1824 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1825 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1826
1827 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1828 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1829 fatal=True, m3u8_id=None):
1830 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1831 return []
1832
1833 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1834 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1835 if akamai_pv is not None and ';' in akamai_pv.text:
1836 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1837 if playerVerificationChallenge.strip() != '':
1838 return []
1839
1840 formats = []
1841 manifest_version = '1.0'
1842 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1843 if not media_nodes:
1844 manifest_version = '2.0'
1845 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1846 # Remove unsupported DRM protected media from final formats
1847 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1848 media_nodes = remove_encrypted_media(media_nodes)
1849 if not media_nodes:
1850 return formats
1851
1852 manifest_base_url = get_base_url(manifest)
1853
1854 bootstrap_info = xpath_element(
1855 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1856 'bootstrap info', default=None)
1857
1858 vcodec = None
1859 mime_type = xpath_text(
1860 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1861 'base URL', default=None)
1862 if mime_type and mime_type.startswith('audio/'):
1863 vcodec = 'none'
1864
1865 for i, media_el in enumerate(media_nodes):
1866 tbr = int_or_none(media_el.attrib.get('bitrate'))
1867 width = int_or_none(media_el.attrib.get('width'))
1868 height = int_or_none(media_el.attrib.get('height'))
1869 format_id = join_nonempty(f4m_id, tbr or i)
1870 # If <bootstrapInfo> is present, the specified f4m is a
1871 # stream-level manifest, and only set-level manifests may refer to
1872 # external resources. See section 11.4 and section 4 of F4M spec
1873 if bootstrap_info is None:
1874 media_url = None
1875 # @href is introduced in 2.0, see section 11.6 of F4M spec
1876 if manifest_version == '2.0':
1877 media_url = media_el.attrib.get('href')
1878 if media_url is None:
1879 media_url = media_el.attrib.get('url')
1880 if not media_url:
1881 continue
1882 manifest_url = (
1883 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1884 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1885 # If media_url is itself a f4m manifest do the recursive extraction
1886 # since bitrates in parent manifest (this one) and media_url manifest
1887 # may differ leading to inability to resolve the format by requested
1888 # bitrate in f4m downloader
1889 ext = determine_ext(manifest_url)
1890 if ext == 'f4m':
1891 f4m_formats = self._extract_f4m_formats(
1892 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1893 transform_source=transform_source, fatal=fatal)
1894 # Sometimes stream-level manifest contains single media entry that
1895 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1896 # At the same time parent's media entry in set-level manifest may
1897 # contain it. We will copy it from parent in such cases.
1898 if len(f4m_formats) == 1:
1899 f = f4m_formats[0]
1900 f.update({
1901 'tbr': f.get('tbr') or tbr,
1902 'width': f.get('width') or width,
1903 'height': f.get('height') or height,
1904 'format_id': f.get('format_id') if not tbr else format_id,
1905 'vcodec': vcodec,
1906 })
1907 formats.extend(f4m_formats)
1908 continue
1909 elif ext == 'm3u8':
1910 formats.extend(self._extract_m3u8_formats(
1911 manifest_url, video_id, 'mp4', preference=preference,
1912 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1913 continue
1914 formats.append({
1915 'format_id': format_id,
1916 'url': manifest_url,
1917 'manifest_url': manifest_url,
1918 'ext': 'flv' if bootstrap_info is not None else None,
1919 'protocol': 'f4m',
1920 'tbr': tbr,
1921 'width': width,
1922 'height': height,
1923 'vcodec': vcodec,
1924 'preference': preference,
1925 'quality': quality,
1926 })
1927 return formats
1928
1929 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1930 return {
1931 'format_id': join_nonempty(m3u8_id, 'meta'),
1932 'url': m3u8_url,
1933 'ext': ext,
1934 'protocol': 'm3u8',
1935 'preference': preference - 100 if preference else -100,
1936 'quality': quality,
1937 'resolution': 'multiple',
1938 'format_note': 'Quality selection URL',
1939 }
1940
1941 def _report_ignoring_subs(self, name):
1942 self.report_warning(bug_reports_message(
1943 f'Ignoring subtitle tracks found in the {name} manifest; '
1944 'if any subtitle tracks are missing,'
1945 ), only_once=True)
1946
1947 def _extract_m3u8_formats(self, *args, **kwargs):
1948 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1949 if subs:
1950 self._report_ignoring_subs('HLS')
1951 return fmts
1952
1953 def _extract_m3u8_formats_and_subtitles(
1954 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1955 preference=None, quality=None, m3u8_id=None, note=None,
1956 errnote=None, fatal=True, live=False, data=None, headers={},
1957 query={}):
1958
1959 if self.get_param('ignore_no_formats_error'):
1960 fatal = False
1961
1962 if not m3u8_url:
1963 if errnote is not False:
1964 errnote = errnote or 'Failed to obtain m3u8 URL'
1965 if fatal:
1966 raise ExtractorError(errnote, video_id=video_id)
1967 self.report_warning(f'{errnote}{bug_reports_message()}')
1968 return [], {}
1969
1970 res = self._download_webpage_handle(
1971 m3u8_url, video_id,
1972 note='Downloading m3u8 information' if note is None else note,
1973 errnote='Failed to download m3u8 information' if errnote is None else errnote,
1974 fatal=fatal, data=data, headers=headers, query=query)
1975
1976 if res is False:
1977 return [], {}
1978
1979 m3u8_doc, urlh = res
1980 m3u8_url = urlh.url
1981
1982 return self._parse_m3u8_formats_and_subtitles(
1983 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1984 preference=preference, quality=quality, m3u8_id=m3u8_id,
1985 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1986 headers=headers, query=query, video_id=video_id)
1987
1988 def _parse_m3u8_formats_and_subtitles(
1989 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1990 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1991 errnote=None, fatal=True, data=None, headers={}, query={},
1992 video_id=None):
1993 formats, subtitles = [], {}
1994 has_drm = HlsFD._has_drm(m3u8_doc)
1995
1996 def format_url(url):
1997 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1998
1999 if self.get_param('hls_split_discontinuity', False):
2000 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
2001 if not m3u8_doc:
2002 if not manifest_url:
2003 return []
2004 m3u8_doc = self._download_webpage(
2005 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
2006 note=False, errnote='Failed to download m3u8 playlist information')
2007 if m3u8_doc is False:
2008 return []
2009 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
2010
2011 else:
2012 def _extract_m3u8_playlist_indices(*args, **kwargs):
2013 return [None]
2014
2015 # References:
2016 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
2017 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
2018 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
2019
2020 # We should try extracting formats only from master playlists [1, 4.3.4],
2021 # i.e. playlists that describe available qualities. On the other hand
2022 # media playlists [1, 4.3.3] should be returned as is since they contain
2023 # just the media without qualities renditions.
2024 # Fortunately, master playlist can be easily distinguished from media
2025 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2026 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2027 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2028 # media playlist and MUST NOT appear in master playlist thus we can
2029 # clearly detect media playlist with this criterion.
2030
2031 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2032 formats = [{
2033 'format_id': join_nonempty(m3u8_id, idx),
2034 'format_index': idx,
2035 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2036 'ext': ext,
2037 'protocol': entry_protocol,
2038 'preference': preference,
2039 'quality': quality,
2040 'has_drm': has_drm,
2041 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2042
2043 return formats, subtitles
2044
2045 groups = {}
2046 last_stream_inf = {}
2047
2048 def extract_media(x_media_line):
2049 media = parse_m3u8_attributes(x_media_line)
2050 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2051 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2052 if not (media_type and group_id and name):
2053 return
2054 groups.setdefault(group_id, []).append(media)
2055 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2056 if media_type == 'SUBTITLES':
2057 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2058 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2059 # However, lack of URI has been spotted in the wild.
2060 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2061 if not media.get('URI'):
2062 return
2063 url = format_url(media['URI'])
2064 sub_info = {
2065 'url': url,
2066 'ext': determine_ext(url),
2067 }
2068 if sub_info['ext'] == 'm3u8':
2069 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2070 # files may contain is WebVTT:
2071 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2072 sub_info['ext'] = 'vtt'
2073 sub_info['protocol'] = 'm3u8_native'
2074 lang = media.get('LANGUAGE') or 'und'
2075 subtitles.setdefault(lang, []).append(sub_info)
2076 if media_type not in ('VIDEO', 'AUDIO'):
2077 return
2078 media_url = media.get('URI')
2079 if media_url:
2080 manifest_url = format_url(media_url)
2081 formats.extend({
2082 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2083 'format_note': name,
2084 'format_index': idx,
2085 'url': manifest_url,
2086 'manifest_url': m3u8_url,
2087 'language': media.get('LANGUAGE'),
2088 'ext': ext,
2089 'protocol': entry_protocol,
2090 'preference': preference,
2091 'quality': quality,
2092 'has_drm': has_drm,
2093 'vcodec': 'none' if media_type == 'AUDIO' else None,
2094 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2095
2096 def build_stream_name():
2097 # Despite specification does not mention NAME attribute for
2098 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2099 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2100 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2101 stream_name = last_stream_inf.get('NAME')
2102 if stream_name:
2103 return stream_name
2104 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2105 # from corresponding rendition group
2106 stream_group_id = last_stream_inf.get('VIDEO')
2107 if not stream_group_id:
2108 return
2109 stream_group = groups.get(stream_group_id)
2110 if not stream_group:
2111 return stream_group_id
2112 rendition = stream_group[0]
2113 return rendition.get('NAME') or stream_group_id
2114
2115 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2116 # chance to detect video only formats when EXT-X-STREAM-INF tags
2117 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2118 for line in m3u8_doc.splitlines():
2119 if line.startswith('#EXT-X-MEDIA:'):
2120 extract_media(line)
2121
2122 for line in m3u8_doc.splitlines():
2123 if line.startswith('#EXT-X-STREAM-INF:'):
2124 last_stream_inf = parse_m3u8_attributes(line)
2125 elif line.startswith('#') or not line.strip():
2126 continue
2127 else:
2128 tbr = float_or_none(
2129 last_stream_inf.get('AVERAGE-BANDWIDTH')
2130 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2131 manifest_url = format_url(line.strip())
2132
2133 for idx in _extract_m3u8_playlist_indices(manifest_url):
2134 format_id = [m3u8_id, None, idx]
2135 # Bandwidth of live streams may differ over time thus making
2136 # format_id unpredictable. So it's better to keep provided
2137 # format_id intact.
2138 if not live:
2139 stream_name = build_stream_name()
2140 format_id[1] = stream_name or '%d' % (tbr or len(formats))
2141 f = {
2142 'format_id': join_nonempty(*format_id),
2143 'format_index': idx,
2144 'url': manifest_url,
2145 'manifest_url': m3u8_url,
2146 'tbr': tbr,
2147 'ext': ext,
2148 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2149 'protocol': entry_protocol,
2150 'preference': preference,
2151 'quality': quality,
2152 'has_drm': has_drm,
2153 }
2154 resolution = last_stream_inf.get('RESOLUTION')
2155 if resolution:
2156 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2157 if mobj:
2158 f['width'] = int(mobj.group('width'))
2159 f['height'] = int(mobj.group('height'))
2160 # Unified Streaming Platform
2161 mobj = re.search(
2162 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2163 if mobj:
2164 abr, vbr = mobj.groups()
2165 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2166 f.update({
2167 'vbr': vbr,
2168 'abr': abr,
2169 })
2170 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2171 f.update(codecs)
2172 audio_group_id = last_stream_inf.get('AUDIO')
2173 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2174 # references a rendition group MUST have a CODECS attribute.
2175 # However, this is not always respected. E.g. [2]
2176 # contains EXT-X-STREAM-INF tag which references AUDIO
2177 # rendition group but does not have CODECS and despite
2178 # referencing an audio group it represents a complete
2179 # (with audio and video) format. So, for such cases we will
2180 # ignore references to rendition groups and treat them
2181 # as complete formats.
2182 if audio_group_id and codecs and f.get('vcodec') != 'none':
2183 audio_group = groups.get(audio_group_id)
2184 if audio_group and audio_group[0].get('URI'):
2185 # TODO: update acodec for audio only formats with
2186 # the same GROUP-ID
2187 f['acodec'] = 'none'
2188 if not f.get('ext'):
2189 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2190 formats.append(f)
2191
2192 # for DailyMotion
2193 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2194 if progressive_uri:
2195 http_f = f.copy()
2196 del http_f['manifest_url']
2197 http_f.update({
2198 'format_id': f['format_id'].replace('hls-', 'http-'),
2199 'protocol': 'http',
2200 'url': progressive_uri,
2201 })
2202 formats.append(http_f)
2203
2204 last_stream_inf = {}
2205 return formats, subtitles
2206
2207 def _extract_m3u8_vod_duration(
2208 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2209
2210 m3u8_vod = self._download_webpage(
2211 m3u8_vod_url, video_id,
2212 note='Downloading m3u8 VOD manifest' if note is None else note,
2213 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2214 fatal=False, data=data, headers=headers, query=query)
2215
2216 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2217
2218 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2219 if '#EXT-X-ENDLIST' not in m3u8_vod:
2220 return None
2221
2222 return int(sum(
2223 float(line[len('#EXTINF:'):].split(',')[0])
2224 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2225
2226 def _extract_mpd_vod_duration(
2227 self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2228
2229 mpd_doc = self._download_xml(
2230 mpd_url, video_id,
2231 note='Downloading MPD VOD manifest' if note is None else note,
2232 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2233 fatal=False, data=data, headers=headers, query=query)
2234 if not isinstance(mpd_doc, xml.etree.ElementTree.Element):
2235 return None
2236 return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2237
2238 @staticmethod
2239 def _xpath_ns(path, namespace=None):
2240 if not namespace:
2241 return path
2242 out = []
2243 for c in path.split('/'):
2244 if not c or c == '.':
2245 out.append(c)
2246 else:
2247 out.append('{%s}%s' % (namespace, c))
2248 return '/'.join(out)
2249
2250 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2251 if self.get_param('ignore_no_formats_error'):
2252 fatal = False
2253
2254 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2255 if res is False:
2256 assert not fatal
2257 return [], {}
2258 smil, urlh = res
2259
2260 return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
2261 namespace=self._parse_smil_namespace(smil))
2262
2263 def _extract_smil_formats(self, *args, **kwargs):
2264 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2265 if subs:
2266 self._report_ignoring_subs('SMIL')
2267 return fmts
2268
2269 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2270 res = self._download_smil(smil_url, video_id, fatal=fatal)
2271 if res is False:
2272 return {}
2273
2274 smil, urlh = res
2275 smil_url = urlh.url
2276
2277 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2278
2279 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2280 return self._download_xml_handle(
2281 smil_url, video_id, 'Downloading SMIL file',
2282 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2283
2284 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2285 namespace = self._parse_smil_namespace(smil)
2286
2287 formats, subtitles = self._parse_smil_formats_and_subtitles(
2288 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2289
2290 video_id = os.path.splitext(url_basename(smil_url))[0]
2291 title = None
2292 description = None
2293 upload_date = None
2294 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2295 name = meta.attrib.get('name')
2296 content = meta.attrib.get('content')
2297 if not name or not content:
2298 continue
2299 if not title and name == 'title':
2300 title = content
2301 elif not description and name in ('description', 'abstract'):
2302 description = content
2303 elif not upload_date and name == 'date':
2304 upload_date = unified_strdate(content)
2305
2306 thumbnails = [{
2307 'id': image.get('type'),
2308 'url': image.get('src'),
2309 'width': int_or_none(image.get('width')),
2310 'height': int_or_none(image.get('height')),
2311 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2312
2313 return {
2314 'id': video_id,
2315 'title': title or video_id,
2316 'description': description,
2317 'upload_date': upload_date,
2318 'thumbnails': thumbnails,
2319 'formats': formats,
2320 'subtitles': subtitles,
2321 }
2322
2323 def _parse_smil_namespace(self, smil):
2324 return self._search_regex(
2325 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2326
2327 def _parse_smil_formats(self, *args, **kwargs):
2328 fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
2329 if subs:
2330 self._report_ignoring_subs('SMIL')
2331 return fmts
2332
2333 def _parse_smil_formats_and_subtitles(
2334 self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2335 base = smil_url
2336 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2337 b = meta.get('base') or meta.get('httpBase')
2338 if b:
2339 base = b
2340 break
2341
2342 formats, subtitles = [], {}
2343 rtmp_count = 0
2344 http_count = 0
2345 m3u8_count = 0
2346 imgs_count = 0
2347
2348 srcs = set()
2349 media = itertools.chain.from_iterable(
2350 smil.findall(self._xpath_ns(arg, namespace))
2351 for arg in ['.//video', './/audio', './/media'])
2352 for medium in media:
2353 src = medium.get('src')
2354 if not src or src in srcs:
2355 continue
2356 srcs.add(src)
2357
2358 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2359 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2360 width = int_or_none(medium.get('width'))
2361 height = int_or_none(medium.get('height'))
2362 proto = medium.get('proto')
2363 ext = medium.get('ext')
2364 src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2365 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2366 streamer = medium.get('streamer') or base
2367
2368 if proto == 'rtmp' or streamer.startswith('rtmp'):
2369 rtmp_count += 1
2370 formats.append({
2371 'url': streamer,
2372 'play_path': src,
2373 'ext': 'flv',
2374 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2375 'tbr': bitrate,
2376 'filesize': filesize,
2377 'width': width,
2378 'height': height,
2379 })
2380 if transform_rtmp_url:
2381 streamer, src = transform_rtmp_url(streamer, src)
2382 formats[-1].update({
2383 'url': streamer,
2384 'play_path': src,
2385 })
2386 continue
2387
2388 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2389 src_url = src_url.strip()
2390
2391 if proto == 'm3u8' or src_ext == 'm3u8':
2392 m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
2393 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2394 self._merge_subtitles(m3u8_subs, target=subtitles)
2395 if len(m3u8_formats) == 1:
2396 m3u8_count += 1
2397 m3u8_formats[0].update({
2398 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2399 'tbr': bitrate,
2400 'width': width,
2401 'height': height,
2402 })
2403 formats.extend(m3u8_formats)
2404 elif src_ext == 'f4m':
2405 f4m_url = src_url
2406 if not f4m_params:
2407 f4m_params = {
2408 'hdcore': '3.2.0',
2409 'plugin': 'flowplayer-3.2.0.1',
2410 }
2411 f4m_url += '&' if '?' in f4m_url else '?'
2412 f4m_url += urllib.parse.urlencode(f4m_params)
2413 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2414 elif src_ext == 'mpd':
2415 mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
2416 src_url, video_id, mpd_id='dash', fatal=False)
2417 formats.extend(mpd_formats)
2418 self._merge_subtitles(mpd_subs, target=subtitles)
2419 elif re.search(r'\.ism/[Mm]anifest', src_url):
2420 ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
2421 src_url, video_id, ism_id='mss', fatal=False)
2422 formats.extend(ism_formats)
2423 self._merge_subtitles(ism_subs, target=subtitles)
2424 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2425 http_count += 1
2426 formats.append({
2427 'url': src_url,
2428 'ext': ext or src_ext or 'flv',
2429 'format_id': 'http-%d' % (bitrate or http_count),
2430 'tbr': bitrate,
2431 'filesize': filesize,
2432 'width': width,
2433 'height': height,
2434 })
2435
2436 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2437 src = medium.get('src')
2438 if not src or src in srcs:
2439 continue
2440 srcs.add(src)
2441
2442 imgs_count += 1
2443 formats.append({
2444 'format_id': 'imagestream-%d' % (imgs_count),
2445 'url': src,
2446 'ext': mimetype2ext(medium.get('type')),
2447 'acodec': 'none',
2448 'vcodec': 'none',
2449 'width': int_or_none(medium.get('width')),
2450 'height': int_or_none(medium.get('height')),
2451 'format_note': 'SMIL storyboards',
2452 })
2453
2454 smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
2455 self._merge_subtitles(smil_subs, target=subtitles)
2456
2457 return formats, subtitles
2458
2459 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2460 urls = []
2461 subtitles = {}
2462 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2463 src = textstream.get('src')
2464 if not src or src in urls:
2465 continue
2466 urls.append(src)
2467 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2468 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2469 subtitles.setdefault(lang, []).append({
2470 'url': src,
2471 'ext': ext,
2472 })
2473 return subtitles
2474
2475 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2476 res = self._download_xml_handle(
2477 xspf_url, playlist_id, 'Downloading xpsf playlist',
2478 'Unable to download xspf manifest', fatal=fatal)
2479 if res is False:
2480 return []
2481
2482 xspf, urlh = res
2483 xspf_url = urlh.url
2484
2485 return self._parse_xspf(
2486 xspf, playlist_id, xspf_url=xspf_url,
2487 xspf_base_url=base_url(xspf_url))
2488
2489 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2490 NS_MAP = {
2491 'xspf': 'http://xspf.org/ns/0/',
2492 's1': 'http://static.streamone.nl/player/ns/0',
2493 }
2494
2495 entries = []
2496 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2497 title = xpath_text(
2498 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2499 description = xpath_text(
2500 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2501 thumbnail = xpath_text(
2502 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2503 duration = float_or_none(
2504 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2505
2506 formats = []
2507 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2508 format_url = urljoin(xspf_base_url, location.text)
2509 if not format_url:
2510 continue
2511 formats.append({
2512 'url': format_url,
2513 'manifest_url': xspf_url,
2514 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2515 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2516 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2517 })
2518
2519 entries.append({
2520 'id': playlist_id,
2521 'title': title,
2522 'description': description,
2523 'thumbnail': thumbnail,
2524 'duration': duration,
2525 'formats': formats,
2526 })
2527 return entries
2528
2529 def _extract_mpd_formats(self, *args, **kwargs):
2530 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2531 if subs:
2532 self._report_ignoring_subs('DASH')
2533 return fmts
2534
2535 def _extract_mpd_formats_and_subtitles(self, *args, **kwargs):
2536 periods = self._extract_mpd_periods(*args, **kwargs)
2537 return self._merge_mpd_periods(periods)
2538
2539 def _extract_mpd_periods(
2540 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2541 fatal=True, data=None, headers={}, query={}):
2542
2543 if self.get_param('ignore_no_formats_error'):
2544 fatal = False
2545
2546 res = self._download_xml_handle(
2547 mpd_url, video_id,
2548 note='Downloading MPD manifest' if note is None else note,
2549 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2550 fatal=fatal, data=data, headers=headers, query=query)
2551 if res is False:
2552 return []
2553 mpd_doc, urlh = res
2554 if mpd_doc is None:
2555 return []
2556
2557 # We could have been redirected to a new url when we retrieved our mpd file.
2558 mpd_url = urlh.url
2559 mpd_base_url = base_url(mpd_url)
2560
2561 return self._parse_mpd_periods(mpd_doc, mpd_id, mpd_base_url, mpd_url)
2562
2563 def _parse_mpd_formats(self, *args, **kwargs):
2564 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2565 if subs:
2566 self._report_ignoring_subs('DASH')
2567 return fmts
2568
2569 def _parse_mpd_formats_and_subtitles(self, *args, **kwargs):
2570 periods = self._parse_mpd_periods(*args, **kwargs)
2571 return self._merge_mpd_periods(periods)
2572
2573 def _merge_mpd_periods(self, periods):
2574 """
2575 Combine all formats and subtitles from an MPD manifest into a single list,
2576 by concatenate streams with similar formats.
2577 """
2578 formats, subtitles = {}, {}
2579 for period in periods:
2580 for f in period['formats']:
2581 assert 'is_dash_periods' not in f, 'format already processed'
2582 f['is_dash_periods'] = True
2583 format_key = tuple(v for k, v in f.items() if k not in (
2584 ('format_id', 'fragments', 'manifest_stream_number')))
2585 if format_key not in formats:
2586 formats[format_key] = f
2587 elif 'fragments' in f:
2588 formats[format_key].setdefault('fragments', []).extend(f['fragments'])
2589
2590 if subtitles and period['subtitles']:
2591 self.report_warning(bug_reports_message(
2592 'Found subtitles in multiple periods in the DASH manifest; '
2593 'if part of the subtitles are missing,'
2594 ), only_once=True)
2595
2596 for sub_lang, sub_info in period['subtitles'].items():
2597 subtitles.setdefault(sub_lang, []).extend(sub_info)
2598
2599 return list(formats.values()), subtitles
2600
2601 def _parse_mpd_periods(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2602 """
2603 Parse formats from MPD manifest.
2604 References:
2605 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2606 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2607 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2608 """
2609 if not self.get_param('dynamic_mpd', True):
2610 if mpd_doc.get('type') == 'dynamic':
2611 return [], {}
2612
2613 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2614
2615 def _add_ns(path):
2616 return self._xpath_ns(path, namespace)
2617
2618 def is_drm_protected(element):
2619 return element.find(_add_ns('ContentProtection')) is not None
2620
2621 def extract_multisegment_info(element, ms_parent_info):
2622 ms_info = ms_parent_info.copy()
2623
2624 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2625 # common attributes and elements. We will only extract relevant
2626 # for us.
2627 def extract_common(source):
2628 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2629 if segment_timeline is not None:
2630 s_e = segment_timeline.findall(_add_ns('S'))
2631 if s_e:
2632 ms_info['total_number'] = 0
2633 ms_info['s'] = []
2634 for s in s_e:
2635 r = int(s.get('r', 0))
2636 ms_info['total_number'] += 1 + r
2637 ms_info['s'].append({
2638 't': int(s.get('t', 0)),
2639 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2640 'd': int(s.attrib['d']),
2641 'r': r,
2642 })
2643 start_number = source.get('startNumber')
2644 if start_number:
2645 ms_info['start_number'] = int(start_number)
2646 timescale = source.get('timescale')
2647 if timescale:
2648 ms_info['timescale'] = int(timescale)
2649 segment_duration = source.get('duration')
2650 if segment_duration:
2651 ms_info['segment_duration'] = float(segment_duration)
2652
2653 def extract_Initialization(source):
2654 initialization = source.find(_add_ns('Initialization'))
2655 if initialization is not None:
2656 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2657
2658 segment_list = element.find(_add_ns('SegmentList'))
2659 if segment_list is not None:
2660 extract_common(segment_list)
2661 extract_Initialization(segment_list)
2662 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2663 if segment_urls_e:
2664 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2665 else:
2666 segment_template = element.find(_add_ns('SegmentTemplate'))
2667 if segment_template is not None:
2668 extract_common(segment_template)
2669 media = segment_template.get('media')
2670 if media:
2671 ms_info['media'] = media
2672 initialization = segment_template.get('initialization')
2673 if initialization:
2674 ms_info['initialization'] = initialization
2675 else:
2676 extract_Initialization(segment_template)
2677 return ms_info
2678
2679 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2680 stream_numbers = collections.defaultdict(int)
2681 for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))):
2682 period_entry = {
2683 'id': period.get('id', f'period-{period_idx}'),
2684 'formats': [],
2685 'subtitles': collections.defaultdict(list),
2686 }
2687 period_duration = parse_duration(period.get('duration')) or mpd_duration
2688 period_ms_info = extract_multisegment_info(period, {
2689 'start_number': 1,
2690 'timescale': 1,
2691 })
2692 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2693 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2694 for representation in adaptation_set.findall(_add_ns('Representation')):
2695 representation_attrib = adaptation_set.attrib.copy()
2696 representation_attrib.update(representation.attrib)
2697 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2698 mime_type = representation_attrib['mimeType']
2699 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2700
2701 codec_str = representation_attrib.get('codecs', '')
2702 # Some kind of binary subtitle found in some youtube livestreams
2703 if mime_type == 'application/x-rawcc':
2704 codecs = {'scodec': codec_str}
2705 else:
2706 codecs = parse_codecs(codec_str)
2707 if content_type not in ('video', 'audio', 'text'):
2708 if mime_type == 'image/jpeg':
2709 content_type = mime_type
2710 elif codecs.get('vcodec', 'none') != 'none':
2711 content_type = 'video'
2712 elif codecs.get('acodec', 'none') != 'none':
2713 content_type = 'audio'
2714 elif codecs.get('scodec', 'none') != 'none':
2715 content_type = 'text'
2716 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2717 content_type = 'text'
2718 else:
2719 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2720 continue
2721
2722 base_url = ''
2723 for element in (representation, adaptation_set, period, mpd_doc):
2724 base_url_e = element.find(_add_ns('BaseURL'))
2725 if try_call(lambda: base_url_e.text) is not None:
2726 base_url = base_url_e.text + base_url
2727 if re.match(r'^https?://', base_url):
2728 break
2729 if mpd_base_url and base_url.startswith('/'):
2730 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2731 elif mpd_base_url and not re.match(r'^https?://', base_url):
2732 if not mpd_base_url.endswith('/'):
2733 mpd_base_url += '/'
2734 base_url = mpd_base_url + base_url
2735 representation_id = representation_attrib.get('id')
2736 lang = representation_attrib.get('lang')
2737 url_el = representation.find(_add_ns('BaseURL'))
2738 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2739 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2740 if representation_id is not None:
2741 format_id = representation_id
2742 else:
2743 format_id = content_type
2744 if mpd_id:
2745 format_id = mpd_id + '-' + format_id
2746 if content_type in ('video', 'audio'):
2747 f = {
2748 'format_id': format_id,
2749 'manifest_url': mpd_url,
2750 'ext': mimetype2ext(mime_type),
2751 'width': int_or_none(representation_attrib.get('width')),
2752 'height': int_or_none(representation_attrib.get('height')),
2753 'tbr': float_or_none(bandwidth, 1000),
2754 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2755 'fps': int_or_none(representation_attrib.get('frameRate')),
2756 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2757 'format_note': 'DASH %s' % content_type,
2758 'filesize': filesize,
2759 'container': mimetype2ext(mime_type) + '_dash',
2760 **codecs
2761 }
2762 elif content_type == 'text':
2763 f = {
2764 'ext': mimetype2ext(mime_type),
2765 'manifest_url': mpd_url,
2766 'filesize': filesize,
2767 }
2768 elif content_type == 'image/jpeg':
2769 # See test case in VikiIE
2770 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2771 f = {
2772 'format_id': format_id,
2773 'ext': 'mhtml',
2774 'manifest_url': mpd_url,
2775 'format_note': 'DASH storyboards (jpeg)',
2776 'acodec': 'none',
2777 'vcodec': 'none',
2778 }
2779 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2780 f['has_drm'] = True
2781 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2782
2783 def prepare_template(template_name, identifiers):
2784 tmpl = representation_ms_info[template_name]
2785 if representation_id is not None:
2786 tmpl = tmpl.replace('$RepresentationID$', representation_id)
2787 # First of, % characters outside $...$ templates
2788 # must be escaped by doubling for proper processing
2789 # by % operator string formatting used further (see
2790 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2791 t = ''
2792 in_template = False
2793 for c in tmpl:
2794 t += c
2795 if c == '$':
2796 in_template = not in_template
2797 elif c == '%' and not in_template:
2798 t += c
2799 # Next, $...$ templates are translated to their
2800 # %(...) counterparts to be used with % operator
2801 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2802 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2803 t.replace('$$', '$')
2804 return t
2805
2806 # @initialization is a regular template like @media one
2807 # so it should be handled just the same way (see
2808 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2809 if 'initialization' in representation_ms_info:
2810 initialization_template = prepare_template(
2811 'initialization',
2812 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2813 # $Time$ shall not be included for @initialization thus
2814 # only $Bandwidth$ remains
2815 ('Bandwidth', ))
2816 representation_ms_info['initialization_url'] = initialization_template % {
2817 'Bandwidth': bandwidth,
2818 }
2819
2820 def location_key(location):
2821 return 'url' if re.match(r'^https?://', location) else 'path'
2822
2823 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2824
2825 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2826 media_location_key = location_key(media_template)
2827
2828 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2829 # can't be used at the same time
2830 if '%(Number' in media_template and 's' not in representation_ms_info:
2831 segment_duration = None
2832 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2833 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2834 representation_ms_info['total_number'] = int(math.ceil(
2835 float_or_none(period_duration, segment_duration, default=0)))
2836 representation_ms_info['fragments'] = [{
2837 media_location_key: media_template % {
2838 'Number': segment_number,
2839 'Bandwidth': bandwidth,
2840 },
2841 'duration': segment_duration,
2842 } for segment_number in range(
2843 representation_ms_info['start_number'],
2844 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2845 else:
2846 # $Number*$ or $Time$ in media template with S list available
2847 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2848 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2849 representation_ms_info['fragments'] = []
2850 segment_time = 0
2851 segment_d = None
2852 segment_number = representation_ms_info['start_number']
2853
2854 def add_segment_url():
2855 segment_url = media_template % {
2856 'Time': segment_time,
2857 'Bandwidth': bandwidth,
2858 'Number': segment_number,
2859 }
2860 representation_ms_info['fragments'].append({
2861 media_location_key: segment_url,
2862 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2863 })
2864
2865 for num, s in enumerate(representation_ms_info['s']):
2866 segment_time = s.get('t') or segment_time
2867 segment_d = s['d']
2868 add_segment_url()
2869 segment_number += 1
2870 for r in range(s.get('r', 0)):
2871 segment_time += segment_d
2872 add_segment_url()
2873 segment_number += 1
2874 segment_time += segment_d
2875 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2876 # No media template,
2877 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2878 # or any YouTube dashsegments video
2879 fragments = []
2880 segment_index = 0
2881 timescale = representation_ms_info['timescale']
2882 for s in representation_ms_info['s']:
2883 duration = float_or_none(s['d'], timescale)
2884 for r in range(s.get('r', 0) + 1):
2885 segment_uri = representation_ms_info['segment_urls'][segment_index]
2886 fragments.append({
2887 location_key(segment_uri): segment_uri,
2888 'duration': duration,
2889 })
2890 segment_index += 1
2891 representation_ms_info['fragments'] = fragments
2892 elif 'segment_urls' in representation_ms_info:
2893 # Segment URLs with no SegmentTimeline
2894 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2895 # https://github.com/ytdl-org/youtube-dl/pull/14844
2896 fragments = []
2897 segment_duration = float_or_none(
2898 representation_ms_info['segment_duration'],
2899 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2900 for segment_url in representation_ms_info['segment_urls']:
2901 fragment = {
2902 location_key(segment_url): segment_url,
2903 }
2904 if segment_duration:
2905 fragment['duration'] = segment_duration
2906 fragments.append(fragment)
2907 representation_ms_info['fragments'] = fragments
2908 # If there is a fragments key available then we correctly recognized fragmented media.
2909 # Otherwise we will assume unfragmented media with direct access. Technically, such
2910 # assumption is not necessarily correct since we may simply have no support for
2911 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2912 if 'fragments' in representation_ms_info:
2913 f.update({
2914 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2915 'url': mpd_url or base_url,
2916 'fragment_base_url': base_url,
2917 'fragments': [],
2918 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2919 })
2920 if 'initialization_url' in representation_ms_info:
2921 initialization_url = representation_ms_info['initialization_url']
2922 if not f.get('url'):
2923 f['url'] = initialization_url
2924 f['fragments'].append({location_key(initialization_url): initialization_url})
2925 f['fragments'].extend(representation_ms_info['fragments'])
2926 if not period_duration:
2927 period_duration = try_get(
2928 representation_ms_info,
2929 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2930 else:
2931 # Assuming direct URL to unfragmented media.
2932 f['url'] = base_url
2933 if content_type in ('video', 'audio', 'image/jpeg'):
2934 f['manifest_stream_number'] = stream_numbers[f['url']]
2935 stream_numbers[f['url']] += 1
2936 period_entry['formats'].append(f)
2937 elif content_type == 'text':
2938 period_entry['subtitles'][lang or 'und'].append(f)
2939 yield period_entry
2940
2941 def _extract_ism_formats(self, *args, **kwargs):
2942 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2943 if subs:
2944 self._report_ignoring_subs('ISM')
2945 return fmts
2946
2947 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2948 if self.get_param('ignore_no_formats_error'):
2949 fatal = False
2950
2951 res = self._download_xml_handle(
2952 ism_url, video_id,
2953 note='Downloading ISM manifest' if note is None else note,
2954 errnote='Failed to download ISM manifest' if errnote is None else errnote,
2955 fatal=fatal, data=data, headers=headers, query=query)
2956 if res is False:
2957 return [], {}
2958 ism_doc, urlh = res
2959 if ism_doc is None:
2960 return [], {}
2961
2962 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
2963
2964 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2965 """
2966 Parse formats from ISM manifest.
2967 References:
2968 1. [MS-SSTR]: Smooth Streaming Protocol,
2969 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2970 """
2971 if ism_doc.get('IsLive') == 'TRUE':
2972 return [], {}
2973
2974 duration = int(ism_doc.attrib['Duration'])
2975 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2976
2977 formats = []
2978 subtitles = {}
2979 for stream in ism_doc.findall('StreamIndex'):
2980 stream_type = stream.get('Type')
2981 if stream_type not in ('video', 'audio', 'text'):
2982 continue
2983 url_pattern = stream.attrib['Url']
2984 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2985 stream_name = stream.get('Name')
2986 stream_language = stream.get('Language', 'und')
2987 for track in stream.findall('QualityLevel'):
2988 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2989 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2990 # TODO: add support for WVC1 and WMAP
2991 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2992 self.report_warning('%s is not a supported codec' % fourcc)
2993 continue
2994 tbr = int(track.attrib['Bitrate']) // 1000
2995 # [1] does not mention Width and Height attributes. However,
2996 # they're often present while MaxWidth and MaxHeight are
2997 # missing, so should be used as fallbacks
2998 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2999 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
3000 sampling_rate = int_or_none(track.get('SamplingRate'))
3001
3002 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
3003 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
3004
3005 fragments = []
3006 fragment_ctx = {
3007 'time': 0,
3008 }
3009 stream_fragments = stream.findall('c')
3010 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
3011 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
3012 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
3013 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
3014 if not fragment_ctx['duration']:
3015 try:
3016 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
3017 except IndexError:
3018 next_fragment_time = duration
3019 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
3020 for _ in range(fragment_repeat):
3021 fragments.append({
3022 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
3023 'duration': fragment_ctx['duration'] / stream_timescale,
3024 })
3025 fragment_ctx['time'] += fragment_ctx['duration']
3026
3027 if stream_type == 'text':
3028 subtitles.setdefault(stream_language, []).append({
3029 'ext': 'ismt',
3030 'protocol': 'ism',
3031 'url': ism_url,
3032 'manifest_url': ism_url,
3033 'fragments': fragments,
3034 '_download_params': {
3035 'stream_type': stream_type,
3036 'duration': duration,
3037 'timescale': stream_timescale,
3038 'fourcc': fourcc,
3039 'language': stream_language,
3040 'codec_private_data': track.get('CodecPrivateData'),
3041 }
3042 })
3043 elif stream_type in ('video', 'audio'):
3044 formats.append({
3045 'format_id': join_nonempty(ism_id, stream_name, tbr),
3046 'url': ism_url,
3047 'manifest_url': ism_url,
3048 'ext': 'ismv' if stream_type == 'video' else 'isma',
3049 'width': width,
3050 'height': height,
3051 'tbr': tbr,
3052 'asr': sampling_rate,
3053 'vcodec': 'none' if stream_type == 'audio' else fourcc,
3054 'acodec': 'none' if stream_type == 'video' else fourcc,
3055 'protocol': 'ism',
3056 'fragments': fragments,
3057 'has_drm': ism_doc.find('Protection') is not None,
3058 'language': stream_language,
3059 'audio_channels': int_or_none(track.get('Channels')),
3060 '_download_params': {
3061 'stream_type': stream_type,
3062 'duration': duration,
3063 'timescale': stream_timescale,
3064 'width': width or 0,
3065 'height': height or 0,
3066 'fourcc': fourcc,
3067 'language': stream_language,
3068 'codec_private_data': track.get('CodecPrivateData'),
3069 'sampling_rate': sampling_rate,
3070 'channels': int_or_none(track.get('Channels', 2)),
3071 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3072 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3073 },
3074 })
3075 return formats, subtitles
3076
3077 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3078 def absolute_url(item_url):
3079 return urljoin(base_url, item_url)
3080
3081 def parse_content_type(content_type):
3082 if not content_type:
3083 return {}
3084 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3085 if ctr:
3086 mimetype, codecs = ctr.groups()
3087 f = parse_codecs(codecs)
3088 f['ext'] = mimetype2ext(mimetype)
3089 return f
3090 return {}
3091
3092 def _media_formats(src, cur_media_type, type_info=None):
3093 type_info = type_info or {}
3094 full_url = absolute_url(src)
3095 ext = type_info.get('ext') or determine_ext(full_url)
3096 if ext == 'm3u8':
3097 is_plain_url = False
3098 formats = self._extract_m3u8_formats(
3099 full_url, video_id, ext='mp4',
3100 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3101 preference=preference, quality=quality, fatal=False)
3102 elif ext == 'mpd':
3103 is_plain_url = False
3104 formats = self._extract_mpd_formats(
3105 full_url, video_id, mpd_id=mpd_id, fatal=False)
3106 else:
3107 is_plain_url = True
3108 formats = [{
3109 'url': full_url,
3110 'vcodec': 'none' if cur_media_type == 'audio' else None,
3111 'ext': ext,
3112 }]
3113 return is_plain_url, formats
3114
3115 entries = []
3116 # amp-video and amp-audio are very similar to their HTML5 counterparts
3117 # so we will include them right here (see
3118 # https://www.ampproject.org/docs/reference/components/amp-video)
3119 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3120 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3121 media_tags = [(media_tag, media_tag_name, media_type, '')
3122 for media_tag, media_tag_name, media_type
3123 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3124 media_tags.extend(re.findall(
3125 # We only allow video|audio followed by a whitespace or '>'.
3126 # Allowing more characters may end up in significant slow down (see
3127 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3128 # e.g. http://www.porntrex.com/maps/videositemap.xml).
3129 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3130 for media_tag, _, media_type, media_content in media_tags:
3131 media_info = {
3132 'formats': [],
3133 'subtitles': {},
3134 }
3135 media_attributes = extract_attributes(media_tag)
3136 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3137 if src:
3138 f = parse_content_type(media_attributes.get('type'))
3139 _, formats = _media_formats(src, media_type, f)
3140 media_info['formats'].extend(formats)
3141 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3142 if media_content:
3143 for source_tag in re.findall(r'<source[^>]+>', media_content):
3144 s_attr = extract_attributes(source_tag)
3145 # data-video-src and data-src are non standard but seen
3146 # several times in the wild
3147 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3148 if not src:
3149 continue
3150 f = parse_content_type(s_attr.get('type'))
3151 is_plain_url, formats = _media_formats(src, media_type, f)
3152 if is_plain_url:
3153 # width, height, res, label and title attributes are
3154 # all not standard but seen several times in the wild
3155 labels = [
3156 s_attr.get(lbl)
3157 for lbl in ('label', 'title')
3158 if str_or_none(s_attr.get(lbl))
3159 ]
3160 width = int_or_none(s_attr.get('width'))
3161 height = (int_or_none(s_attr.get('height'))
3162 or int_or_none(s_attr.get('res')))
3163 if not width or not height:
3164 for lbl in labels:
3165 resolution = parse_resolution(lbl)
3166 if not resolution:
3167 continue
3168 width = width or resolution.get('width')
3169 height = height or resolution.get('height')
3170 for lbl in labels:
3171 tbr = parse_bitrate(lbl)
3172 if tbr:
3173 break
3174 else:
3175 tbr = None
3176 f.update({
3177 'width': width,
3178 'height': height,
3179 'tbr': tbr,
3180 'format_id': s_attr.get('label') or s_attr.get('title'),
3181 })
3182 f.update(formats[0])
3183 media_info['formats'].append(f)
3184 else:
3185 media_info['formats'].extend(formats)
3186 for track_tag in re.findall(r'<track[^>]+>', media_content):
3187 track_attributes = extract_attributes(track_tag)
3188 kind = track_attributes.get('kind')
3189 if not kind or kind in ('subtitles', 'captions'):
3190 src = strip_or_none(track_attributes.get('src'))
3191 if not src:
3192 continue
3193 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3194 media_info['subtitles'].setdefault(lang, []).append({
3195 'url': absolute_url(src),
3196 })
3197 for f in media_info['formats']:
3198 f.setdefault('http_headers', {})['Referer'] = base_url
3199 if media_info['formats'] or media_info['subtitles']:
3200 entries.append(media_info)
3201 return entries
3202
3203 def _extract_akamai_formats(self, *args, **kwargs):
3204 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3205 if subs:
3206 self._report_ignoring_subs('akamai')
3207 return fmts
3208
3209 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3210 signed = 'hdnea=' in manifest_url
3211 if not signed:
3212 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3213 manifest_url = re.sub(
3214 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3215 '', manifest_url).strip('?')
3216
3217 formats = []
3218 subtitles = {}
3219
3220 hdcore_sign = 'hdcore=3.7.0'
3221 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3222 hds_host = hosts.get('hds')
3223 if hds_host:
3224 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3225 if 'hdcore=' not in f4m_url:
3226 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3227 f4m_formats = self._extract_f4m_formats(
3228 f4m_url, video_id, f4m_id='hds', fatal=False)
3229 for entry in f4m_formats:
3230 entry.update({'extra_param_to_segment_url': hdcore_sign})
3231 formats.extend(f4m_formats)
3232
3233 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3234 hls_host = hosts.get('hls')
3235 if hls_host:
3236 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3237 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3238 m3u8_url, video_id, 'mp4', 'm3u8_native',
3239 m3u8_id='hls', fatal=False)
3240 formats.extend(m3u8_formats)
3241 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3242
3243 http_host = hosts.get('http')
3244 if http_host and m3u8_formats and not signed:
3245 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3246 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3247 qualities_length = len(qualities)
3248 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3249 i = 0
3250 for f in m3u8_formats:
3251 if f['vcodec'] != 'none':
3252 for protocol in ('http', 'https'):
3253 http_f = f.copy()
3254 del http_f['manifest_url']
3255 http_url = re.sub(
3256 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3257 http_f.update({
3258 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3259 'url': http_url,
3260 'protocol': protocol,
3261 })
3262 formats.append(http_f)
3263 i += 1
3264
3265 return formats, subtitles
3266
3267 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3268 query = urllib.parse.urlparse(url).query
3269 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3270 mobj = re.search(
3271 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3272 url_base = mobj.group('url')
3273 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3274 formats = []
3275
3276 def manifest_url(manifest):
3277 m_url = f'{http_base_url}/{manifest}'
3278 if query:
3279 m_url += '?%s' % query
3280 return m_url
3281
3282 if 'm3u8' not in skip_protocols:
3283 formats.extend(self._extract_m3u8_formats(
3284 manifest_url('playlist.m3u8'), video_id, 'mp4',
3285 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3286 if 'f4m' not in skip_protocols:
3287 formats.extend(self._extract_f4m_formats(
3288 manifest_url('manifest.f4m'),
3289 video_id, f4m_id='hds', fatal=False))
3290 if 'dash' not in skip_protocols:
3291 formats.extend(self._extract_mpd_formats(
3292 manifest_url('manifest.mpd'),
3293 video_id, mpd_id='dash', fatal=False))
3294 if re.search(r'(?:/smil:|\.smil)', url_base):
3295 if 'smil' not in skip_protocols:
3296 rtmp_formats = self._extract_smil_formats(
3297 manifest_url('jwplayer.smil'),
3298 video_id, fatal=False)
3299 for rtmp_format in rtmp_formats:
3300 rtsp_format = rtmp_format.copy()
3301 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3302 del rtsp_format['play_path']
3303 del rtsp_format['ext']
3304 rtsp_format.update({
3305 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3306 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3307 'protocol': 'rtsp',
3308 })
3309 formats.extend([rtmp_format, rtsp_format])
3310 else:
3311 for protocol in ('rtmp', 'rtsp'):
3312 if protocol not in skip_protocols:
3313 formats.append({
3314 'url': f'{protocol}:{url_base}',
3315 'format_id': protocol,
3316 'protocol': protocol,
3317 })
3318 return formats
3319
3320 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3321 mobj = re.search(
3322 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3323 webpage)
3324 if mobj:
3325 try:
3326 jwplayer_data = self._parse_json(mobj.group('options'),
3327 video_id=video_id,
3328 transform_source=transform_source)
3329 except ExtractorError:
3330 pass
3331 else:
3332 if isinstance(jwplayer_data, dict):
3333 return jwplayer_data
3334
3335 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3336 jwplayer_data = self._find_jwplayer_data(
3337 webpage, video_id, transform_source=js_to_json)
3338 return self._parse_jwplayer_data(
3339 jwplayer_data, video_id, *args, **kwargs)
3340
3341 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3342 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3343 entries = []
3344 if not isinstance(jwplayer_data, dict):
3345 return entries
3346
3347 playlist_items = jwplayer_data.get('playlist')
3348 # JWPlayer backward compatibility: single playlist item/flattened playlists
3349 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3350 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3351 if not isinstance(playlist_items, list):
3352 playlist_items = (playlist_items or jwplayer_data, )
3353
3354 for video_data in playlist_items:
3355 if not isinstance(video_data, dict):
3356 continue
3357 # JWPlayer backward compatibility: flattened sources
3358 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3359 if 'sources' not in video_data:
3360 video_data['sources'] = [video_data]
3361
3362 this_video_id = video_id or video_data['mediaid']
3363
3364 formats = self._parse_jwplayer_formats(
3365 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3366 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3367
3368 subtitles = {}
3369 tracks = video_data.get('tracks')
3370 if tracks and isinstance(tracks, list):
3371 for track in tracks:
3372 if not isinstance(track, dict):
3373 continue
3374 track_kind = track.get('kind')
3375 if not track_kind or not isinstance(track_kind, str):
3376 continue
3377 if track_kind.lower() not in ('captions', 'subtitles'):
3378 continue
3379 track_url = urljoin(base_url, track.get('file'))
3380 if not track_url:
3381 continue
3382 subtitles.setdefault(track.get('label') or 'en', []).append({
3383 'url': self._proto_relative_url(track_url)
3384 })
3385
3386 entry = {
3387 'id': this_video_id,
3388 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3389 'description': clean_html(video_data.get('description')),
3390 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3391 'timestamp': int_or_none(video_data.get('pubdate')),
3392 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3393 'subtitles': subtitles,
3394 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3395 'genre': clean_html(video_data.get('genre')),
3396 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3397 'season_number': int_or_none(video_data.get('season')),
3398 'episode_number': int_or_none(video_data.get('episode')),
3399 'release_year': int_or_none(video_data.get('releasedate')),
3400 'age_limit': int_or_none(video_data.get('age_restriction')),
3401 }
3402 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3403 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3404 entry.update({
3405 '_type': 'url_transparent',
3406 'url': formats[0]['url'],
3407 })
3408 else:
3409 entry['formats'] = formats
3410 entries.append(entry)
3411 if len(entries) == 1:
3412 return entries[0]
3413 else:
3414 return self.playlist_result(entries)
3415
3416 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3417 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3418 urls = set()
3419 formats = []
3420 for source in jwplayer_sources_data:
3421 if not isinstance(source, dict):
3422 continue
3423 source_url = urljoin(
3424 base_url, self._proto_relative_url(source.get('file')))
3425 if not source_url or source_url in urls:
3426 continue
3427 urls.add(source_url)
3428 source_type = source.get('type') or ''
3429 ext = mimetype2ext(source_type) or determine_ext(source_url)
3430 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3431 formats.extend(self._extract_m3u8_formats(
3432 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3433 m3u8_id=m3u8_id, fatal=False))
3434 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3435 formats.extend(self._extract_mpd_formats(
3436 source_url, video_id, mpd_id=mpd_id, fatal=False))
3437 elif ext == 'smil':
3438 formats.extend(self._extract_smil_formats(
3439 source_url, video_id, fatal=False))
3440 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3441 elif source_type.startswith('audio') or ext in (
3442 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3443 formats.append({
3444 'url': source_url,
3445 'vcodec': 'none',
3446 'ext': ext,
3447 })
3448 else:
3449 format_id = str_or_none(source.get('label'))
3450 height = int_or_none(source.get('height'))
3451 if height is None and format_id:
3452 # Often no height is provided but there is a label in
3453 # format like "1080p", "720p SD", or 1080.
3454 height = parse_resolution(format_id).get('height')
3455 a_format = {
3456 'url': source_url,
3457 'width': int_or_none(source.get('width')),
3458 'height': height,
3459 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3460 'filesize': int_or_none(source.get('filesize')),
3461 'ext': ext,
3462 'format_id': format_id
3463 }
3464 if source_url.startswith('rtmp'):
3465 a_format['ext'] = 'flv'
3466 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3467 # of jwplayer.flash.swf
3468 rtmp_url_parts = re.split(
3469 r'((?:mp4|mp3|flv):)', source_url, 1)
3470 if len(rtmp_url_parts) == 3:
3471 rtmp_url, prefix, play_path = rtmp_url_parts
3472 a_format.update({
3473 'url': rtmp_url,
3474 'play_path': prefix + play_path,
3475 })
3476 if rtmp_params:
3477 a_format.update(rtmp_params)
3478 formats.append(a_format)
3479 return formats
3480
3481 def _live_title(self, name):
3482 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3483 return name
3484
3485 def _int(self, v, name, fatal=False, **kwargs):
3486 res = int_or_none(v, **kwargs)
3487 if res is None:
3488 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3489 if fatal:
3490 raise ExtractorError(msg)
3491 else:
3492 self.report_warning(msg)
3493 return res
3494
3495 def _float(self, v, name, fatal=False, **kwargs):
3496 res = float_or_none(v, **kwargs)
3497 if res is None:
3498 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3499 if fatal:
3500 raise ExtractorError(msg)
3501 else:
3502 self.report_warning(msg)
3503 return res
3504
3505 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3506 path='/', secure=False, discard=False, rest={}, **kwargs):
3507 cookie = http.cookiejar.Cookie(
3508 0, name, value, port, port is not None, domain, True,
3509 domain.startswith('.'), path, True, secure, expire_time,
3510 discard, None, None, rest)
3511 self.cookiejar.set_cookie(cookie)
3512
3513 def _get_cookies(self, url):
3514 """ Return a http.cookies.SimpleCookie with the cookies for the url """
3515 return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3516
3517 def _apply_first_set_cookie_header(self, url_handle, cookie):
3518 """
3519 Apply first Set-Cookie header instead of the last. Experimental.
3520
3521 Some sites (e.g. [1-3]) may serve two cookies under the same name
3522 in Set-Cookie header and expect the first (old) one to be set rather
3523 than second (new). However, as of RFC6265 the newer one cookie
3524 should be set into cookie store what actually happens.
3525 We will workaround this issue by resetting the cookie to
3526 the first one manually.
3527 1. https://new.vk.com/
3528 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3529 3. https://learning.oreilly.com/
3530 """
3531 for header, cookies in url_handle.headers.items():
3532 if header.lower() != 'set-cookie':
3533 continue
3534 cookies = cookies.encode('iso-8859-1').decode('utf-8')
3535 cookie_value = re.search(
3536 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3537 if cookie_value:
3538 value, domain = cookie_value.groups()
3539 self._set_cookie(domain, cookie, value)
3540 break
3541
3542 @classmethod
3543 def get_testcases(cls, include_onlymatching=False):
3544 # Do not look in super classes
3545 t = vars(cls).get('_TEST')
3546 if t:
3547 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3548 tests = [t]
3549 else:
3550 tests = vars(cls).get('_TESTS', [])
3551 for t in tests:
3552 if not include_onlymatching and t.get('only_matching', False):
3553 continue
3554 t['name'] = cls.ie_key()
3555 yield t
3556 if getattr(cls, '__wrapped__', None):
3557 yield from cls.__wrapped__.get_testcases(include_onlymatching)
3558
3559 @classmethod
3560 def get_webpage_testcases(cls):
3561 tests = vars(cls).get('_WEBPAGE_TESTS', [])
3562 for t in tests:
3563 t['name'] = cls.ie_key()
3564 yield t
3565 if getattr(cls, '__wrapped__', None):
3566 yield from cls.__wrapped__.get_webpage_testcases()
3567
3568 @classproperty(cache=True)
3569 def age_limit(cls):
3570 """Get age limit from the testcases"""
3571 return max(traverse_obj(
3572 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3573 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3574
3575 @classproperty(cache=True)
3576 def _RETURN_TYPE(cls):
3577 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3578 tests = tuple(cls.get_testcases(include_onlymatching=False))
3579 if not tests:
3580 return None
3581 elif not any(k.startswith('playlist') for test in tests for k in test):
3582 return 'video'
3583 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3584 return 'playlist'
3585 return 'any'
3586
3587 @classmethod
3588 def is_single_video(cls, url):
3589 """Returns whether the URL is of a single video, None if unknown"""
3590 if cls.suitable(url):
3591 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3592
3593 @classmethod
3594 def is_suitable(cls, age_limit):
3595 """Test whether the extractor is generally suitable for the given age limit"""
3596 return not age_restricted(cls.age_limit, age_limit)
3597
3598 @classmethod
3599 def description(cls, *, markdown=True, search_examples=None):
3600 """Description of the extractor"""
3601 desc = ''
3602 if cls._NETRC_MACHINE:
3603 if markdown:
3604 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3605 else:
3606 desc += f' [{cls._NETRC_MACHINE}]'
3607 if cls.IE_DESC is False:
3608 desc += ' [HIDDEN]'
3609 elif cls.IE_DESC:
3610 desc += f' {cls.IE_DESC}'
3611 if cls.SEARCH_KEY:
3612 desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3613 if search_examples:
3614 _COUNTS = ('', '5', '10', 'all')
3615 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3616 if not cls.working():
3617 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3618
3619 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3620 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3621 return f'{name}:{desc}' if desc else name
3622
3623 def extract_subtitles(self, *args, **kwargs):
3624 if (self.get_param('writesubtitles', False)
3625 or self.get_param('listsubtitles')):
3626 return self._get_subtitles(*args, **kwargs)
3627 return {}
3628
3629 def _get_subtitles(self, *args, **kwargs):
3630 raise NotImplementedError('This method must be implemented by subclasses')
3631
3632 class CommentsDisabled(Exception):
3633 """Raise in _get_comments if comments are disabled for the video"""
3634
3635 def extract_comments(self, *args, **kwargs):
3636 if not self.get_param('getcomments'):
3637 return None
3638 generator = self._get_comments(*args, **kwargs)
3639
3640 def extractor():
3641 comments = []
3642 interrupted = True
3643 try:
3644 while True:
3645 comments.append(next(generator))
3646 except StopIteration:
3647 interrupted = False
3648 except KeyboardInterrupt:
3649 self.to_screen('Interrupted by user')
3650 except self.CommentsDisabled:
3651 return {'comments': None, 'comment_count': None}
3652 except Exception as e:
3653 if self.get_param('ignoreerrors') is not True:
3654 raise
3655 self._downloader.report_error(e)
3656 comment_count = len(comments)
3657 self.to_screen(f'Extracted {comment_count} comments')
3658 return {
3659 'comments': comments,
3660 'comment_count': None if interrupted else comment_count
3661 }
3662 return extractor
3663
3664 def _get_comments(self, *args, **kwargs):
3665 raise NotImplementedError('This method must be implemented by subclasses')
3666
3667 @staticmethod
3668 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3669 """ Merge subtitle items for one language. Items with duplicated URLs/data
3670 will be dropped. """
3671 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3672 ret = list(subtitle_list1)
3673 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3674 return ret
3675
3676 @classmethod
3677 def _merge_subtitles(cls, *dicts, target=None):
3678 """ Merge subtitle dictionaries, language by language. """
3679 if target is None:
3680 target = {}
3681 for d in dicts:
3682 for lang, subs in d.items():
3683 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3684 return target
3685
3686 def extract_automatic_captions(self, *args, **kwargs):
3687 if (self.get_param('writeautomaticsub', False)
3688 or self.get_param('listsubtitles')):
3689 return self._get_automatic_captions(*args, **kwargs)
3690 return {}
3691
3692 def _get_automatic_captions(self, *args, **kwargs):
3693 raise NotImplementedError('This method must be implemented by subclasses')
3694
3695 @functools.cached_property
3696 def _cookies_passed(self):
3697 """Whether cookies have been passed to YoutubeDL"""
3698 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3699
3700 def mark_watched(self, *args, **kwargs):
3701 if not self.get_param('mark_watched', False):
3702 return
3703 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3704 self._mark_watched(*args, **kwargs)
3705
3706 def _mark_watched(self, *args, **kwargs):
3707 raise NotImplementedError('This method must be implemented by subclasses')
3708
3709 def geo_verification_headers(self):
3710 headers = {}
3711 geo_verification_proxy = self.get_param('geo_verification_proxy')
3712 if geo_verification_proxy:
3713 headers['Ytdl-request-proxy'] = geo_verification_proxy
3714 return headers
3715
3716 @staticmethod
3717 def _generic_id(url):
3718 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3719
3720 def _generic_title(self, url='', webpage='', *, default=None):
3721 return (self._og_search_title(webpage, default=None)
3722 or self._html_extract_title(webpage, default=None)
3723 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3724 or default)
3725
3726 def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3727 if not duration:
3728 return
3729 chapter_list = [{
3730 'start_time': start_function(chapter),
3731 'title': title_function(chapter),
3732 } for chapter in chapter_list or []]
3733 if strict:
3734 warn = self.report_warning
3735 else:
3736 warn = self.write_debug
3737 chapter_list.sort(key=lambda c: c['start_time'] or 0)
3738
3739 chapters = [{'start_time': 0}]
3740 for idx, chapter in enumerate(chapter_list):
3741 if chapter['start_time'] is None:
3742 warn(f'Incomplete chapter {idx}')
3743 elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3744 chapters.append(chapter)
3745 elif chapter not in chapters:
3746 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3747 else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3748 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3749 return chapters[1:]
3750
3751 def _extract_chapters_from_description(self, description, duration):
3752 duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3753 sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3754 return self._extract_chapters_helper(
3755 re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3756 start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3757 duration=duration, strict=False) or self._extract_chapters_helper(
3758 re.findall(sep_re % (r'.+?', duration_re), description or ''),
3759 start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3760 duration=duration, strict=False)
3761
3762 @staticmethod
3763 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3764 all_known = all(map(
3765 lambda x: x is not None,
3766 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3767 return (
3768 'private' if is_private
3769 else 'premium_only' if needs_premium
3770 else 'subscriber_only' if needs_subscription
3771 else 'needs_auth' if needs_auth
3772 else 'unlisted' if is_unlisted
3773 else 'public' if all_known
3774 else None)
3775
3776 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3777 '''
3778 @returns A list of values for the extractor argument given by "key"
3779 or "default" if no such key is present
3780 @param default The default value to return when the key is not present (default: [])
3781 @param casesense When false, the values are converted to lower case
3782 '''
3783 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3784 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3785 if val is None:
3786 return [] if default is NO_DEFAULT else default
3787 return list(val) if casesense else [x.lower() for x in val]
3788
3789 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3790 if not playlist_id or not video_id:
3791 return not video_id
3792
3793 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3794 if no_playlist is not None:
3795 return not no_playlist
3796
3797 video_id = '' if video_id is True else f' {video_id}'
3798 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3799 if self.get_param('noplaylist'):
3800 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3801 return False
3802 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3803 return True
3804
3805 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3806 RetryManager.report_retry(
3807 err, _count or int(fatal), _retries,
3808 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3809 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3810
3811 def RetryManager(self, **kwargs):
3812 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3813
3814 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3815 display_id = traverse_obj(info_dict, 'display_id', 'id')
3816 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3817 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3818 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3819
3820 @classmethod
3821 def extract_from_webpage(cls, ydl, url, webpage):
3822 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3823 else ydl.get_info_extractor(cls.ie_key()))
3824 for info in ie._extract_from_webpage(url, webpage) or []:
3825 # url = None since we do not want to set (webpage/original)_url
3826 ydl.add_default_extra_info(info, ie, None)
3827 yield info
3828
3829 @classmethod
3830 def _extract_from_webpage(cls, url, webpage):
3831 for embed_url in orderedSet(
3832 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3833 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3834
3835 @classmethod
3836 def _extract_embed_urls(cls, url, webpage):
3837 """@returns all the embed urls on the webpage"""
3838 if '_EMBED_URL_RE' not in cls.__dict__:
3839 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3840 for idx, regex in enumerate(cls._EMBED_REGEX):
3841 assert regex.count('(?P<url>') == 1, \
3842 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3843 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3844
3845 for regex in cls._EMBED_URL_RE:
3846 for mobj in regex.finditer(webpage):
3847 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3848 if cls._VALID_URL is False or cls.suitable(embed_url):
3849 yield embed_url
3850
3851 class StopExtraction(Exception):
3852 pass
3853
3854 @classmethod
3855 def _extract_url(cls, webpage): # TODO: Remove
3856 """Only for compatibility with some older extractors"""
3857 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3858
3859 @classmethod
3860 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3861 if plugin_name:
3862 mro = inspect.getmro(cls)
3863 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3864 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3865 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3866 while getattr(super_class, '__wrapped__', None):
3867 super_class = super_class.__wrapped__
3868 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3869 _PLUGIN_OVERRIDES[super_class].append(cls)
3870
3871 return super().__init_subclass__(**kwargs)
3872
3873
3874class SearchInfoExtractor(InfoExtractor):
3875 """
3876 Base class for paged search queries extractors.
3877 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3878 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3879 """
3880
3881 _MAX_RESULTS = float('inf')
3882 _RETURN_TYPE = 'playlist'
3883
3884 @classproperty
3885 def _VALID_URL(cls):
3886 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3887
3888 def _real_extract(self, query):
3889 prefix, query = self._match_valid_url(query).group('prefix', 'query')
3890 if prefix == '':
3891 return self._get_n_results(query, 1)
3892 elif prefix == 'all':
3893 return self._get_n_results(query, self._MAX_RESULTS)
3894 else:
3895 n = int(prefix)
3896 if n <= 0:
3897 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3898 elif n > self._MAX_RESULTS:
3899 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3900 n = self._MAX_RESULTS
3901 return self._get_n_results(query, n)
3902
3903 def _get_n_results(self, query, n):
3904 """Get a specified number of results for a query.
3905 Either this function or _search_results must be overridden by subclasses """
3906 return self.playlist_result(
3907 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3908 query, query)
3909
3910 def _search_results(self, query):
3911 """Returns an iterator of search results"""
3912 raise NotImplementedError('This method must be implemented by subclasses')
3913
3914 @classproperty
3915 def SEARCH_KEY(cls):
3916 return cls._SEARCH_KEY
3917
3918
3919class UnsupportedURLIE(InfoExtractor):
3920 _VALID_URL = '.*'
3921 _ENABLED = False
3922 IE_DESC = False
3923
3924 def _real_extract(self, url):
3925 raise UnsupportedError(url)
3926
3927
3928_PLUGIN_OVERRIDES = collections.defaultdict(list)