]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/common.py
[cookies] Move `YoutubeDLCookieJar` to cookies module (#7091)
[yt-dlp.git] / yt_dlp / extractor / common.py
1 import base64
2 import collections
3 import getpass
4 import hashlib
5 import http.client
6 import http.cookiejar
7 import http.cookies
8 import inspect
9 import itertools
10 import json
11 import math
12 import netrc
13 import os
14 import random
15 import re
16 import sys
17 import time
18 import types
19 import urllib.parse
20 import urllib.request
21 import xml.etree.ElementTree
22
23 from ..compat import functools # isort: split
24 from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
25 from ..cookies import LenientSimpleCookie
26 from ..downloader.f4m import get_base_url, remove_encrypted_media
27 from ..utils import (
28 IDENTITY,
29 JSON_LD_RE,
30 NO_DEFAULT,
31 ExtractorError,
32 FormatSorter,
33 GeoRestrictedError,
34 GeoUtils,
35 HEADRequest,
36 LenientJSONDecoder,
37 RegexNotFoundError,
38 RetryManager,
39 UnsupportedError,
40 age_restricted,
41 base_url,
42 bug_reports_message,
43 classproperty,
44 clean_html,
45 deprecation_warning,
46 determine_ext,
47 dict_get,
48 encode_data_uri,
49 error_to_compat_str,
50 extract_attributes,
51 filter_dict,
52 fix_xml_ampersands,
53 float_or_none,
54 format_field,
55 int_or_none,
56 join_nonempty,
57 js_to_json,
58 mimetype2ext,
59 network_exceptions,
60 orderedSet,
61 parse_bitrate,
62 parse_codecs,
63 parse_duration,
64 parse_iso8601,
65 parse_m3u8_attributes,
66 parse_resolution,
67 sanitize_filename,
68 sanitize_url,
69 sanitized_Request,
70 smuggle_url,
71 str_or_none,
72 str_to_int,
73 strip_or_none,
74 traverse_obj,
75 truncate_string,
76 try_call,
77 try_get,
78 unescapeHTML,
79 unified_strdate,
80 unified_timestamp,
81 update_Request,
82 update_url_query,
83 url_basename,
84 url_or_none,
85 urlhandle_detect_ext,
86 urljoin,
87 variadic,
88 xpath_element,
89 xpath_text,
90 xpath_with_ns,
91 )
92
93
94 class InfoExtractor:
95 """Information Extractor class.
96
97 Information extractors are the classes that, given a URL, extract
98 information about the video (or videos) the URL refers to. This
99 information includes the real video URL, the video title, author and
100 others. The information is stored in a dictionary which is then
101 passed to the YoutubeDL. The YoutubeDL processes this
102 information possibly downloading the video to the file system, among
103 other possible outcomes.
104
105 The type field determines the type of the result.
106 By far the most common value (and the default if _type is missing) is
107 "video", which indicates a single video.
108
109 For a video, the dictionaries must include the following fields:
110
111 id: Video identifier.
112 title: Video title, unescaped. Set to an empty string if video has
113 no title as opposed to "None" which signifies that the
114 extractor failed to obtain a title
115
116 Additionally, it must contain either a formats entry or a url one:
117
118 formats: A list of dictionaries for each format available, ordered
119 from worst to best quality.
120
121 Potential fields:
122 * url The mandatory URL representing the media:
123 for plain file media - HTTP URL of this file,
124 for RTMP - RTMP URL,
125 for HLS - URL of the M3U8 media playlist,
126 for HDS - URL of the F4M manifest,
127 for DASH
128 - HTTP URL to plain file media (in case of
129 unfragmented media)
130 - URL of the MPD manifest or base URL
131 representing the media if MPD manifest
132 is parsed from a string (in case of
133 fragmented media)
134 for MSS - URL of the ISM manifest.
135 * request_data Data to send in POST request to the URL
136 * manifest_url
137 The URL of the manifest file in case of
138 fragmented media:
139 for HLS - URL of the M3U8 master playlist,
140 for HDS - URL of the F4M manifest,
141 for DASH - URL of the MPD manifest,
142 for MSS - URL of the ISM manifest.
143 * manifest_stream_number (For internal use only)
144 The index of the stream in the manifest file
145 * ext Will be calculated from URL if missing
146 * format A human-readable description of the format
147 ("mp4 container with h264/opus").
148 Calculated from the format_id, width, height.
149 and format_note fields if missing.
150 * format_id A short description of the format
151 ("mp4_h264_opus" or "19").
152 Technically optional, but strongly recommended.
153 * format_note Additional info about the format
154 ("3D" or "DASH video")
155 * width Width of the video, if known
156 * height Height of the video, if known
157 * aspect_ratio Aspect ratio of the video, if known
158 Automatically calculated from width and height
159 * resolution Textual description of width and height
160 Automatically calculated from width and height
161 * dynamic_range The dynamic range of the video. One of:
162 "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV"
163 * tbr Average bitrate of audio and video in KBit/s
164 * abr Average audio bitrate in KBit/s
165 * acodec Name of the audio codec in use
166 * asr Audio sampling rate in Hertz
167 * audio_channels Number of audio channels
168 * vbr Average video bitrate in KBit/s
169 * fps Frame rate
170 * vcodec Name of the video codec in use
171 * container Name of the container format
172 * filesize The number of bytes, if known in advance
173 * filesize_approx An estimate for the number of bytes
174 * player_url SWF Player URL (used for rtmpdump).
175 * protocol The protocol that will be used for the actual
176 download, lower-case. One of "http", "https" or
177 one of the protocols defined in downloader.PROTOCOL_MAP
178 * fragment_base_url
179 Base URL for fragments. Each fragment's path
180 value (if present) will be relative to
181 this URL.
182 * fragments A list of fragments of a fragmented media.
183 Each fragment entry must contain either an url
184 or a path. If an url is present it should be
185 considered by a client. Otherwise both path and
186 fragment_base_url must be present. Here is
187 the list of all potential fields:
188 * "url" - fragment's URL
189 * "path" - fragment's path relative to
190 fragment_base_url
191 * "duration" (optional, int or float)
192 * "filesize" (optional, int)
193 * is_from_start Is a live format that can be downloaded
194 from the start. Boolean
195 * preference Order number of this format. If this field is
196 present and not None, the formats get sorted
197 by this field, regardless of all other values.
198 -1 for default (order by other properties),
199 -2 or smaller for less than default.
200 < -1000 to hide the format (if there is
201 another one which is strictly better)
202 * language Language code, e.g. "de" or "en-US".
203 * language_preference Is this in the language mentioned in
204 the URL?
205 10 if it's what the URL is about,
206 -1 for default (don't know),
207 -10 otherwise, other values reserved for now.
208 * quality Order number of the video quality of this
209 format, irrespective of the file format.
210 -1 for default (order by other properties),
211 -2 or smaller for less than default.
212 * source_preference Order number for this video source
213 (quality takes higher priority)
214 -1 for default (order by other properties),
215 -2 or smaller for less than default.
216 * http_headers A dictionary of additional HTTP headers
217 to add to the request.
218 * stretched_ratio If given and not 1, indicates that the
219 video's pixels are not square.
220 width : height ratio as float.
221 * no_resume The server does not support resuming the
222 (HTTP or RTMP) download. Boolean.
223 * has_drm The format has DRM and cannot be downloaded. Boolean
224 * extra_param_to_segment_url A query string to append to each
225 fragment's URL, or to update each existing query string
226 with. Only applied by the native HLS/DASH downloaders.
227 * hls_aes A dictionary of HLS AES-128 decryption information
228 used by the native HLS downloader to override the
229 values in the media playlist when an '#EXT-X-KEY' tag
230 is present in the playlist:
231 * uri The URI from which the key will be downloaded
232 * key The key (as hex) used to decrypt fragments.
233 If `key` is given, any key URI will be ignored
234 * iv The IV (as hex) used to decrypt fragments
235 * downloader_options A dictionary of downloader options
236 (For internal use only)
237 * http_chunk_size Chunk size for HTTP downloads
238 * ffmpeg_args Extra arguments for ffmpeg downloader
239 RTMP formats can also have the additional fields: page_url,
240 app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn,
241 rtmp_protocol, rtmp_real_time
242
243 url: Final video URL.
244 ext: Video filename extension.
245 format: The video format, defaults to ext (used for --get-format)
246 player_url: SWF Player URL (used for rtmpdump).
247
248 The following fields are optional:
249
250 direct: True if a direct video file was given (must only be set by GenericIE)
251 alt_title: A secondary title of the video.
252 display_id An alternative identifier for the video, not necessarily
253 unique, but available before title. Typically, id is
254 something like "4234987", title "Dancing naked mole rats",
255 and display_id "dancing-naked-mole-rats"
256 thumbnails: A list of dictionaries, with the following entries:
257 * "id" (optional, string) - Thumbnail format ID
258 * "url"
259 * "preference" (optional, int) - quality of the image
260 * "width" (optional, int)
261 * "height" (optional, int)
262 * "resolution" (optional, string "{width}x{height}",
263 deprecated)
264 * "filesize" (optional, int)
265 * "http_headers" (dict) - HTTP headers for the request
266 thumbnail: Full URL to a video thumbnail image.
267 description: Full video description.
268 uploader: Full name of the video uploader.
269 license: License name the video is licensed under.
270 creator: The creator of the video.
271 timestamp: UNIX timestamp of the moment the video was uploaded
272 upload_date: Video upload date in UTC (YYYYMMDD).
273 If not explicitly set, calculated from timestamp
274 release_timestamp: UNIX timestamp of the moment the video was released.
275 If it is not clear whether to use timestamp or this, use the former
276 release_date: The date (YYYYMMDD) when the video was released in UTC.
277 If not explicitly set, calculated from release_timestamp
278 modified_timestamp: UNIX timestamp of the moment the video was last modified.
279 modified_date: The date (YYYYMMDD) when the video was last modified in UTC.
280 If not explicitly set, calculated from modified_timestamp
281 uploader_id: Nickname or id of the video uploader.
282 uploader_url: Full URL to a personal webpage of the video uploader.
283 channel: Full name of the channel the video is uploaded on.
284 Note that channel fields may or may not repeat uploader
285 fields. This depends on a particular extractor.
286 channel_id: Id of the channel.
287 channel_url: Full URL to a channel webpage.
288 channel_follower_count: Number of followers of the channel.
289 location: Physical location where the video was filmed.
290 subtitles: The available subtitles as a dictionary in the format
291 {tag: subformats}. "tag" is usually a language code, and
292 "subformats" is a list sorted from lower to higher
293 preference, each element is a dictionary with the "ext"
294 entry and one of:
295 * "data": The subtitles file contents
296 * "url": A URL pointing to the subtitles file
297 It can optionally also have:
298 * "name": Name or description of the subtitles
299 * "http_headers": A dictionary of additional HTTP headers
300 to add to the request.
301 "ext" will be calculated from URL if missing
302 automatic_captions: Like 'subtitles'; contains automatically generated
303 captions instead of normal subtitles
304 duration: Length of the video in seconds, as an integer or float.
305 view_count: How many users have watched the video on the platform.
306 concurrent_view_count: How many users are currently watching the video on the platform.
307 like_count: Number of positive ratings of the video
308 dislike_count: Number of negative ratings of the video
309 repost_count: Number of reposts of the video
310 average_rating: Average rating give by users, the scale used depends on the webpage
311 comment_count: Number of comments on the video
312 comments: A list of comments, each with one or more of the following
313 properties (all but one of text or html optional):
314 * "author" - human-readable name of the comment author
315 * "author_id" - user ID of the comment author
316 * "author_thumbnail" - The thumbnail of the comment author
317 * "id" - Comment ID
318 * "html" - Comment as HTML
319 * "text" - Plain text of the comment
320 * "timestamp" - UNIX timestamp of comment
321 * "parent" - ID of the comment this one is replying to.
322 Set to "root" to indicate that this is a
323 comment to the original video.
324 * "like_count" - Number of positive ratings of the comment
325 * "dislike_count" - Number of negative ratings of the comment
326 * "is_favorited" - Whether the comment is marked as
327 favorite by the video uploader
328 * "author_is_uploader" - Whether the comment is made by
329 the video uploader
330 age_limit: Age restriction for the video, as an integer (years)
331 webpage_url: The URL to the video webpage, if given to yt-dlp it
332 should allow to get the same result again. (It will be set
333 by YoutubeDL if it's missing)
334 categories: A list of categories that the video falls in, for example
335 ["Sports", "Berlin"]
336 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
337 cast: A list of the video cast
338 is_live: True, False, or None (=unknown). Whether this video is a
339 live stream that goes on instead of a fixed-length video.
340 was_live: True, False, or None (=unknown). Whether this video was
341 originally a live stream.
342 live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live',
343 or 'post_live' (was live, but VOD is not yet processed)
344 If absent, automatically set from is_live, was_live
345 start_time: Time in seconds where the reproduction should start, as
346 specified in the URL.
347 end_time: Time in seconds where the reproduction should end, as
348 specified in the URL.
349 chapters: A list of dictionaries, with the following entries:
350 * "start_time" - The start time of the chapter in seconds
351 * "end_time" - The end time of the chapter in seconds
352 * "title" (optional, string)
353 heatmap: A list of dictionaries, with the following entries:
354 * "start_time" - The start time of the data point in seconds
355 * "end_time" - The end time of the data point in seconds
356 * "value" - The normalized value of the data point (float between 0 and 1)
357 playable_in_embed: Whether this video is allowed to play in embedded
358 players on other sites. Can be True (=always allowed),
359 False (=never allowed), None (=unknown), or a string
360 specifying the criteria for embedability; e.g. 'whitelist'
361 availability: Under what condition the video is available. One of
362 'private', 'premium_only', 'subscriber_only', 'needs_auth',
363 'unlisted' or 'public'. Use 'InfoExtractor._availability'
364 to set it
365 _old_archive_ids: A list of old archive ids needed for backward compatibility
366 _format_sort_fields: A list of fields to use for sorting formats
367 __post_extractor: A function to be called just before the metadata is
368 written to either disk, logger or console. The function
369 must return a dict which will be added to the info_dict.
370 This is usefull for additional information that is
371 time-consuming to extract. Note that the fields thus
372 extracted will not be available to output template and
373 match_filter. So, only "comments" and "comment_count" are
374 currently allowed to be extracted via this method.
375
376 The following fields should only be used when the video belongs to some logical
377 chapter or section:
378
379 chapter: Name or title of the chapter the video belongs to.
380 chapter_number: Number of the chapter the video belongs to, as an integer.
381 chapter_id: Id of the chapter the video belongs to, as a unicode string.
382
383 The following fields should only be used when the video is an episode of some
384 series, programme or podcast:
385
386 series: Title of the series or programme the video episode belongs to.
387 series_id: Id of the series or programme the video episode belongs to, as a unicode string.
388 season: Title of the season the video episode belongs to.
389 season_number: Number of the season the video episode belongs to, as an integer.
390 season_id: Id of the season the video episode belongs to, as a unicode string.
391 episode: Title of the video episode. Unlike mandatory video title field,
392 this field should denote the exact title of the video episode
393 without any kind of decoration.
394 episode_number: Number of the video episode within a season, as an integer.
395 episode_id: Id of the video episode, as a unicode string.
396
397 The following fields should only be used when the media is a track or a part of
398 a music album:
399
400 track: Title of the track.
401 track_number: Number of the track within an album or a disc, as an integer.
402 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
403 as a unicode string.
404 artist: Artist(s) of the track.
405 genre: Genre(s) of the track.
406 album: Title of the album the track belongs to.
407 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
408 album_artist: List of all artists appeared on the album (e.g.
409 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
410 and compilations).
411 disc_number: Number of the disc or other physical medium the track belongs to,
412 as an integer.
413 release_year: Year (YYYY) when the album was released.
414 composer: Composer of the piece
415
416 The following fields should only be set for clips that should be cut from the original video:
417
418 section_start: Start time of the section in seconds
419 section_end: End time of the section in seconds
420
421 The following fields should only be set for storyboards:
422 rows: Number of rows in each storyboard fragment, as an integer
423 columns: Number of columns in each storyboard fragment, as an integer
424
425 Unless mentioned otherwise, the fields should be Unicode strings.
426
427 Unless mentioned otherwise, None is equivalent to absence of information.
428
429
430 _type "playlist" indicates multiple videos.
431 There must be a key "entries", which is a list, an iterable, or a PagedList
432 object, each element of which is a valid dictionary by this specification.
433
434 Additionally, playlists can have "id", "title", and any other relevant
435 attributes with the same semantics as videos (see above).
436
437 It can also have the following optional fields:
438
439 playlist_count: The total number of videos in a playlist. If not given,
440 YoutubeDL tries to calculate it from "entries"
441
442
443 _type "multi_video" indicates that there are multiple videos that
444 form a single show, for examples multiple acts of an opera or TV episode.
445 It must have an entries key like a playlist and contain all the keys
446 required for a video at the same time.
447
448
449 _type "url" indicates that the video must be extracted from another
450 location, possibly by a different extractor. Its only required key is:
451 "url" - the next URL to extract.
452 The key "ie_key" can be set to the class name (minus the trailing "IE",
453 e.g. "Youtube") if the extractor class is known in advance.
454 Additionally, the dictionary may have any properties of the resolved entity
455 known in advance, for example "title" if the title of the referred video is
456 known ahead of time.
457
458
459 _type "url_transparent" entities have the same specification as "url", but
460 indicate that the given additional information is more precise than the one
461 associated with the resolved URL.
462 This is useful when a site employs a video service that hosts the video and
463 its technical metadata, but that video service does not embed a useful
464 title, description etc.
465
466
467 Subclasses of this should also be added to the list of extractors and
468 should define a _VALID_URL regexp and, re-define the _real_extract() and
469 (optionally) _real_initialize() methods.
470
471 Subclasses may also override suitable() if necessary, but ensure the function
472 signature is preserved and that this function imports everything it needs
473 (except other extractors), so that lazy_extractors works correctly.
474
475 Subclasses can define a list of _EMBED_REGEX, which will be searched for in
476 the HTML of Generic webpages. It may also override _extract_embed_urls
477 or _extract_from_webpage as necessary. While these are normally classmethods,
478 _extract_from_webpage is allowed to be an instance method.
479
480 _extract_from_webpage may raise self.StopExtraction() to stop further
481 processing of the webpage and obtain exclusive rights to it. This is useful
482 when the extractor cannot reliably be matched using just the URL,
483 e.g. invidious/peertube instances
484
485 Embed-only extractors can be defined by setting _VALID_URL = False.
486
487 To support username + password (or netrc) login, the extractor must define a
488 _NETRC_MACHINE and re-define _perform_login(username, password) and
489 (optionally) _initialize_pre_login() methods. The _perform_login method will
490 be called between _initialize_pre_login and _real_initialize if credentials
491 are passed by the user. In cases where it is necessary to have the login
492 process as part of the extraction rather than initialization, _perform_login
493 can be left undefined.
494
495 _GEO_BYPASS attribute may be set to False in order to disable
496 geo restriction bypass mechanisms for a particular extractor.
497 Though it won't disable explicit geo restriction bypass based on
498 country code provided with geo_bypass_country.
499
500 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
501 countries for this extractor. One of these countries will be used by
502 geo restriction bypass mechanism right away in order to bypass
503 geo restriction, of course, if the mechanism is not disabled.
504
505 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
506 IP blocks in CIDR notation for this extractor. One of these IP blocks
507 will be used by geo restriction bypass mechanism similarly
508 to _GEO_COUNTRIES.
509
510 The _ENABLED attribute should be set to False for IEs that
511 are disabled by default and must be explicitly enabled.
512
513 The _WORKING attribute should be set to False for broken IEs
514 in order to warn the users and skip the tests.
515 """
516
517 _ready = False
518 _downloader = None
519 _x_forwarded_for_ip = None
520 _GEO_BYPASS = True
521 _GEO_COUNTRIES = None
522 _GEO_IP_BLOCKS = None
523 _WORKING = True
524 _ENABLED = True
525 _NETRC_MACHINE = None
526 IE_DESC = None
527 SEARCH_KEY = None
528 _VALID_URL = None
529 _EMBED_REGEX = []
530
531 def _login_hint(self, method=NO_DEFAULT, netrc=None):
532 password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
533 return {
534 None: '',
535 'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
536 'password': f'Use {password_hint}',
537 'cookies': (
538 'Use --cookies-from-browser or --cookies for the authentication. '
539 'See https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp for how to manually pass cookies'),
540 }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies']
541
542 def __init__(self, downloader=None):
543 """Constructor. Receives an optional downloader (a YoutubeDL instance).
544 If a downloader is not passed during initialization,
545 it must be set using "set_downloader()" before "extract()" is called"""
546 self._ready = False
547 self._x_forwarded_for_ip = None
548 self._printed_messages = set()
549 self.set_downloader(downloader)
550
551 @classmethod
552 def _match_valid_url(cls, url):
553 if cls._VALID_URL is False:
554 return None
555 # This does not use has/getattr intentionally - we want to know whether
556 # we have cached the regexp for *this* class, whereas getattr would also
557 # match the superclass
558 if '_VALID_URL_RE' not in cls.__dict__:
559 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
560 return cls._VALID_URL_RE.match(url)
561
562 @classmethod
563 def suitable(cls, url):
564 """Receives a URL and returns True if suitable for this IE."""
565 # This function must import everything it needs (except other extractors),
566 # so that lazy_extractors works correctly
567 return cls._match_valid_url(url) is not None
568
569 @classmethod
570 def _match_id(cls, url):
571 return cls._match_valid_url(url).group('id')
572
573 @classmethod
574 def get_temp_id(cls, url):
575 try:
576 return cls._match_id(url)
577 except (IndexError, AttributeError):
578 return None
579
580 @classmethod
581 def working(cls):
582 """Getter method for _WORKING."""
583 return cls._WORKING
584
585 @classmethod
586 def supports_login(cls):
587 return bool(cls._NETRC_MACHINE)
588
589 def initialize(self):
590 """Initializes an instance (authentication, etc)."""
591 self._printed_messages = set()
592 self._initialize_geo_bypass({
593 'countries': self._GEO_COUNTRIES,
594 'ip_blocks': self._GEO_IP_BLOCKS,
595 })
596 if not self._ready:
597 self._initialize_pre_login()
598 if self.supports_login():
599 username, password = self._get_login_info()
600 if username:
601 self._perform_login(username, password)
602 elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE):
603 self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}')
604 self._real_initialize()
605 self._ready = True
606
607 def _initialize_geo_bypass(self, geo_bypass_context):
608 """
609 Initialize geo restriction bypass mechanism.
610
611 This method is used to initialize geo bypass mechanism based on faking
612 X-Forwarded-For HTTP header. A random country from provided country list
613 is selected and a random IP belonging to this country is generated. This
614 IP will be passed as X-Forwarded-For HTTP header in all subsequent
615 HTTP requests.
616
617 This method will be used for initial geo bypass mechanism initialization
618 during the instance initialization with _GEO_COUNTRIES and
619 _GEO_IP_BLOCKS.
620
621 You may also manually call it from extractor's code if geo bypass
622 information is not available beforehand (e.g. obtained during
623 extraction) or due to some other reason. In this case you should pass
624 this information in geo bypass context passed as first argument. It may
625 contain following fields:
626
627 countries: List of geo unrestricted countries (similar
628 to _GEO_COUNTRIES)
629 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
630 (similar to _GEO_IP_BLOCKS)
631
632 """
633 if not self._x_forwarded_for_ip:
634
635 # Geo bypass mechanism is explicitly disabled by user
636 if not self.get_param('geo_bypass', True):
637 return
638
639 if not geo_bypass_context:
640 geo_bypass_context = {}
641
642 # Backward compatibility: previously _initialize_geo_bypass
643 # expected a list of countries, some 3rd party code may still use
644 # it this way
645 if isinstance(geo_bypass_context, (list, tuple)):
646 geo_bypass_context = {
647 'countries': geo_bypass_context,
648 }
649
650 # The whole point of geo bypass mechanism is to fake IP
651 # as X-Forwarded-For HTTP header based on some IP block or
652 # country code.
653
654 # Path 1: bypassing based on IP block in CIDR notation
655
656 # Explicit IP block specified by user, use it right away
657 # regardless of whether extractor is geo bypassable or not
658 ip_block = self.get_param('geo_bypass_ip_block', None)
659
660 # Otherwise use random IP block from geo bypass context but only
661 # if extractor is known as geo bypassable
662 if not ip_block:
663 ip_blocks = geo_bypass_context.get('ip_blocks')
664 if self._GEO_BYPASS and ip_blocks:
665 ip_block = random.choice(ip_blocks)
666
667 if ip_block:
668 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
669 self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For')
670 return
671
672 # Path 2: bypassing based on country code
673
674 # Explicit country code specified by user, use it right away
675 # regardless of whether extractor is geo bypassable or not
676 country = self.get_param('geo_bypass_country', None)
677
678 # Otherwise use random country code from geo bypass context but
679 # only if extractor is known as geo bypassable
680 if not country:
681 countries = geo_bypass_context.get('countries')
682 if self._GEO_BYPASS and countries:
683 country = random.choice(countries)
684
685 if country:
686 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
687 self._downloader.write_debug(
688 f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For')
689
690 def extract(self, url):
691 """Extracts URL information and returns it in list of dicts."""
692 try:
693 for _ in range(2):
694 try:
695 self.initialize()
696 self.to_screen('Extracting URL: %s' % (
697 url if self.get_param('verbose') else truncate_string(url, 100, 20)))
698 ie_result = self._real_extract(url)
699 if ie_result is None:
700 return None
701 if self._x_forwarded_for_ip:
702 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
703 subtitles = ie_result.get('subtitles') or {}
704 if 'no-live-chat' in self.get_param('compat_opts'):
705 for lang in ('live_chat', 'comments', 'danmaku'):
706 subtitles.pop(lang, None)
707 return ie_result
708 except GeoRestrictedError as e:
709 if self.__maybe_fake_ip_and_retry(e.countries):
710 continue
711 raise
712 except UnsupportedError:
713 raise
714 except ExtractorError as e:
715 e.video_id = e.video_id or self.get_temp_id(url),
716 e.ie = e.ie or self.IE_NAME,
717 e.traceback = e.traceback or sys.exc_info()[2]
718 raise
719 except http.client.IncompleteRead as e:
720 raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
721 except (KeyError, StopIteration) as e:
722 raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
723
724 def __maybe_fake_ip_and_retry(self, countries):
725 if (not self.get_param('geo_bypass_country', None)
726 and self._GEO_BYPASS
727 and self.get_param('geo_bypass', True)
728 and not self._x_forwarded_for_ip
729 and countries):
730 country_code = random.choice(countries)
731 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
732 if self._x_forwarded_for_ip:
733 self.report_warning(
734 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
735 % (self._x_forwarded_for_ip, country_code.upper()))
736 return True
737 return False
738
739 def set_downloader(self, downloader):
740 """Sets a YoutubeDL instance as the downloader for this IE."""
741 self._downloader = downloader
742
743 @property
744 def cache(self):
745 return self._downloader.cache
746
747 @property
748 def cookiejar(self):
749 return self._downloader.cookiejar
750
751 def _initialize_pre_login(self):
752 """ Initialization before login. Redefine in subclasses."""
753 pass
754
755 def _perform_login(self, username, password):
756 """ Login with username and password. Redefine in subclasses."""
757 pass
758
759 def _real_initialize(self):
760 """Real initialization process. Redefine in subclasses."""
761 pass
762
763 def _real_extract(self, url):
764 """Real extraction process. Redefine in subclasses."""
765 raise NotImplementedError('This method must be implemented by subclasses')
766
767 @classmethod
768 def ie_key(cls):
769 """A string for getting the InfoExtractor with get_info_extractor"""
770 return cls.__name__[:-2]
771
772 @classproperty
773 def IE_NAME(cls):
774 return cls.__name__[:-2]
775
776 @staticmethod
777 def __can_accept_status_code(err, expected_status):
778 assert isinstance(err, urllib.error.HTTPError)
779 if expected_status is None:
780 return False
781 elif callable(expected_status):
782 return expected_status(err.code) is True
783 else:
784 return err.code in variadic(expected_status)
785
786 def _create_request(self, url_or_request, data=None, headers=None, query=None):
787 if isinstance(url_or_request, urllib.request.Request):
788 return update_Request(url_or_request, data=data, headers=headers, query=query)
789 if query:
790 url_or_request = update_url_query(url_or_request, query)
791 return sanitized_Request(url_or_request, data, headers or {})
792
793 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
794 """
795 Return the response handle.
796
797 See _download_webpage docstring for arguments specification.
798 """
799 if not self._downloader._first_webpage_request:
800 sleep_interval = self.get_param('sleep_interval_requests') or 0
801 if sleep_interval > 0:
802 self.to_screen('Sleeping %s seconds ...' % sleep_interval)
803 time.sleep(sleep_interval)
804 else:
805 self._downloader._first_webpage_request = False
806
807 if note is None:
808 self.report_download_webpage(video_id)
809 elif note is not False:
810 if video_id is None:
811 self.to_screen(str(note))
812 else:
813 self.to_screen(f'{video_id}: {note}')
814
815 # Some sites check X-Forwarded-For HTTP header in order to figure out
816 # the origin of the client behind proxy. This allows bypassing geo
817 # restriction by faking this header's value to IP that belongs to some
818 # geo unrestricted country. We will do so once we encounter any
819 # geo restriction error.
820 if self._x_forwarded_for_ip:
821 headers = (headers or {}).copy()
822 headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip)
823
824 try:
825 return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
826 except network_exceptions as err:
827 if isinstance(err, urllib.error.HTTPError):
828 if self.__can_accept_status_code(err, expected_status):
829 # Retain reference to error to prevent file object from
830 # being closed before it can be read. Works around the
831 # effects of <https://bugs.python.org/issue15002>
832 # introduced in Python 3.4.1.
833 err.fp._error = err
834 return err.fp
835
836 if errnote is False:
837 return False
838 if errnote is None:
839 errnote = 'Unable to download webpage'
840
841 errmsg = f'{errnote}: {error_to_compat_str(err)}'
842 if fatal:
843 raise ExtractorError(errmsg, cause=err)
844 else:
845 self.report_warning(errmsg)
846 return False
847
848 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True,
849 encoding=None, data=None, headers={}, query={}, expected_status=None):
850 """
851 Return a tuple (page content as string, URL handle).
852
853 Arguments:
854 url_or_request -- plain text URL as a string or
855 a urllib.request.Request object
856 video_id -- Video/playlist/item identifier (string)
857
858 Keyword arguments:
859 note -- note printed before downloading (string)
860 errnote -- note printed in case of an error (string)
861 fatal -- flag denoting whether error should be considered fatal,
862 i.e. whether it should cause ExtractionError to be raised,
863 otherwise a warning will be reported and extraction continued
864 encoding -- encoding for a page content decoding, guessed automatically
865 when not explicitly specified
866 data -- POST data (bytes)
867 headers -- HTTP headers (dict)
868 query -- URL query (dict)
869 expected_status -- allows to accept failed HTTP requests (non 2xx
870 status code) by explicitly specifying a set of accepted status
871 codes. Can be any of the following entities:
872 - an integer type specifying an exact failed status code to
873 accept
874 - a list or a tuple of integer types specifying a list of
875 failed status codes to accept
876 - a callable accepting an actual failed status code and
877 returning True if it should be accepted
878 Note that this argument does not affect success status codes (2xx)
879 which are always accepted.
880 """
881
882 # Strip hashes from the URL (#1038)
883 if isinstance(url_or_request, str):
884 url_or_request = url_or_request.partition('#')[0]
885
886 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
887 if urlh is False:
888 assert not fatal
889 return False
890 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
891 return (content, urlh)
892
893 @staticmethod
894 def _guess_encoding_from_content(content_type, webpage_bytes):
895 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
896 if m:
897 encoding = m.group(1)
898 else:
899 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
900 webpage_bytes[:1024])
901 if m:
902 encoding = m.group(1).decode('ascii')
903 elif webpage_bytes.startswith(b'\xff\xfe'):
904 encoding = 'utf-16'
905 else:
906 encoding = 'utf-8'
907
908 return encoding
909
910 def __check_blocked(self, content):
911 first_block = content[:512]
912 if ('<title>Access to this site is blocked</title>' in content
913 and 'Websense' in first_block):
914 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
915 blocked_iframe = self._html_search_regex(
916 r'<iframe src="([^"]+)"', content,
917 'Websense information URL', default=None)
918 if blocked_iframe:
919 msg += ' Visit %s for more details' % blocked_iframe
920 raise ExtractorError(msg, expected=True)
921 if '<title>The URL you requested has been blocked</title>' in first_block:
922 msg = (
923 'Access to this webpage has been blocked by Indian censorship. '
924 'Use a VPN or proxy server (with --proxy) to route around it.')
925 block_msg = self._html_search_regex(
926 r'</h1><p>(.*?)</p>',
927 content, 'block message', default=None)
928 if block_msg:
929 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
930 raise ExtractorError(msg, expected=True)
931 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
932 and 'blocklist.rkn.gov.ru' in content):
933 raise ExtractorError(
934 'Access to this webpage has been blocked by decision of the Russian government. '
935 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
936 expected=True)
937
938 def _request_dump_filename(self, url, video_id):
939 basen = f'{video_id}_{url}'
940 trim_length = self.get_param('trim_file_name') or 240
941 if len(basen) > trim_length:
942 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
943 basen = basen[:trim_length - len(h)] + h
944 filename = sanitize_filename(f'{basen}.dump', restricted=True)
945 # Working around MAX_PATH limitation on Windows (see
946 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
947 if compat_os_name == 'nt':
948 absfilepath = os.path.abspath(filename)
949 if len(absfilepath) > 259:
950 filename = fR'\\?\{absfilepath}'
951 return filename
952
953 def __decode_webpage(self, webpage_bytes, encoding, headers):
954 if not encoding:
955 encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
956 try:
957 return webpage_bytes.decode(encoding, 'replace')
958 except LookupError:
959 return webpage_bytes.decode('utf-8', 'replace')
960
961 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
962 webpage_bytes = urlh.read()
963 if prefix is not None:
964 webpage_bytes = prefix + webpage_bytes
965 if self.get_param('dump_intermediate_pages', False):
966 self.to_screen('Dumping request to ' + urlh.geturl())
967 dump = base64.b64encode(webpage_bytes).decode('ascii')
968 self._downloader.to_screen(dump)
969 if self.get_param('write_pages'):
970 filename = self._request_dump_filename(urlh.geturl(), video_id)
971 self.to_screen(f'Saving request to {filename}')
972 with open(filename, 'wb') as outf:
973 outf.write(webpage_bytes)
974
975 content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
976 self.__check_blocked(content)
977
978 return content
979
980 def __print_error(self, errnote, fatal, video_id, err):
981 if fatal:
982 raise ExtractorError(f'{video_id}: {errnote}', cause=err)
983 elif errnote:
984 self.report_warning(f'{video_id}: {errnote}: {err}')
985
986 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None):
987 if transform_source:
988 xml_string = transform_source(xml_string)
989 try:
990 return compat_etree_fromstring(xml_string.encode('utf-8'))
991 except xml.etree.ElementTree.ParseError as ve:
992 self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve)
993
994 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs):
995 try:
996 return json.loads(
997 json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
998 except ValueError as ve:
999 self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve)
1000
1001 def _parse_socket_response_as_json(self, data, *args, **kwargs):
1002 return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs)
1003
1004 def __create_download_methods(name, parser, note, errnote, return_value):
1005
1006 def parse(ie, content, *args, errnote=errnote, **kwargs):
1007 if parser is None:
1008 return content
1009 if errnote is False:
1010 kwargs['errnote'] = errnote
1011 # parser is fetched by name so subclasses can override it
1012 return getattr(ie, parser)(content, *args, **kwargs)
1013
1014 def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1015 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1016 res = self._download_webpage_handle(
1017 url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding,
1018 data=data, headers=headers, query=query, expected_status=expected_status)
1019 if res is False:
1020 return res
1021 content, urlh = res
1022 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh
1023
1024 def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
1025 fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
1026 if self.get_param('load_pages'):
1027 url_or_request = self._create_request(url_or_request, data, headers, query)
1028 filename = self._request_dump_filename(url_or_request.full_url, video_id)
1029 self.to_screen(f'Loading request from {filename}')
1030 try:
1031 with open(filename, 'rb') as dumpf:
1032 webpage_bytes = dumpf.read()
1033 except OSError as e:
1034 self.report_warning(f'Unable to load request from disk: {e}')
1035 else:
1036 content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
1037 return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote)
1038 kwargs = {
1039 'note': note,
1040 'errnote': errnote,
1041 'transform_source': transform_source,
1042 'fatal': fatal,
1043 'encoding': encoding,
1044 'data': data,
1045 'headers': headers,
1046 'query': query,
1047 'expected_status': expected_status,
1048 }
1049 if parser is None:
1050 kwargs.pop('transform_source')
1051 # The method is fetched by name so subclasses can override _download_..._handle
1052 res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs)
1053 return res if res is False else res[0]
1054
1055 def impersonate(func, name, return_value):
1056 func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
1057 func.__doc__ = f'''
1058 @param transform_source Apply this transformation before parsing
1059 @returns {return_value}
1060
1061 See _download_webpage_handle docstring for other arguments specification
1062 '''
1063
1064 impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
1065 impersonate(download_content, f'_download_{name}', f'{return_value}')
1066 return download_handle, download_content
1067
1068 _download_xml_handle, _download_xml = __create_download_methods(
1069 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
1070 _download_json_handle, _download_json = __create_download_methods(
1071 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
1072 _download_socket_json_handle, _download_socket_json = __create_download_methods(
1073 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
1074 __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
1075
1076 def _download_webpage(
1077 self, url_or_request, video_id, note=None, errnote=None,
1078 fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
1079 """
1080 Return the data of the page as a string.
1081
1082 Keyword arguments:
1083 tries -- number of tries
1084 timeout -- sleep interval between tries
1085
1086 See _download_webpage_handle docstring for other arguments specification.
1087 """
1088
1089 R''' # NB: These are unused; should they be deprecated?
1090 if tries != 1:
1091 self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
1092 if timeout is NO_DEFAULT:
1093 timeout = 5
1094 else:
1095 self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
1096 '''
1097
1098 try_count = 0
1099 while True:
1100 try:
1101 return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
1102 except http.client.IncompleteRead as e:
1103 try_count += 1
1104 if try_count >= tries:
1105 raise e
1106 self._sleep(timeout, video_id)
1107
1108 def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
1109 idstr = format_field(video_id, None, '%s: ')
1110 msg = f'[{self.IE_NAME}] {idstr}{msg}'
1111 if only_once:
1112 if f'WARNING: {msg}' in self._printed_messages:
1113 return
1114 self._printed_messages.add(f'WARNING: {msg}')
1115 self._downloader.report_warning(msg, *args, **kwargs)
1116
1117 def to_screen(self, msg, *args, **kwargs):
1118 """Print msg to screen, prefixing it with '[ie_name]'"""
1119 self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1120
1121 def write_debug(self, msg, *args, **kwargs):
1122 self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs)
1123
1124 def get_param(self, name, default=None, *args, **kwargs):
1125 if self._downloader:
1126 return self._downloader.params.get(name, default, *args, **kwargs)
1127 return default
1128
1129 def report_drm(self, video_id, partial=NO_DEFAULT):
1130 if partial is not NO_DEFAULT:
1131 self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial')
1132 self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
1133
1134 def report_extraction(self, id_or_name):
1135 """Report information extraction."""
1136 self.to_screen('%s: Extracting information' % id_or_name)
1137
1138 def report_download_webpage(self, video_id):
1139 """Report webpage download."""
1140 self.to_screen('%s: Downloading webpage' % video_id)
1141
1142 def report_age_confirmation(self):
1143 """Report attempt to confirm age."""
1144 self.to_screen('Confirming age')
1145
1146 def report_login(self):
1147 """Report attempt to log in."""
1148 self.to_screen('Logging in')
1149
1150 def raise_login_required(
1151 self, msg='This video is only available for registered users',
1152 metadata_available=False, method=NO_DEFAULT):
1153 if metadata_available and (
1154 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1155 self.report_warning(msg)
1156 return
1157 msg += format_field(self._login_hint(method), None, '. %s')
1158 raise ExtractorError(msg, expected=True)
1159
1160 def raise_geo_restricted(
1161 self, msg='This video is not available from your location due to geo restriction',
1162 countries=None, metadata_available=False):
1163 if metadata_available and (
1164 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1165 self.report_warning(msg)
1166 else:
1167 raise GeoRestrictedError(msg, countries=countries)
1168
1169 def raise_no_formats(self, msg, expected=False, video_id=None):
1170 if expected and (
1171 self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
1172 self.report_warning(msg, video_id)
1173 elif isinstance(msg, ExtractorError):
1174 raise msg
1175 else:
1176 raise ExtractorError(msg, expected=expected, video_id=video_id)
1177
1178 # Methods for following #608
1179 @staticmethod
1180 def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
1181 """Returns a URL that points to a page that should be processed"""
1182 if ie is not None:
1183 kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
1184 if video_id is not None:
1185 kwargs['id'] = video_id
1186 if video_title is not None:
1187 kwargs['title'] = video_title
1188 return {
1189 **kwargs,
1190 '_type': 'url_transparent' if url_transparent else 'url',
1191 'url': url,
1192 }
1193
1194 @classmethod
1195 def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
1196 getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
1197 return cls.playlist_result(
1198 (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
1199 playlist_id, playlist_title, **kwargs)
1200
1201 @staticmethod
1202 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
1203 """Returns a playlist"""
1204 if playlist_id:
1205 kwargs['id'] = playlist_id
1206 if playlist_title:
1207 kwargs['title'] = playlist_title
1208 if playlist_description is not None:
1209 kwargs['description'] = playlist_description
1210 return {
1211 **kwargs,
1212 '_type': 'multi_video' if multi_video else 'playlist',
1213 'entries': entries,
1214 }
1215
1216 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1217 """
1218 Perform a regex search on the given string, using a single or a list of
1219 patterns returning the first matching group.
1220 In case of failure return a default value or raise a WARNING or a
1221 RegexNotFoundError, depending on fatal, specifying the field name.
1222 """
1223 if string is None:
1224 mobj = None
1225 elif isinstance(pattern, (str, re.Pattern)):
1226 mobj = re.search(pattern, string, flags)
1227 else:
1228 for p in pattern:
1229 mobj = re.search(p, string, flags)
1230 if mobj:
1231 break
1232
1233 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1234
1235 if mobj:
1236 if group is None:
1237 # return the first matching group
1238 return next(g for g in mobj.groups() if g is not None)
1239 elif isinstance(group, (list, tuple)):
1240 return tuple(mobj.group(g) for g in group)
1241 else:
1242 return mobj.group(group)
1243 elif default is not NO_DEFAULT:
1244 return default
1245 elif fatal:
1246 raise RegexNotFoundError('Unable to extract %s' % _name)
1247 else:
1248 self.report_warning('unable to extract %s' % _name + bug_reports_message())
1249 return None
1250
1251 def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='',
1252 contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs):
1253 """Searches string for the JSON object specified by start_pattern"""
1254 # NB: end_pattern is only used to reduce the size of the initial match
1255 if default is NO_DEFAULT:
1256 default, has_default = {}, False
1257 else:
1258 fatal, has_default = False, True
1259
1260 json_string = self._search_regex(
1261 rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})',
1262 string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
1263 if not json_string:
1264 return default
1265
1266 _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
1267 try:
1268 return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
1269 except ExtractorError as e:
1270 if fatal:
1271 raise ExtractorError(
1272 f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id)
1273 elif not has_default:
1274 self.report_warning(
1275 f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id)
1276 return default
1277
1278 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1279 """
1280 Like _search_regex, but strips HTML tags and unescapes entities.
1281 """
1282 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1283 if isinstance(res, tuple):
1284 return tuple(map(clean_html, res))
1285 return clean_html(res)
1286
1287 def _get_netrc_login_info(self, netrc_machine=None):
1288 username = None
1289 password = None
1290 netrc_machine = netrc_machine or self._NETRC_MACHINE
1291
1292 if self.get_param('usenetrc', False):
1293 try:
1294 netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
1295 if os.path.isdir(netrc_file):
1296 netrc_file = os.path.join(netrc_file, '.netrc')
1297 info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
1298 if info is not None:
1299 username = info[0]
1300 password = info[2]
1301 else:
1302 raise netrc.NetrcParseError(
1303 'No authenticators for %s' % netrc_machine)
1304 except (OSError, netrc.NetrcParseError) as err:
1305 self.report_warning(
1306 'parsing .netrc: %s' % error_to_compat_str(err))
1307
1308 return username, password
1309
1310 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1311 """
1312 Get the login info as (username, password)
1313 First look for the manually specified credentials using username_option
1314 and password_option as keys in params dictionary. If no such credentials
1315 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1316 value.
1317 If there's no info available, return (None, None)
1318 """
1319
1320 # Attempt to use provided username and password or .netrc data
1321 username = self.get_param(username_option)
1322 if username is not None:
1323 password = self.get_param(password_option)
1324 else:
1325 username, password = self._get_netrc_login_info(netrc_machine)
1326
1327 return username, password
1328
1329 def _get_tfa_info(self, note='two-factor verification code'):
1330 """
1331 Get the two-factor authentication info
1332 TODO - asking the user will be required for sms/phone verify
1333 currently just uses the command line option
1334 If there's no info available, return None
1335 """
1336
1337 tfa = self.get_param('twofactor')
1338 if tfa is not None:
1339 return tfa
1340
1341 return getpass.getpass('Type %s and press [Return]: ' % note)
1342
1343 # Helper functions for extracting OpenGraph info
1344 @staticmethod
1345 def _og_regexes(prop):
1346 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
1347 property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
1348 % {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
1349 template = r'<meta[^>]+?%s[^>]+?%s'
1350 return [
1351 template % (property_re, content_re),
1352 template % (content_re, property_re),
1353 ]
1354
1355 @staticmethod
1356 def _meta_regex(prop):
1357 return r'''(?isx)<meta
1358 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1359 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1360
1361 def _og_search_property(self, prop, html, name=None, **kargs):
1362 prop = variadic(prop)
1363 if name is None:
1364 name = 'OpenGraph %s' % prop[0]
1365 og_regexes = []
1366 for p in prop:
1367 og_regexes.extend(self._og_regexes(p))
1368 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1369 if escaped is None:
1370 return None
1371 return unescapeHTML(escaped)
1372
1373 def _og_search_thumbnail(self, html, **kargs):
1374 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1375
1376 def _og_search_description(self, html, **kargs):
1377 return self._og_search_property('description', html, fatal=False, **kargs)
1378
1379 def _og_search_title(self, html, *, fatal=False, **kargs):
1380 return self._og_search_property('title', html, fatal=fatal, **kargs)
1381
1382 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1383 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1384 if secure:
1385 regexes = self._og_regexes('video:secure_url') + regexes
1386 return self._html_search_regex(regexes, html, name, **kargs)
1387
1388 def _og_search_url(self, html, **kargs):
1389 return self._og_search_property('url', html, **kargs)
1390
1391 def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
1392 return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
1393
1394 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1395 name = variadic(name)
1396 if display_name is None:
1397 display_name = name[0]
1398 return self._html_search_regex(
1399 [self._meta_regex(n) for n in name],
1400 html, display_name, fatal=fatal, group='content', **kwargs)
1401
1402 def _dc_search_uploader(self, html):
1403 return self._html_search_meta('dc.creator', html, 'uploader')
1404
1405 @staticmethod
1406 def _rta_search(html):
1407 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1408 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1409 r' content="RTA-5042-1996-1400-1577-RTA"',
1410 html):
1411 return 18
1412
1413 # And then there are the jokers who advertise that they use RTA, but actually don't.
1414 AGE_LIMIT_MARKERS = [
1415 r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
1416 r'>[^<]*you acknowledge you are at least (\d+) years old',
1417 r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
1418 ]
1419
1420 age_limit = 0
1421 for marker in AGE_LIMIT_MARKERS:
1422 mobj = re.search(marker, html)
1423 if mobj:
1424 age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
1425 return age_limit
1426
1427 def _media_rating_search(self, html):
1428 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1429 rating = self._html_search_meta('rating', html)
1430
1431 if not rating:
1432 return None
1433
1434 RATING_TABLE = {
1435 'safe for kids': 0,
1436 'general': 8,
1437 '14 years': 14,
1438 'mature': 17,
1439 'restricted': 19,
1440 }
1441 return RATING_TABLE.get(rating.lower())
1442
1443 def _family_friendly_search(self, html):
1444 # See http://schema.org/VideoObject
1445 family_friendly = self._html_search_meta(
1446 'isFamilyFriendly', html, default=None)
1447
1448 if not family_friendly:
1449 return None
1450
1451 RATING_TABLE = {
1452 '1': 0,
1453 'true': 0,
1454 '0': 18,
1455 'false': 18,
1456 }
1457 return RATING_TABLE.get(family_friendly.lower())
1458
1459 def _twitter_search_player(self, html):
1460 return self._html_search_meta('twitter:player', html,
1461 'twitter card player')
1462
1463 def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT):
1464 """Yield all json ld objects in the html"""
1465 if default is not NO_DEFAULT:
1466 fatal = False
1467 for mobj in re.finditer(JSON_LD_RE, html):
1468 json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal)
1469 for json_ld in variadic(json_ld_item):
1470 if isinstance(json_ld, dict):
1471 yield json_ld
1472
1473 def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT):
1474 """Search for a video in any json ld in the html"""
1475 if default is not NO_DEFAULT:
1476 fatal = False
1477 info = self._json_ld(
1478 list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)),
1479 video_id, fatal=fatal, expected_type=expected_type)
1480 if info:
1481 return info
1482 if default is not NO_DEFAULT:
1483 return default
1484 elif fatal:
1485 raise RegexNotFoundError('Unable to extract JSON-LD')
1486 else:
1487 self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1488 return {}
1489
1490 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1491 if isinstance(json_ld, str):
1492 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1493 if not json_ld:
1494 return {}
1495 info = {}
1496
1497 INTERACTION_TYPE_MAP = {
1498 'CommentAction': 'comment',
1499 'AgreeAction': 'like',
1500 'DisagreeAction': 'dislike',
1501 'LikeAction': 'like',
1502 'DislikeAction': 'dislike',
1503 'ListenAction': 'view',
1504 'WatchAction': 'view',
1505 'ViewAction': 'view',
1506 }
1507
1508 def is_type(e, *expected_types):
1509 type = variadic(traverse_obj(e, '@type'))
1510 return any(x in type for x in expected_types)
1511
1512 def extract_interaction_type(e):
1513 interaction_type = e.get('interactionType')
1514 if isinstance(interaction_type, dict):
1515 interaction_type = interaction_type.get('@type')
1516 return str_or_none(interaction_type)
1517
1518 def extract_interaction_statistic(e):
1519 interaction_statistic = e.get('interactionStatistic')
1520 if isinstance(interaction_statistic, dict):
1521 interaction_statistic = [interaction_statistic]
1522 if not isinstance(interaction_statistic, list):
1523 return
1524 for is_e in interaction_statistic:
1525 if not is_type(is_e, 'InteractionCounter'):
1526 continue
1527 interaction_type = extract_interaction_type(is_e)
1528 if not interaction_type:
1529 continue
1530 # For interaction count some sites provide string instead of
1531 # an integer (as per spec) with non digit characters (e.g. ",")
1532 # so extracting count with more relaxed str_to_int
1533 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1534 if interaction_count is None:
1535 continue
1536 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1537 if not count_kind:
1538 continue
1539 count_key = '%s_count' % count_kind
1540 if info.get(count_key) is not None:
1541 continue
1542 info[count_key] = interaction_count
1543
1544 def extract_chapter_information(e):
1545 chapters = [{
1546 'title': part.get('name'),
1547 'start_time': part.get('startOffset'),
1548 'end_time': part.get('endOffset'),
1549 } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip']
1550 for idx, (last_c, current_c, next_c) in enumerate(zip(
1551 [{'end_time': 0}] + chapters, chapters, chapters[1:])):
1552 current_c['end_time'] = current_c['end_time'] or next_c['start_time']
1553 current_c['start_time'] = current_c['start_time'] or last_c['end_time']
1554 if None in current_c.values():
1555 self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
1556 return
1557 if chapters:
1558 chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
1559 info['chapters'] = chapters
1560
1561 def extract_video_object(e):
1562 author = e.get('author')
1563 info.update({
1564 'url': url_or_none(e.get('contentUrl')),
1565 'ext': mimetype2ext(e.get('encodingFormat')),
1566 'title': unescapeHTML(e.get('name')),
1567 'description': unescapeHTML(e.get('description')),
1568 'thumbnails': [{'url': unescapeHTML(url)}
1569 for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
1570 if url_or_none(url)],
1571 'duration': parse_duration(e.get('duration')),
1572 'timestamp': unified_timestamp(e.get('uploadDate')),
1573 # author can be an instance of 'Organization' or 'Person' types.
1574 # both types can have 'name' property(inherited from 'Thing' type). [1]
1575 # however some websites are using 'Text' type instead.
1576 # 1. https://schema.org/VideoObject
1577 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None,
1578 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str),
1579 'filesize': int_or_none(float_or_none(e.get('contentSize'))),
1580 'tbr': int_or_none(e.get('bitrate')),
1581 'width': int_or_none(e.get('width')),
1582 'height': int_or_none(e.get('height')),
1583 'view_count': int_or_none(e.get('interactionCount')),
1584 'tags': try_call(lambda: e.get('keywords').split(',')),
1585 })
1586 if is_type(e, 'AudioObject'):
1587 info.update({
1588 'vcodec': 'none',
1589 'abr': int_or_none(e.get('bitrate')),
1590 })
1591 extract_interaction_statistic(e)
1592 extract_chapter_information(e)
1593
1594 def traverse_json_ld(json_ld, at_top_level=True):
1595 for e in variadic(json_ld):
1596 if not isinstance(e, dict):
1597 continue
1598 if at_top_level and '@context' not in e:
1599 continue
1600 if at_top_level and set(e.keys()) == {'@context', '@graph'}:
1601 traverse_json_ld(e['@graph'], at_top_level=False)
1602 continue
1603 if expected_type is not None and not is_type(e, expected_type):
1604 continue
1605 rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
1606 if rating is not None:
1607 info['average_rating'] = rating
1608 if is_type(e, 'TVEpisode', 'Episode'):
1609 episode_name = unescapeHTML(e.get('name'))
1610 info.update({
1611 'episode': episode_name,
1612 'episode_number': int_or_none(e.get('episodeNumber')),
1613 'description': unescapeHTML(e.get('description')),
1614 })
1615 if not info.get('title') and episode_name:
1616 info['title'] = episode_name
1617 part_of_season = e.get('partOfSeason')
1618 if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'):
1619 info.update({
1620 'season': unescapeHTML(part_of_season.get('name')),
1621 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1622 })
1623 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1624 if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'):
1625 info['series'] = unescapeHTML(part_of_series.get('name'))
1626 elif is_type(e, 'Movie'):
1627 info.update({
1628 'title': unescapeHTML(e.get('name')),
1629 'description': unescapeHTML(e.get('description')),
1630 'duration': parse_duration(e.get('duration')),
1631 'timestamp': unified_timestamp(e.get('dateCreated')),
1632 })
1633 elif is_type(e, 'Article', 'NewsArticle'):
1634 info.update({
1635 'timestamp': parse_iso8601(e.get('datePublished')),
1636 'title': unescapeHTML(e.get('headline')),
1637 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
1638 })
1639 if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'):
1640 extract_video_object(e['video'][0])
1641 elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'):
1642 extract_video_object(e['subjectOf'][0])
1643 elif is_type(e, 'VideoObject', 'AudioObject'):
1644 extract_video_object(e)
1645 if expected_type is None:
1646 continue
1647 else:
1648 break
1649 video = e.get('video')
1650 if is_type(video, 'VideoObject'):
1651 extract_video_object(video)
1652 if expected_type is None:
1653 continue
1654 else:
1655 break
1656
1657 traverse_json_ld(json_ld)
1658 return filter_dict(info)
1659
1660 def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
1661 return self._parse_json(
1662 self._search_regex(
1663 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
1664 webpage, 'next.js data', fatal=fatal, **kw),
1665 video_id, transform_source=transform_source, fatal=fatal)
1666
1667 def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
1668 """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
1669 rectx = re.escape(context_name)
1670 FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
1671 js, arg_keys, arg_vals = self._search_regex(
1672 (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'),
1673 webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
1674 default=NO_DEFAULT if fatal else (None, None, None))
1675 if js is None:
1676 return {}
1677
1678 args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
1679 f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
1680
1681 ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
1682 return traverse_obj(ret, traverse) or {}
1683
1684 @staticmethod
1685 def _hidden_inputs(html):
1686 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1687 hidden_inputs = {}
1688 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1689 attrs = extract_attributes(input)
1690 if not input:
1691 continue
1692 if attrs.get('type') not in ('hidden', 'submit'):
1693 continue
1694 name = attrs.get('name') or attrs.get('id')
1695 value = attrs.get('value')
1696 if name and value is not None:
1697 hidden_inputs[name] = value
1698 return hidden_inputs
1699
1700 def _form_hidden_inputs(self, form_id, html):
1701 form = self._search_regex(
1702 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1703 html, '%s form' % form_id, group='form')
1704 return self._hidden_inputs(form)
1705
1706 @classproperty(cache=True)
1707 def FormatSort(cls):
1708 class FormatSort(FormatSorter):
1709 def __init__(ie, *args, **kwargs):
1710 super().__init__(ie._downloader, *args, **kwargs)
1711
1712 deprecation_warning(
1713 'yt_dlp.InfoExtractor.FormatSort is deprecated and may be removed in the future. '
1714 'Use yt_dlp.utils.FormatSorter instead')
1715 return FormatSort
1716
1717 def _sort_formats(self, formats, field_preference=[]):
1718 if not field_preference:
1719 self._downloader.deprecation_warning(
1720 'yt_dlp.InfoExtractor._sort_formats is deprecated and is no longer required')
1721 return
1722 self._downloader.deprecation_warning(
1723 'yt_dlp.InfoExtractor._sort_formats is deprecated and no longer works as expected. '
1724 'Return _format_sort_fields in the info_dict instead')
1725 if formats:
1726 formats[0]['__sort_fields'] = field_preference
1727
1728 def _check_formats(self, formats, video_id):
1729 if formats:
1730 formats[:] = filter(
1731 lambda f: self._is_valid_url(
1732 f['url'], video_id,
1733 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1734 formats)
1735
1736 @staticmethod
1737 def _remove_duplicate_formats(formats):
1738 format_urls = set()
1739 unique_formats = []
1740 for f in formats:
1741 if f['url'] not in format_urls:
1742 format_urls.add(f['url'])
1743 unique_formats.append(f)
1744 formats[:] = unique_formats
1745
1746 def _is_valid_url(self, url, video_id, item='video', headers={}):
1747 url = self._proto_relative_url(url, scheme='http:')
1748 # For now assume non HTTP(S) URLs always valid
1749 if not (url.startswith('http://') or url.startswith('https://')):
1750 return True
1751 try:
1752 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1753 return True
1754 except ExtractorError as e:
1755 self.to_screen(
1756 '%s: %s URL is invalid, skipping: %s'
1757 % (video_id, item, error_to_compat_str(e.cause)))
1758 return False
1759
1760 def http_scheme(self):
1761 """ Either "http:" or "https:", depending on the user's preferences """
1762 return (
1763 'http:'
1764 if self.get_param('prefer_insecure', False)
1765 else 'https:')
1766
1767 def _proto_relative_url(self, url, scheme=None):
1768 scheme = scheme or self.http_scheme()
1769 assert scheme.endswith(':')
1770 return sanitize_url(url, scheme=scheme[:-1])
1771
1772 def _sleep(self, timeout, video_id, msg_template=None):
1773 if msg_template is None:
1774 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1775 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1776 self.to_screen(msg)
1777 time.sleep(timeout)
1778
1779 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1780 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1781 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1782 if self.get_param('ignore_no_formats_error'):
1783 fatal = False
1784
1785 res = self._download_xml_handle(
1786 manifest_url, video_id, 'Downloading f4m manifest',
1787 'Unable to download f4m manifest',
1788 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1789 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1790 transform_source=transform_source,
1791 fatal=fatal, data=data, headers=headers, query=query)
1792 if res is False:
1793 return []
1794
1795 manifest, urlh = res
1796 manifest_url = urlh.geturl()
1797
1798 return self._parse_f4m_formats(
1799 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1800 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1801
1802 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1803 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1804 fatal=True, m3u8_id=None):
1805 if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
1806 return []
1807
1808 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1809 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1810 if akamai_pv is not None and ';' in akamai_pv.text:
1811 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1812 if playerVerificationChallenge.strip() != '':
1813 return []
1814
1815 formats = []
1816 manifest_version = '1.0'
1817 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1818 if not media_nodes:
1819 manifest_version = '2.0'
1820 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1821 # Remove unsupported DRM protected media from final formats
1822 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1823 media_nodes = remove_encrypted_media(media_nodes)
1824 if not media_nodes:
1825 return formats
1826
1827 manifest_base_url = get_base_url(manifest)
1828
1829 bootstrap_info = xpath_element(
1830 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1831 'bootstrap info', default=None)
1832
1833 vcodec = None
1834 mime_type = xpath_text(
1835 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1836 'base URL', default=None)
1837 if mime_type and mime_type.startswith('audio/'):
1838 vcodec = 'none'
1839
1840 for i, media_el in enumerate(media_nodes):
1841 tbr = int_or_none(media_el.attrib.get('bitrate'))
1842 width = int_or_none(media_el.attrib.get('width'))
1843 height = int_or_none(media_el.attrib.get('height'))
1844 format_id = join_nonempty(f4m_id, tbr or i)
1845 # If <bootstrapInfo> is present, the specified f4m is a
1846 # stream-level manifest, and only set-level manifests may refer to
1847 # external resources. See section 11.4 and section 4 of F4M spec
1848 if bootstrap_info is None:
1849 media_url = None
1850 # @href is introduced in 2.0, see section 11.6 of F4M spec
1851 if manifest_version == '2.0':
1852 media_url = media_el.attrib.get('href')
1853 if media_url is None:
1854 media_url = media_el.attrib.get('url')
1855 if not media_url:
1856 continue
1857 manifest_url = (
1858 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1859 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1860 # If media_url is itself a f4m manifest do the recursive extraction
1861 # since bitrates in parent manifest (this one) and media_url manifest
1862 # may differ leading to inability to resolve the format by requested
1863 # bitrate in f4m downloader
1864 ext = determine_ext(manifest_url)
1865 if ext == 'f4m':
1866 f4m_formats = self._extract_f4m_formats(
1867 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1868 transform_source=transform_source, fatal=fatal)
1869 # Sometimes stream-level manifest contains single media entry that
1870 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1871 # At the same time parent's media entry in set-level manifest may
1872 # contain it. We will copy it from parent in such cases.
1873 if len(f4m_formats) == 1:
1874 f = f4m_formats[0]
1875 f.update({
1876 'tbr': f.get('tbr') or tbr,
1877 'width': f.get('width') or width,
1878 'height': f.get('height') or height,
1879 'format_id': f.get('format_id') if not tbr else format_id,
1880 'vcodec': vcodec,
1881 })
1882 formats.extend(f4m_formats)
1883 continue
1884 elif ext == 'm3u8':
1885 formats.extend(self._extract_m3u8_formats(
1886 manifest_url, video_id, 'mp4', preference=preference,
1887 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1888 continue
1889 formats.append({
1890 'format_id': format_id,
1891 'url': manifest_url,
1892 'manifest_url': manifest_url,
1893 'ext': 'flv' if bootstrap_info is not None else None,
1894 'protocol': 'f4m',
1895 'tbr': tbr,
1896 'width': width,
1897 'height': height,
1898 'vcodec': vcodec,
1899 'preference': preference,
1900 'quality': quality,
1901 })
1902 return formats
1903
1904 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1905 return {
1906 'format_id': join_nonempty(m3u8_id, 'meta'),
1907 'url': m3u8_url,
1908 'ext': ext,
1909 'protocol': 'm3u8',
1910 'preference': preference - 100 if preference else -100,
1911 'quality': quality,
1912 'resolution': 'multiple',
1913 'format_note': 'Quality selection URL',
1914 }
1915
1916 def _report_ignoring_subs(self, name):
1917 self.report_warning(bug_reports_message(
1918 f'Ignoring subtitle tracks found in the {name} manifest; '
1919 'if any subtitle tracks are missing,'
1920 ), only_once=True)
1921
1922 def _extract_m3u8_formats(self, *args, **kwargs):
1923 fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
1924 if subs:
1925 self._report_ignoring_subs('HLS')
1926 return fmts
1927
1928 def _extract_m3u8_formats_and_subtitles(
1929 self, m3u8_url, video_id, ext=None, entry_protocol='m3u8_native',
1930 preference=None, quality=None, m3u8_id=None, note=None,
1931 errnote=None, fatal=True, live=False, data=None, headers={},
1932 query={}):
1933
1934 if self.get_param('ignore_no_formats_error'):
1935 fatal = False
1936
1937 if not m3u8_url:
1938 if errnote is not False:
1939 errnote = errnote or 'Failed to obtain m3u8 URL'
1940 if fatal:
1941 raise ExtractorError(errnote, video_id=video_id)
1942 self.report_warning(f'{errnote}{bug_reports_message()}')
1943 return [], {}
1944
1945 res = self._download_webpage_handle(
1946 m3u8_url, video_id,
1947 note='Downloading m3u8 information' if note is None else note,
1948 errnote='Failed to download m3u8 information' if errnote is None else errnote,
1949 fatal=fatal, data=data, headers=headers, query=query)
1950
1951 if res is False:
1952 return [], {}
1953
1954 m3u8_doc, urlh = res
1955 m3u8_url = urlh.geturl()
1956
1957 return self._parse_m3u8_formats_and_subtitles(
1958 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1959 preference=preference, quality=quality, m3u8_id=m3u8_id,
1960 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1961 headers=headers, query=query, video_id=video_id)
1962
1963 def _parse_m3u8_formats_and_subtitles(
1964 self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
1965 preference=None, quality=None, m3u8_id=None, live=False, note=None,
1966 errnote=None, fatal=True, data=None, headers={}, query={},
1967 video_id=None):
1968 formats, subtitles = [], {}
1969
1970 has_drm = re.search('|'.join([
1971 r'#EXT-X-FAXS-CM:', # Adobe Flash Access
1972 r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
1973 ]), m3u8_doc)
1974
1975 def format_url(url):
1976 return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
1977
1978 if self.get_param('hls_split_discontinuity', False):
1979 def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None):
1980 if not m3u8_doc:
1981 if not manifest_url:
1982 return []
1983 m3u8_doc = self._download_webpage(
1984 manifest_url, video_id, fatal=fatal, data=data, headers=headers,
1985 note=False, errnote='Failed to download m3u8 playlist information')
1986 if m3u8_doc is False:
1987 return []
1988 return range(1 + sum(line.startswith('#EXT-X-DISCONTINUITY') for line in m3u8_doc.splitlines()))
1989
1990 else:
1991 def _extract_m3u8_playlist_indices(*args, **kwargs):
1992 return [None]
1993
1994 # References:
1995 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1996 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1997 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1998
1999 # We should try extracting formats only from master playlists [1, 4.3.4],
2000 # i.e. playlists that describe available qualities. On the other hand
2001 # media playlists [1, 4.3.3] should be returned as is since they contain
2002 # just the media without qualities renditions.
2003 # Fortunately, master playlist can be easily distinguished from media
2004 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
2005 # master playlist tags MUST NOT appear in a media playlist and vice versa.
2006 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
2007 # media playlist and MUST NOT appear in master playlist thus we can
2008 # clearly detect media playlist with this criterion.
2009
2010 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
2011 formats = [{
2012 'format_id': join_nonempty(m3u8_id, idx),
2013 'format_index': idx,
2014 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
2015 'ext': ext,
2016 'protocol': entry_protocol,
2017 'preference': preference,
2018 'quality': quality,
2019 'has_drm': has_drm,
2020 } for idx in _extract_m3u8_playlist_indices(m3u8_doc=m3u8_doc)]
2021
2022 return formats, subtitles
2023
2024 groups = {}
2025 last_stream_inf = {}
2026
2027 def extract_media(x_media_line):
2028 media = parse_m3u8_attributes(x_media_line)
2029 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
2030 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
2031 if not (media_type and group_id and name):
2032 return
2033 groups.setdefault(group_id, []).append(media)
2034 # <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
2035 if media_type == 'SUBTITLES':
2036 # According to RFC 8216 §4.3.4.2.1, URI is REQUIRED in the
2037 # EXT-X-MEDIA tag if the media type is SUBTITLES.
2038 # However, lack of URI has been spotted in the wild.
2039 # e.g. NebulaIE; see https://github.com/yt-dlp/yt-dlp/issues/339
2040 if not media.get('URI'):
2041 return
2042 url = format_url(media['URI'])
2043 sub_info = {
2044 'url': url,
2045 'ext': determine_ext(url),
2046 }
2047 if sub_info['ext'] == 'm3u8':
2048 # Per RFC 8216 §3.1, the only possible subtitle format m3u8
2049 # files may contain is WebVTT:
2050 # <https://tools.ietf.org/html/rfc8216#section-3.1>
2051 sub_info['ext'] = 'vtt'
2052 sub_info['protocol'] = 'm3u8_native'
2053 lang = media.get('LANGUAGE') or 'und'
2054 subtitles.setdefault(lang, []).append(sub_info)
2055 if media_type not in ('VIDEO', 'AUDIO'):
2056 return
2057 media_url = media.get('URI')
2058 if media_url:
2059 manifest_url = format_url(media_url)
2060 formats.extend({
2061 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
2062 'format_note': name,
2063 'format_index': idx,
2064 'url': manifest_url,
2065 'manifest_url': m3u8_url,
2066 'language': media.get('LANGUAGE'),
2067 'ext': ext,
2068 'protocol': entry_protocol,
2069 'preference': preference,
2070 'quality': quality,
2071 'has_drm': has_drm,
2072 'vcodec': 'none' if media_type == 'AUDIO' else None,
2073 } for idx in _extract_m3u8_playlist_indices(manifest_url))
2074
2075 def build_stream_name():
2076 # Despite specification does not mention NAME attribute for
2077 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2078 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2079 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2080 stream_name = last_stream_inf.get('NAME')
2081 if stream_name:
2082 return stream_name
2083 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2084 # from corresponding rendition group
2085 stream_group_id = last_stream_inf.get('VIDEO')
2086 if not stream_group_id:
2087 return
2088 stream_group = groups.get(stream_group_id)
2089 if not stream_group:
2090 return stream_group_id
2091 rendition = stream_group[0]
2092 return rendition.get('NAME') or stream_group_id
2093
2094 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2095 # chance to detect video only formats when EXT-X-STREAM-INF tags
2096 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2097 for line in m3u8_doc.splitlines():
2098 if line.startswith('#EXT-X-MEDIA:'):
2099 extract_media(line)
2100
2101 for line in m3u8_doc.splitlines():
2102 if line.startswith('#EXT-X-STREAM-INF:'):
2103 last_stream_inf = parse_m3u8_attributes(line)
2104 elif line.startswith('#') or not line.strip():
2105 continue
2106 else:
2107 tbr = float_or_none(
2108 last_stream_inf.get('AVERAGE-BANDWIDTH')
2109 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2110 manifest_url = format_url(line.strip())
2111
2112 for idx in _extract_m3u8_playlist_indices(manifest_url):
2113 format_id = [m3u8_id, None, idx]
2114 # Bandwidth of live streams may differ over time thus making
2115 # format_id unpredictable. So it's better to keep provided
2116 # format_id intact.
2117 if not live:
2118 stream_name = build_stream_name()
2119 format_id[1] = stream_name or '%d' % (tbr or len(formats))
2120 f = {
2121 'format_id': join_nonempty(*format_id),
2122 'format_index': idx,
2123 'url': manifest_url,
2124 'manifest_url': m3u8_url,
2125 'tbr': tbr,
2126 'ext': ext,
2127 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2128 'protocol': entry_protocol,
2129 'preference': preference,
2130 'quality': quality,
2131 'has_drm': has_drm,
2132 }
2133 resolution = last_stream_inf.get('RESOLUTION')
2134 if resolution:
2135 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2136 if mobj:
2137 f['width'] = int(mobj.group('width'))
2138 f['height'] = int(mobj.group('height'))
2139 # Unified Streaming Platform
2140 mobj = re.search(
2141 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2142 if mobj:
2143 abr, vbr = mobj.groups()
2144 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2145 f.update({
2146 'vbr': vbr,
2147 'abr': abr,
2148 })
2149 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2150 f.update(codecs)
2151 audio_group_id = last_stream_inf.get('AUDIO')
2152 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2153 # references a rendition group MUST have a CODECS attribute.
2154 # However, this is not always respected. E.g. [2]
2155 # contains EXT-X-STREAM-INF tag which references AUDIO
2156 # rendition group but does not have CODECS and despite
2157 # referencing an audio group it represents a complete
2158 # (with audio and video) format. So, for such cases we will
2159 # ignore references to rendition groups and treat them
2160 # as complete formats.
2161 if audio_group_id and codecs and f.get('vcodec') != 'none':
2162 audio_group = groups.get(audio_group_id)
2163 if audio_group and audio_group[0].get('URI'):
2164 # TODO: update acodec for audio only formats with
2165 # the same GROUP-ID
2166 f['acodec'] = 'none'
2167 if not f.get('ext'):
2168 f['ext'] = 'm4a' if f.get('vcodec') == 'none' else 'mp4'
2169 formats.append(f)
2170
2171 # for DailyMotion
2172 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2173 if progressive_uri:
2174 http_f = f.copy()
2175 del http_f['manifest_url']
2176 http_f.update({
2177 'format_id': f['format_id'].replace('hls-', 'http-'),
2178 'protocol': 'http',
2179 'url': progressive_uri,
2180 })
2181 formats.append(http_f)
2182
2183 last_stream_inf = {}
2184 return formats, subtitles
2185
2186 def _extract_m3u8_vod_duration(
2187 self, m3u8_vod_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2188
2189 m3u8_vod = self._download_webpage(
2190 m3u8_vod_url, video_id,
2191 note='Downloading m3u8 VOD manifest' if note is None else note,
2192 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2193 fatal=False, data=data, headers=headers, query=query)
2194
2195 return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
2196
2197 def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
2198 if '#EXT-X-ENDLIST' not in m3u8_vod:
2199 return None
2200
2201 return int(sum(
2202 float(line[len('#EXTINF:'):].split(',')[0])
2203 for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
2204
2205 def _extract_mpd_vod_duration(
2206 self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
2207
2208 mpd_doc = self._download_xml(
2209 mpd_url, video_id,
2210 note='Downloading MPD VOD manifest' if note is None else note,
2211 errnote='Failed to download VOD manifest' if errnote is None else errnote,
2212 fatal=False, data=data, headers=headers, query=query) or {}
2213 return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
2214
2215 @staticmethod
2216 def _xpath_ns(path, namespace=None):
2217 if not namespace:
2218 return path
2219 out = []
2220 for c in path.split('/'):
2221 if not c or c == '.':
2222 out.append(c)
2223 else:
2224 out.append('{%s}%s' % (namespace, c))
2225 return '/'.join(out)
2226
2227 def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2228 if self.get_param('ignore_no_formats_error'):
2229 fatal = False
2230
2231 res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2232 if res is False:
2233 assert not fatal
2234 return [], {}
2235
2236 smil, urlh = res
2237 smil_url = urlh.geturl()
2238
2239 namespace = self._parse_smil_namespace(smil)
2240
2241 fmts = self._parse_smil_formats(
2242 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2243 subs = self._parse_smil_subtitles(
2244 smil, namespace=namespace)
2245
2246 return fmts, subs
2247
2248 def _extract_smil_formats(self, *args, **kwargs):
2249 fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
2250 if subs:
2251 self._report_ignoring_subs('SMIL')
2252 return fmts
2253
2254 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2255 res = self._download_smil(smil_url, video_id, fatal=fatal)
2256 if res is False:
2257 return {}
2258
2259 smil, urlh = res
2260 smil_url = urlh.geturl()
2261
2262 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2263
2264 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2265 return self._download_xml_handle(
2266 smil_url, video_id, 'Downloading SMIL file',
2267 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2268
2269 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2270 namespace = self._parse_smil_namespace(smil)
2271
2272 formats = self._parse_smil_formats(
2273 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2274 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2275
2276 video_id = os.path.splitext(url_basename(smil_url))[0]
2277 title = None
2278 description = None
2279 upload_date = None
2280 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2281 name = meta.attrib.get('name')
2282 content = meta.attrib.get('content')
2283 if not name or not content:
2284 continue
2285 if not title and name == 'title':
2286 title = content
2287 elif not description and name in ('description', 'abstract'):
2288 description = content
2289 elif not upload_date and name == 'date':
2290 upload_date = unified_strdate(content)
2291
2292 thumbnails = [{
2293 'id': image.get('type'),
2294 'url': image.get('src'),
2295 'width': int_or_none(image.get('width')),
2296 'height': int_or_none(image.get('height')),
2297 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2298
2299 return {
2300 'id': video_id,
2301 'title': title or video_id,
2302 'description': description,
2303 'upload_date': upload_date,
2304 'thumbnails': thumbnails,
2305 'formats': formats,
2306 'subtitles': subtitles,
2307 }
2308
2309 def _parse_smil_namespace(self, smil):
2310 return self._search_regex(
2311 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2312
2313 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2314 base = smil_url
2315 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2316 b = meta.get('base') or meta.get('httpBase')
2317 if b:
2318 base = b
2319 break
2320
2321 formats = []
2322 rtmp_count = 0
2323 http_count = 0
2324 m3u8_count = 0
2325 imgs_count = 0
2326
2327 srcs = set()
2328 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2329 for medium in media:
2330 src = medium.get('src')
2331 if not src or src in srcs:
2332 continue
2333 srcs.add(src)
2334
2335 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2336 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2337 width = int_or_none(medium.get('width'))
2338 height = int_or_none(medium.get('height'))
2339 proto = medium.get('proto')
2340 ext = medium.get('ext')
2341 src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
2342 self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
2343 streamer = medium.get('streamer') or base
2344
2345 if proto == 'rtmp' or streamer.startswith('rtmp'):
2346 rtmp_count += 1
2347 formats.append({
2348 'url': streamer,
2349 'play_path': src,
2350 'ext': 'flv',
2351 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2352 'tbr': bitrate,
2353 'filesize': filesize,
2354 'width': width,
2355 'height': height,
2356 })
2357 if transform_rtmp_url:
2358 streamer, src = transform_rtmp_url(streamer, src)
2359 formats[-1].update({
2360 'url': streamer,
2361 'play_path': src,
2362 })
2363 continue
2364
2365 src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src)
2366 src_url = src_url.strip()
2367
2368 if proto == 'm3u8' or src_ext == 'm3u8':
2369 m3u8_formats = self._extract_m3u8_formats(
2370 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2371 if len(m3u8_formats) == 1:
2372 m3u8_count += 1
2373 m3u8_formats[0].update({
2374 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2375 'tbr': bitrate,
2376 'width': width,
2377 'height': height,
2378 })
2379 formats.extend(m3u8_formats)
2380 elif src_ext == 'f4m':
2381 f4m_url = src_url
2382 if not f4m_params:
2383 f4m_params = {
2384 'hdcore': '3.2.0',
2385 'plugin': 'flowplayer-3.2.0.1',
2386 }
2387 f4m_url += '&' if '?' in f4m_url else '?'
2388 f4m_url += urllib.parse.urlencode(f4m_params)
2389 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2390 elif src_ext == 'mpd':
2391 formats.extend(self._extract_mpd_formats(
2392 src_url, video_id, mpd_id='dash', fatal=False))
2393 elif re.search(r'\.ism/[Mm]anifest', src_url):
2394 formats.extend(self._extract_ism_formats(
2395 src_url, video_id, ism_id='mss', fatal=False))
2396 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2397 http_count += 1
2398 formats.append({
2399 'url': src_url,
2400 'ext': ext or src_ext or 'flv',
2401 'format_id': 'http-%d' % (bitrate or http_count),
2402 'tbr': bitrate,
2403 'filesize': filesize,
2404 'width': width,
2405 'height': height,
2406 })
2407
2408 for medium in smil.findall(self._xpath_ns('.//imagestream', namespace)):
2409 src = medium.get('src')
2410 if not src or src in srcs:
2411 continue
2412 srcs.add(src)
2413
2414 imgs_count += 1
2415 formats.append({
2416 'format_id': 'imagestream-%d' % (imgs_count),
2417 'url': src,
2418 'ext': mimetype2ext(medium.get('type')),
2419 'acodec': 'none',
2420 'vcodec': 'none',
2421 'width': int_or_none(medium.get('width')),
2422 'height': int_or_none(medium.get('height')),
2423 'format_note': 'SMIL storyboards',
2424 })
2425
2426 return formats
2427
2428 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2429 urls = []
2430 subtitles = {}
2431 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2432 src = textstream.get('src')
2433 if not src or src in urls:
2434 continue
2435 urls.append(src)
2436 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2437 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2438 subtitles.setdefault(lang, []).append({
2439 'url': src,
2440 'ext': ext,
2441 })
2442 return subtitles
2443
2444 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2445 res = self._download_xml_handle(
2446 xspf_url, playlist_id, 'Downloading xpsf playlist',
2447 'Unable to download xspf manifest', fatal=fatal)
2448 if res is False:
2449 return []
2450
2451 xspf, urlh = res
2452 xspf_url = urlh.geturl()
2453
2454 return self._parse_xspf(
2455 xspf, playlist_id, xspf_url=xspf_url,
2456 xspf_base_url=base_url(xspf_url))
2457
2458 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2459 NS_MAP = {
2460 'xspf': 'http://xspf.org/ns/0/',
2461 's1': 'http://static.streamone.nl/player/ns/0',
2462 }
2463
2464 entries = []
2465 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2466 title = xpath_text(
2467 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2468 description = xpath_text(
2469 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2470 thumbnail = xpath_text(
2471 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2472 duration = float_or_none(
2473 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2474
2475 formats = []
2476 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2477 format_url = urljoin(xspf_base_url, location.text)
2478 if not format_url:
2479 continue
2480 formats.append({
2481 'url': format_url,
2482 'manifest_url': xspf_url,
2483 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2484 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2485 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2486 })
2487
2488 entries.append({
2489 'id': playlist_id,
2490 'title': title,
2491 'description': description,
2492 'thumbnail': thumbnail,
2493 'duration': duration,
2494 'formats': formats,
2495 })
2496 return entries
2497
2498 def _extract_mpd_formats(self, *args, **kwargs):
2499 fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
2500 if subs:
2501 self._report_ignoring_subs('DASH')
2502 return fmts
2503
2504 def _extract_mpd_formats_and_subtitles(
2505 self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
2506 fatal=True, data=None, headers={}, query={}):
2507
2508 if self.get_param('ignore_no_formats_error'):
2509 fatal = False
2510
2511 res = self._download_xml_handle(
2512 mpd_url, video_id,
2513 note='Downloading MPD manifest' if note is None else note,
2514 errnote='Failed to download MPD manifest' if errnote is None else errnote,
2515 fatal=fatal, data=data, headers=headers, query=query)
2516 if res is False:
2517 return [], {}
2518 mpd_doc, urlh = res
2519 if mpd_doc is None:
2520 return [], {}
2521
2522 # We could have been redirected to a new url when we retrieved our mpd file.
2523 mpd_url = urlh.geturl()
2524 mpd_base_url = base_url(mpd_url)
2525
2526 return self._parse_mpd_formats_and_subtitles(
2527 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2528
2529 def _parse_mpd_formats(self, *args, **kwargs):
2530 fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
2531 if subs:
2532 self._report_ignoring_subs('DASH')
2533 return fmts
2534
2535 def _parse_mpd_formats_and_subtitles(
2536 self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2537 """
2538 Parse formats from MPD manifest.
2539 References:
2540 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2541 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2542 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2543 """
2544 if not self.get_param('dynamic_mpd', True):
2545 if mpd_doc.get('type') == 'dynamic':
2546 return [], {}
2547
2548 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2549
2550 def _add_ns(path):
2551 return self._xpath_ns(path, namespace)
2552
2553 def is_drm_protected(element):
2554 return element.find(_add_ns('ContentProtection')) is not None
2555
2556 def extract_multisegment_info(element, ms_parent_info):
2557 ms_info = ms_parent_info.copy()
2558
2559 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2560 # common attributes and elements. We will only extract relevant
2561 # for us.
2562 def extract_common(source):
2563 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2564 if segment_timeline is not None:
2565 s_e = segment_timeline.findall(_add_ns('S'))
2566 if s_e:
2567 ms_info['total_number'] = 0
2568 ms_info['s'] = []
2569 for s in s_e:
2570 r = int(s.get('r', 0))
2571 ms_info['total_number'] += 1 + r
2572 ms_info['s'].append({
2573 't': int(s.get('t', 0)),
2574 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2575 'd': int(s.attrib['d']),
2576 'r': r,
2577 })
2578 start_number = source.get('startNumber')
2579 if start_number:
2580 ms_info['start_number'] = int(start_number)
2581 timescale = source.get('timescale')
2582 if timescale:
2583 ms_info['timescale'] = int(timescale)
2584 segment_duration = source.get('duration')
2585 if segment_duration:
2586 ms_info['segment_duration'] = float(segment_duration)
2587
2588 def extract_Initialization(source):
2589 initialization = source.find(_add_ns('Initialization'))
2590 if initialization is not None:
2591 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2592
2593 segment_list = element.find(_add_ns('SegmentList'))
2594 if segment_list is not None:
2595 extract_common(segment_list)
2596 extract_Initialization(segment_list)
2597 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2598 if segment_urls_e:
2599 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2600 else:
2601 segment_template = element.find(_add_ns('SegmentTemplate'))
2602 if segment_template is not None:
2603 extract_common(segment_template)
2604 media = segment_template.get('media')
2605 if media:
2606 ms_info['media'] = media
2607 initialization = segment_template.get('initialization')
2608 if initialization:
2609 ms_info['initialization'] = initialization
2610 else:
2611 extract_Initialization(segment_template)
2612 return ms_info
2613
2614 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2615 formats, subtitles = [], {}
2616 stream_numbers = collections.defaultdict(int)
2617 for period in mpd_doc.findall(_add_ns('Period')):
2618 period_duration = parse_duration(period.get('duration')) or mpd_duration
2619 period_ms_info = extract_multisegment_info(period, {
2620 'start_number': 1,
2621 'timescale': 1,
2622 })
2623 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2624 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2625 for representation in adaptation_set.findall(_add_ns('Representation')):
2626 representation_attrib = adaptation_set.attrib.copy()
2627 representation_attrib.update(representation.attrib)
2628 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2629 mime_type = representation_attrib['mimeType']
2630 content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
2631
2632 codec_str = representation_attrib.get('codecs', '')
2633 # Some kind of binary subtitle found in some youtube livestreams
2634 if mime_type == 'application/x-rawcc':
2635 codecs = {'scodec': codec_str}
2636 else:
2637 codecs = parse_codecs(codec_str)
2638 if content_type not in ('video', 'audio', 'text'):
2639 if mime_type == 'image/jpeg':
2640 content_type = mime_type
2641 elif codecs.get('vcodec', 'none') != 'none':
2642 content_type = 'video'
2643 elif codecs.get('acodec', 'none') != 'none':
2644 content_type = 'audio'
2645 elif codecs.get('scodec', 'none') != 'none':
2646 content_type = 'text'
2647 elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
2648 content_type = 'text'
2649 else:
2650 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2651 continue
2652
2653 base_url = ''
2654 for element in (representation, adaptation_set, period, mpd_doc):
2655 base_url_e = element.find(_add_ns('BaseURL'))
2656 if try_call(lambda: base_url_e.text) is not None:
2657 base_url = base_url_e.text + base_url
2658 if re.match(r'^https?://', base_url):
2659 break
2660 if mpd_base_url and base_url.startswith('/'):
2661 base_url = urllib.parse.urljoin(mpd_base_url, base_url)
2662 elif mpd_base_url and not re.match(r'^https?://', base_url):
2663 if not mpd_base_url.endswith('/'):
2664 mpd_base_url += '/'
2665 base_url = mpd_base_url + base_url
2666 representation_id = representation_attrib.get('id')
2667 lang = representation_attrib.get('lang')
2668 url_el = representation.find(_add_ns('BaseURL'))
2669 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2670 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2671 if representation_id is not None:
2672 format_id = representation_id
2673 else:
2674 format_id = content_type
2675 if mpd_id:
2676 format_id = mpd_id + '-' + format_id
2677 if content_type in ('video', 'audio'):
2678 f = {
2679 'format_id': format_id,
2680 'manifest_url': mpd_url,
2681 'ext': mimetype2ext(mime_type),
2682 'width': int_or_none(representation_attrib.get('width')),
2683 'height': int_or_none(representation_attrib.get('height')),
2684 'tbr': float_or_none(bandwidth, 1000),
2685 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2686 'fps': int_or_none(representation_attrib.get('frameRate')),
2687 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2688 'format_note': 'DASH %s' % content_type,
2689 'filesize': filesize,
2690 'container': mimetype2ext(mime_type) + '_dash',
2691 **codecs
2692 }
2693 elif content_type == 'text':
2694 f = {
2695 'ext': mimetype2ext(mime_type),
2696 'manifest_url': mpd_url,
2697 'filesize': filesize,
2698 }
2699 elif content_type == 'image/jpeg':
2700 # See test case in VikiIE
2701 # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
2702 f = {
2703 'format_id': format_id,
2704 'ext': 'mhtml',
2705 'manifest_url': mpd_url,
2706 'format_note': 'DASH storyboards (jpeg)',
2707 'acodec': 'none',
2708 'vcodec': 'none',
2709 }
2710 if is_drm_protected(adaptation_set) or is_drm_protected(representation):
2711 f['has_drm'] = True
2712 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2713
2714 def prepare_template(template_name, identifiers):
2715 tmpl = representation_ms_info[template_name]
2716 if representation_id is not None:
2717 tmpl = tmpl.replace('$RepresentationID$', representation_id)
2718 # First of, % characters outside $...$ templates
2719 # must be escaped by doubling for proper processing
2720 # by % operator string formatting used further (see
2721 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2722 t = ''
2723 in_template = False
2724 for c in tmpl:
2725 t += c
2726 if c == '$':
2727 in_template = not in_template
2728 elif c == '%' and not in_template:
2729 t += c
2730 # Next, $...$ templates are translated to their
2731 # %(...) counterparts to be used with % operator
2732 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2733 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2734 t.replace('$$', '$')
2735 return t
2736
2737 # @initialization is a regular template like @media one
2738 # so it should be handled just the same way (see
2739 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2740 if 'initialization' in representation_ms_info:
2741 initialization_template = prepare_template(
2742 'initialization',
2743 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2744 # $Time$ shall not be included for @initialization thus
2745 # only $Bandwidth$ remains
2746 ('Bandwidth', ))
2747 representation_ms_info['initialization_url'] = initialization_template % {
2748 'Bandwidth': bandwidth,
2749 }
2750
2751 def location_key(location):
2752 return 'url' if re.match(r'^https?://', location) else 'path'
2753
2754 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2755
2756 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2757 media_location_key = location_key(media_template)
2758
2759 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2760 # can't be used at the same time
2761 if '%(Number' in media_template and 's' not in representation_ms_info:
2762 segment_duration = None
2763 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2764 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2765 representation_ms_info['total_number'] = int(math.ceil(
2766 float_or_none(period_duration, segment_duration, default=0)))
2767 representation_ms_info['fragments'] = [{
2768 media_location_key: media_template % {
2769 'Number': segment_number,
2770 'Bandwidth': bandwidth,
2771 },
2772 'duration': segment_duration,
2773 } for segment_number in range(
2774 representation_ms_info['start_number'],
2775 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2776 else:
2777 # $Number*$ or $Time$ in media template with S list available
2778 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2779 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2780 representation_ms_info['fragments'] = []
2781 segment_time = 0
2782 segment_d = None
2783 segment_number = representation_ms_info['start_number']
2784
2785 def add_segment_url():
2786 segment_url = media_template % {
2787 'Time': segment_time,
2788 'Bandwidth': bandwidth,
2789 'Number': segment_number,
2790 }
2791 representation_ms_info['fragments'].append({
2792 media_location_key: segment_url,
2793 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2794 })
2795
2796 for num, s in enumerate(representation_ms_info['s']):
2797 segment_time = s.get('t') or segment_time
2798 segment_d = s['d']
2799 add_segment_url()
2800 segment_number += 1
2801 for r in range(s.get('r', 0)):
2802 segment_time += segment_d
2803 add_segment_url()
2804 segment_number += 1
2805 segment_time += segment_d
2806 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2807 # No media template,
2808 # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI
2809 # or any YouTube dashsegments video
2810 fragments = []
2811 segment_index = 0
2812 timescale = representation_ms_info['timescale']
2813 for s in representation_ms_info['s']:
2814 duration = float_or_none(s['d'], timescale)
2815 for r in range(s.get('r', 0) + 1):
2816 segment_uri = representation_ms_info['segment_urls'][segment_index]
2817 fragments.append({
2818 location_key(segment_uri): segment_uri,
2819 'duration': duration,
2820 })
2821 segment_index += 1
2822 representation_ms_info['fragments'] = fragments
2823 elif 'segment_urls' in representation_ms_info:
2824 # Segment URLs with no SegmentTimeline
2825 # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2826 # https://github.com/ytdl-org/youtube-dl/pull/14844
2827 fragments = []
2828 segment_duration = float_or_none(
2829 representation_ms_info['segment_duration'],
2830 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2831 for segment_url in representation_ms_info['segment_urls']:
2832 fragment = {
2833 location_key(segment_url): segment_url,
2834 }
2835 if segment_duration:
2836 fragment['duration'] = segment_duration
2837 fragments.append(fragment)
2838 representation_ms_info['fragments'] = fragments
2839 # If there is a fragments key available then we correctly recognized fragmented media.
2840 # Otherwise we will assume unfragmented media with direct access. Technically, such
2841 # assumption is not necessarily correct since we may simply have no support for
2842 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2843 if 'fragments' in representation_ms_info:
2844 f.update({
2845 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2846 'url': mpd_url or base_url,
2847 'fragment_base_url': base_url,
2848 'fragments': [],
2849 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml',
2850 })
2851 if 'initialization_url' in representation_ms_info:
2852 initialization_url = representation_ms_info['initialization_url']
2853 if not f.get('url'):
2854 f['url'] = initialization_url
2855 f['fragments'].append({location_key(initialization_url): initialization_url})
2856 f['fragments'].extend(representation_ms_info['fragments'])
2857 if not period_duration:
2858 period_duration = try_get(
2859 representation_ms_info,
2860 lambda r: sum(frag['duration'] for frag in r['fragments']), float)
2861 else:
2862 # Assuming direct URL to unfragmented media.
2863 f['url'] = base_url
2864 if content_type in ('video', 'audio', 'image/jpeg'):
2865 f['manifest_stream_number'] = stream_numbers[f['url']]
2866 stream_numbers[f['url']] += 1
2867 formats.append(f)
2868 elif content_type == 'text':
2869 subtitles.setdefault(lang or 'und', []).append(f)
2870
2871 return formats, subtitles
2872
2873 def _extract_ism_formats(self, *args, **kwargs):
2874 fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
2875 if subs:
2876 self._report_ignoring_subs('ISM')
2877 return fmts
2878
2879 def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2880 if self.get_param('ignore_no_formats_error'):
2881 fatal = False
2882
2883 res = self._download_xml_handle(
2884 ism_url, video_id,
2885 note='Downloading ISM manifest' if note is None else note,
2886 errnote='Failed to download ISM manifest' if errnote is None else errnote,
2887 fatal=fatal, data=data, headers=headers, query=query)
2888 if res is False:
2889 return [], {}
2890 ism_doc, urlh = res
2891 if ism_doc is None:
2892 return [], {}
2893
2894 return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
2895
2896 def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
2897 """
2898 Parse formats from ISM manifest.
2899 References:
2900 1. [MS-SSTR]: Smooth Streaming Protocol,
2901 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2902 """
2903 if ism_doc.get('IsLive') == 'TRUE':
2904 return [], {}
2905
2906 duration = int(ism_doc.attrib['Duration'])
2907 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2908
2909 formats = []
2910 subtitles = {}
2911 for stream in ism_doc.findall('StreamIndex'):
2912 stream_type = stream.get('Type')
2913 if stream_type not in ('video', 'audio', 'text'):
2914 continue
2915 url_pattern = stream.attrib['Url']
2916 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2917 stream_name = stream.get('Name')
2918 stream_language = stream.get('Language', 'und')
2919 for track in stream.findall('QualityLevel'):
2920 KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'}
2921 fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag'))
2922 # TODO: add support for WVC1 and WMAP
2923 if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'):
2924 self.report_warning('%s is not a supported codec' % fourcc)
2925 continue
2926 tbr = int(track.attrib['Bitrate']) // 1000
2927 # [1] does not mention Width and Height attributes. However,
2928 # they're often present while MaxWidth and MaxHeight are
2929 # missing, so should be used as fallbacks
2930 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2931 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2932 sampling_rate = int_or_none(track.get('SamplingRate'))
2933
2934 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2935 track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern)
2936
2937 fragments = []
2938 fragment_ctx = {
2939 'time': 0,
2940 }
2941 stream_fragments = stream.findall('c')
2942 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2943 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2944 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2945 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2946 if not fragment_ctx['duration']:
2947 try:
2948 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2949 except IndexError:
2950 next_fragment_time = duration
2951 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2952 for _ in range(fragment_repeat):
2953 fragments.append({
2954 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
2955 'duration': fragment_ctx['duration'] / stream_timescale,
2956 })
2957 fragment_ctx['time'] += fragment_ctx['duration']
2958
2959 if stream_type == 'text':
2960 subtitles.setdefault(stream_language, []).append({
2961 'ext': 'ismt',
2962 'protocol': 'ism',
2963 'url': ism_url,
2964 'manifest_url': ism_url,
2965 'fragments': fragments,
2966 '_download_params': {
2967 'stream_type': stream_type,
2968 'duration': duration,
2969 'timescale': stream_timescale,
2970 'fourcc': fourcc,
2971 'language': stream_language,
2972 'codec_private_data': track.get('CodecPrivateData'),
2973 }
2974 })
2975 elif stream_type in ('video', 'audio'):
2976 formats.append({
2977 'format_id': join_nonempty(ism_id, stream_name, tbr),
2978 'url': ism_url,
2979 'manifest_url': ism_url,
2980 'ext': 'ismv' if stream_type == 'video' else 'isma',
2981 'width': width,
2982 'height': height,
2983 'tbr': tbr,
2984 'asr': sampling_rate,
2985 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2986 'acodec': 'none' if stream_type == 'video' else fourcc,
2987 'protocol': 'ism',
2988 'fragments': fragments,
2989 'has_drm': ism_doc.find('Protection') is not None,
2990 'language': stream_language,
2991 'audio_channels': int_or_none(track.get('Channels')),
2992 '_download_params': {
2993 'stream_type': stream_type,
2994 'duration': duration,
2995 'timescale': stream_timescale,
2996 'width': width or 0,
2997 'height': height or 0,
2998 'fourcc': fourcc,
2999 'language': stream_language,
3000 'codec_private_data': track.get('CodecPrivateData'),
3001 'sampling_rate': sampling_rate,
3002 'channels': int_or_none(track.get('Channels', 2)),
3003 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
3004 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
3005 },
3006 })
3007 return formats, subtitles
3008
3009 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None):
3010 def absolute_url(item_url):
3011 return urljoin(base_url, item_url)
3012
3013 def parse_content_type(content_type):
3014 if not content_type:
3015 return {}
3016 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
3017 if ctr:
3018 mimetype, codecs = ctr.groups()
3019 f = parse_codecs(codecs)
3020 f['ext'] = mimetype2ext(mimetype)
3021 return f
3022 return {}
3023
3024 def _media_formats(src, cur_media_type, type_info=None):
3025 type_info = type_info or {}
3026 full_url = absolute_url(src)
3027 ext = type_info.get('ext') or determine_ext(full_url)
3028 if ext == 'm3u8':
3029 is_plain_url = False
3030 formats = self._extract_m3u8_formats(
3031 full_url, video_id, ext='mp4',
3032 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
3033 preference=preference, quality=quality, fatal=False)
3034 elif ext == 'mpd':
3035 is_plain_url = False
3036 formats = self._extract_mpd_formats(
3037 full_url, video_id, mpd_id=mpd_id, fatal=False)
3038 else:
3039 is_plain_url = True
3040 formats = [{
3041 'url': full_url,
3042 'vcodec': 'none' if cur_media_type == 'audio' else None,
3043 'ext': ext,
3044 }]
3045 return is_plain_url, formats
3046
3047 entries = []
3048 # amp-video and amp-audio are very similar to their HTML5 counterparts
3049 # so we will include them right here (see
3050 # https://www.ampproject.org/docs/reference/components/amp-video)
3051 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
3052 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
3053 media_tags = [(media_tag, media_tag_name, media_type, '')
3054 for media_tag, media_tag_name, media_type
3055 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
3056 media_tags.extend(re.findall(
3057 # We only allow video|audio followed by a whitespace or '>'.
3058 # Allowing more characters may end up in significant slow down (see
3059 # https://github.com/ytdl-org/youtube-dl/issues/11979,
3060 # e.g. http://www.porntrex.com/maps/videositemap.xml).
3061 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
3062 for media_tag, _, media_type, media_content in media_tags:
3063 media_info = {
3064 'formats': [],
3065 'subtitles': {},
3066 }
3067 media_attributes = extract_attributes(media_tag)
3068 src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source')))
3069 if src:
3070 f = parse_content_type(media_attributes.get('type'))
3071 _, formats = _media_formats(src, media_type, f)
3072 media_info['formats'].extend(formats)
3073 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
3074 if media_content:
3075 for source_tag in re.findall(r'<source[^>]+>', media_content):
3076 s_attr = extract_attributes(source_tag)
3077 # data-video-src and data-src are non standard but seen
3078 # several times in the wild
3079 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source')))
3080 if not src:
3081 continue
3082 f = parse_content_type(s_attr.get('type'))
3083 is_plain_url, formats = _media_formats(src, media_type, f)
3084 if is_plain_url:
3085 # width, height, res, label and title attributes are
3086 # all not standard but seen several times in the wild
3087 labels = [
3088 s_attr.get(lbl)
3089 for lbl in ('label', 'title')
3090 if str_or_none(s_attr.get(lbl))
3091 ]
3092 width = int_or_none(s_attr.get('width'))
3093 height = (int_or_none(s_attr.get('height'))
3094 or int_or_none(s_attr.get('res')))
3095 if not width or not height:
3096 for lbl in labels:
3097 resolution = parse_resolution(lbl)
3098 if not resolution:
3099 continue
3100 width = width or resolution.get('width')
3101 height = height or resolution.get('height')
3102 for lbl in labels:
3103 tbr = parse_bitrate(lbl)
3104 if tbr:
3105 break
3106 else:
3107 tbr = None
3108 f.update({
3109 'width': width,
3110 'height': height,
3111 'tbr': tbr,
3112 'format_id': s_attr.get('label') or s_attr.get('title'),
3113 })
3114 f.update(formats[0])
3115 media_info['formats'].append(f)
3116 else:
3117 media_info['formats'].extend(formats)
3118 for track_tag in re.findall(r'<track[^>]+>', media_content):
3119 track_attributes = extract_attributes(track_tag)
3120 kind = track_attributes.get('kind')
3121 if not kind or kind in ('subtitles', 'captions'):
3122 src = strip_or_none(track_attributes.get('src'))
3123 if not src:
3124 continue
3125 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
3126 media_info['subtitles'].setdefault(lang, []).append({
3127 'url': absolute_url(src),
3128 })
3129 for f in media_info['formats']:
3130 f.setdefault('http_headers', {})['Referer'] = base_url
3131 if media_info['formats'] or media_info['subtitles']:
3132 entries.append(media_info)
3133 return entries
3134
3135 def _extract_akamai_formats(self, *args, **kwargs):
3136 fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
3137 if subs:
3138 self._report_ignoring_subs('akamai')
3139 return fmts
3140
3141 def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
3142 signed = 'hdnea=' in manifest_url
3143 if not signed:
3144 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
3145 manifest_url = re.sub(
3146 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
3147 '', manifest_url).strip('?')
3148
3149 formats = []
3150 subtitles = {}
3151
3152 hdcore_sign = 'hdcore=3.7.0'
3153 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
3154 hds_host = hosts.get('hds')
3155 if hds_host:
3156 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
3157 if 'hdcore=' not in f4m_url:
3158 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
3159 f4m_formats = self._extract_f4m_formats(
3160 f4m_url, video_id, f4m_id='hds', fatal=False)
3161 for entry in f4m_formats:
3162 entry.update({'extra_param_to_segment_url': hdcore_sign})
3163 formats.extend(f4m_formats)
3164
3165 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
3166 hls_host = hosts.get('hls')
3167 if hls_host:
3168 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
3169 m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
3170 m3u8_url, video_id, 'mp4', 'm3u8_native',
3171 m3u8_id='hls', fatal=False)
3172 formats.extend(m3u8_formats)
3173 subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
3174
3175 http_host = hosts.get('http')
3176 if http_host and m3u8_formats and not signed:
3177 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
3178 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
3179 qualities_length = len(qualities)
3180 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
3181 i = 0
3182 for f in m3u8_formats:
3183 if f['vcodec'] != 'none':
3184 for protocol in ('http', 'https'):
3185 http_f = f.copy()
3186 del http_f['manifest_url']
3187 http_url = re.sub(
3188 REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url'])
3189 http_f.update({
3190 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
3191 'url': http_url,
3192 'protocol': protocol,
3193 })
3194 formats.append(http_f)
3195 i += 1
3196
3197 return formats, subtitles
3198
3199 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
3200 query = urllib.parse.urlparse(url).query
3201 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
3202 mobj = re.search(
3203 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
3204 url_base = mobj.group('url')
3205 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
3206 formats = []
3207
3208 def manifest_url(manifest):
3209 m_url = f'{http_base_url}/{manifest}'
3210 if query:
3211 m_url += '?%s' % query
3212 return m_url
3213
3214 if 'm3u8' not in skip_protocols:
3215 formats.extend(self._extract_m3u8_formats(
3216 manifest_url('playlist.m3u8'), video_id, 'mp4',
3217 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
3218 if 'f4m' not in skip_protocols:
3219 formats.extend(self._extract_f4m_formats(
3220 manifest_url('manifest.f4m'),
3221 video_id, f4m_id='hds', fatal=False))
3222 if 'dash' not in skip_protocols:
3223 formats.extend(self._extract_mpd_formats(
3224 manifest_url('manifest.mpd'),
3225 video_id, mpd_id='dash', fatal=False))
3226 if re.search(r'(?:/smil:|\.smil)', url_base):
3227 if 'smil' not in skip_protocols:
3228 rtmp_formats = self._extract_smil_formats(
3229 manifest_url('jwplayer.smil'),
3230 video_id, fatal=False)
3231 for rtmp_format in rtmp_formats:
3232 rtsp_format = rtmp_format.copy()
3233 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
3234 del rtsp_format['play_path']
3235 del rtsp_format['ext']
3236 rtsp_format.update({
3237 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
3238 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3239 'protocol': 'rtsp',
3240 })
3241 formats.extend([rtmp_format, rtsp_format])
3242 else:
3243 for protocol in ('rtmp', 'rtsp'):
3244 if protocol not in skip_protocols:
3245 formats.append({
3246 'url': f'{protocol}:{url_base}',
3247 'format_id': protocol,
3248 'protocol': protocol,
3249 })
3250 return formats
3251
3252 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3253 mobj = re.search(
3254 r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
3255 webpage)
3256 if mobj:
3257 try:
3258 jwplayer_data = self._parse_json(mobj.group('options'),
3259 video_id=video_id,
3260 transform_source=transform_source)
3261 except ExtractorError:
3262 pass
3263 else:
3264 if isinstance(jwplayer_data, dict):
3265 return jwplayer_data
3266
3267 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3268 jwplayer_data = self._find_jwplayer_data(
3269 webpage, video_id, transform_source=js_to_json)
3270 return self._parse_jwplayer_data(
3271 jwplayer_data, video_id, *args, **kwargs)
3272
3273 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3274 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3275 entries = []
3276 if not isinstance(jwplayer_data, dict):
3277 return entries
3278
3279 playlist_items = jwplayer_data.get('playlist')
3280 # JWPlayer backward compatibility: single playlist item/flattened playlists
3281 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3282 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3283 if not isinstance(playlist_items, list):
3284 playlist_items = (playlist_items or jwplayer_data, )
3285
3286 for video_data in playlist_items:
3287 if not isinstance(video_data, dict):
3288 continue
3289 # JWPlayer backward compatibility: flattened sources
3290 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3291 if 'sources' not in video_data:
3292 video_data['sources'] = [video_data]
3293
3294 this_video_id = video_id or video_data['mediaid']
3295
3296 formats = self._parse_jwplayer_formats(
3297 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3298 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3299
3300 subtitles = {}
3301 tracks = video_data.get('tracks')
3302 if tracks and isinstance(tracks, list):
3303 for track in tracks:
3304 if not isinstance(track, dict):
3305 continue
3306 track_kind = track.get('kind')
3307 if not track_kind or not isinstance(track_kind, str):
3308 continue
3309 if track_kind.lower() not in ('captions', 'subtitles'):
3310 continue
3311 track_url = urljoin(base_url, track.get('file'))
3312 if not track_url:
3313 continue
3314 subtitles.setdefault(track.get('label') or 'en', []).append({
3315 'url': self._proto_relative_url(track_url)
3316 })
3317
3318 entry = {
3319 'id': this_video_id,
3320 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3321 'description': clean_html(video_data.get('description')),
3322 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3323 'timestamp': int_or_none(video_data.get('pubdate')),
3324 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3325 'subtitles': subtitles,
3326 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
3327 'genre': clean_html(video_data.get('genre')),
3328 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
3329 'season_number': int_or_none(video_data.get('season')),
3330 'episode_number': int_or_none(video_data.get('episode')),
3331 'release_year': int_or_none(video_data.get('releasedate')),
3332 'age_limit': int_or_none(video_data.get('age_restriction')),
3333 }
3334 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3335 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3336 entry.update({
3337 '_type': 'url_transparent',
3338 'url': formats[0]['url'],
3339 })
3340 else:
3341 entry['formats'] = formats
3342 entries.append(entry)
3343 if len(entries) == 1:
3344 return entries[0]
3345 else:
3346 return self.playlist_result(entries)
3347
3348 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3349 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3350 urls = set()
3351 formats = []
3352 for source in jwplayer_sources_data:
3353 if not isinstance(source, dict):
3354 continue
3355 source_url = urljoin(
3356 base_url, self._proto_relative_url(source.get('file')))
3357 if not source_url or source_url in urls:
3358 continue
3359 urls.add(source_url)
3360 source_type = source.get('type') or ''
3361 ext = mimetype2ext(source_type) or determine_ext(source_url)
3362 if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
3363 formats.extend(self._extract_m3u8_formats(
3364 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3365 m3u8_id=m3u8_id, fatal=False))
3366 elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
3367 formats.extend(self._extract_mpd_formats(
3368 source_url, video_id, mpd_id=mpd_id, fatal=False))
3369 elif ext == 'smil':
3370 formats.extend(self._extract_smil_formats(
3371 source_url, video_id, fatal=False))
3372 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3373 elif source_type.startswith('audio') or ext in (
3374 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3375 formats.append({
3376 'url': source_url,
3377 'vcodec': 'none',
3378 'ext': ext,
3379 })
3380 else:
3381 format_id = str_or_none(source.get('label'))
3382 height = int_or_none(source.get('height'))
3383 if height is None and format_id:
3384 # Often no height is provided but there is a label in
3385 # format like "1080p", "720p SD", or 1080.
3386 height = parse_resolution(format_id).get('height')
3387 a_format = {
3388 'url': source_url,
3389 'width': int_or_none(source.get('width')),
3390 'height': height,
3391 'tbr': int_or_none(source.get('bitrate'), scale=1000),
3392 'filesize': int_or_none(source.get('filesize')),
3393 'ext': ext,
3394 'format_id': format_id
3395 }
3396 if source_url.startswith('rtmp'):
3397 a_format['ext'] = 'flv'
3398 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3399 # of jwplayer.flash.swf
3400 rtmp_url_parts = re.split(
3401 r'((?:mp4|mp3|flv):)', source_url, 1)
3402 if len(rtmp_url_parts) == 3:
3403 rtmp_url, prefix, play_path = rtmp_url_parts
3404 a_format.update({
3405 'url': rtmp_url,
3406 'play_path': prefix + play_path,
3407 })
3408 if rtmp_params:
3409 a_format.update(rtmp_params)
3410 formats.append(a_format)
3411 return formats
3412
3413 def _live_title(self, name):
3414 self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
3415 return name
3416
3417 def _int(self, v, name, fatal=False, **kwargs):
3418 res = int_or_none(v, **kwargs)
3419 if res is None:
3420 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3421 if fatal:
3422 raise ExtractorError(msg)
3423 else:
3424 self.report_warning(msg)
3425 return res
3426
3427 def _float(self, v, name, fatal=False, **kwargs):
3428 res = float_or_none(v, **kwargs)
3429 if res is None:
3430 msg = f'Failed to extract {name}: Could not parse value {v!r}'
3431 if fatal:
3432 raise ExtractorError(msg)
3433 else:
3434 self.report_warning(msg)
3435 return res
3436
3437 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3438 path='/', secure=False, discard=False, rest={}, **kwargs):
3439 cookie = http.cookiejar.Cookie(
3440 0, name, value, port, port is not None, domain, True,
3441 domain.startswith('.'), path, True, secure, expire_time,
3442 discard, None, None, rest)
3443 self.cookiejar.set_cookie(cookie)
3444
3445 def _get_cookies(self, url):
3446 """ Return a http.cookies.SimpleCookie with the cookies for the url """
3447 return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
3448
3449 def _apply_first_set_cookie_header(self, url_handle, cookie):
3450 """
3451 Apply first Set-Cookie header instead of the last. Experimental.
3452
3453 Some sites (e.g. [1-3]) may serve two cookies under the same name
3454 in Set-Cookie header and expect the first (old) one to be set rather
3455 than second (new). However, as of RFC6265 the newer one cookie
3456 should be set into cookie store what actually happens.
3457 We will workaround this issue by resetting the cookie to
3458 the first one manually.
3459 1. https://new.vk.com/
3460 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
3461 3. https://learning.oreilly.com/
3462 """
3463 for header, cookies in url_handle.headers.items():
3464 if header.lower() != 'set-cookie':
3465 continue
3466 cookies = cookies.encode('iso-8859-1').decode('utf-8')
3467 cookie_value = re.search(
3468 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3469 if cookie_value:
3470 value, domain = cookie_value.groups()
3471 self._set_cookie(domain, cookie, value)
3472 break
3473
3474 @classmethod
3475 def get_testcases(cls, include_onlymatching=False):
3476 # Do not look in super classes
3477 t = vars(cls).get('_TEST')
3478 if t:
3479 assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS'
3480 tests = [t]
3481 else:
3482 tests = vars(cls).get('_TESTS', [])
3483 for t in tests:
3484 if not include_onlymatching and t.get('only_matching', False):
3485 continue
3486 t['name'] = cls.ie_key()
3487 yield t
3488 if getattr(cls, '__wrapped__', None):
3489 yield from cls.__wrapped__.get_testcases(include_onlymatching)
3490
3491 @classmethod
3492 def get_webpage_testcases(cls):
3493 tests = vars(cls).get('_WEBPAGE_TESTS', [])
3494 for t in tests:
3495 t['name'] = cls.ie_key()
3496 yield t
3497 if getattr(cls, '__wrapped__', None):
3498 yield from cls.__wrapped__.get_webpage_testcases()
3499
3500 @classproperty(cache=True)
3501 def age_limit(cls):
3502 """Get age limit from the testcases"""
3503 return max(traverse_obj(
3504 (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()),
3505 (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0])
3506
3507 @classproperty(cache=True)
3508 def _RETURN_TYPE(cls):
3509 """What the extractor returns: "video", "playlist", "any", or None (Unknown)"""
3510 tests = tuple(cls.get_testcases(include_onlymatching=False))
3511 if not tests:
3512 return None
3513 elif not any(k.startswith('playlist') for test in tests for k in test):
3514 return 'video'
3515 elif all(any(k.startswith('playlist') for k in test) for test in tests):
3516 return 'playlist'
3517 return 'any'
3518
3519 @classmethod
3520 def is_single_video(cls, url):
3521 """Returns whether the URL is of a single video, None if unknown"""
3522 if cls.suitable(url):
3523 return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
3524
3525 @classmethod
3526 def is_suitable(cls, age_limit):
3527 """Test whether the extractor is generally suitable for the given age limit"""
3528 return not age_restricted(cls.age_limit, age_limit)
3529
3530 @classmethod
3531 def description(cls, *, markdown=True, search_examples=None):
3532 """Description of the extractor"""
3533 desc = ''
3534 if cls._NETRC_MACHINE:
3535 if markdown:
3536 desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
3537 else:
3538 desc += f' [{cls._NETRC_MACHINE}]'
3539 if cls.IE_DESC is False:
3540 desc += ' [HIDDEN]'
3541 elif cls.IE_DESC:
3542 desc += f' {cls.IE_DESC}'
3543 if cls.SEARCH_KEY:
3544 desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
3545 if search_examples:
3546 _COUNTS = ('', '5', '10', 'all')
3547 desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
3548 if not cls.working():
3549 desc += ' (**Currently broken**)' if markdown else ' (Currently broken)'
3550
3551 # Escape emojis. Ref: https://github.com/github/markup/issues/1153
3552 name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME
3553 return f'{name}:{desc}' if desc else name
3554
3555 def extract_subtitles(self, *args, **kwargs):
3556 if (self.get_param('writesubtitles', False)
3557 or self.get_param('listsubtitles')):
3558 return self._get_subtitles(*args, **kwargs)
3559 return {}
3560
3561 def _get_subtitles(self, *args, **kwargs):
3562 raise NotImplementedError('This method must be implemented by subclasses')
3563
3564 class CommentsDisabled(Exception):
3565 """Raise in _get_comments if comments are disabled for the video"""
3566
3567 def extract_comments(self, *args, **kwargs):
3568 if not self.get_param('getcomments'):
3569 return None
3570 generator = self._get_comments(*args, **kwargs)
3571
3572 def extractor():
3573 comments = []
3574 interrupted = True
3575 try:
3576 while True:
3577 comments.append(next(generator))
3578 except StopIteration:
3579 interrupted = False
3580 except KeyboardInterrupt:
3581 self.to_screen('Interrupted by user')
3582 except self.CommentsDisabled:
3583 return {'comments': None, 'comment_count': None}
3584 except Exception as e:
3585 if self.get_param('ignoreerrors') is not True:
3586 raise
3587 self._downloader.report_error(e)
3588 comment_count = len(comments)
3589 self.to_screen(f'Extracted {comment_count} comments')
3590 return {
3591 'comments': comments,
3592 'comment_count': None if interrupted else comment_count
3593 }
3594 return extractor
3595
3596 def _get_comments(self, *args, **kwargs):
3597 raise NotImplementedError('This method must be implemented by subclasses')
3598
3599 @staticmethod
3600 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3601 """ Merge subtitle items for one language. Items with duplicated URLs/data
3602 will be dropped. """
3603 list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1}
3604 ret = list(subtitle_list1)
3605 ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data)
3606 return ret
3607
3608 @classmethod
3609 def _merge_subtitles(cls, *dicts, target=None):
3610 """ Merge subtitle dictionaries, language by language. """
3611 if target is None:
3612 target = {}
3613 for d in dicts:
3614 for lang, subs in d.items():
3615 target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
3616 return target
3617
3618 def extract_automatic_captions(self, *args, **kwargs):
3619 if (self.get_param('writeautomaticsub', False)
3620 or self.get_param('listsubtitles')):
3621 return self._get_automatic_captions(*args, **kwargs)
3622 return {}
3623
3624 def _get_automatic_captions(self, *args, **kwargs):
3625 raise NotImplementedError('This method must be implemented by subclasses')
3626
3627 @functools.cached_property
3628 def _cookies_passed(self):
3629 """Whether cookies have been passed to YoutubeDL"""
3630 return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None
3631
3632 def mark_watched(self, *args, **kwargs):
3633 if not self.get_param('mark_watched', False):
3634 return
3635 if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed:
3636 self._mark_watched(*args, **kwargs)
3637
3638 def _mark_watched(self, *args, **kwargs):
3639 raise NotImplementedError('This method must be implemented by subclasses')
3640
3641 def geo_verification_headers(self):
3642 headers = {}
3643 geo_verification_proxy = self.get_param('geo_verification_proxy')
3644 if geo_verification_proxy:
3645 headers['Ytdl-request-proxy'] = geo_verification_proxy
3646 return headers
3647
3648 @staticmethod
3649 def _generic_id(url):
3650 return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3651
3652 def _generic_title(self, url='', webpage='', *, default=None):
3653 return (self._og_search_title(webpage, default=None)
3654 or self._html_extract_title(webpage, default=None)
3655 or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
3656 or default)
3657
3658 def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
3659 if not duration:
3660 return
3661 chapter_list = [{
3662 'start_time': start_function(chapter),
3663 'title': title_function(chapter),
3664 } for chapter in chapter_list or []]
3665 if strict:
3666 warn = self.report_warning
3667 else:
3668 warn = self.write_debug
3669 chapter_list.sort(key=lambda c: c['start_time'] or 0)
3670
3671 chapters = [{'start_time': 0}]
3672 for idx, chapter in enumerate(chapter_list):
3673 if chapter['start_time'] is None:
3674 warn(f'Incomplete chapter {idx}')
3675 elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
3676 chapters.append(chapter)
3677 elif chapter not in chapters:
3678 issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
3679 else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
3680 warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
3681 return chapters[1:]
3682
3683 def _extract_chapters_from_description(self, description, duration):
3684 duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
3685 sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
3686 return self._extract_chapters_helper(
3687 re.findall(sep_re % (duration_re, r'.+?'), description or ''),
3688 start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
3689 duration=duration, strict=False) or self._extract_chapters_helper(
3690 re.findall(sep_re % (r'.+?', duration_re), description or ''),
3691 start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
3692 duration=duration, strict=False)
3693
3694 @staticmethod
3695 def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
3696 all_known = all(map(
3697 lambda x: x is not None,
3698 (is_private, needs_premium, needs_subscription, needs_auth, is_unlisted)))
3699 return (
3700 'private' if is_private
3701 else 'premium_only' if needs_premium
3702 else 'subscriber_only' if needs_subscription
3703 else 'needs_auth' if needs_auth
3704 else 'unlisted' if is_unlisted
3705 else 'public' if all_known
3706 else None)
3707
3708 def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
3709 '''
3710 @returns A list of values for the extractor argument given by "key"
3711 or "default" if no such key is present
3712 @param default The default value to return when the key is not present (default: [])
3713 @param casesense When false, the values are converted to lower case
3714 '''
3715 ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key()
3716 val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key))
3717 if val is None:
3718 return [] if default is NO_DEFAULT else default
3719 return list(val) if casesense else [x.lower() for x in val]
3720
3721 def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'):
3722 if not playlist_id or not video_id:
3723 return not video_id
3724
3725 no_playlist = (smuggled_data or {}).get('force_noplaylist')
3726 if no_playlist is not None:
3727 return not no_playlist
3728
3729 video_id = '' if video_id is True else f' {video_id}'
3730 playlist_id = '' if playlist_id is True else f' {playlist_id}'
3731 if self.get_param('noplaylist'):
3732 self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist')
3733 return False
3734 self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
3735 return True
3736
3737 def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True):
3738 RetryManager.report_retry(
3739 err, _count or int(fatal), _retries,
3740 info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning,
3741 sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor'))
3742
3743 def RetryManager(self, **kwargs):
3744 return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs)
3745
3746 def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs):
3747 display_id = traverse_obj(info_dict, 'display_id', 'id')
3748 self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}')
3749 return self._downloader.get_info_extractor('Generic')._extract_embeds(
3750 smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs)
3751
3752 @classmethod
3753 def extract_from_webpage(cls, ydl, url, webpage):
3754 ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
3755 else ydl.get_info_extractor(cls.ie_key()))
3756 for info in ie._extract_from_webpage(url, webpage) or []:
3757 # url = None since we do not want to set (webpage/original)_url
3758 ydl.add_default_extra_info(info, ie, None)
3759 yield info
3760
3761 @classmethod
3762 def _extract_from_webpage(cls, url, webpage):
3763 for embed_url in orderedSet(
3764 cls._extract_embed_urls(url, webpage) or [], lazy=True):
3765 yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls)
3766
3767 @classmethod
3768 def _extract_embed_urls(cls, url, webpage):
3769 """@returns all the embed urls on the webpage"""
3770 if '_EMBED_URL_RE' not in cls.__dict__:
3771 assert isinstance(cls._EMBED_REGEX, (list, tuple))
3772 for idx, regex in enumerate(cls._EMBED_REGEX):
3773 assert regex.count('(?P<url>') == 1, \
3774 f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
3775 cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
3776
3777 for regex in cls._EMBED_URL_RE:
3778 for mobj in regex.finditer(webpage):
3779 embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
3780 if cls._VALID_URL is False or cls.suitable(embed_url):
3781 yield embed_url
3782
3783 class StopExtraction(Exception):
3784 pass
3785
3786 @classmethod
3787 def _extract_url(cls, webpage): # TODO: Remove
3788 """Only for compatibility with some older extractors"""
3789 return next(iter(cls._extract_embed_urls(None, webpage) or []), None)
3790
3791 @classmethod
3792 def __init_subclass__(cls, *, plugin_name=None, **kwargs):
3793 if plugin_name:
3794 mro = inspect.getmro(cls)
3795 super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
3796 cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
3797 cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
3798 while getattr(super_class, '__wrapped__', None):
3799 super_class = super_class.__wrapped__
3800 setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
3801 _PLUGIN_OVERRIDES[super_class].append(cls)
3802
3803 return super().__init_subclass__(**kwargs)
3804
3805
3806 class SearchInfoExtractor(InfoExtractor):
3807 """
3808 Base class for paged search queries extractors.
3809 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3810 Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
3811 """
3812
3813 _MAX_RESULTS = float('inf')
3814 _RETURN_TYPE = 'playlist'
3815
3816 @classproperty
3817 def _VALID_URL(cls):
3818 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3819
3820 def _real_extract(self, query):
3821 prefix, query = self._match_valid_url(query).group('prefix', 'query')
3822 if prefix == '':
3823 return self._get_n_results(query, 1)
3824 elif prefix == 'all':
3825 return self._get_n_results(query, self._MAX_RESULTS)
3826 else:
3827 n = int(prefix)
3828 if n <= 0:
3829 raise ExtractorError(f'invalid download number {n} for query "{query}"')
3830 elif n > self._MAX_RESULTS:
3831 self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3832 n = self._MAX_RESULTS
3833 return self._get_n_results(query, n)
3834
3835 def _get_n_results(self, query, n):
3836 """Get a specified number of results for a query.
3837 Either this function or _search_results must be overridden by subclasses """
3838 return self.playlist_result(
3839 itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
3840 query, query)
3841
3842 def _search_results(self, query):
3843 """Returns an iterator of search results"""
3844 raise NotImplementedError('This method must be implemented by subclasses')
3845
3846 @classproperty
3847 def SEARCH_KEY(cls):
3848 return cls._SEARCH_KEY
3849
3850
3851 class UnsupportedURLIE(InfoExtractor):
3852 _VALID_URL = '.*'
3853 _ENABLED = False
3854 IE_DESC = False
3855
3856 def _real_extract(self, url):
3857 raise UnsupportedError(url)
3858
3859
3860 _PLUGIN_OVERRIDES = collections.defaultdict(list)