X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/84ffeb7d5e72e3829319ba7720a8480fc4c7503b..61edf57f8f13f6dfd81154174e647eb5fdd26089:/yt_dlp/extractor/common.py diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 78288f809..f63bd7825 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1,5 +1,6 @@ import base64 import collections +import functools import getpass import hashlib import http.client @@ -13,6 +14,7 @@ import os import random import re +import subprocess import sys import time import types @@ -20,10 +22,22 @@ import urllib.request import xml.etree.ElementTree -from ..compat import functools # isort: split -from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name +from ..compat import ( + compat_etree_fromstring, + compat_expanduser, + compat_os_name, + urllib_req_to_req, +) from ..cookies import LenientSimpleCookie from ..downloader.f4m import get_base_url, remove_encrypted_media +from ..downloader.hls import HlsFD +from ..networking import HEADRequest, Request +from ..networking.exceptions import ( + HTTPError, + IncompleteRead, + network_exceptions, +) +from ..networking.impersonate import ImpersonateTarget from ..utils import ( IDENTITY, JSON_LD_RE, @@ -32,8 +46,8 @@ FormatSorter, GeoRestrictedError, GeoUtils, - HEADRequest, LenientJSONDecoder, + Popen, RegexNotFoundError, RetryManager, UnsupportedError, @@ -46,7 +60,6 @@ determine_ext, dict_get, encode_data_uri, - error_to_compat_str, extract_attributes, filter_dict, fix_xml_ampersands, @@ -56,7 +69,7 @@ join_nonempty, js_to_json, mimetype2ext, - network_exceptions, + netrc_from_content, orderedSet, parse_bitrate, parse_codecs, @@ -66,7 +79,6 @@ parse_resolution, sanitize_filename, sanitize_url, - sanitized_Request, smuggle_url, str_or_none, str_to_int, @@ -78,8 +90,6 @@ unescapeHTML, unified_strdate, unified_timestamp, - update_Request, - update_url_query, url_basename, url_or_none, urlhandle_detect_ext, @@ -160,12 +170,12 @@ class InfoExtractor: Automatically calculated from width and height * dynamic_range The dynamic range of the video. One of: "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV" - * tbr Average bitrate of audio and video in KBit/s - * abr Average audio bitrate in KBit/s + * tbr Average bitrate of audio and video in kbps (1000 bits/sec) + * abr Average audio bitrate in kbps (1000 bits/sec) * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz * audio_channels Number of audio channels - * vbr Average video bitrate in KBit/s + * vbr Average video bitrate in kbps (1000 bits/sec) * fps Frame rate * vcodec Name of the video codec in use * container Name of the container format @@ -220,10 +230,18 @@ class InfoExtractor: width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. - * has_drm The format has DRM and cannot be downloaded. Boolean + * has_drm True if the format has DRM and cannot be downloaded. + 'maybe' if the format may have DRM and has to be tested before download. * extra_param_to_segment_url A query string to append to each fragment's URL, or to update each existing query string - with. Only applied by the native HLS/DASH downloaders. + with. If it is an HLS stream with an AES-128 decryption key, + the query paramaters will be passed to the key URI as well, + unless there is an `extra_param_to_key_url` given, + or unless an external key URI is provided via `hls_aes`. + Only applied by the native HLS/DASH downloaders. + * extra_param_to_key_url A query string to append to the URL + of the format's HLS AES-128 decryption key. + Only applied by the native HLS downloader. * hls_aes A dictionary of HLS AES-128 decryption information used by the native HLS downloader to override the values in the media playlist when an '#EXT-X-KEY' tag @@ -235,7 +253,10 @@ class InfoExtractor: * downloader_options A dictionary of downloader options (For internal use only) * http_chunk_size Chunk size for HTTP downloads - * ffmpeg_args Extra arguments for ffmpeg downloader + * ffmpeg_args Extra arguments for ffmpeg downloader (input) + * ffmpeg_args_out Extra arguments for ffmpeg downloader (output) + * is_dash_periods Whether the format is a result of merging + multiple DASH periods. RTMP formats can also have the additional fields: page_url, app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, rtmp_protocol, rtmp_real_time @@ -249,7 +270,7 @@ class InfoExtractor: direct: True if a direct video file was given (must only be set by GenericIE) alt_title: A secondary title of the video. - display_id An alternative identifier for the video, not necessarily + display_id: An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", and display_id "dancing-naked-mole-rats" @@ -267,7 +288,7 @@ class InfoExtractor: description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. - creator: The creator of the video. + creators: List of creators of the video. timestamp: UNIX timestamp of the moment the video was uploaded upload_date: Video upload date in UTC (YYYYMMDD). If not explicitly set, calculated from timestamp @@ -275,6 +296,9 @@ class InfoExtractor: If it is not clear whether to use timestamp or this, use the former release_date: The date (YYYYMMDD) when the video was released in UTC. If not explicitly set, calculated from release_timestamp + release_year: Year (YYYY) as integer when the video or album was released. + To be used if no exact release date is known. + If not explicitly set, calculated from release_date. modified_timestamp: UNIX timestamp of the moment the video was last modified. modified_date: The date (YYYYMMDD) when the video was last modified in UTC. If not explicitly set, calculated from modified_timestamp @@ -286,6 +310,7 @@ class InfoExtractor: channel_id: Id of the channel. channel_url: Full URL to a channel webpage. channel_follower_count: Number of followers of the channel. + channel_is_verified: Whether the channel is verified on the platform. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -314,6 +339,11 @@ class InfoExtractor: * "author" - human-readable name of the comment author * "author_id" - user ID of the comment author * "author_thumbnail" - The thumbnail of the comment author + * "author_url" - The url to the comment author's page + * "author_is_verified" - Whether the author is verified + on the platform + * "author_is_uploader" - Whether the comment is made by + the video uploader * "id" - Comment ID * "html" - Comment as HTML * "text" - Plain text of the comment @@ -325,8 +355,8 @@ class InfoExtractor: * "dislike_count" - Number of negative ratings of the comment * "is_favorited" - Whether the comment is marked as favorite by the video uploader - * "author_is_uploader" - Whether the comment is made by - the video uploader + * "is_pinned" - Whether the comment is pinned to + the top of the comments age_limit: Age restriction for the video, as an integer (years) webpage_url: The URL to the video webpage, if given to yt-dlp it should allow to get the same result again. (It will be set @@ -350,6 +380,10 @@ class InfoExtractor: * "start_time" - The start time of the chapter in seconds * "end_time" - The end time of the chapter in seconds * "title" (optional, string) + heatmap: A list of dictionaries, with the following entries: + * "start_time" - The start time of the data point in seconds + * "end_time" - The end time of the data point in seconds + * "value" - The normalized value of the data point (float between 0 and 1) playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string @@ -358,6 +392,7 @@ class InfoExtractor: 'private', 'premium_only', 'subscriber_only', 'needs_auth', 'unlisted' or 'public'. Use 'InfoExtractor._availability' to set it + media_type: The type of media as classified by the site, e.g. "episode", "clip", "trailer" _old_archive_ids: A list of old archive ids needed for backward compatibility _format_sort_fields: A list of fields to use for sorting formats __post_extractor: A function to be called just before the metadata is @@ -397,17 +432,16 @@ class InfoExtractor: track_number: Number of the track within an album or a disc, as an integer. track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), as a unicode string. - artist: Artist(s) of the track. - genre: Genre(s) of the track. + artists: List of artists of the track. + composers: List of composers of the piece. + genres: List of genres of the track. album: Title of the album the track belongs to. album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). - album_artist: List of all artists appeared on the album (e.g. - "Ash Borer / Fell Voices" or "Various Artists", useful for splits - and compilations). + album_artists: List of all artists appeared on the album. + E.g. ["Ash Borer", "Fell Voices"] or ["Various Artists"]. + Useful for splits and compilations. disc_number: Number of the disc or other physical medium the track belongs to, as an integer. - release_year: Year (YYYY) when the album was released. - composer: Composer of the piece The following fields should only be set for clips that should be cut from the original video: @@ -418,6 +452,18 @@ class InfoExtractor: rows: Number of rows in each storyboard fragment, as an integer columns: Number of columns in each storyboard fragment, as an integer + The following fields are deprecated and should not be set by new code: + composer: Use "composers" instead. + Composer(s) of the piece, comma-separated. + artist: Use "artists" instead. + Artist(s) of the track, comma-separated. + genre: Use "genres" instead. + Genre(s) of the track, comma-separated. + album_artist: Use "album_artists" instead. + All artists appeared on the album, comma-separated. + creator: Use "creators" instead. + The creator of the video. + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -461,8 +507,8 @@ class InfoExtractor: Subclasses of this should also be added to the list of extractors and - should define a _VALID_URL regexp and, re-define the _real_extract() and - (optionally) _real_initialize() methods. + should define _VALID_URL as a regexp or a Sequence of regexps, and + re-define the _real_extract() and (optionally) _real_initialize() methods. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs @@ -525,7 +571,7 @@ class InfoExtractor: _EMBED_REGEX = [] def _login_hint(self, method=NO_DEFAULT, netrc=None): - password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' return { None: '', 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', @@ -552,8 +598,8 @@ def _match_valid_url(cls, url): # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) + cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) + return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None) @classmethod def suitable(cls, url): @@ -708,11 +754,11 @@ def extract(self, url): except UnsupportedError: raise except ExtractorError as e: - e.video_id = e.video_id or self.get_temp_id(url), - e.ie = e.ie or self.IE_NAME, + e.video_id = e.video_id or self.get_temp_id(url) + e.ie = e.ie or self.IE_NAME e.traceback = e.traceback or sys.exc_info()[2] raise - except http.client.IncompleteRead as e: + except IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) @@ -727,8 +773,8 @@ def __maybe_fake_ip_and_retry(self, countries): self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._x_forwarded_for_ip: self.report_warning( - 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' - % (self._x_forwarded_for_ip, country_code.upper())) + 'Video is geo restricted. Retrying extraction with fake IP ' + f'{self._x_forwarded_for_ip} ({country_code.upper()}) as X-Forwarded-For.') return True return False @@ -771,22 +817,28 @@ def IE_NAME(cls): @staticmethod def __can_accept_status_code(err, expected_status): - assert isinstance(err, urllib.error.HTTPError) + assert isinstance(err, HTTPError) if expected_status is None: return False elif callable(expected_status): - return expected_status(err.code) is True + return expected_status(err.status) is True else: - return err.code in variadic(expected_status) + return err.status in variadic(expected_status) - def _create_request(self, url_or_request, data=None, headers=None, query=None): + def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None): if isinstance(url_or_request, urllib.request.Request): - return update_Request(url_or_request, data=data, headers=headers, query=query) - if query: - url_or_request = update_url_query(url_or_request, query) - return sanitized_Request(url_or_request, data, headers or {}) + self._downloader.deprecation_warning( + 'Passing a urllib.request.Request to _create_request() is deprecated. ' + 'Use yt_dlp.networking.common.Request instead.') + url_or_request = urllib_req_to_req(url_or_request) + elif not isinstance(url_or_request, Request): + url_or_request = Request(url_or_request) + + url_or_request.update(data=data, headers=headers, query=query, extensions=extensions) + return url_or_request - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, + headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False): """ Return the response handle. @@ -795,7 +847,7 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa if not self._downloader._first_webpage_request: sleep_interval = self.get_param('sleep_interval_requests') or 0 if sleep_interval > 0: - self.to_screen('Sleeping %s seconds ...' % sleep_interval) + self.to_screen(f'Sleeping {sleep_interval} seconds ...') time.sleep(sleep_interval) else: self._downloader._first_webpage_request = False @@ -817,24 +869,42 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa headers = (headers or {}).copy() headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip) + extensions = {} + + if impersonate in (True, ''): + impersonate = ImpersonateTarget() + requested_targets = [ + t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t) + for t in variadic(impersonate) + ] if impersonate else [] + + available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None) + if available_target: + extensions['impersonate'] = available_target + elif requested_targets: + message = 'The extractor is attempting impersonation, but ' + message += ( + 'no impersonate target is available' if not str(impersonate) + else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"') + info_msg = ('see https://github.com/yt-dlp/yt-dlp#impersonation ' + 'for information on installing the required dependencies') + if require_impersonation: + raise ExtractorError(f'{message}; {info_msg}', expected=True) + self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True) + try: - return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) + return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions)) except network_exceptions as err: - if isinstance(err, urllib.error.HTTPError): + if isinstance(err, HTTPError): if self.__can_accept_status_code(err, expected_status): - # Retain reference to error to prevent file object from - # being closed before it can be read. Works around the - # effects of - # introduced in Python 3.4.1. - err.fp._error = err - return err.fp + return err.response if errnote is False: return False if errnote is None: errnote = 'Unable to download webpage' - errmsg = f'{errnote}: {error_to_compat_str(err)}' + errmsg = f'{errnote}: {err}' if fatal: raise ExtractorError(errmsg, cause=err) else: @@ -842,13 +912,14 @@ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fa return False def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, - encoding=None, data=None, headers={}, query={}, expected_status=None): + encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): """ Return a tuple (page content as string, URL handle). Arguments: url_or_request -- plain text URL as a string or - a urllib.request.Request object + a yt_dlp.networking.Request object video_id -- Video/playlist/item identifier (string) Keyword arguments: @@ -873,17 +944,27 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote= returning True if it should be accepted Note that this argument does not affect success status codes (2xx) which are always accepted. + impersonate -- the impersonate target. Can be any of the following entities: + - an instance of yt_dlp.networking.impersonate.ImpersonateTarget + - a string in the format of CLIENT[:OS] + - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances + - a boolean value; True means any impersonate target is sufficient + require_impersonation -- flag to toggle whether the request should raise an error + if impersonation is not possible (bool, default: False) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, str): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, + headers=headers, query=query, expected_status=expected_status, + impersonate=impersonate, require_impersonation=require_impersonation) if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data) return (content, urlh) @staticmethod @@ -912,7 +993,7 @@ def __check_blocked(self, content): r'