write_string,
)
+# These bloat the lazy_extractors, so allow them to passthrough silently
+ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'}
+
class LazyLoadMetaClass(type):
def __getattr__(cls, name):
- # "_TESTS" bloat the lazy_extractors
- if '_real_class' not in cls.__dict__ and name != 'get_testcases':
+ if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:
write_string(
'WARNING: Falling back to normal extractor since lazy extractor '
f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
from inspect import getsource
NO_ATTR = object()
-STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit']
+STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']
CLASS_METHODS = [
'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
]
}.get(base.__name__, base.__name__) for base in ie.__bases__)
s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases)
- valid_url = getattr(ie, '_VALID_URL', None)
- if not valid_url and hasattr(ie, '_make_valid_url'):
- valid_url = ie._make_valid_url()
- if valid_url:
- s += f' _VALID_URL = {valid_url!r}\n'
return s + '\n'.join(extra_ie_code(ie, attr_base))
result_type = ie_result.get('_type', 'video')
if result_type in ('url', 'url_transparent'):
- ie_result['url'] = sanitize_url(ie_result['url'])
+ ie_result['url'] = sanitize_url(
+ ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')
if ie_result.get('original_url'):
extra_info.setdefault('original_url', ie_result['original_url'])
@staticmethod
def _extract_url(ie, webpage):
- urls = BrightcoveNewIE._extract_urls(ie, webpage)
+ urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage)
return urls[0] if urls else None
@staticmethod
- def _extract_urls(ie, webpage):
+ def _extract_brightcove_urls(ie, webpage):
# Reference:
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
# 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
import re
import sys
import time
+import types
import urllib.parse
import urllib.request
import xml.etree.ElementTree
from ..downloader import FileDownloader
from ..downloader.f4m import get_base_url, remove_encrypted_media
from ..utils import (
+ IDENTITY,
JSON_LD_RE,
NO_DEFAULT,
ExtractorError,
parse_m3u8_attributes,
parse_resolution,
sanitize_filename,
+ sanitize_url,
sanitized_Request,
str_or_none,
str_to_int,
title, description etc.
- Subclasses of this should define a _VALID_URL regexp and, re-define the
- _real_extract() and (optionally) _real_initialize() methods.
- Probably, they should also be added to the list of extractors.
+ Subclasses of this should also be added to the list of extractors and
+ should define a _VALID_URL regexp and, re-define the _real_extract() and
+ (optionally) _real_initialize() methods.
Subclasses may also override suitable() if necessary, but ensure the function
signature is preserved and that this function imports everything it needs
(except other extractors), so that lazy_extractors works correctly.
+ Subclasses can define a list of _EMBED_REGEX, which will be searched for in
+ the HTML of Generic webpages. It may also override _extract_embed_urls
+ or _extract_from_webpage as necessary. While these are normally classmethods,
+ _extract_from_webpage is allowed to be an instance method.
+
+ _extract_from_webpage may raise self.StopExtraction() to stop further
+ processing of the webpage and obtain exclusive rights to it. This is useful
+ when the extractor cannot reliably be matched using just the URL.
+ Eg: invidious/peertube instances
+
+ Embed-only extractors can be defined by setting _VALID_URL = False.
+
To support username + password (or netrc) login, the extractor must define a
_NETRC_MACHINE and re-define _perform_login(username, password) and
(optionally) _initialize_pre_login() methods. The _perform_login method will
_NETRC_MACHINE = None
IE_DESC = None
SEARCH_KEY = None
+ _VALID_URL = None
+ _EMBED_REGEX = []
def _login_hint(self, method=NO_DEFAULT, netrc=None):
password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
@classmethod
def _match_valid_url(cls, url):
+ if cls._VALID_URL is False:
+ return None
# This does not use has/getattr intentionally - we want to know whether
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
- if '_VALID_URL' not in cls.__dict__:
- cls._VALID_URL = cls._make_valid_url()
cls._VALID_URL_RE = re.compile(cls._VALID_URL)
return cls._VALID_URL_RE.match(url)
'url': url,
}
- def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs):
- urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {}))
- for m in orderedSet(map(getter, matches) if getter else matches))
- return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
+ @classmethod
+ def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None,
+ getter=IDENTITY, ie=None, video_kwargs=None, **kwargs):
+ return cls.playlist_result(
+ (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)),
+ playlist_id, playlist_title, **kwargs)
@staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')
- def _rta_search(self, html):
+ @staticmethod
+ def _rta_search(html):
# See http://www.rtalabel.org/index.php?content=howtofaq#single
if re.search(r'(?ix)<meta\s+name="rating"\s+'
r' content="RTA-5042-1996-1400-1577-RTA"',
html):
return 18
+
+ # And then there are the jokers who advertise that they use RTA, but actually don't.
+ AGE_LIMIT_MARKERS = [
+ r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
+ ]
+ if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
+ return 18
return 0
def _media_rating_search(self, html):
else 'https:')
def _proto_relative_url(self, url, scheme=None):
- if url is None:
- return url
- if url.startswith('//'):
- if scheme is None:
- scheme = self.http_scheme()
- return scheme + url
- else:
- return url
+ scheme = scheme or self.http_scheme()
+ assert scheme.endswith(':')
+ return sanitize_url(url, scheme=scheme[:-1])
def _sleep(self, timeout, video_id, msg_template=None):
if msg_template is None:
headers['Ytdl-request-proxy'] = geo_verification_proxy
return headers
- def _generic_id(self, url):
+ @staticmethod
+ def _generic_id(url):
return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
- def _generic_title(self, url):
+ @staticmethod
+ def _generic_title(url):
return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
@staticmethod
self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')
return True
+ @classmethod
+ def extract_from_webpage(cls, ydl, url, webpage):
+ ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType)
+ else ydl.get_info_extractor(cls.ie_key()))
+ yield from ie._extract_from_webpage(url, webpage) or []
+
+ @classmethod
+ def _extract_from_webpage(cls, url, webpage):
+ for embed_url in orderedSet(
+ cls._extract_embed_urls(url, webpage) or [], lazy=True):
+ yield cls.url_result(embed_url, cls)
+
+ @classmethod
+ def _extract_embed_urls(cls, url, webpage):
+ """@returns all the embed urls on the webpage"""
+ if '_EMBED_URL_RE' not in cls.__dict__:
+ assert isinstance(cls._EMBED_REGEX, (list, tuple))
+ for idx, regex in enumerate(cls._EMBED_REGEX):
+ assert regex.count('(?P<url>') == 1, \
+ f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}'
+ cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX))
+
+ for regex in cls._EMBED_URL_RE:
+ for mobj in regex.finditer(webpage):
+ embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url')))
+ if cls._VALID_URL is False or cls.suitable(embed_url):
+ yield embed_url
+
+ class StopExtraction(Exception):
+ pass
+
class SearchInfoExtractor(InfoExtractor):
"""
_MAX_RESULTS = float('inf')
- @classmethod
- def _make_valid_url(cls):
+ @classproperty
+ def _VALID_URL(cls):
return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
def _real_extract(self, query):
import urllib.parse
import xml.etree.ElementTree
+from . import gen_extractor_classes
+from .common import InfoExtractor # isort: split
from .ant1newsgr import Ant1NewsGrEmbedIE
from .anvato import AnvatoIE
from .apa import APAIE
from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE
from .channel9 import Channel9IE
from .cloudflarestream import CloudflareStreamIE
-from .common import InfoExtractor
from .commonprotocols import RtmpIE
from .condenast import CondeNastIE
from .dailymail import DailyMailIE
determine_ext,
dict_get,
float_or_none,
+ format_field,
int_or_none,
is_html,
js_to_json,
"""Report information extraction."""
self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
- def report_detected(self, name):
- self._downloader.write_debug(f'Identified a {name}')
+ def report_detected(self, name, num=1, note=None):
+ if num > 1:
+ name += 's'
+ elif not num:
+ return
+ else:
+ num = 'a'
+
+ self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
def _extract_rss(self, url, video_id, doc):
NS_MAP = {
if not self.get_param('test', False) and not is_intentional:
force = self.get_param('force_generic_extractor', False)
- self.report_warning(
- '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
+ self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
first_bytes = full_response.read(512)
self.report_detected('Camtasia video')
return camtasia_res
+ info_dict.update({
+ # it's tempting to parse this further, but you would
+ # have to take into account all the variations like
+ # Video Title - Site Name
+ # Site Name | Video Title
+ # Video Title - Tagline | Site Name
+ # and so on and so forth; it's just not practical
+ 'title': (self._og_search_title(webpage, default=None)
+ or self._html_extract_title(webpage, 'video title', default='video')),
+ 'description': self._og_search_description(webpage, default=None),
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'age_limit': self._rta_search(webpage),
+ })
+
+ domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
+
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way
r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
lambda x: unescapeHTML(x.group(0)), webpage)
- # it's tempting to parse this further, but you would
- # have to take into account all the variations like
- # Video Title - Site Name
- # Site Name | Video Title
- # Video Title - Tagline | Site Name
- # and so on and so forth; it's just not practical
- video_title = (self._og_search_title(webpage, default=None)
- or self._html_extract_title(webpage, 'video title', default='video'))
-
- # Try to detect age limit automatically
- age_limit = self._rta_search(webpage)
- # And then there are the jokers who advertise that they use RTA,
- # but actually don't.
- AGE_LIMIT_MARKERS = [
- r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
- ]
- if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
- age_limit = 18
-
- # video uploader is domain name
- video_uploader = self._search_regex(
- r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
-
- video_description = self._og_search_description(webpage, default=None)
- video_thumbnail = self._og_search_thumbnail(webpage, default=None)
-
- info_dict.update({
- 'title': video_title,
- 'description': video_description,
- 'thumbnail': video_thumbnail,
- 'age_limit': age_limit,
- })
+ # TODO: Remove
+ video_title, video_description, video_thumbnail, age_limit, video_uploader = \
+ info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name
- self._downloader.write_debug('Looking for video embeds')
+ # TODO: Move Embeds
+ self._downloader.write_debug('Looking for single embeds')
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
}
# Look for Brightcove New Studio embeds
- bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
+ bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
if bc_urls:
return self.playlist_from_matches(
bc_urls, video_id, video_title,
return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
# Look for embedded Spotify player
- spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage)
+ spotify_urls = SpotifyBaseIE._extract_urls(webpage)
if spotify_urls:
return self.playlist_from_matches(spotify_urls, video_id, video_title)
tiktok_urls = TikTokIE._extract_urls(webpage)
if tiktok_urls:
return self.playlist_from_matches(tiktok_urls, video_id, video_title)
+ # TODO: END: Move Embeds
+
+ self._downloader.write_debug('Looking for embeds')
+ embeds = []
+ for ie in gen_extractor_classes():
+ gen = ie.extract_from_webpage(self._downloader, url, webpage)
+ current_embeds = []
+ try:
+ while True:
+ current_embeds.append(next(gen))
+ except self.StopExtraction:
+ self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
+ embeds and 'discarding other embeds')
+ embeds = current_embeds
+ break
+ except StopIteration:
+ self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
+ embeds.extend(current_embeds)
+
+ del current_embeds
+ if len(embeds) == 1:
+ return {**info_dict, **embeds[0]}
+ elif embeds:
+ return self.playlist_result(embeds, **info_dict)
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
entries.append(self.url_result(video_url, 'Youtube'))
continue
- # here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0]
headers = {
'referer': full_response.geturl()
}
@classmethod
- def _extract_embed_urls(cls, webpage):
+ def _extract_urls(cls, webpage):
return re.findall(
r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',
webpage)
return os.path.join(*sanitized_path)
-def sanitize_url(url):
+def sanitize_url(url, *, scheme='http'):
# Prepend protocol-less URLs with `http:` scheme in order to mitigate
# the number of unwanted failures due to missing protocol
if url is None:
return
elif url.startswith('//'):
- return 'http:%s' % url
+ return f'{scheme}:{url}'
# Fix some common typos seen so far
COMMON_TYPOS = (
# https://github.com/ytdl-org/youtube-dl/issues/15649