import os
import re
+import types
import urllib.parse
import xml.etree.ElementTree
unified_timestamp,
unsmuggle_url,
url_or_none,
+ variadic,
xpath_attr,
xpath_text,
xpath_with_ns,
'skip_download': True,
}
},
- {
- # JWPlatform iframe
- 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved',
- 'info_dict': {
- 'id': 'AG26UQXM',
- 'ext': 'mp4',
- 'upload_date': '20160719',
- 'timestamp': 468923808,
- 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4',
- },
- 'add_ie': ['JWPlatform'],
- },
{
# Video.js embed, multiple formats
'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
},
'playlist_count': 6,
},
- {
- # Squarespace video embed, 2019-08-28
- 'url': 'http://ootboxford.com',
- 'info_dict': {
- 'id': 'Tc7b_JGdZfw',
- 'title': 'Out of the Blue, at Childish Things 10',
- 'ext': 'mp4',
- 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f',
- 'uploader_id': 'helendouglashouse',
- 'uploader': 'Helen & Douglas House',
- 'upload_date': '20140328',
- },
- 'params': {
- 'skip_download': True,
- },
- },
# {
# # Zype embed
# 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
'upload_date': '20210111',
}
},
- {
- 'note': 'Rumble embed',
- 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html',
- 'md5': '53af34098a7f92c4e51cf0bd1c33f009',
- 'info_dict': {
- 'id': 'vb0ofn',
- 'ext': 'mp4',
- 'timestamp': 1612662578,
- 'uploader': 'LovingMontana',
- 'channel': 'LovingMontana',
- 'upload_date': '20210207',
- 'title': 'Winter-loving dog helps girls dig a snow fort ',
- 'channel_url': 'https://rumble.com/c/c-546523',
- 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg',
- 'duration': 103,
- }
- },
- {
- 'note': 'Rumble JS embed',
- 'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it',
- 'md5': '4701209ac99095592e73dbba21889690',
- 'info_dict': {
- 'id': 'v15eqxl',
- 'ext': 'mp4',
- 'channel': 'Mr Producer Media',
- 'duration': 92,
- 'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh',
- 'channel_url': 'https://rumble.com/c/RichSementa',
- 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg',
- 'timestamp': 1654892716,
- 'uploader': 'Mr Producer Media',
- 'upload_date': '20220610',
- }
- },
{
'note': 'JSON LD with multiple @type',
'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
'duration': 111.0,
}
},
+ {
+ 'note': 'JSON LD with unexpected data type',
+ 'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/',
+ 'info_dict': {
+ 'id': 'porsche-911-gt3-rs-rij-impressie-2',
+ 'ext': 'mp4',
+ 'title': 'Test: Porsche 911 GT3 RS',
+ 'description': 'Je ziet het niet, maar het is er wel. Downforce, hebben we het dan over. En in de nieuwe Porsche 911 GT3 RS is er zelfs heel veel downforce.',
+ 'timestamp': 1664920902,
+ 'upload_date': '20221004',
+ 'thumbnail': r're:^https://media.autoweek.nl/m/.+\.jpg$',
+ 'age_limit': 0,
+ 'direct': True,
+ }
+ }
]
def report_following_redirect(self, new_url):
default_search += ':'
return self.url_result(default_search + url)
+ original_url = url
url, smuggled_data = unsmuggle_url(url, {})
force_videoid = None
is_intentional = smuggled_data.get('to_generic')
**smuggled_data.get('http_headers', {})
})
new_url = full_response.geturl()
- if url != new_url:
+ if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl():
+ url = new_url
+ elif url != new_url:
self.report_following_redirect(new_url)
if force_videoid:
new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
'vcodec': 'none' if m.group('type') == 'audio' else None
}]
info_dict['direct'] = True
- self._sort_formats(formats)
info_dict.update({
'formats': formats,
'subtitles': subtitles,
if first_bytes.startswith(b'#EXTM3U'):
self.report_detected('M3U playlist')
info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
- self._sort_formats(info_dict['formats'])
return info_dict
# Maybe it's a direct link to a video?
elif doc.tag == 'SmoothStreamingMedia':
info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
self.report_detected('ISM manifest')
- self._sort_formats(info_dict['formats'])
return info_dict
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
smil = self._parse_smil(doc, url, video_id)
self.report_detected('SMIL file')
- self._sort_formats(smil['formats'])
return smil
elif doc.tag == '{http://xspf.org/ns/0/}playlist':
self.report_detected('XSPF playlist')
mpd_base_url=full_response.geturl().rpartition('/')[0],
mpd_url=url)
self.report_detected('DASH manifest')
- self._sort_formats(info_dict['formats'])
return info_dict
elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
self.report_detected('F4M manifest')
- self._sort_formats(info_dict['formats'])
return info_dict
except xml.etree.ElementTree.ParseError:
pass
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
- 'title': (self._og_search_title(webpage, default=None)
- or self._html_extract_title(webpage, 'video title', default='video')),
+ 'title': self._generic_title('', webpage, default='video'),
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'age_limit': self._rta_search(webpage),
})
- domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None)
+ self._downloader.write_debug('Looking for embeds')
+ embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
+ if len(embeds) == 1:
+ return {**info_dict, **embeds[0]}
+ elif embeds:
+ return self.playlist_result(embeds, **info_dict)
+ raise UnsupportedError(url)
+
+ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
+ """Returns an iterator of video entries"""
+ info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation
+ video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
+ url, smuggled_data = unsmuggle_url(url, {})
+ actual_url = urlh.geturl() if urlh else url
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
# There probably should be a second run of generic extractor on unescaped webpage.
# webpage = urllib.parse.unquote(webpage)
- # Unescape squarespace embeds to be detected by generic extractor,
- # see https://github.com/ytdl-org/youtube-dl/issues/21294
- webpage = re.sub(
- r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
- lambda x: unescapeHTML(x.group(0)), webpage)
-
# TODO: Move to respective extractors
- self._downloader.write_debug('Looking for Brightcove embeds')
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
- entries = [{
- '_type': 'url',
- 'url': smuggle_url(bc_url, {'Referer': url}),
- 'ie_key': 'BrightcoveLegacy'
- } for bc_url in bc_urls]
-
- return {
- '_type': 'playlist',
- 'title': info_dict['title'],
- 'id': video_id,
- 'entries': entries,
- }
+ return [self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveLegacyIE)
+ for bc_url in bc_urls]
bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)
if bc_urls:
- return self.playlist_from_matches(
- bc_urls, video_id, info_dict['title'],
- getter=lambda x: smuggle_url(x, {'referrer': url}),
- ie='BrightcoveNew')
+ return [self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveNewIE)
+ for bc_url in bc_urls]
- self._downloader.write_debug('Looking for embeds')
embeds = []
for ie in self._downloader._ies.values():
+ if ie.ie_key() in smuggled_data.get('block_ies', []):
+ continue
gen = ie.extract_from_webpage(self._downloader, url, webpage)
current_embeds = []
try:
except self.StopExtraction:
self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
embeds and 'discarding other embeds')
- embeds = current_embeds
- break
+ return current_embeds
except StopIteration:
self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
embeds.extend(current_embeds)
- del current_embeds
- if len(embeds) == 1:
- return {**info_dict, **embeds[0]}
- elif embeds:
- return self.playlist_result(embeds, **info_dict)
+ if embeds:
+ return embeds
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
if isinstance(jwplayer_data.get('playlist'), str):
self.report_detected('JW Player playlist')
- return {
- **info_dict,
- '_type': 'url',
- 'ie_key': 'JWPlatform',
- 'url': jwplayer_data['playlist'],
- }
+ return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')]
try:
info = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False, base_url=url)
if traverse_obj(info, 'formats', ('entries', ..., 'formats')):
self.report_detected('JW Player data')
- return merge_dicts(info, info_dict)
+ return [info]
except ExtractorError:
# See https://github.com/ytdl-org/youtube-dl/pull/16735
pass
webpage)
if mobj is not None:
varname = mobj.group(1)
- sources = self._parse_json(
- mobj.group(2), video_id, transform_source=js_to_json,
- fatal=False) or []
- if not isinstance(sources, list):
- sources = [sources]
+ sources = variadic(self._parse_json(
+ mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
formats = []
subtitles = {}
for source in sources:
src_type = src_type.lower()
ext = determine_ext(src).lower()
if src_type == 'video/youtube':
- return self.url_result(src, YoutubeIE.ie_key())
+ return [self.url_result(src, YoutubeIE.ie_key())]
if src_type == 'application/dash+xml' or ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(
src, video_id, mpd_id='dash', fatal=False)
'ext': (mimetype2ext(src_type)
or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
'http_headers': {
- 'Referer': full_response.geturl(),
+ 'Referer': actual_url,
},
})
# https://docs.videojs.com/player#addRemoteTextTrack
'url': urllib.parse.urljoin(url, src),
'name': sub.get('label'),
'http_headers': {
- 'Referer': full_response.geturl(),
+ 'Referer': actual_url,
},
})
if formats or subtitles:
self.report_detected('video.js embed')
- self._sort_formats(formats)
- info_dict['formats'] = formats
- info_dict['subtitles'] = subtitles
- return info_dict
+ return [{'formats': formats, 'subtitles': subtitles}]
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(webpage, video_id, default={})
if json_ld.get('url') not in (url, None):
self.report_detected('JSON LD')
- return merge_dicts({
+ return [merge_dicts({
'_type': 'video' if json_ld.get('ext') else 'url_transparent',
'url': smuggle_url(json_ld['url'], {
'force_videoid': video_id,
'to_generic': True,
'http_headers': {'Referer': url},
}),
- }, json_ld, info_dict)
+ }, json_ld)]
def check_video(vurl):
if YoutubeIE.suitable(vurl):
if not formats[-1].get('height'):
formats[-1]['quality'] = 1
- self._sort_formats(formats)
-
- return {
+ return [{
'id': flashvars['video_id'],
'display_id': display_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
- }
+ }]
if not found:
# Broaden the search a little bit
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
webpage)
if not found:
# Look also in Refresh HTTP header
- refresh_header = full_response.headers.get('Refresh')
+ refresh_header = urlh and urlh.headers.get('Refresh')
if refresh_header:
found = re.search(REDIRECT_REGEX, refresh_header)
if found:
new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
if new_url != url:
self.report_following_redirect(new_url)
- return {
- '_type': 'url',
- 'url': new_url,
- }
+ return [self.url_result(new_url)]
else:
found = None
embed_url = self._html_search_meta('twitter:player', webpage, default=None)
if embed_url and embed_url != url:
self.report_detected('twitter:player iframe')
- return self.url_result(embed_url)
+ return [self.url_result(embed_url)]
if not found:
- raise UnsupportedError(url)
+ return []
+
+ domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None)
entries = []
for video_url in orderedSet(found):
video_id = os.path.splitext(video_id)[0]
headers = {
- 'referer': full_response.geturl()
+ 'referer': actual_url
}
entry_info_dict = {
if ext == 'smil':
entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
elif ext == 'xspf':
- return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
+ return [self._extract_xspf_playlist(video_url, video_id)]
elif ext == 'm3u8':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
elif ext == 'mpd':
else:
entry_info_dict['url'] = video_url
- if entry_info_dict.get('formats'):
- self._sort_formats(entry_info_dict['formats'])
-
entries.append(entry_info_dict)
- if len(entries) == 1:
- return merge_dicts(entries[0], info_dict)
- else:
+ if len(entries) > 1:
for num, e in enumerate(entries, start=1):
# 'url' results don't have a title
if e.get('title') is not None:
e['title'] = '%s (%d)' % (e['title'], num)
- return {
- '_type': 'playlist',
- 'entries': entries,
- }
+ return entries