ExtractorError,
UnsupportedError,
determine_ext,
+ determine_protocol,
dict_get,
extract_basic_auth,
+ filter_dict,
format_field,
int_or_none,
is_html,
unsmuggle_url,
update_url_query,
url_or_none,
+ urlhandle_detect_ext,
urljoin,
variadic,
xpath_attr,
'ext': 'mp4',
'title': 'trailer',
'upload_date': '20100513',
+ 'direct': True,
+ 'timestamp': 1273772943.0,
}
},
# Direct link to media delivered compressed (until Accept-Encoding is *)
'ext': 'webm',
'title': '5_Lennart_Poettering_-_Systemd',
'upload_date': '20141120',
+ 'direct': True,
+ 'timestamp': 1416498816.0,
},
'expected_warnings': [
'URL could be a direct video link, returning it as such.'
'upload_date': '20201204',
},
}],
+ 'skip': 'Dead link',
},
# RSS feed with item with description and thumbnails
{
'playlist': [{
'info_dict': {
'ext': 'm4a',
- 'id': 'c1c879525ce2cb640b344507e682c36d',
+ 'id': '818a5d38-01cd-152f-2231-ee479677fa82',
'title': 're:Hydrogen!',
'description': 're:.*In this episode we are going.*',
'timestamp': 1567977776,
'upload_date': '20190908',
- 'duration': 459,
+ 'duration': 423,
'thumbnail': r're:^https?://.*\.jpg$',
'episode_number': 1,
'season_number': 1,
'params': {
'skip_download': True,
},
+ 'skip': '404 Not Found',
},
# MPD from http://dash-mse-test.appspot.com/media.html
{
'title': 'car-20120827-manifest',
'formats': 'mincount:9',
'upload_date': '20130904',
+ 'timestamp': 1378272859.0,
},
},
# m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8
'id': 'cmQHVoWB5FY',
'ext': 'mp4',
'upload_date': '20130224',
- 'uploader_id': 'TheVerge',
+ 'uploader_id': '@TheVerge',
'description': r're:^Chris Ziegler takes a look at the\.*',
'uploader': 'The Verge',
'title': 'First Firefox OS phones side-by-side',
},
'skip': 'There is a limit of 200 free downloads / month for the test song',
},
- # ooyala video
- {
- 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
- 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
- 'info_dict': {
- 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
- 'ext': 'mp4',
- 'title': '2cc213299525360.mov', # that's what we get
- 'duration': 238.231,
- },
- 'add_ie': ['Ooyala'],
- },
- {
- # ooyala video embedded with http://player.ooyala.com/iframe.js
- 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
- 'info_dict': {
- 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
- 'ext': 'mp4',
- 'title': '"Steve Jobs: Man in the Machine" trailer',
- 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
- 'duration': 135.427,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'movie expired',
- },
- # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
- {
- 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
- 'info_dict': {
- 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
- 'ext': 'mp4',
- 'title': 'Steampunk Fest Comes to Honesdale',
- 'duration': 43.276,
- },
- 'params': {
- 'skip_download': True,
- }
- },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
'title': 'Ужастики, русский трейлер (2015)',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 153,
- }
+ },
+ 'skip': 'Site dead',
},
# XHamster embed
{
'playlist_mincount': 1,
'add_ie': ['Youtube'],
},
- # Cinchcast embed
+ # Libsyn embed
{
'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
'info_dict': {
- 'id': '7141703',
+ 'id': '3793998',
'ext': 'mp3',
'upload_date': '20141126',
- 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
+ 'title': 'Underground Wellness Radio - Jack Tips: 5 Steps to Permanent Gut Healing',
+ 'thumbnail': 'https://assets.libsyn.com/secure/item/3793998/?height=90&width=90',
+ 'duration': 3989.0,
}
},
# Cinerama player
},
},
{
- # Video.js embed, multiple formats
+ # Youtube embed, formerly: Video.js embed, multiple formats
'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
'info_dict': {
'id': 'yygqldloqIk',
'params': {
'skip_download': True,
},
+ 'skip': '404 Not Found',
},
# rtl.nl embed
{
'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
},
},
- {
- # vzaar embed
- 'url': 'http://help.vzaar.com/article/165-embedding-video',
- 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
- 'info_dict': {
- 'id': '8707641',
- 'ext': 'mp4',
- 'title': 'Building A Business Online: Principal Chairs Q & A',
- },
- },
{
# multiple HTML5 videos on one page
'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html',
'age_limit': 18,
},
},
+ {
+ 'note': 'Live HLS direct link',
+ 'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8',
+ 'info_dict': {
+ 'id': 'index',
+ 'title': r're:index',
+ 'ext': 'mp4',
+ 'live_status': 'is_live',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ },
+ {
+ 'note': 'Video.js VOD HLS',
+ 'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
+ 'info_dict': {
+ 'id': 'videojs_hls_test',
+ 'title': 'video',
+ 'ext': 'mp4',
+ 'age_limit': 0,
+ 'duration': 1800,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ },
]
def report_following_redirect(self, new_url):
def _extra_manifest_info(self, info, manifest_url):
fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0]
if fragment_query is not None:
- fragment_query = self._configuration_arg('fragment_query', casesense=True)[0]
info['extra_param_to_segment_url'] = (
urllib.parse.urlparse(fragment_query).query or fragment_query
or urllib.parse.urlparse(manifest_url).query or None)
for fmt in self._downloader._get_formats(info):
fmt['url'] = update_url_query(fmt['url'], query)
+ # Attempt to detect live HLS or set VOD duration
+ m3u8_format = next((f for f in self._downloader._get_formats(info)
+ if determine_protocol(f) == 'm3u8_native'), None)
+ if m3u8_format:
+ is_live = self._configuration_arg('is_live', [None])[0]
+ if is_live is not None:
+ info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
+ return
+ headers = m3u8_format.get('http_headers') or info.get('http_headers')
+ duration = self._extract_m3u8_vod_duration(
+ m3u8_format['url'], info.get('id'), note='Checking m3u8 live status',
+ errnote='Failed to download m3u8 media playlist', headers=headers)
+ if not duration:
+ info['live_status'] = 'is_live'
+ info['duration'] = info.get('duration') or duration
+
def _extract_rss(self, url, video_id, doc):
NS_MAP = {
'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
'id': flashvars['video_id'],
'display_id': display_id,
'title': title,
- 'thumbnail': thumbnail,
+ 'thumbnail': urljoin(url, thumbnail),
'formats': formats,
}
# to accept raw bytes and being able to download only a chunk.
# It may probably better to solve this by checking Content-Type for application/octet-stream
# after a HEAD request, but not sure if we can rely on this.
- full_response = self._request_webpage(url, video_id, headers={
+ full_response = self._request_webpage(url, video_id, headers=filter_dict({
'Accept-Encoding': 'identity',
- **smuggled_data.get('http_headers', {})
- })
- new_url = full_response.geturl()
+ 'Referer': smuggled_data.get('referer'),
+ }))
+ new_url = full_response.url
url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl()
if new_url != extract_basic_auth(url)[0]:
self.report_following_redirect(new_url)
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m:
self.report_detected('direct video link')
- headers = smuggled_data.get('http_headers', {})
+ headers = filter_dict({'Referer': smuggled_data.get('referer')})
format_id = str(m.group('format_id'))
- ext = determine_ext(url)
+ ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response)
subtitles = {}
if format_id.endswith('mpegurl') or ext == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
formats = [{
'format_id': format_id,
'url': url,
+ 'ext': ext,
'vcodec': 'none' if m.group('type') == 'audio' else None
}]
info_dict['direct'] = True
return self.playlist_result(
self._parse_xspf(
doc, video_id, xspf_url=url,
- xspf_base_url=full_response.geturl()),
+ xspf_base_url=full_response.url),
video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
doc,
- mpd_base_url=full_response.geturl().rpartition('/')[0],
+ mpd_base_url=full_response.url.rpartition('/')[0],
mpd_url=url)
self._extra_manifest_info(info_dict, url)
self.report_detected('DASH manifest')
self._downloader.write_debug('Looking for embeds')
embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
if len(embeds) == 1:
- return {**info_dict, **embeds[0]}
+ return merge_dicts(embeds[0], info_dict)
elif embeds:
return self.playlist_result(embeds, **info_dict)
raise UnsupportedError(url)
info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation
video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
url, smuggled_data = unsmuggle_url(url, {})
- actual_url = urlh.geturl() if urlh else url
+ actual_url = urlh.url if urlh else url
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
varname = mobj.group(1)
sources = variadic(self._parse_json(
mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
- formats = []
- subtitles = {}
+ formats, subtitles, src = [], {}, None
for source in sources:
src = source.get('src')
if not src or not isinstance(src, str):
m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
- for fmt in formats:
- self._extra_manifest_info(fmt, src)
if not formats:
formats.append({
for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
sub = self._parse_json(
sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
- src = str_or_none(sub.get('src'))
- if not src:
+ sub_src = str_or_none(sub.get('src'))
+ if not sub_src:
continue
subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
- 'url': urllib.parse.urljoin(url, src),
+ 'url': urllib.parse.urljoin(url, sub_src),
'name': sub.get('label'),
'http_headers': {
'Referer': actual_url,
})
if formats or subtitles:
self.report_detected('video.js embed')
- return [{'formats': formats, 'subtitles': subtitles}]
+ info_dict = {'formats': formats, 'subtitles': subtitles}
+ if formats:
+ self._extra_manifest_info(info_dict, src)
+ return [info_dict]
# Look for generic KVS player (before json-ld bc of some urls that break otherwise)
found = self._search_regex((
'url': smuggle_url(json_ld['url'], {
'force_videoid': video_id,
'to_generic': True,
- 'http_headers': {'Referer': url},
+ 'referer': url,
}),
}, json_ld)]