import base64
import collections
-import datetime
import hashlib
import itertools
import json
determine_ext,
determine_protocol,
dict_get,
+ encode_data_uri,
error_to_compat_str,
extract_attributes,
ExtractorError,
* filesize_approx An estimate for the number of bytes
* player_url SWF Player URL (used for rtmpdump).
* protocol The protocol that will be used for the actual
- download, lower-case.
- "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe",
- "m3u8", "m3u8_native" or "http_dash_segments".
+ download, lower-case. One of "http", "https" or
+ one of the protocols defined in downloader.PROTOCOL_MAP
* fragment_base_url
Base URL for fragments. Each fragment's path
value (if present) will be relative to
fragment_base_url
* "duration" (optional, int or float)
* "filesize" (optional, int)
+ * is_from_start Is a live format that can be downloaded
+ from the start. Boolean
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field, regardless of all other values.
uploader: Full name of the video uploader.
license: License name the video is licensed under.
creator: The creator of the video.
- release_timestamp: UNIX timestamp of the moment the video was released.
- release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video was uploaded
upload_date: Video upload date (YYYYMMDD).
- If not explicitly set, calculated from timestamp.
+ If not explicitly set, calculated from timestamp
+ release_timestamp: UNIX timestamp of the moment the video was released.
+ If it is not clear whether to use timestamp or this, use the former
+ release_date: The date (YYYYMMDD) when the video was released.
+ If not explicitly set, calculated from release_timestamp
+ modified_timestamp: UNIX timestamp of the moment the video was last modified.
+ modified_date: The date (YYYYMMDD) when the video was last modified.
+ If not explicitly set, calculated from modified_timestamp
uploader_id: Nickname or id of the video uploader.
uploader_url: Full URL to a personal webpage of the video uploader.
channel: Full name of the channel the video is uploaded on.
disc_number: Number of the disc or other physical medium the track belongs to,
as an integer.
release_year: Year (YYYY) when the album was released.
+ composer: Composer of the piece
Unless mentioned otherwise, the fields should be Unicode strings.
Additionally, playlists can have "id", "title", and any other relevent
attributes with the same semantics as videos (see above).
+ It can also have the following optional fields:
+
+ playlist_count: The total number of videos in a playlist. If not given,
+ YoutubeDL tries to calculate it from "entries"
+
_type "multi_video" indicates that there are multiple videos that
form a single show, for examples multiple acts of an opera or TV episode.
kwargs = {
'video_id': e.video_id or self.get_temp_id(url),
'ie': self.IE_NAME,
- 'tb': e.traceback,
+ 'tb': e.traceback or sys.exc_info()[2],
'expected': e.expected,
'cause': e.cause
}
# Methods for following #608
@staticmethod
- def url_result(url, ie=None, video_id=None, video_title=None, **kwargs):
+ def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs):
"""Returns a URL that points to a page that should be processed"""
- # TODO: ie should be the class used for getting the info
- video_info = {'_type': 'url',
- 'url': url,
- 'ie_key': ie}
- video_info.update(kwargs)
+ if ie is not None:
+ kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key()
if video_id is not None:
- video_info['id'] = video_id
+ kwargs['id'] = video_id
if video_title is not None:
- video_info['title'] = video_title
- return video_info
+ kwargs['title'] = video_title
+ return {
+ **kwargs,
+ '_type': 'url_transparent' if url_transparent else 'url',
+ 'url': url,
+ }
- def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
- urls = orderedSet(
- self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
- for m in matches)
- return self.playlist_result(
- urls, playlist_id=playlist_id, playlist_title=playlist_title)
+ def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, **kwargs):
+ urls = (self.url_result(self._proto_relative_url(m), ie)
+ for m in orderedSet(map(getter, matches) if getter else matches))
+ return self.playlist_result(urls, playlist_id, playlist_title, **kwargs)
@staticmethod
- def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
+ def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs):
"""Returns a playlist"""
- video_info = {'_type': 'playlist',
- 'entries': entries}
- video_info.update(kwargs)
if playlist_id:
- video_info['id'] = playlist_id
+ kwargs['id'] = playlist_id
if playlist_title:
- video_info['title'] = playlist_title
+ kwargs['title'] = playlist_title
if playlist_description is not None:
- video_info['description'] = playlist_description
- return video_info
+ kwargs['description'] = playlist_description
+ return {
+ **kwargs,
+ '_type': 'multi_video' if multi_video else 'playlist',
+ 'entries': entries,
+ }
def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
continue
info[count_key] = interaction_count
+ def extract_chapter_information(e):
+ chapters = [{
+ 'title': part.get('name'),
+ 'start_time': part.get('startOffset'),
+ 'end_time': part.get('endOffset'),
+ } for part in e.get('hasPart', []) if part.get('@type') == 'Clip']
+ for idx, (last_c, current_c, next_c) in enumerate(zip(
+ [{'end_time': 0}] + chapters, chapters, chapters[1:])):
+ current_c['end_time'] = current_c['end_time'] or next_c['start_time']
+ current_c['start_time'] = current_c['start_time'] or last_c['end_time']
+ if None in current_c.values():
+ self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters')
+ return
+ if chapters:
+ chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration']
+ info['chapters'] = chapters
+
def extract_video_object(e):
assert e['@type'] == 'VideoObject'
author = e.get('author')
'url': url_or_none(e.get('contentUrl')),
'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')),
- 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
+ 'thumbnails': [{'url': url_or_none(url)}
+ for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')),
# author can be an instance of 'Organization' or 'Person' types.
'view_count': int_or_none(e.get('interactionCount')),
})
extract_interaction_statistic(e)
+ extract_chapter_information(e)
- for e in json_ld:
- if '@context' in e:
+ def traverse_json_ld(json_ld, at_top_level=True):
+ for e in json_ld:
+ if at_top_level and '@context' not in e:
+ continue
+ if at_top_level and set(e.keys()) == {'@context', '@graph'}:
+ traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False)
+ break
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
continue
info.update({
'timestamp': parse_iso8601(e.get('datePublished')),
'title': unescapeHTML(e.get('headline')),
- 'description': unescapeHTML(e.get('articleBody')),
+ 'description': unescapeHTML(e.get('articleBody') or e.get('description')),
})
elif item_type == 'VideoObject':
extract_video_object(e)
continue
else:
break
+ traverse_json_ld(json_ld)
+
return dict((k, v) for k, v in info.items() if v is not None)
- def _search_nextjs_data(self, webpage, video_id, **kw):
+ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw):
return self._parse_json(
self._search_regex(
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
- webpage, 'next.js data', **kw),
- video_id, **kw)
+ webpage, 'next.js data', fatal=fatal, **kw),
+ video_id, transform_source=transform_source, fatal=fatal)
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'):
''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. '''
'vcodec': {'type': 'ordered', 'regex': True,
'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
'acodec': {'type': 'ordered', 'regex': True,
- 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
+ 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
headers=headers, query=query, video_id=video_id)
def _parse_m3u8_formats_and_subtitles(
- self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native',
+ self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native',
preference=None, quality=None, m3u8_id=None, live=False, note=None,
errnote=None, fatal=True, data=None, headers={}, query={},
video_id=None):
formats = [{
'format_id': join_nonempty(m3u8_id, idx),
'format_index': idx,
- 'url': m3u8_url,
+ 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'),
'ext': ext,
'protocol': entry_protocol,
'preference': preference,
if smil is False:
assert not fatal
- return []
+ return [], {}
namespace = self._parse_smil_namespace(smil)
mime_type = representation_attrib['mimeType']
content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
- codecs = representation_attrib.get('codecs', '')
+ codecs = parse_codecs(representation_attrib.get('codecs', ''))
if content_type not in ('video', 'audio', 'text'):
if mime_type == 'image/jpeg':
content_type = mime_type
- elif codecs.split('.')[0] == 'stpp':
+ elif codecs['vcodec'] != 'none':
+ content_type = 'video'
+ elif codecs['acodec'] != 'none':
+ content_type = 'audio'
+ elif codecs.get('tcodec', 'none') != 'none':
content_type = 'text'
elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
content_type = 'text'
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
+ **codecs
}
- f.update(parse_codecs(codecs))
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
return formats
def _live_title(self, name):
- """ Generate the title for a live video """
- now = datetime.datetime.now()
- now_str = now.strftime('%Y-%m-%d %H:%M')
- return name + ' ' + now_str
+ self._downloader.deprecation_warning('yt_dlp.InfoExtractor._live_title is deprecated and does not work as expected')
+ return name
def _int(self, v, name, fatal=False, **kwargs):
res = int_or_none(v, **kwargs)
else 'public' if all_known
else None)
- def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False):
+ def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False):
'''
@returns A list of values for the extractor argument given by "key"
or "default" if no such key is present
@param casesense When false, the values are converted to lower case
'''
val = traverse_obj(
- self._downloader.params, ('extractor_args', self.ie_key().lower(), key))
+ self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key))
if val is None:
return [] if default is NO_DEFAULT else default
return list(val) if casesense else [x.lower() for x in val]