return self._og_search_property('url', html, **kargs)
def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs):
- return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
+ return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs)
def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
name = variadic(name)
'url': url_or_none(e.get('contentUrl')),
'title': unescapeHTML(e.get('name')),
'description': unescapeHTML(e.get('description')),
- 'thumbnails': [{'url': url_or_none(url)}
- for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))],
+ 'thumbnails': [{'url': url}
+ for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))
+ if url_or_none(url)],
'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')),
# author can be an instance of 'Organization' or 'Person' types.
mime_type = representation_attrib['mimeType']
content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
- codecs = parse_codecs(representation_attrib.get('codecs', ''))
+ codec_str = representation_attrib.get('codecs', '')
+ # Some kind of binary subtitle found in some youtube livestreams
+ if mime_type == 'application/x-rawcc':
+ codecs = {'scodec': codec_str}
+ else:
+ codecs = parse_codecs(codec_str)
if content_type not in ('video', 'audio', 'text'):
if mime_type == 'image/jpeg':
content_type = mime_type
- elif codecs['vcodec'] != 'none':
+ elif codecs.get('vcodec', 'none') != 'none':
content_type = 'video'
- elif codecs['acodec'] != 'none':
+ elif codecs.get('acodec', 'none') != 'none':
content_type = 'audio'
elif codecs.get('scodec', 'none') != 'none':
content_type = 'text'