-import re
-import time
-import hmac
import binascii
import hashlib
+import hmac
+import re
+import time
-
-from .once import OnceIE
from .adobepass import AdobePassIE
+from .once import OnceIE
+from ..networking import HEADRequest, Request
from ..utils import (
- determine_ext,
ExtractorError,
+ determine_ext,
+ find_xpath_attr,
float_or_none,
int_or_none,
+ mimetype2ext,
parse_qs,
- sanitized_Request,
+ traverse_obj,
unsmuggle_url,
+ update_url,
update_url_query,
+ urlhandle_detect_ext,
xpath_with_ns,
- mimetype2ext,
- find_xpath_attr,
)
default_ns = 'http://www.w3.org/2005/SMIL21/Language'
raise ExtractorError(
error_element.attrib['abstract'], expected=True)
- smil_formats = self._parse_smil_formats(
+ smil_formats, subtitles = self._parse_smil_formats_and_subtitles(
meta, smil_url, video_id, namespace=default_ns,
# the parameters are from syfy.com, other sites may use others,
# they also work for nbc.com
formats.append(_format)
- subtitles = self._parse_smil_subtitles(meta, default_ns)
-
return formats, subtitles
def _download_theplatform_metadata(self, path, video_id):
_add_chapter(chapter.get('startTime'), chapter.get('endTime'))
_add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
+ def extract_site_specific_field(field):
+ # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
+ return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False)
+
return {
'title': info['title'],
'subtitles': subtitles,
'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
'uploader': info.get('billingCode'),
'chapters': chapters,
+ 'creator': traverse_obj(info, ('author', {str})) or None,
+ 'categories': traverse_obj(info, (
+ 'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
+ 'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
+ 'location': extract_site_specific_field('region'),
+ 'series': extract_site_specific_field('show'),
+ 'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
+ 'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'),
}
def _extract_theplatform_metadata(self, path, video_id):
(?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
(?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
|theplatform:)(?P<id>[^/\?&]+)'''
+ _EMBED_REGEX = [
+ r'''(?x)
+ <meta\s+
+ property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+ content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2''',
+ r'(?s)<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//player\.theplatform\.com/p/.+?)\1'
+ ]
_TESTS = [{
# from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
'params': {
# rtmp download
'skip_download': True,
- }
+ },
+ 'skip': 'CNet no longer uses ThePlatform',
}, {
'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
'info_dict': {
'description': 'md5:644ad9188d655b742f942bf2e06b002d',
'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
'uploader': 'EGSM',
- }
+ },
+ 'skip': 'Dead link',
}, {
'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
'only_matching': True,
'upload_date': '20150701',
'uploader': 'NBCU-NEWS',
},
+ 'skip': 'Error: Player PID "nbcNewsOffsite" is disabled',
}, {
# From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
# geo-restricted (US), HLS encrypted with AES-128
}]
@classmethod
- def _extract_urls(cls, webpage):
- m = re.search(
- r'''(?x)
- <meta\s+
- property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
- content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
- ''', webpage)
- if m:
- return [m.group('url')]
-
+ def _extract_embed_urls(cls, url, webpage):
# Are whitespaces ignored in URLs?
# https://github.com/ytdl-org/youtube-dl/issues/12044
- matches = re.findall(
- r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
- if matches:
- return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
+ for embed_url in super()._extract_embed_urls(url, webpage):
+ yield re.sub(r'\s', '', embed_url)
@staticmethod
def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
source_url = smuggled_data.get('source_url')
if source_url:
headers['Referer'] = source_url
- request = sanitized_Request(url, headers=headers)
+ request = Request(url, headers=headers)
webpage = self._download_webpage(request, video_id)
smil_url = self._search_regex(
r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
- self._sort_formats(formats)
+
+ # With some sites, manifest URL must be forced to extract HLS formats
+ if not traverse_obj(formats, lambda _, v: v['format_id'].startswith('hls')):
+ m3u8_url = update_url(url, query='mbr=true&manifest=m3u', fragment=None)
+ urlh = self._request_webpage(
+ HEADRequest(m3u8_url), video_id, 'Checking for HLS formats', 'No HLS formats found', fatal=False)
+ if urlh and urlhandle_detect_ext(urlh) == 'm3u8':
+ m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+ m3u8_url, video_id, m3u8_id='hls', fatal=False)
+ formats.extend(m3u8_fmts)
+ self._merge_subtitles(m3u8_subs, target=subtitles)
ret = self._extract_theplatform_metadata(path, video_id)
combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
formats.extend(cur_formats)
subtitles = self._merge_subtitles(subtitles, cur_subtitles)
- self._sort_formats(formats)
-
thumbnails = [{
'url': thumbnail['plfile$url'],
'width': int_or_none(thumbnail.get('plfile$width')),