]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/theplatform.py
[ie/generic] Add `key_query` extractor-arg
[yt-dlp.git] / yt_dlp / extractor / theplatform.py
index c56b708b8208b5a08c0c87755c9a8ad3741c1f36..7c1769c2dfd0def159c69ce79e4da8e3f1bc6a7f 100644 (file)
@@ -1,30 +1,25 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
+import hashlib
+import hmac
 import re
 import time
-import hmac
-import binascii
-import hashlib
-
 
-from .once import OnceIE
 from .adobepass import AdobePassIE
-from ..compat import (
-    compat_parse_qs,
-    compat_urllib_parse_urlparse,
-)
+from .once import OnceIE
+from ..networking import HEADRequest, Request
 from ..utils import (
-    determine_ext,
     ExtractorError,
+    determine_ext,
+    find_xpath_attr,
     float_or_none,
     int_or_none,
-    sanitized_Request,
+    mimetype2ext,
+    parse_qs,
+    traverse_obj,
     unsmuggle_url,
+    update_url,
     update_url_query,
+    urlhandle_detect_ext,
     xpath_with_ns,
-    mimetype2ext,
-    find_xpath_attr,
 )
 
 default_ns = 'http://www.w3.org/2005/SMIL21/Language'
@@ -46,12 +41,11 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d
                 if exception.get('value') == 'GeoLocationBlocked':
                     self.raise_geo_restricted(error_element.attrib['abstract'])
                 elif error_element.attrib['src'].startswith(
-                        'http://link.theplatform.%s/s/errorFiles/Unavailable.'
-                        % self._TP_TLD):
+                        f'http://link.theplatform.{self._TP_TLD}/s/errorFiles/Unavailable.'):
                     raise ExtractorError(
                         error_element.attrib['abstract'], expected=True)
 
-        smil_formats = self._parse_smil_formats(
+        smil_formats, subtitles = self._parse_smil_formats_and_subtitles(
             meta, smil_url, video_id, namespace=default_ns,
             # the parameters are from syfy.com, other sites may use others,
             # they also work for nbc.com
@@ -71,12 +65,10 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d
 
                 formats.append(_format)
 
-        subtitles = self._parse_smil_subtitles(meta, default_ns)
-
         return formats, subtitles
 
     def _download_theplatform_metadata(self, path, video_id):
-        info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path)
+        info_url = f'http://link.theplatform.{self._TP_TLD}/s/{path}?format=preview'
         return self._download_json(info_url, video_id)
 
     def _parse_theplatform_metadata(self, info):
@@ -108,6 +100,10 @@ def _add_chapter(start_time, end_time):
                 _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
             _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
 
+        def extract_site_specific_field(field):
+            # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
+            return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False)
+
         return {
             'title': info['title'],
             'subtitles': subtitles,
@@ -117,6 +113,14 @@ def _add_chapter(start_time, end_time):
             'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
             'uploader': info.get('billingCode'),
             'chapters': chapters,
+            'creator': traverse_obj(info, ('author', {str})) or None,
+            'categories': traverse_obj(info, (
+                'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
+            'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
+            'location': extract_site_specific_field('region'),
+            'series': extract_site_specific_field('show'),
+            'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
+            'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'),
         }
 
     def _extract_theplatform_metadata(self, path, video_id):
@@ -129,6 +133,13 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
         (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
            (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
          |theplatform:)(?P<id>[^/\?&]+)'''
+    _EMBED_REGEX = [
+        r'''(?x)
+            <meta\s+
+                property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+                content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2''',
+        r'(?s)<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//player\.theplatform\.com/p/.+?)\1',
+    ]
 
     _TESTS = [{
         # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
@@ -163,7 +174,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
         'params': {
             # rtmp download
             'skip_download': True,
-        }
+        },
+        'skip': 'CNet no longer uses ThePlatform',
     }, {
         'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
         'info_dict': {
@@ -172,7 +184,8 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
             'description': 'md5:644ad9188d655b742f942bf2e06b002d',
             'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
             'uploader': 'EGSM',
-        }
+        },
+        'skip': 'Dead link',
     }, {
         'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
         'only_matching': True,
@@ -190,6 +203,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
             'upload_date': '20150701',
             'uploader': 'NBCU-NEWS',
         },
+        'skip': 'Error: Player PID "nbcNewsOffsite" is disabled',
     }, {
         # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
         # geo-restricted (US), HLS encrypted with AES-128
@@ -198,39 +212,25 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
     }]
 
     @classmethod
-    def _extract_urls(cls, webpage):
-        m = re.search(
-            r'''(?x)
-                    <meta\s+
-                        property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
-                        content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
-            ''', webpage)
-        if m:
-            return [m.group('url')]
-
+    def _extract_embed_urls(cls, url, webpage):
         # Are whitespaces ignored in URLs?
         # https://github.com/ytdl-org/youtube-dl/issues/12044
-        matches = re.findall(
-            r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
-        if matches:
-            return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
+        for embed_url in super()._extract_embed_urls(url, webpage):
+            yield re.sub(r'\s', '', embed_url)
 
     @staticmethod
     def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
         flags = '10' if include_qs else '00'
         expiration_date = '%x' % (int(time.time()) + life)
 
-        def str_to_hex(str):
-            return binascii.b2a_hex(str.encode('ascii')).decode('ascii')
-
-        def hex_to_bytes(hex):
-            return binascii.a2b_hex(hex.encode('ascii'))
+        def str_to_hex(str_data):
+            return str_data.encode('ascii').hex()
 
         relative_path = re.match(r'https?://link\.theplatform\.com/s/([^?]+)', url).group(1)
-        clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path))
+        clear_text = bytes.fromhex(flags + expiration_date + str_to_hex(relative_path))
         checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
         sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
-        return '%s&sig=%s' % (url, sig)
+        return f'{url}&sig={sig}'
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
@@ -250,7 +250,7 @@ def _real_extract(self, url):
             path += mobj.group('media')
         path += video_id
 
-        qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        qs_dict = parse_qs(url)
         if 'guid' in qs_dict:
             webpage = self._download_webpage(url, video_id)
             scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage)
@@ -269,7 +269,7 @@ def _real_extract(self, url):
                     break
             if feed_id is None:
                 raise ExtractorError('Unable to find feed id')
-            return self.url_result('http://feed.theplatform.com/f/%s/%s?byGuid=%s' % (
+            return self.url_result('http://feed.theplatform.com/f/{}/{}?byGuid={}'.format(
                 provider_id, feed_id, qs_dict['guid'][0]))
 
         if smuggled_data.get('force_smil_url', False):
@@ -280,7 +280,7 @@ def _real_extract(self, url):
             source_url = smuggled_data.get('source_url')
             if source_url:
                 headers['Referer'] = source_url
-            request = sanitized_Request(url, headers=headers)
+            request = Request(url, headers=headers)
             webpage = self._download_webpage(request, video_id)
             smil_url = self._search_regex(
                 r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
@@ -293,20 +293,27 @@ def _real_extract(self, url):
             config_url = config_url.replace('swf/', 'config/')
             config_url = config_url.replace('onsite/', 'onsite/config/')
             config = self._download_json(config_url, video_id, 'Downloading config')
-            if 'releaseUrl' in config:
-                release_url = config['releaseUrl']
-            else:
-                release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
+            release_url = config.get('releaseUrl') or f'http://link.theplatform.com/s/{path}?mbr=true'
             smil_url = release_url + '&formats=MPEG4&manifest=f4m'
         else:
-            smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
+            smil_url = f'http://link.theplatform.com/s/{path}?mbr=true'
 
         sig = smuggled_data.get('sig')
         if sig:
             smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
 
         formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
-        self._sort_formats(formats)
+
+        # With some sites, manifest URL must be forced to extract HLS formats
+        if not traverse_obj(formats, lambda _, v: v['format_id'].startswith('hls')):
+            m3u8_url = update_url(url, query='mbr=true&manifest=m3u', fragment=None)
+            urlh = self._request_webpage(
+                HEADRequest(m3u8_url), video_id, 'Checking for HLS formats', 'No HLS formats found', fatal=False)
+            if urlh and urlhandle_detect_ext(urlh) == 'm3u8':
+                m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
+                    m3u8_url, video_id, m3u8_id='hls', fatal=False)
+                formats.extend(m3u8_fmts)
+                self._merge_subtitles(m3u8_subs, target=subtitles)
 
         ret = self._extract_theplatform_metadata(path, video_id)
         combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
@@ -359,7 +366,7 @@ def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custo
             if first_video_id is None:
                 first_video_id = cur_video_id
                 duration = float_or_none(item.get('plfile$duration'))
-            file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes']
+            file_asset_types = item.get('plfile$assetTypes') or parse_qs(smil_url)['assetTypes']
             for asset_type in file_asset_types:
                 if asset_type in asset_types:
                     continue
@@ -372,12 +379,10 @@ def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custo
                 if asset_type in asset_types_query:
                     query.update(asset_types_query[asset_type])
                 cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
-                    main_smil_url or smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
+                    main_smil_url or smil_url, query), video_id, f'Downloading SMIL data for {asset_type}')
                 formats.extend(cur_formats)
                 subtitles = self._merge_subtitles(subtitles, cur_subtitles)
 
-        self._sort_formats(formats)
-
         thumbnails = [{
             'url': thumbnail['plfile$url'],
             'width': int_or_none(thumbnail.get('plfile$width')),
@@ -387,7 +392,7 @@ def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custo
         timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
         categories = [item['media$name'] for item in entry.get('media$categories', [])]
 
-        ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+        ret = self._extract_theplatform_metadata(f'{provider_id}/{first_video_id}', video_id)
         subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
         ret.update({
             'id': video_id,