]> jfr.im git - yt-dlp.git/blobdiff - yt_dlp/extractor/pbs.py
[ie/mlbtv] Fix extraction (#10296)
[yt-dlp.git] / yt_dlp / extractor / pbs.py
index 5bdf561db9ef833e5651f459b63e29f2bee968ec..686796491d439398127ce0ba5cd0c38a06c408d6 100644 (file)
@@ -1,19 +1,19 @@
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
+    US_RATINGS,
     ExtractorError,
     determine_ext,
-    int_or_none,
     float_or_none,
+    int_or_none,
     js_to_json,
     orderedSet,
     strip_jsonp,
     strip_or_none,
+    traverse_obj,
     unified_strdate,
     url_or_none,
-    US_RATINGS,
 )
 
 
@@ -181,18 +181,18 @@ class PBSIE(InfoExtractor):
     )
 
     IE_NAME = 'pbs'
-    IE_DESC = 'Public Broadcasting Service (PBS) and member stations: %s' % ', '.join(list(zip(*_STATIONS))[1])
+    IE_DESC = 'Public Broadcasting Service (PBS) and member stations: {}'.format(', '.join(list(zip(*_STATIONS))[1]))
 
     _VALID_URL = r'''(?x)https?://
         (?:
            # Direct video URL
-           (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
+           (?:{})/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
            # Article with embedded player (or direct video)
-           (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+           (?:www\.)?pbs\.org/(?:[^/]+/){{1,5}}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
            # Player
            (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)
         )
-    ''' % '|'.join(list(zip(*_STATIONS))[0])
+    '''.format('|'.join(next(zip(*_STATIONS))))
 
     _GEO_COUNTRIES = ['US']
 
@@ -414,7 +414,7 @@ class PBSIE(InfoExtractor):
         {
             'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
             'only_matching': True,
-        }
+        },
     ]
     _ERRORS = {
         101: 'We\'re sorry, but this video is not yet available.',
@@ -517,7 +517,7 @@ def _extract_webpage(self, url):
             if not video_id:
                 video_info = self._extract_video_data(
                     player_page, 'video data', display_id)
-                video_id = compat_str(
+                video_id = str(
                     video_info.get('id') or video_info['contentID'])
         else:
             video_id = mobj.group('id')
@@ -538,7 +538,7 @@ def _real_extract(self, url):
 
         if isinstance(video_id, list):
             entries = [self.url_result(
-                'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id)
+                f'http://video.pbs.org/video/{vid_id}', 'PBS', vid_id)
                 for vid_id in video_id]
             return self.playlist_result(entries, display_id)
 
@@ -567,11 +567,11 @@ def extract_redirect_urls(info):
         # Player pages may also serve different qualities
         for page in ('widget/partnerplayer', 'portalplayer'):
             player = self._download_webpage(
-                'http://player.pbs.org/%s/%s' % (page, video_id),
-                display_id, 'Downloading %s page' % page, fatal=False)
+                f'http://player.pbs.org/{page}/{video_id}',
+                display_id, f'Downloading {page} page', fatal=False)
             if player:
                 video_info = self._extract_video_data(
-                    player, '%s video data' % page, display_id, fatal=False)
+                    player, f'{page} video data', display_id, fatal=False)
                 if video_info:
                     extract_redirect_urls(video_info)
                     if not info:
@@ -602,7 +602,7 @@ def extract_redirect_urls(info):
             redirect_id = redirect.get('eeid')
 
             redirect_info = self._download_json(
-                '%s?format=json' % redirect['url'], display_id,
+                '{}?format=json'.format(redirect['url']), display_id,
                 'Downloading %s video url info' % (redirect_id or num),
                 headers=self.geo_verification_headers())
 
@@ -613,7 +613,7 @@ def extract_redirect_urls(info):
                     self.raise_geo_restricted(
                         msg=message, countries=self._GEO_COUNTRIES)
                 raise ExtractorError(
-                    '%s said: %s' % (self.IE_NAME, message), expected=True)
+                    f'{self.IE_NAME} said: {message}', expected=True)
 
             format_url = redirect_info.get('url')
             if not format_url:
@@ -648,7 +648,7 @@ def extract_redirect_urls(info):
                 f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url)
                 # This may produce invalid links sometimes (e.g.
                 # http://www.pbs.org/wgbh/frontline/film/suicide-plan)
-                if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate):
+                if not self._is_valid_url(f_url, display_id, f'http-{bitrate}k video'):
                     continue
                 f = m3u8_format.copy()
                 f.update({
@@ -670,7 +670,7 @@ def extract_redirect_urls(info):
         captions = info.get('cc') or {}
         for caption_url in captions.values():
             subtitles.setdefault('en', []).append({
-                'url': caption_url
+                'url': caption_url,
             })
         subtitles = self._merge_subtitles(subtitles, hls_subs)
 
@@ -696,3 +696,61 @@ def extract_redirect_urls(info):
             'subtitles': subtitles,
             'chapters': chapters,
         }
+
+
+class PBSKidsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pbskids\.org/video/[\w-]+/(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'https://pbskids.org/video/molly-of-denali/3030407927',
+            'md5': '1ded20a017cc6b53446238f1804ce4c7',
+            'info_dict': {
+                'id': '3030407927',
+                'title': 'Bird in the Hand/Bye-Bye Birdie',
+                'channel': 'molly-of-denali',
+                'duration': 1540,
+                'ext': 'mp4',
+                'series': 'Molly of Denali',
+                'description': 'md5:d006b2211633685d8ebc8d03b6d5611e',
+                'categories': ['Episode'],
+                'upload_date': '20190718',
+            },
+        },
+        {
+            'url': 'https://pbskids.org/video/plum-landing/2365205059',
+            'md5': '92e5d189851a64ae1d0237a965be71f5',
+            'info_dict': {
+                'id': '2365205059',
+                'title': 'Cooper\'s Favorite Place in Nature',
+                'channel': 'plum-landing',
+                'duration': 67,
+                'ext': 'mp4',
+                'series': 'Plum Landing',
+                'description': 'md5:657e5fc4356a84ead1c061eb280ff05d',
+                'categories': ['Episode'],
+                'upload_date': '20140302',
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        meta = self._search_json(r'window\._PBS_KIDS_DEEPLINK\s*=', webpage, 'video info', video_id)
+        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+            traverse_obj(meta, ('video_obj', 'URI', {url_or_none})), video_id, ext='mp4')
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+            **traverse_obj(meta, {
+                'categories': ('video_obj', 'video_type', {str}, {lambda x: [x] if x else None}),
+                'channel': ('show_slug', {str}),
+                'description': ('video_obj', 'description', {str}),
+                'duration': ('video_obj', 'duration', {int_or_none}),
+                'series': ('video_obj', 'program_title', {str}),
+                'title': ('video_obj', 'title', {str}),
+                'upload_date': ('video_obj', 'air_date', {unified_strdate}),
+            }),
+        }