[ie/mlbtv] Fix extraction (#10296)

[yt-dlp.git] / yt_dlp / extractor / pbs.py
diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py

index 5bdf561db9ef833e5651f459b63e29f2bee968ec..686796491d439398127ce0ba5cd0c38a06c408d6 100644 (file)
--- a/yt_dlp/extractor/pbs.py
+++ b/yt_dlp/extractor/pbs.py
@@ -1,19 +1,19 @@
  import re
  
  from .common import InfoExtractor
-from ..compat import compat_str
  from ..utils import (
+    US_RATINGS,
      ExtractorError,
      determine_ext,
-    int_or_none,
      float_or_none,
+    int_or_none,
      js_to_json,
      orderedSet,
      strip_jsonp,
      strip_or_none,
+    traverse_obj,
      unified_strdate,
      url_or_none,
-    US_RATINGS,
  )
  
  
@@ -181,18 +181,18 @@ class PBSIE(InfoExtractor):
      )
  
      IE_NAME = 'pbs'
-    IE_DESC = 'Public Broadcasting Service (PBS) and member stations: %s' % ', '.join(list(zip(*_STATIONS))[1])
+    IE_DESC = 'Public Broadcasting Service (PBS) and member stations: {}'.format(', '.join(list(zip(*_STATIONS))[1]))
  
      _VALID_URL = r'''(?x)https?://
          (?:
             # Direct video URL
-           (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
+           (?:{})/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
             # Article with embedded player (or direct video)
-           (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+           (?:www\.)?pbs\.org/(?:[^/]+/){{1,5}}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
             # Player
             (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)
          )
-    ''' % '|'.join(list(zip(*_STATIONS))[0])
+    '''.format('|'.join(next(zip(*_STATIONS))))
  
      _GEO_COUNTRIES = ['US']
  
@@ -414,7 +414,7 @@ class PBSIE(InfoExtractor):
          {
              'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
              'only_matching': True,
-        }
+        },
      ]
      _ERRORS = {
          101: 'We\'re sorry, but this video is not yet available.',
@@ -517,7 +517,7 @@ def _extract_webpage(self, url):
              if not video_id:
                  video_info = self._extract_video_data(
                      player_page, 'video data', display_id)
-                video_id = compat_str(
+                video_id = str(
                      video_info.get('id') or video_info['contentID'])
          else:
              video_id = mobj.group('id')
@@ -538,7 +538,7 @@ def _real_extract(self, url):
  
          if isinstance(video_id, list):
              entries = [self.url_result(
-                'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id)
+                f'http://video.pbs.org/video/{vid_id}', 'PBS', vid_id)
                  for vid_id in video_id]
              return self.playlist_result(entries, display_id)
  
@@ -567,11 +567,11 @@ def extract_redirect_urls(info):
          # Player pages may also serve different qualities
          for page in ('widget/partnerplayer', 'portalplayer'):
              player = self._download_webpage(
-                'http://player.pbs.org/%s/%s' % (page, video_id),
-                display_id, 'Downloading %s page' % page, fatal=False)
+                f'http://player.pbs.org/{page}/{video_id}',
+                display_id, f'Downloading {page} page', fatal=False)
              if player:
                  video_info = self._extract_video_data(
-                    player, '%s video data' % page, display_id, fatal=False)
+                    player, f'{page} video data', display_id, fatal=False)
                  if video_info:
                      extract_redirect_urls(video_info)
                      if not info:
@@ -602,7 +602,7 @@ def extract_redirect_urls(info):
              redirect_id = redirect.get('eeid')
  
              redirect_info = self._download_json(
-                '%s?format=json' % redirect['url'], display_id,
+                '{}?format=json'.format(redirect['url']), display_id,
                  'Downloading %s video url info' % (redirect_id or num),
                  headers=self.geo_verification_headers())
  
@@ -613,7 +613,7 @@ def extract_redirect_urls(info):
                      self.raise_geo_restricted(
                          msg=message, countries=self._GEO_COUNTRIES)
                  raise ExtractorError(
-                    '%s said: %s' % (self.IE_NAME, message), expected=True)
+                    f'{self.IE_NAME} said: {message}', expected=True)
  
              format_url = redirect_info.get('url')
              if not format_url:
@@ -648,7 +648,7 @@ def extract_redirect_urls(info):
                  f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url)
                  # This may produce invalid links sometimes (e.g.
                  # http://www.pbs.org/wgbh/frontline/film/suicide-plan)
-                if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate):
+                if not self._is_valid_url(f_url, display_id, f'http-{bitrate}k video'):
                      continue
                  f = m3u8_format.copy()
                  f.update({
@@ -670,7 +670,7 @@ def extract_redirect_urls(info):
          captions = info.get('cc') or {}
          for caption_url in captions.values():
              subtitles.setdefault('en', []).append({
-                'url': caption_url
+                'url': caption_url,
              })
          subtitles = self._merge_subtitles(subtitles, hls_subs)
  
@@ -696,3 +696,61 @@ def extract_redirect_urls(info):
              'subtitles': subtitles,
              'chapters': chapters,
          }
+
+
+class PBSKidsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pbskids\.org/video/[\w-]+/(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'https://pbskids.org/video/molly-of-denali/3030407927',
+            'md5': '1ded20a017cc6b53446238f1804ce4c7',
+            'info_dict': {
+                'id': '3030407927',
+                'title': 'Bird in the Hand/Bye-Bye Birdie',
+                'channel': 'molly-of-denali',
+                'duration': 1540,
+                'ext': 'mp4',
+                'series': 'Molly of Denali',
+                'description': 'md5:d006b2211633685d8ebc8d03b6d5611e',
+                'categories': ['Episode'],
+                'upload_date': '20190718',
+            },
+        },
+        {
+            'url': 'https://pbskids.org/video/plum-landing/2365205059',
+            'md5': '92e5d189851a64ae1d0237a965be71f5',
+            'info_dict': {
+                'id': '2365205059',
+                'title': 'Cooper\'s Favorite Place in Nature',
+                'channel': 'plum-landing',
+                'duration': 67,
+                'ext': 'mp4',
+                'series': 'Plum Landing',
+                'description': 'md5:657e5fc4356a84ead1c061eb280ff05d',
+                'categories': ['Episode'],
+                'upload_date': '20140302',
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        meta = self._search_json(r'window\._PBS_KIDS_DEEPLINK\s*=', webpage, 'video info', video_id)
+        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+            traverse_obj(meta, ('video_obj', 'URI', {url_or_none})), video_id, ext='mp4')
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+            **traverse_obj(meta, {
+                'categories': ('video_obj', 'video_type', {str}, {lambda x: [x] if x else None}),
+                'channel': ('show_slug', {str}),
+                'description': ('video_obj', 'description', {str}),
+                'duration': ('video_obj', 'duration', {int_or_none}),
+                'series': ('video_obj', 'program_title', {str}),
+                'title': ('video_obj', 'title', {str}),
+                'upload_date': ('video_obj', 'air_date', {unified_strdate}),
+            }),
+        }