yt_dlp/extractor/stitcher.py

   1 from .common import InfoExtractor
   2 from ..compat import compat_str
   3 from ..utils import (
   4     ExtractorError,
   5     clean_html,
   6     clean_podcast_url,
   7     int_or_none,
   8     str_or_none,
   9     try_get,
  10     url_or_none,
  11 )
  12
  13
  14 class StitcherBaseIE(InfoExtractor):
  15     _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/'
  16
  17     def _call_api(self, path, video_id, query):
  18         resp = self._download_json(
  19             'https://api.prod.stitcher.com/' + path,
  20             video_id, query=query)
  21         error_massage = try_get(resp, lambda x: x['errors'][0]['message'])
  22         if error_massage:
  23             raise ExtractorError(error_massage, expected=True)
  24         return resp['data']
  25
  26     def _extract_description(self, data):
  27         return clean_html(data.get('html_description') or data.get('description'))
  28
  29     def _extract_audio_url(self, episode):
  30         return url_or_none(episode.get('audio_url') or episode.get('guid'))
  31
  32     def _extract_show_info(self, show):
  33         return {
  34             'thumbnail': show.get('image_base_url'),
  35             'series': show.get('title'),
  36         }
  37
  38     def _extract_episode(self, episode, audio_url, show_info):
  39         info = {
  40             'id': compat_str(episode['id']),
  41             'display_id': episode.get('slug'),
  42             'title': episode['title'].strip(),
  43             'description': self._extract_description(episode),
  44             'duration': int_or_none(episode.get('duration')),
  45             'url': clean_podcast_url(audio_url),
  46             'vcodec': 'none',
  47             'timestamp': int_or_none(episode.get('date_published')),
  48             'season_number': int_or_none(episode.get('season')),
  49             'season_id': str_or_none(episode.get('season_id')),
  50         }
  51         info.update(show_info)
  52         return info
  53
  54
  55 class StitcherIE(StitcherBaseIE):
  56     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P<id>\d+)'
  57     _TESTS = [{
  58         'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
  59         'md5': 'e9635098e0da10b21a0e2b85585530f6',
  60         'info_dict': {
  61             'id': '40789481',
  62             'ext': 'mp3',
  63             'title': 'Machine Learning Mastery and Cancer Clusters',
  64             'description': 'md5:547adb4081864be114ae3831b4c2b42f',
  65             'duration': 1604,
  66             'thumbnail': r're:^https?://.*\.jpg',
  67             'upload_date': '20151008',
  68             'timestamp': 1444285800,
  69             'series': 'Talking Machines',
  70         },
  71     }, {
  72         'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
  73         'info_dict': {
  74             'id': '40846275',
  75             'display_id': 'the-rare-hourlong-comedy-plus',
  76             'ext': 'mp3',
  77             'title': "The CW's 'Crazy Ex-Girlfriend'",
  78             'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
  79             'duration': 2235,
  80             'thumbnail': r're:^https?://.*\.jpg',
  81         },
  82         'params': {
  83             'skip_download': True,
  84         },
  85         'skip': 'Page Not Found',
  86     }, {
  87         # escaped title
  88         'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
  89         'only_matching': True,
  90     }, {
  91         'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
  92         'only_matching': True,
  93     }, {
  94         'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584',
  95         'only_matching': True,
  96     }]
  97
  98     def _real_extract(self, url):
  99         audio_id = self._match_id(url)
 100         data = self._call_api(
 101             'shows/episodes', audio_id, {'episode_ids': audio_id})
 102         episode = data['episodes'][0]
 103         audio_url = self._extract_audio_url(episode)
 104         if not audio_url:
 105             self.raise_login_required()
 106         show = try_get(data, lambda x: x['shows'][0], dict) or {}
 107         return self._extract_episode(
 108             episode, audio_url, self._extract_show_info(show))
 109
 110
 111 class StitcherShowIE(StitcherBaseIE):
 112     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P<id>[^/#?&]+)/?(?:[?#&]|$)'
 113     _TESTS = [{
 114         'url': 'http://www.stitcher.com/podcast/the-talking-machines',
 115         'info_dict': {
 116             'id': 'the-talking-machines',
 117             'title': 'Talking Machines',
 118             'description': 'md5:831f0995e40f26c10231af39cf1ebf0b',
 119         },
 120         'playlist_mincount': 106,
 121     }, {
 122         'url': 'https://www.stitcher.com/show/the-talking-machines',
 123         'only_matching': True,
 124     }]
 125
 126     def _real_extract(self, url):
 127         show_slug = self._match_id(url)
 128         data = self._call_api(
 129             'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000})
 130         show = try_get(data, lambda x: x['shows'][0], dict) or {}
 131         show_info = self._extract_show_info(show)
 132
 133         entries = []
 134         for episode in (data.get('episodes') or []):
 135             audio_url = self._extract_audio_url(episode)
 136             if not audio_url:
 137                 continue
 138             entries.append(self._extract_episode(episode, audio_url, show_info))
 139
 140         return self.playlist_result(
 141             entries, show_slug, show.get('title'),
 142             self._extract_description(show))