yt_dlp/extractor/stitcher.py

   1 from __future__ import unicode_literals
   2
   3 from .common import InfoExtractor
   4 from ..compat import compat_str
   5 from ..utils import (
   6     clean_html,
   7     clean_podcast_url,
   8     ExtractorError,
   9     int_or_none,
  10     str_or_none,
  11     try_get,
  12     url_or_none,
  13 )
  14
  15
  16 class StitcherBaseIE(InfoExtractor):
  17     _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/'
  18
  19     def _call_api(self, path, video_id, query):
  20         resp = self._download_json(
  21             'https://api.prod.stitcher.com/' + path,
  22             video_id, query=query)
  23         error_massage = try_get(resp, lambda x: x['errors'][0]['message'])
  24         if error_massage:
  25             raise ExtractorError(error_massage, expected=True)
  26         return resp['data']
  27
  28     def _extract_description(self, data):
  29         return clean_html(data.get('html_description') or data.get('description'))
  30
  31     def _extract_audio_url(self, episode):
  32         return url_or_none(episode.get('audio_url') or episode.get('guid'))
  33
  34     def _extract_show_info(self, show):
  35         return {
  36             'thumbnail': show.get('image_base_url'),
  37             'series': show.get('title'),
  38         }
  39
  40     def _extract_episode(self, episode, audio_url, show_info):
  41         info = {
  42             'id': compat_str(episode['id']),
  43             'display_id': episode.get('slug'),
  44             'title': episode['title'].strip(),
  45             'description': self._extract_description(episode),
  46             'duration': int_or_none(episode.get('duration')),
  47             'url': clean_podcast_url(audio_url),
  48             'vcodec': 'none',
  49             'timestamp': int_or_none(episode.get('date_published')),
  50             'season_number': int_or_none(episode.get('season')),
  51             'season_id': str_or_none(episode.get('season_id')),
  52         }
  53         info.update(show_info)
  54         return info
  55
  56
  57 class StitcherIE(StitcherBaseIE):
  58     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P<id>\d+)'
  59     _TESTS = [{
  60         'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
  61         'md5': 'e9635098e0da10b21a0e2b85585530f6',
  62         'info_dict': {
  63             'id': '40789481',
  64             'ext': 'mp3',
  65             'title': 'Machine Learning Mastery and Cancer Clusters',
  66             'description': 'md5:547adb4081864be114ae3831b4c2b42f',
  67             'duration': 1604,
  68             'thumbnail': r're:^https?://.*\.jpg',
  69             'upload_date': '20151008',
  70             'timestamp': 1444285800,
  71             'series': 'Talking Machines',
  72         },
  73     }, {
  74         'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
  75         'info_dict': {
  76             'id': '40846275',
  77             'display_id': 'the-rare-hourlong-comedy-plus',
  78             'ext': 'mp3',
  79             'title': "The CW's 'Crazy Ex-Girlfriend'",
  80             'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
  81             'duration': 2235,
  82             'thumbnail': r're:^https?://.*\.jpg',
  83         },
  84         'params': {
  85             'skip_download': True,
  86         },
  87         'skip': 'Page Not Found',
  88     }, {
  89         # escaped title
  90         'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
  91         'only_matching': True,
  92     }, {
  93         'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
  94         'only_matching': True,
  95     }, {
  96         'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584',
  97         'only_matching': True,
  98     }]
  99
 100     def _real_extract(self, url):
 101         audio_id = self._match_id(url)
 102         data = self._call_api(
 103             'shows/episodes', audio_id, {'episode_ids': audio_id})
 104         episode = data['episodes'][0]
 105         audio_url = self._extract_audio_url(episode)
 106         if not audio_url:
 107             self.raise_login_required()
 108         show = try_get(data, lambda x: x['shows'][0], dict) or {}
 109         return self._extract_episode(
 110             episode, audio_url, self._extract_show_info(show))
 111
 112
 113 class StitcherShowIE(StitcherBaseIE):
 114     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P<id>[^/#?&]+)/?(?:[?#&]|$)'
 115     _TESTS = [{
 116         'url': 'http://www.stitcher.com/podcast/the-talking-machines',
 117         'info_dict': {
 118             'id': 'the-talking-machines',
 119             'title': 'Talking Machines',
 120             'description': 'md5:831f0995e40f26c10231af39cf1ebf0b',
 121         },
 122         'playlist_mincount': 106,
 123     }, {
 124         'url': 'https://www.stitcher.com/show/the-talking-machines',
 125         'only_matching': True,
 126     }]
 127
 128     def _real_extract(self, url):
 129         show_slug = self._match_id(url)
 130         data = self._call_api(
 131             'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000})
 132         show = try_get(data, lambda x: x['shows'][0], dict) or {}
 133         show_info = self._extract_show_info(show)
 134
 135         entries = []
 136         for episode in (data.get('episodes') or []):
 137             audio_url = self._extract_audio_url(episode)
 138             if not audio_url:
 139                 continue
 140             entries.append(self._extract_episode(episode, audio_url, show_info))
 141
 142         return self.playlist_result(
 143             entries, show_slug, show.get('title'),
 144             self._extract_description(show))