yt_dlp/extractor/spreaker.py

   1 import itertools
   2
   3 from .common import InfoExtractor
   4 from ..compat import compat_str
   5 from ..utils import (
   6     float_or_none,
   7     int_or_none,
   8     str_or_none,
   9     try_get,
  10     unified_timestamp,
  11     url_or_none,
  12 )
  13
  14
  15 def _extract_episode(data, episode_id=None):
  16     title = data['title']
  17     download_url = data['download_url']
  18
  19     series = try_get(data, lambda x: x['show']['title'], compat_str)
  20     uploader = try_get(data, lambda x: x['author']['fullname'], compat_str)
  21
  22     thumbnails = []
  23     for image in ('image_original', 'image_medium', 'image'):
  24         image_url = url_or_none(data.get('%s_url' % image))
  25         if image_url:
  26             thumbnails.append({'url': image_url})
  27
  28     def stats(key):
  29         return int_or_none(try_get(
  30             data,
  31             (lambda x: x['%ss_count' % key],
  32              lambda x: x['stats']['%ss' % key])))
  33
  34     def duration(key):
  35         return float_or_none(data.get(key), scale=1000)
  36
  37     return {
  38         'id': compat_str(episode_id or data['episode_id']),
  39         'url': download_url,
  40         'display_id': data.get('permalink'),
  41         'title': title,
  42         'description': data.get('description'),
  43         'timestamp': unified_timestamp(data.get('published_at')),
  44         'uploader': uploader,
  45         'uploader_id': str_or_none(data.get('author_id')),
  46         'creator': uploader,
  47         'duration': duration('duration') or duration('length'),
  48         'view_count': stats('play'),
  49         'like_count': stats('like'),
  50         'comment_count': stats('message'),
  51         'format': 'MPEG Layer 3',
  52         'format_id': 'mp3',
  53         'container': 'mp3',
  54         'ext': 'mp3',
  55         'thumbnails': thumbnails,
  56         'series': series,
  57         'extractor_key': SpreakerIE.ie_key(),
  58     }
  59
  60
  61 class SpreakerIE(InfoExtractor):
  62     _VALID_URL = r'''(?x)
  63                     https?://
  64                         api\.spreaker\.com/
  65                         (?:
  66                             (?:download/)?episode|
  67                             v2/episodes
  68                         )/
  69                         (?P<id>\d+)
  70                     '''
  71     _TESTS = [{
  72         'url': 'https://api.spreaker.com/episode/12534508',
  73         'info_dict': {
  74             'id': '12534508',
  75             'display_id': 'swm-ep15-how-to-market-your-music-part-2',
  76             'ext': 'mp3',
  77             'title': 'EP:15 | Music Marketing (Likes) - Part 2',
  78             'description': 'md5:0588c43e27be46423e183076fa071177',
  79             'timestamp': 1502250336,
  80             'upload_date': '20170809',
  81             'uploader': 'SWM',
  82             'uploader_id': '9780658',
  83             'duration': 1063.42,
  84             'view_count': int,
  85             'like_count': int,
  86             'comment_count': int,
  87             'series': 'Success With Music (SWM)',
  88         },
  89     }, {
  90         'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3',
  91         'only_matching': True,
  92     }, {
  93         'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments',
  94         'only_matching': True,
  95     }]
  96
  97     def _real_extract(self, url):
  98         episode_id = self._match_id(url)
  99         data = self._download_json(
 100             'https://api.spreaker.com/v2/episodes/%s' % episode_id,
 101             episode_id)['response']['episode']
 102         return _extract_episode(data, episode_id)
 103
 104
 105 class SpreakerPageIE(InfoExtractor):
 106     _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)'
 107     _TESTS = [{
 108         'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2',
 109         'only_matching': True,
 110     }]
 111
 112     def _real_extract(self, url):
 113         display_id = self._match_id(url)
 114         webpage = self._download_webpage(url, display_id)
 115         episode_id = self._search_regex(
 116             (r'data-episode_id=["\'](?P<id>\d+)',
 117              r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id')
 118         return self.url_result(
 119             'https://api.spreaker.com/episode/%s' % episode_id,
 120             ie=SpreakerIE.ie_key(), video_id=episode_id)
 121
 122
 123 class SpreakerShowIE(InfoExtractor):
 124     _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)'
 125     _TESTS = [{
 126         'url': 'https://api.spreaker.com/show/4652058',
 127         'info_dict': {
 128             'id': '4652058',
 129         },
 130         'playlist_mincount': 118,
 131     }]
 132
 133     def _entries(self, show_id):
 134         for page_num in itertools.count(1):
 135             episodes = self._download_json(
 136                 'https://api.spreaker.com/show/%s/episodes' % show_id,
 137                 show_id, note='Downloading JSON page %d' % page_num, query={
 138                     'page': page_num,
 139                     'max_per_page': 100,
 140                 })
 141             pager = try_get(episodes, lambda x: x['response']['pager'], dict)
 142             if not pager:
 143                 break
 144             results = pager.get('results')
 145             if not results or not isinstance(results, list):
 146                 break
 147             for result in results:
 148                 if not isinstance(result, dict):
 149                     continue
 150                 yield _extract_episode(result)
 151             if page_num == pager.get('last_page'):
 152                 break
 153
 154     def _real_extract(self, url):
 155         show_id = self._match_id(url)
 156         return self.playlist_result(self._entries(show_id), playlist_id=show_id)
 157
 158
 159 class SpreakerShowPageIE(InfoExtractor):
 160     _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)'
 161     _TESTS = [{
 162         'url': 'https://www.spreaker.com/show/success-with-music',
 163         'only_matching': True,
 164     }]
 165
 166     def _real_extract(self, url):
 167         display_id = self._match_id(url)
 168         webpage = self._download_webpage(url, display_id)
 169         show_id = self._search_regex(
 170             r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id')
 171         return self.url_result(
 172             'https://api.spreaker.com/show/%s' % show_id,
 173             ie=SpreakerShowIE.ie_key(), video_id=show_id)