yt_dlp/extractor/prx.py

   1 import itertools
   2
   3 from .common import InfoExtractor, SearchInfoExtractor
   4 from ..utils import (
   5     clean_html,
   6     int_or_none,
   7     mimetype2ext,
   8     str_or_none,
   9     traverse_obj,
  10     unified_timestamp,
  11     url_or_none,
  12     urljoin,
  13 )
  14
  15
  16 class PRXBaseIE(InfoExtractor):
  17     PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
  18
  19     def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
  20         return self._download_json(
  21             urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
  22
  23     @staticmethod
  24     def _get_prx_embed_response(response, section):
  25         return traverse_obj(response, ('_embedded', f'prx:{section}'))
  26
  27     @staticmethod
  28     def _extract_file_link(response):
  29         return url_or_none(traverse_obj(
  30             response, ('_links', 'enclosure', 'href'), expected_type=str))
  31
  32     @classmethod
  33     def _extract_image(cls, image_response):
  34         if not isinstance(image_response, dict):
  35             return
  36         return {
  37             'id': str_or_none(image_response.get('id')),
  38             'filesize': image_response.get('size'),
  39             'width': image_response.get('width'),
  40             'height': image_response.get('height'),
  41             'url': cls._extract_file_link(image_response)
  42         }
  43
  44     @classmethod
  45     def _extract_base_info(cls, response):
  46         if not isinstance(response, dict):
  47             return
  48         item_id = str_or_none(response.get('id'))
  49         if not item_id:
  50             return
  51         thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
  52         description = (
  53             clean_html(response.get('description'))
  54             or response.get('shortDescription'))
  55         return {
  56             'id': item_id,
  57             'title': response.get('title') or item_id,
  58             'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
  59             'description': description,
  60             'release_timestamp': unified_timestamp(response.get('releasedAt')),
  61             'timestamp': unified_timestamp(response.get('createdAt')),
  62             'modified_timestamp': unified_timestamp(response.get('updatedAt')),
  63             'duration': int_or_none(response.get('duration')),
  64             'tags': response.get('tags'),
  65             'episode_number': int_or_none(response.get('episodeIdentifier')),
  66             'season_number': int_or_none(response.get('seasonIdentifier'))
  67         }
  68
  69     @classmethod
  70     def _extract_series_info(cls, series_response):
  71         base_info = cls._extract_base_info(series_response)
  72         if not base_info:
  73             return
  74         account_info = cls._extract_account_info(
  75             cls._get_prx_embed_response(series_response, 'account')) or {}
  76         return {
  77             **base_info,
  78             'channel_id': account_info.get('channel_id'),
  79             'channel_url': account_info.get('channel_url'),
  80             'channel': account_info.get('channel'),
  81             'series': base_info.get('title'),
  82             'series_id': base_info.get('id'),
  83         }
  84
  85     @classmethod
  86     def _extract_account_info(cls, account_response):
  87         base_info = cls._extract_base_info(account_response)
  88         if not base_info:
  89             return
  90         name = account_response.get('name')
  91         return {
  92             **base_info,
  93             'title': name,
  94             'channel_id': base_info.get('id'),
  95             'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
  96             'channel': name,
  97         }
  98
  99     @classmethod
 100     def _extract_story_info(cls, story_response):
 101         base_info = cls._extract_base_info(story_response)
 102         if not base_info:
 103             return
 104         series = cls._extract_series_info(
 105             cls._get_prx_embed_response(story_response, 'series')) or {}
 106         account = cls._extract_account_info(
 107             cls._get_prx_embed_response(story_response, 'account')) or {}
 108         return {
 109             **base_info,
 110             'series': series.get('series'),
 111             'series_id': series.get('series_id'),
 112             'channel_id': account.get('channel_id'),
 113             'channel_url': account.get('channel_url'),
 114             'channel': account.get('channel')
 115         }
 116
 117     def _entries(self, item_id, endpoint, entry_func, query=None):
 118         """
 119         Extract entries from paginated list API
 120         @param entry_func: Function to generate entry from response item
 121         """
 122         total = 0
 123         for page in itertools.count(1):
 124             response = self._call_api(f'{item_id}: page {page}', endpoint, query={
 125                 **(query or {}),
 126                 'page': page,
 127                 'per': 100
 128             })
 129             items = self._get_prx_embed_response(response, 'items')
 130             if not response or not items:
 131                 break
 132
 133             yield from filter(None, map(entry_func, items))
 134
 135             total += response['count']
 136             if total >= response['total']:
 137                 break
 138
 139     def _story_playlist_entry(self, response):
 140         story = self._extract_story_info(response)
 141         if not story:
 142             return
 143         story.update({
 144             '_type': 'url',
 145             'url': 'https://beta.prx.org/stories/%s' % story['id'],
 146             'ie_key': PRXStoryIE.ie_key()
 147         })
 148         return story
 149
 150     def _series_playlist_entry(self, response):
 151         series = self._extract_series_info(response)
 152         if not series:
 153             return
 154         series.update({
 155             '_type': 'url',
 156             'url': 'https://beta.prx.org/series/%s' % series['id'],
 157             'ie_key': PRXSeriesIE.ie_key()
 158         })
 159         return series
 160
 161
 162 class PRXStoryIE(PRXBaseIE):
 163     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
 164
 165     _TESTS = [
 166         {
 167             # Story with season and episode details
 168             'url': 'https://beta.prx.org/stories/399200',
 169             'info_dict': {
 170                 'id': '399200',
 171                 'title': 'Fly Me To The Moon',
 172                 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
 173                 'release_timestamp': 1640250000,
 174                 'timestamp': 1640208972,
 175                 'modified_timestamp': 1641318202,
 176                 'duration': 1004,
 177                 'tags': 'count:7',
 178                 'episode_number': 8,
 179                 'season_number': 5,
 180                 'series': 'AirSpace',
 181                 'series_id': '38057',
 182                 'channel_id': '220986',
 183                 'channel_url': 'https://beta.prx.org/accounts/220986',
 184                 'channel': 'Air and Space Museum',
 185             },
 186             'playlist': [{
 187                 'info_dict': {
 188                     'id': '399200_part1',
 189                     'title': 'Fly Me To The Moon',
 190                     'description': 'md5:43230168390b95d3322048d8a56bf2bb',
 191                     'release_timestamp': 1640250000,
 192                     'timestamp': 1640208972,
 193                     'modified_timestamp': 1641318202,
 194                     'duration': 530,
 195                     'tags': 'count:7',
 196                     'episode_number': 8,
 197                     'season_number': 5,
 198                     'series': 'AirSpace',
 199                     'series_id': '38057',
 200                     'channel_id': '220986',
 201                     'channel_url': 'https://beta.prx.org/accounts/220986',
 202                     'channel': 'Air and Space Museum',
 203                     'ext': 'mp3',
 204                     'upload_date': '20211222',
 205                     'episode': 'Episode 8',
 206                     'release_date': '20211223',
 207                     'season': 'Season 5',
 208                     'modified_date': '20220104'
 209                 }
 210             }, {
 211                 'info_dict': {
 212                     'id': '399200_part2',
 213                     'title': 'Fly Me To The Moon',
 214                     'description': 'md5:43230168390b95d3322048d8a56bf2bb',
 215                     'release_timestamp': 1640250000,
 216                     'timestamp': 1640208972,
 217                     'modified_timestamp': 1641318202,
 218                     'duration': 474,
 219                     'tags': 'count:7',
 220                     'episode_number': 8,
 221                     'season_number': 5,
 222                     'series': 'AirSpace',
 223                     'series_id': '38057',
 224                     'channel_id': '220986',
 225                     'channel_url': 'https://beta.prx.org/accounts/220986',
 226                     'channel': 'Air and Space Museum',
 227                     'ext': 'mp3',
 228                     'upload_date': '20211222',
 229                     'episode': 'Episode 8',
 230                     'release_date': '20211223',
 231                     'season': 'Season 5',
 232                     'modified_date': '20220104'
 233                 }
 234             }
 235
 236             ]
 237         }, {
 238             # Story with only split audio
 239             'url': 'https://beta.prx.org/stories/326414',
 240             'info_dict': {
 241                 'id': '326414',
 242                 'title': 'Massachusetts v EPA',
 243                 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
 244                 'timestamp': 1592509124,
 245                 'modified_timestamp': 1592510457,
 246                 'duration': 3088,
 247                 'tags': 'count:0',
 248                 'series': 'Outside/In',
 249                 'series_id': '36252',
 250                 'channel_id': '206',
 251                 'channel_url': 'https://beta.prx.org/accounts/206',
 252                 'channel': 'New Hampshire Public Radio',
 253             },
 254             'playlist_count': 4
 255         }, {
 256             # Story with single combined audio
 257             'url': 'https://beta.prx.org/stories/400404',
 258             'info_dict': {
 259                 'id': '400404',
 260                 'title': 'Cafe Chill (Episode 2022-01)',
 261                 'thumbnails': 'count:1',
 262                 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
 263                 'timestamp': 1641233952,
 264                 'modified_timestamp': 1641234248,
 265                 'duration': 3540,
 266                 'series': 'Café Chill',
 267                 'series_id': '37762',
 268                 'channel_id': '5767',
 269                 'channel_url': 'https://beta.prx.org/accounts/5767',
 270                 'channel': 'C89.5 - KNHC Seattle',
 271                 'ext': 'mp3',
 272                 'tags': 'count:0',
 273                 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
 274                 'upload_date': '20220103',
 275                 'modified_date': '20220103'
 276             }
 277         }, {
 278             'url': 'https://listen.prx.org/stories/399200',
 279             'only_matching': True
 280         }
 281     ]
 282
 283     def _extract_audio_pieces(self, audio_response):
 284         return [{
 285             'format_id': str_or_none(piece_response.get('id')),
 286             'format_note': str_or_none(piece_response.get('label')),
 287             'filesize': int_or_none(piece_response.get('size')),
 288             'duration': int_or_none(piece_response.get('duration')),
 289             'ext': mimetype2ext(piece_response.get('contentType')),
 290             'asr': int_or_none(piece_response.get('frequency'), scale=1000),
 291             'abr': int_or_none(piece_response.get('bitRate')),
 292             'url': self._extract_file_link(piece_response),
 293             'vcodec': 'none'
 294         } for piece_response in sorted(
 295             self._get_prx_embed_response(audio_response, 'items') or [],
 296             key=lambda p: int_or_none(p.get('position')))]
 297
 298     def _extract_story(self, story_response):
 299         info = self._extract_story_info(story_response)
 300         if not info:
 301             return
 302         audio_pieces = self._extract_audio_pieces(
 303             self._get_prx_embed_response(story_response, 'audio'))
 304         if len(audio_pieces) == 1:
 305             return {
 306                 'formats': audio_pieces,
 307                 **info
 308             }
 309
 310         entries = [{
 311             **info,
 312             'id': '%s_part%d' % (info['id'], (idx + 1)),
 313             'formats': [fmt],
 314         } for idx, fmt in enumerate(audio_pieces)]
 315         return {
 316             '_type': 'multi_video',
 317             'entries': entries,
 318             **info
 319         }
 320
 321     def _real_extract(self, url):
 322         story_id = self._match_id(url)
 323         response = self._call_api(story_id, f'stories/{story_id}')
 324         return self._extract_story(response)
 325
 326
 327 class PRXSeriesIE(PRXBaseIE):
 328     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
 329     _TESTS = [
 330         {
 331             'url': 'https://beta.prx.org/series/36252',
 332             'info_dict': {
 333                 'id': '36252',
 334                 'title': 'Outside/In',
 335                 'thumbnails': 'count:1',
 336                 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
 337                 'timestamp': 1470684964,
 338                 'modified_timestamp': 1582308830,
 339                 'channel_id': '206',
 340                 'channel_url': 'https://beta.prx.org/accounts/206',
 341                 'channel': 'New Hampshire Public Radio',
 342                 'series': 'Outside/In',
 343                 'series_id': '36252'
 344             },
 345             'playlist_mincount': 39
 346         }, {
 347             # Blank series
 348             'url': 'https://beta.prx.org/series/25038',
 349             'info_dict': {
 350                 'id': '25038',
 351                 'title': '25038',
 352                 'timestamp': 1207612800,
 353                 'modified_timestamp': 1207612800,
 354                 'channel_id': '206',
 355                 'channel_url': 'https://beta.prx.org/accounts/206',
 356                 'channel': 'New Hampshire Public Radio',
 357                 'series': '25038',
 358                 'series_id': '25038'
 359             },
 360             'playlist_count': 0
 361         }
 362     ]
 363
 364     def _extract_series(self, series_response):
 365         info = self._extract_series_info(series_response)
 366         return {
 367             '_type': 'playlist',
 368             'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
 369             **info
 370         }
 371
 372     def _real_extract(self, url):
 373         series_id = self._match_id(url)
 374         response = self._call_api(series_id, f'series/{series_id}')
 375         return self._extract_series(response)
 376
 377
 378 class PRXAccountIE(PRXBaseIE):
 379     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
 380     _TESTS = [{
 381         'url': 'https://beta.prx.org/accounts/206',
 382         'info_dict': {
 383             'id': '206',
 384             'title': 'New Hampshire Public Radio',
 385             'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
 386             'channel_id': '206',
 387             'channel_url': 'https://beta.prx.org/accounts/206',
 388             'channel': 'New Hampshire Public Radio',
 389             'thumbnails': 'count:1'
 390         },
 391         'playlist_mincount': 380
 392     }]
 393
 394     def _extract_account(self, account_response):
 395         info = self._extract_account_info(account_response)
 396         series = self._entries(
 397             info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
 398         stories = self._entries(
 399             info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
 400         return {
 401             '_type': 'playlist',
 402             'entries': itertools.chain(series, stories),
 403             **info
 404         }
 405
 406     def _real_extract(self, url):
 407         account_id = self._match_id(url)
 408         response = self._call_api(account_id, f'accounts/{account_id}')
 409         return self._extract_account(response)
 410
 411
 412 class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
 413     IE_DESC = 'PRX Stories Search'
 414     IE_NAME = 'prxstories:search'
 415     _SEARCH_KEY = 'prxstories'
 416
 417     def _search_results(self, query):
 418         yield from self._entries(
 419             f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
 420
 421
 422 class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
 423     IE_DESC = 'PRX Series Search'
 424     IE_NAME = 'prxseries:search'
 425     _SEARCH_KEY = 'prxseries'
 426
 427     def _search_results(self, query):
 428         yield from self._entries(
 429             f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})