yt_dlp/extractor/prx.py

   1 import itertools
   2 from .common import InfoExtractor, SearchInfoExtractor
   3 from ..utils import (
   4     urljoin,
   5     traverse_obj,
   6     int_or_none,
   7     mimetype2ext,
   8     clean_html,
   9     url_or_none,
  10     unified_timestamp,
  11     str_or_none,
  12 )
  13
  14
  15 class PRXBaseIE(InfoExtractor):
  16     PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
  17
  18     def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
  19         return self._download_json(
  20             urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
  21
  22     @staticmethod
  23     def _get_prx_embed_response(response, section):
  24         return traverse_obj(response, ('_embedded', f'prx:{section}'))
  25
  26     @staticmethod
  27     def _extract_file_link(response):
  28         return url_or_none(traverse_obj(
  29             response, ('_links', 'enclosure', 'href'), expected_type=str))
  30
  31     @classmethod
  32     def _extract_image(cls, image_response):
  33         if not isinstance(image_response, dict):
  34             return
  35         return {
  36             'id': str_or_none(image_response.get('id')),
  37             'filesize': image_response.get('size'),
  38             'width': image_response.get('width'),
  39             'height': image_response.get('height'),
  40             'url': cls._extract_file_link(image_response)
  41         }
  42
  43     @classmethod
  44     def _extract_base_info(cls, response):
  45         if not isinstance(response, dict):
  46             return
  47         item_id = str_or_none(response.get('id'))
  48         if not item_id:
  49             return
  50         thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
  51         description = (
  52             clean_html(response.get('description'))
  53             or response.get('shortDescription'))
  54         return {
  55             'id': item_id,
  56             'title': response.get('title') or item_id,
  57             'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
  58             'description': description,
  59             'release_timestamp': unified_timestamp(response.get('releasedAt')),
  60             'timestamp': unified_timestamp(response.get('createdAt')),
  61             'modified_timestamp': unified_timestamp(response.get('updatedAt')),
  62             'duration': int_or_none(response.get('duration')),
  63             'tags': response.get('tags'),
  64             'episode_number': int_or_none(response.get('episodeIdentifier')),
  65             'season_number': int_or_none(response.get('seasonIdentifier'))
  66         }
  67
  68     @classmethod
  69     def _extract_series_info(cls, series_response):
  70         base_info = cls._extract_base_info(series_response)
  71         if not base_info:
  72             return
  73         account_info = cls._extract_account_info(
  74             cls._get_prx_embed_response(series_response, 'account')) or {}
  75         return {
  76             **base_info,
  77             'channel_id': account_info.get('channel_id'),
  78             'channel_url': account_info.get('channel_url'),
  79             'channel': account_info.get('channel'),
  80             'series': base_info.get('title'),
  81             'series_id': base_info.get('id'),
  82         }
  83
  84     @classmethod
  85     def _extract_account_info(cls, account_response):
  86         base_info = cls._extract_base_info(account_response)
  87         if not base_info:
  88             return
  89         name = account_response.get('name')
  90         return {
  91             **base_info,
  92             'title': name,
  93             'channel_id': base_info.get('id'),
  94             'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
  95             'channel': name,
  96         }
  97
  98     @classmethod
  99     def _extract_story_info(cls, story_response):
 100         base_info = cls._extract_base_info(story_response)
 101         if not base_info:
 102             return
 103         series = cls._extract_series_info(
 104             cls._get_prx_embed_response(story_response, 'series')) or {}
 105         account = cls._extract_account_info(
 106             cls._get_prx_embed_response(story_response, 'account')) or {}
 107         return {
 108             **base_info,
 109             'series': series.get('series'),
 110             'series_id': series.get('series_id'),
 111             'channel_id': account.get('channel_id'),
 112             'channel_url': account.get('channel_url'),
 113             'channel': account.get('channel')
 114         }
 115
 116     def _entries(self, item_id, endpoint, entry_func, query=None):
 117         """
 118         Extract entries from paginated list API
 119         @param entry_func: Function to generate entry from response item
 120         """
 121         total = 0
 122         for page in itertools.count(1):
 123             response = self._call_api(f'{item_id}: page {page}', endpoint, query={
 124                 **(query or {}),
 125                 'page': page,
 126                 'per': 100
 127             })
 128             items = self._get_prx_embed_response(response, 'items')
 129             if not response or not items:
 130                 break
 131
 132             yield from filter(None, map(entry_func, items))
 133
 134             total += response['count']
 135             if total >= response['total']:
 136                 break
 137
 138     def _story_playlist_entry(self, response):
 139         story = self._extract_story_info(response)
 140         if not story:
 141             return
 142         story.update({
 143             '_type': 'url',
 144             'url': 'https://beta.prx.org/stories/%s' % story['id'],
 145             'ie_key': PRXStoryIE.ie_key()
 146         })
 147         return story
 148
 149     def _series_playlist_entry(self, response):
 150         series = self._extract_series_info(response)
 151         if not series:
 152             return
 153         series.update({
 154             '_type': 'url',
 155             'url': 'https://beta.prx.org/series/%s' % series['id'],
 156             'ie_key': PRXSeriesIE.ie_key()
 157         })
 158         return series
 159
 160
 161 class PRXStoryIE(PRXBaseIE):
 162     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
 163
 164     _TESTS = [
 165         {
 166             # Story with season and episode details
 167             'url': 'https://beta.prx.org/stories/399200',
 168             'info_dict': {
 169                 'id': '399200',
 170                 'title': 'Fly Me To The Moon',
 171                 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
 172                 'release_timestamp': 1640250000,
 173                 'timestamp': 1640208972,
 174                 'modified_timestamp': 1641318202,
 175                 'duration': 1004,
 176                 'tags': 'count:7',
 177                 'episode_number': 8,
 178                 'season_number': 5,
 179                 'series': 'AirSpace',
 180                 'series_id': '38057',
 181                 'channel_id': '220986',
 182                 'channel_url': 'https://beta.prx.org/accounts/220986',
 183                 'channel': 'Air and Space Museum',
 184             },
 185             'playlist': [{
 186                 'info_dict': {
 187                     'id': '399200_part1',
 188                     'title': 'Fly Me To The Moon',
 189                     'description': 'md5:43230168390b95d3322048d8a56bf2bb',
 190                     'release_timestamp': 1640250000,
 191                     'timestamp': 1640208972,
 192                     'modified_timestamp': 1641318202,
 193                     'duration': 530,
 194                     'tags': 'count:7',
 195                     'episode_number': 8,
 196                     'season_number': 5,
 197                     'series': 'AirSpace',
 198                     'series_id': '38057',
 199                     'channel_id': '220986',
 200                     'channel_url': 'https://beta.prx.org/accounts/220986',
 201                     'channel': 'Air and Space Museum',
 202                     'ext': 'mp3',
 203                     'upload_date': '20211222',
 204                     'episode': 'Episode 8',
 205                     'release_date': '20211223',
 206                     'season': 'Season 5',
 207                     'modified_date': '20220104'
 208                 }
 209             }, {
 210                 'info_dict': {
 211                     'id': '399200_part2',
 212                     'title': 'Fly Me To The Moon',
 213                     'description': 'md5:43230168390b95d3322048d8a56bf2bb',
 214                     'release_timestamp': 1640250000,
 215                     'timestamp': 1640208972,
 216                     'modified_timestamp': 1641318202,
 217                     'duration': 474,
 218                     'tags': 'count:7',
 219                     'episode_number': 8,
 220                     'season_number': 5,
 221                     'series': 'AirSpace',
 222                     'series_id': '38057',
 223                     'channel_id': '220986',
 224                     'channel_url': 'https://beta.prx.org/accounts/220986',
 225                     'channel': 'Air and Space Museum',
 226                     'ext': 'mp3',
 227                     'upload_date': '20211222',
 228                     'episode': 'Episode 8',
 229                     'release_date': '20211223',
 230                     'season': 'Season 5',
 231                     'modified_date': '20220104'
 232                 }
 233             }
 234
 235             ]
 236         }, {
 237             # Story with only split audio
 238             'url': 'https://beta.prx.org/stories/326414',
 239             'info_dict': {
 240                 'id': '326414',
 241                 'title': 'Massachusetts v EPA',
 242                 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
 243                 'timestamp': 1592509124,
 244                 'modified_timestamp': 1592510457,
 245                 'duration': 3088,
 246                 'tags': 'count:0',
 247                 'series': 'Outside/In',
 248                 'series_id': '36252',
 249                 'channel_id': '206',
 250                 'channel_url': 'https://beta.prx.org/accounts/206',
 251                 'channel': 'New Hampshire Public Radio',
 252             },
 253             'playlist_count': 4
 254         }, {
 255             # Story with single combined audio
 256             'url': 'https://beta.prx.org/stories/400404',
 257             'info_dict': {
 258                 'id': '400404',
 259                 'title': 'Cafe Chill (Episode 2022-01)',
 260                 'thumbnails': 'count:1',
 261                 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
 262                 'timestamp': 1641233952,
 263                 'modified_timestamp': 1641234248,
 264                 'duration': 3540,
 265                 'series': 'Café Chill',
 266                 'series_id': '37762',
 267                 'channel_id': '5767',
 268                 'channel_url': 'https://beta.prx.org/accounts/5767',
 269                 'channel': 'C89.5 - KNHC Seattle',
 270                 'ext': 'mp3',
 271                 'tags': 'count:0',
 272                 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
 273                 'upload_date': '20220103',
 274                 'modified_date': '20220103'
 275             }
 276         }, {
 277             'url': 'https://listen.prx.org/stories/399200',
 278             'only_matching': True
 279         }
 280     ]
 281
 282     def _extract_audio_pieces(self, audio_response):
 283         return [{
 284             'format_id': str_or_none(piece_response.get('id')),
 285             'format_note': str_or_none(piece_response.get('label')),
 286             'filesize': int_or_none(piece_response.get('size')),
 287             'duration': int_or_none(piece_response.get('duration')),
 288             'ext': mimetype2ext(piece_response.get('contentType')),
 289             'asr': int_or_none(piece_response.get('frequency'), scale=1000),
 290             'abr': int_or_none(piece_response.get('bitRate')),
 291             'url': self._extract_file_link(piece_response),
 292             'vcodec': 'none'
 293         } for piece_response in sorted(
 294             self._get_prx_embed_response(audio_response, 'items') or [],
 295             key=lambda p: int_or_none(p.get('position')))]
 296
 297     def _extract_story(self, story_response):
 298         info = self._extract_story_info(story_response)
 299         if not info:
 300             return
 301         audio_pieces = self._extract_audio_pieces(
 302             self._get_prx_embed_response(story_response, 'audio'))
 303         if len(audio_pieces) == 1:
 304             return {
 305                 'formats': audio_pieces,
 306                 **info
 307             }
 308
 309         entries = [{
 310             **info,
 311             'id': '%s_part%d' % (info['id'], (idx + 1)),
 312             'formats': [fmt],
 313         } for idx, fmt in enumerate(audio_pieces)]
 314         return {
 315             '_type': 'multi_video',
 316             'entries': entries,
 317             **info
 318         }
 319
 320     def _real_extract(self, url):
 321         story_id = self._match_id(url)
 322         response = self._call_api(story_id, f'stories/{story_id}')
 323         return self._extract_story(response)
 324
 325
 326 class PRXSeriesIE(PRXBaseIE):
 327     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
 328     _TESTS = [
 329         {
 330             'url': 'https://beta.prx.org/series/36252',
 331             'info_dict': {
 332                 'id': '36252',
 333                 'title': 'Outside/In',
 334                 'thumbnails': 'count:1',
 335                 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
 336                 'timestamp': 1470684964,
 337                 'modified_timestamp': 1582308830,
 338                 'channel_id': '206',
 339                 'channel_url': 'https://beta.prx.org/accounts/206',
 340                 'channel': 'New Hampshire Public Radio',
 341                 'series': 'Outside/In',
 342                 'series_id': '36252'
 343             },
 344             'playlist_mincount': 39
 345         }, {
 346             # Blank series
 347             'url': 'https://beta.prx.org/series/25038',
 348             'info_dict': {
 349                 'id': '25038',
 350                 'title': '25038',
 351                 'timestamp': 1207612800,
 352                 'modified_timestamp': 1207612800,
 353                 'channel_id': '206',
 354                 'channel_url': 'https://beta.prx.org/accounts/206',
 355                 'channel': 'New Hampshire Public Radio',
 356                 'series': '25038',
 357                 'series_id': '25038'
 358             },
 359             'playlist_count': 0
 360         }
 361     ]
 362
 363     def _extract_series(self, series_response):
 364         info = self._extract_series_info(series_response)
 365         return {
 366             '_type': 'playlist',
 367             'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
 368             **info
 369         }
 370
 371     def _real_extract(self, url):
 372         series_id = self._match_id(url)
 373         response = self._call_api(series_id, f'series/{series_id}')
 374         return self._extract_series(response)
 375
 376
 377 class PRXAccountIE(PRXBaseIE):
 378     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
 379     _TESTS = [{
 380         'url': 'https://beta.prx.org/accounts/206',
 381         'info_dict': {
 382             'id': '206',
 383             'title': 'New Hampshire Public Radio',
 384             'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
 385             'channel_id': '206',
 386             'channel_url': 'https://beta.prx.org/accounts/206',
 387             'channel': 'New Hampshire Public Radio',
 388             'thumbnails': 'count:1'
 389         },
 390         'playlist_mincount': 380
 391     }]
 392
 393     def _extract_account(self, account_response):
 394         info = self._extract_account_info(account_response)
 395         series = self._entries(
 396             info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
 397         stories = self._entries(
 398             info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
 399         return {
 400             '_type': 'playlist',
 401             'entries': itertools.chain(series, stories),
 402             **info
 403         }
 404
 405     def _real_extract(self, url):
 406         account_id = self._match_id(url)
 407         response = self._call_api(account_id, f'accounts/{account_id}')
 408         return self._extract_account(response)
 409
 410
 411 class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
 412     IE_DESC = 'PRX Stories Search'
 413     IE_NAME = 'prxstories:search'
 414     _SEARCH_KEY = 'prxstories'
 415
 416     def _search_results(self, query):
 417         yield from self._entries(
 418             f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
 419
 420
 421 class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
 422     IE_DESC = 'PRX Series Search'
 423     IE_NAME = 'prxseries:search'
 424     _SEARCH_KEY = 'prxseries'
 425
 426     def _search_results(self, query):
 427         yield from self._entries(
 428             f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})