yt_dlp/extractor/prx.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import itertools
   5 from .common import InfoExtractor, SearchInfoExtractor
   6 from ..utils import (
   7     urljoin,
   8     traverse_obj,
   9     int_or_none,
  10     mimetype2ext,
  11     clean_html,
  12     url_or_none,
  13     unified_timestamp,
  14     str_or_none,
  15 )
  16
  17
  18 class PRXBaseIE(InfoExtractor):
  19     PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
  20
  21     def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
  22         return self._download_json(
  23             urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
  24
  25     @staticmethod
  26     def _get_prx_embed_response(response, section):
  27         return traverse_obj(response, ('_embedded', f'prx:{section}'))
  28
  29     @staticmethod
  30     def _extract_file_link(response):
  31         return url_or_none(traverse_obj(
  32             response, ('_links', 'enclosure', 'href'), expected_type=str))
  33
  34     @classmethod
  35     def _extract_image(cls, image_response):
  36         if not isinstance(image_response, dict):
  37             return
  38         return {
  39             'id': str_or_none(image_response.get('id')),
  40             'filesize': image_response.get('size'),
  41             'width': image_response.get('width'),
  42             'height': image_response.get('height'),
  43             'url': cls._extract_file_link(image_response)
  44         }
  45
  46     @classmethod
  47     def _extract_base_info(cls, response):
  48         if not isinstance(response, dict):
  49             return
  50         item_id = str_or_none(response.get('id'))
  51         if not item_id:
  52             return
  53         thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
  54         description = (
  55             clean_html(response.get('description'))
  56             or response.get('shortDescription'))
  57         return {
  58             'id': item_id,
  59             'title': response.get('title') or item_id,
  60             'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
  61             'description': description,
  62             'release_timestamp': unified_timestamp(response.get('releasedAt')),
  63             'timestamp': unified_timestamp(response.get('createdAt')),
  64             'modified_timestamp': unified_timestamp(response.get('updatedAt')),
  65             'duration': int_or_none(response.get('duration')),
  66             'tags': response.get('tags'),
  67             'episode_number': int_or_none(response.get('episodeIdentifier')),
  68             'season_number': int_or_none(response.get('seasonIdentifier'))
  69         }
  70
  71     @classmethod
  72     def _extract_series_info(cls, series_response):
  73         base_info = cls._extract_base_info(series_response)
  74         if not base_info:
  75             return
  76         account_info = cls._extract_account_info(
  77             cls._get_prx_embed_response(series_response, 'account')) or {}
  78         return {
  79             **base_info,
  80             'channel_id': account_info.get('channel_id'),
  81             'channel_url': account_info.get('channel_url'),
  82             'channel': account_info.get('channel'),
  83             'series': base_info.get('title'),
  84             'series_id': base_info.get('id'),
  85         }
  86
  87     @classmethod
  88     def _extract_account_info(cls, account_response):
  89         base_info = cls._extract_base_info(account_response)
  90         if not base_info:
  91             return
  92         name = account_response.get('name')
  93         return {
  94             **base_info,
  95             'title': name,
  96             'channel_id': base_info.get('id'),
  97             'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
  98             'channel': name,
  99         }
 100
 101     @classmethod
 102     def _extract_story_info(cls, story_response):
 103         base_info = cls._extract_base_info(story_response)
 104         if not base_info:
 105             return
 106         series = cls._extract_series_info(
 107             cls._get_prx_embed_response(story_response, 'series')) or {}
 108         account = cls._extract_account_info(
 109             cls._get_prx_embed_response(story_response, 'account')) or {}
 110         return {
 111             **base_info,
 112             'series': series.get('series'),
 113             'series_id': series.get('series_id'),
 114             'channel_id': account.get('channel_id'),
 115             'channel_url': account.get('channel_url'),
 116             'channel': account.get('channel')
 117         }
 118
 119     def _entries(self, item_id, endpoint, entry_func, query=None):
 120         """
 121         Extract entries from paginated list API
 122         @param entry_func: Function to generate entry from response item
 123         """
 124         total = 0
 125         for page in itertools.count(1):
 126             response = self._call_api(f'{item_id}: page {page}', endpoint, query={
 127                 **(query or {}),
 128                 'page': page,
 129                 'per': 100
 130             })
 131             items = self._get_prx_embed_response(response, 'items')
 132             if not response or not items:
 133                 break
 134
 135             yield from filter(None, map(entry_func, items))
 136
 137             total += response['count']
 138             if total >= response['total']:
 139                 break
 140
 141     def _story_playlist_entry(self, response):
 142         story = self._extract_story_info(response)
 143         if not story:
 144             return
 145         story.update({
 146             '_type': 'url',
 147             'url': 'https://beta.prx.org/stories/%s' % story['id'],
 148             'ie_key': PRXStoryIE.ie_key()
 149         })
 150         return story
 151
 152     def _series_playlist_entry(self, response):
 153         series = self._extract_series_info(response)
 154         if not series:
 155             return
 156         series.update({
 157             '_type': 'url',
 158             'url': 'https://beta.prx.org/series/%s' % series['id'],
 159             'ie_key': PRXSeriesIE.ie_key()
 160         })
 161         return series
 162
 163
 164 class PRXStoryIE(PRXBaseIE):
 165     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
 166
 167     _TESTS = [
 168         {
 169             # Story with season and episode details
 170             'url': 'https://beta.prx.org/stories/399200',
 171             'info_dict': {
 172                 'id': '399200',
 173                 'title': 'Fly Me To The Moon',
 174                 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
 175                 'release_timestamp': 1640250000,
 176                 'timestamp': 1640208972,
 177                 'modified_timestamp': 1641318202,
 178                 'duration': 1004,
 179                 'tags': 'count:7',
 180                 'episode_number': 8,
 181                 'season_number': 5,
 182                 'series': 'AirSpace',
 183                 'series_id': '38057',
 184                 'channel_id': '220986',
 185                 'channel_url': 'https://beta.prx.org/accounts/220986',
 186                 'channel': 'Air and Space Museum',
 187             },
 188             'playlist': [{
 189                 'info_dict': {
 190                     'id': '399200_part1',
 191                     'title': 'Fly Me To The Moon',
 192                     'description': 'md5:43230168390b95d3322048d8a56bf2bb',
 193                     'release_timestamp': 1640250000,
 194                     'timestamp': 1640208972,
 195                     'modified_timestamp': 1641318202,
 196                     'duration': 530,
 197                     'tags': 'count:7',
 198                     'episode_number': 8,
 199                     'season_number': 5,
 200                     'series': 'AirSpace',
 201                     'series_id': '38057',
 202                     'channel_id': '220986',
 203                     'channel_url': 'https://beta.prx.org/accounts/220986',
 204                     'channel': 'Air and Space Museum',
 205                     'ext': 'mp3',
 206                     'upload_date': '20211222',
 207                     'episode': 'Episode 8',
 208                     'release_date': '20211223',
 209                     'season': 'Season 5',
 210                     'modified_date': '20220104'
 211                 }
 212             }, {
 213                 'info_dict': {
 214                     'id': '399200_part2',
 215                     'title': 'Fly Me To The Moon',
 216                     'description': 'md5:43230168390b95d3322048d8a56bf2bb',
 217                     'release_timestamp': 1640250000,
 218                     'timestamp': 1640208972,
 219                     'modified_timestamp': 1641318202,
 220                     'duration': 474,
 221                     'tags': 'count:7',
 222                     'episode_number': 8,
 223                     'season_number': 5,
 224                     'series': 'AirSpace',
 225                     'series_id': '38057',
 226                     'channel_id': '220986',
 227                     'channel_url': 'https://beta.prx.org/accounts/220986',
 228                     'channel': 'Air and Space Museum',
 229                     'ext': 'mp3',
 230                     'upload_date': '20211222',
 231                     'episode': 'Episode 8',
 232                     'release_date': '20211223',
 233                     'season': 'Season 5',
 234                     'modified_date': '20220104'
 235                 }
 236             }
 237
 238             ]
 239         }, {
 240             # Story with only split audio
 241             'url': 'https://beta.prx.org/stories/326414',
 242             'info_dict': {
 243                 'id': '326414',
 244                 'title': 'Massachusetts v EPA',
 245                 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
 246                 'timestamp': 1592509124,
 247                 'modified_timestamp': 1592510457,
 248                 'duration': 3088,
 249                 'tags': 'count:0',
 250                 'series': 'Outside/In',
 251                 'series_id': '36252',
 252                 'channel_id': '206',
 253                 'channel_url': 'https://beta.prx.org/accounts/206',
 254                 'channel': 'New Hampshire Public Radio',
 255             },
 256             'playlist_count': 4
 257         }, {
 258             # Story with single combined audio
 259             'url': 'https://beta.prx.org/stories/400404',
 260             'info_dict': {
 261                 'id': '400404',
 262                 'title': 'Cafe Chill (Episode 2022-01)',
 263                 'thumbnails': 'count:1',
 264                 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
 265                 'timestamp': 1641233952,
 266                 'modified_timestamp': 1641234248,
 267                 'duration': 3540,
 268                 'series': 'Café Chill',
 269                 'series_id': '37762',
 270                 'channel_id': '5767',
 271                 'channel_url': 'https://beta.prx.org/accounts/5767',
 272                 'channel': 'C89.5 - KNHC Seattle',
 273                 'ext': 'mp3',
 274                 'tags': 'count:0',
 275                 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
 276                 'upload_date': '20220103',
 277                 'modified_date': '20220103'
 278             }
 279         }, {
 280             'url': 'https://listen.prx.org/stories/399200',
 281             'only_matching': True
 282         }
 283     ]
 284
 285     def _extract_audio_pieces(self, audio_response):
 286         return [{
 287             'format_id': str_or_none(piece_response.get('id')),
 288             'format_note': str_or_none(piece_response.get('label')),
 289             'filesize': int_or_none(piece_response.get('size')),
 290             'duration': int_or_none(piece_response.get('duration')),
 291             'ext': mimetype2ext(piece_response.get('contentType')),
 292             'asr': int_or_none(piece_response.get('frequency'), scale=1000),
 293             'abr': int_or_none(piece_response.get('bitRate')),
 294             'url': self._extract_file_link(piece_response),
 295             'vcodec': 'none'
 296         } for piece_response in sorted(
 297             self._get_prx_embed_response(audio_response, 'items') or [],
 298             key=lambda p: int_or_none(p.get('position')))]
 299
 300     def _extract_story(self, story_response):
 301         info = self._extract_story_info(story_response)
 302         if not info:
 303             return
 304         audio_pieces = self._extract_audio_pieces(
 305             self._get_prx_embed_response(story_response, 'audio'))
 306         if len(audio_pieces) == 1:
 307             return {
 308                 'formats': audio_pieces,
 309                 **info
 310             }
 311
 312         entries = [{
 313             **info,
 314             'id': '%s_part%d' % (info['id'], (idx + 1)),
 315             'formats': [fmt],
 316         } for idx, fmt in enumerate(audio_pieces)]
 317         return {
 318             '_type': 'multi_video',
 319             'entries': entries,
 320             **info
 321         }
 322
 323     def _real_extract(self, url):
 324         story_id = self._match_id(url)
 325         response = self._call_api(story_id, f'stories/{story_id}')
 326         return self._extract_story(response)
 327
 328
 329 class PRXSeriesIE(PRXBaseIE):
 330     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
 331     _TESTS = [
 332         {
 333             'url': 'https://beta.prx.org/series/36252',
 334             'info_dict': {
 335                 'id': '36252',
 336                 'title': 'Outside/In',
 337                 'thumbnails': 'count:1',
 338                 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
 339                 'timestamp': 1470684964,
 340                 'modified_timestamp': 1582308830,
 341                 'channel_id': '206',
 342                 'channel_url': 'https://beta.prx.org/accounts/206',
 343                 'channel': 'New Hampshire Public Radio',
 344                 'series': 'Outside/In',
 345                 'series_id': '36252'
 346             },
 347             'playlist_mincount': 39
 348         }, {
 349             # Blank series
 350             'url': 'https://beta.prx.org/series/25038',
 351             'info_dict': {
 352                 'id': '25038',
 353                 'title': '25038',
 354                 'timestamp': 1207612800,
 355                 'modified_timestamp': 1207612800,
 356                 'channel_id': '206',
 357                 'channel_url': 'https://beta.prx.org/accounts/206',
 358                 'channel': 'New Hampshire Public Radio',
 359                 'series': '25038',
 360                 'series_id': '25038'
 361             },
 362             'playlist_count': 0
 363         }
 364     ]
 365
 366     def _extract_series(self, series_response):
 367         info = self._extract_series_info(series_response)
 368         return {
 369             '_type': 'playlist',
 370             'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
 371             **info
 372         }
 373
 374     def _real_extract(self, url):
 375         series_id = self._match_id(url)
 376         response = self._call_api(series_id, f'series/{series_id}')
 377         return self._extract_series(response)
 378
 379
 380 class PRXAccountIE(PRXBaseIE):
 381     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
 382     _TESTS = [{
 383         'url': 'https://beta.prx.org/accounts/206',
 384         'info_dict': {
 385             'id': '206',
 386             'title': 'New Hampshire Public Radio',
 387             'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
 388             'channel_id': '206',
 389             'channel_url': 'https://beta.prx.org/accounts/206',
 390             'channel': 'New Hampshire Public Radio',
 391             'thumbnails': 'count:1'
 392         },
 393         'playlist_mincount': 380
 394     }]
 395
 396     def _extract_account(self, account_response):
 397         info = self._extract_account_info(account_response)
 398         series = self._entries(
 399             info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
 400         stories = self._entries(
 401             info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
 402         return {
 403             '_type': 'playlist',
 404             'entries': itertools.chain(series, stories),
 405             **info
 406         }
 407
 408     def _real_extract(self, url):
 409         account_id = self._match_id(url)
 410         response = self._call_api(account_id, f'accounts/{account_id}')
 411         return self._extract_account(response)
 412
 413
 414 class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
 415     IE_DESC = 'PRX Stories Search'
 416     IE_NAME = 'prxstories:search'
 417     _SEARCH_KEY = 'prxstories'
 418
 419     def _search_results(self, query):
 420         yield from self._entries(
 421             f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
 422
 423
 424 class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
 425     IE_DESC = 'PRX Series Search'
 426     IE_NAME = 'prxseries:search'
 427     _SEARCH_KEY = 'prxseries'
 428
 429     def _search_results(self, query):
 430         yield from self._entries(
 431             f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})