yt_dlp/extractor/crowdbunker.py

   1 import itertools
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     int_or_none,
   6     try_get,
   7     unified_strdate,
   8 )
   9
  10
  11 class CrowdBunkerIE(InfoExtractor):
  12     _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/v/(?P<id>[^/?#$&]+)'
  13
  14     _TESTS = [{
  15         'url': 'https://crowdbunker.com/v/0z4Kms8pi8I',
  16         'info_dict': {
  17             'id': '0z4Kms8pi8I',
  18             'ext': 'mp4',
  19             'title': '117) Pass vax et solutions',
  20             'description': 'md5:86bcb422c29475dbd2b5dcfa6ec3749c',
  21             'view_count': int,
  22             'duration': 5386,
  23             'uploader': 'Jérémie Mercier',
  24             'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ',
  25             'like_count': int,
  26             'upload_date': '20211218',
  27             'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg'
  28         },
  29         'params': {'skip_download': True}
  30     }]
  31
  32     def _real_extract(self, url):
  33         id = self._match_id(url)
  34         data_json = self._download_json(f'https://api.divulg.org/post/{id}/details',
  35                                         id, headers={'accept': 'application/json, text/plain, */*'})
  36         video_json = data_json['video']
  37         formats, subtitles = [], {}
  38         for sub in video_json.get('captions') or []:
  39             sub_url = try_get(sub, lambda x: x['file']['url'])
  40             if not sub_url:
  41                 continue
  42             subtitles.setdefault(sub.get('languageCode', 'fr'), []).append({
  43                 'url': sub_url,
  44             })
  45
  46         mpd_url = try_get(video_json, lambda x: x['dashManifest']['url'])
  47         if mpd_url:
  48             fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, id)
  49             formats.extend(fmts)
  50             subtitles = self._merge_subtitles(subtitles, subs)
  51         m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url'])
  52         if m3u8_url:
  53             fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, id)
  54             formats.extend(fmts)
  55             subtitles = self._merge_subtitles(subtitles, subs)
  56
  57         thumbnails = [{
  58             'url': image['url'],
  59             'height': int_or_none(image.get('height')),
  60             'width': int_or_none(image.get('width')),
  61         } for image in video_json.get('thumbnails') or [] if image.get('url')]
  62
  63         return {
  64             'id': id,
  65             'title': video_json.get('title'),
  66             'description': video_json.get('description'),
  67             'view_count': video_json.get('viewCount'),
  68             'duration': video_json.get('duration'),
  69             'uploader': try_get(data_json, lambda x: x['channel']['name']),
  70             'uploader_id': try_get(data_json, lambda x: x['channel']['id']),
  71             'like_count': data_json.get('likesCount'),
  72             'upload_date': unified_strdate(video_json.get('publishedAt') or video_json.get('createdAt')),
  73             'thumbnails': thumbnails,
  74             'formats': formats,
  75             'subtitles': subtitles,
  76         }
  77
  78
  79 class CrowdBunkerChannelIE(InfoExtractor):
  80     _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/@(?P<id>[^/?#$&]+)'
  81
  82     _TESTS = [{
  83         'url': 'https://crowdbunker.com/@Milan_UHRIN',
  84         'playlist_mincount': 14,
  85         'info_dict': {
  86             'id': 'Milan_UHRIN',
  87         },
  88     }]
  89
  90     def _entries(self, id):
  91         last = None
  92
  93         for page in itertools.count():
  94             channel_json = self._download_json(
  95                 f'https://api.divulg.org/organization/{id}/posts', id, headers={'accept': 'application/json, text/plain, */*'},
  96                 query={'after': last} if last else {}, note=f'Downloading Page {page}')
  97             for item in channel_json.get('items') or []:
  98                 v_id = item.get('uid')
  99                 if not v_id:
 100                     continue
 101                 yield self.url_result(
 102                     'https://crowdbunker.com/v/%s' % v_id, ie=CrowdBunkerIE.ie_key(), video_id=v_id)
 103             last = channel_json.get('last')
 104             if not last:
 105                 break
 106
 107     def _real_extract(self, url):
 108         id = self._match_id(url)
 109         return self.playlist_result(self._entries(id), playlist_id=id)