]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/crowdbunker.py
Reject entire playlists faster with `--match-filter`
[yt-dlp.git] / yt_dlp / extractor / crowdbunker.py
1 import itertools
2
3 from .common import InfoExtractor
4 from ..utils import (
5 int_or_none,
6 try_get,
7 unified_strdate,
8 )
9
10
11 class CrowdBunkerIE(InfoExtractor):
12 _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/v/(?P<id>[^/?#$&]+)'
13
14 _TESTS = [{
15 'url': 'https://crowdbunker.com/v/0z4Kms8pi8I',
16 'info_dict': {
17 'id': '0z4Kms8pi8I',
18 'ext': 'mp4',
19 'title': '117) Pass vax et solutions',
20 'description': 'md5:86bcb422c29475dbd2b5dcfa6ec3749c',
21 'view_count': int,
22 'duration': 5386,
23 'uploader': 'Jérémie Mercier',
24 'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ',
25 'like_count': int,
26 'upload_date': '20211218',
27 'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg'
28 },
29 'params': {'skip_download': True}
30 }]
31
32 def _real_extract(self, url):
33 id = self._match_id(url)
34 data_json = self._download_json(f'https://api.divulg.org/post/{id}/details',
35 id, headers={'accept': 'application/json, text/plain, */*'})
36 video_json = data_json['video']
37 formats, subtitles = [], {}
38 for sub in video_json.get('captions') or []:
39 sub_url = try_get(sub, lambda x: x['file']['url'])
40 if not sub_url:
41 continue
42 subtitles.setdefault(sub.get('languageCode', 'fr'), []).append({
43 'url': sub_url,
44 })
45
46 mpd_url = try_get(video_json, lambda x: x['dashManifest']['url'])
47 if mpd_url:
48 fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, id)
49 formats.extend(fmts)
50 subtitles = self._merge_subtitles(subtitles, subs)
51 m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url'])
52 if m3u8_url:
53 fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, id)
54 formats.extend(fmts)
55 subtitles = self._merge_subtitles(subtitles, subs)
56
57 thumbnails = [{
58 'url': image['url'],
59 'height': int_or_none(image.get('height')),
60 'width': int_or_none(image.get('width')),
61 } for image in video_json.get('thumbnails') or [] if image.get('url')]
62
63 self._sort_formats(formats)
64 return {
65 'id': id,
66 'title': video_json.get('title'),
67 'description': video_json.get('description'),
68 'view_count': video_json.get('viewCount'),
69 'duration': video_json.get('duration'),
70 'uploader': try_get(data_json, lambda x: x['channel']['name']),
71 'uploader_id': try_get(data_json, lambda x: x['channel']['id']),
72 'like_count': data_json.get('likesCount'),
73 'upload_date': unified_strdate(video_json.get('publishedAt') or video_json.get('createdAt')),
74 'thumbnails': thumbnails,
75 'formats': formats,
76 'subtitles': subtitles,
77 }
78
79
80 class CrowdBunkerChannelIE(InfoExtractor):
81 _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/@(?P<id>[^/?#$&]+)'
82
83 _TESTS = [{
84 'url': 'https://crowdbunker.com/@Milan_UHRIN',
85 'playlist_mincount': 14,
86 'info_dict': {
87 'id': 'Milan_UHRIN',
88 },
89 }]
90
91 def _entries(self, id):
92 last = None
93
94 for page in itertools.count():
95 channel_json = self._download_json(
96 f'https://api.divulg.org/organization/{id}/posts', id, headers={'accept': 'application/json, text/plain, */*'},
97 query={'after': last} if last else {}, note=f'Downloading Page {page}')
98 for item in channel_json.get('items') or []:
99 v_id = item.get('uid')
100 if not v_id:
101 continue
102 yield self.url_result(
103 'https://crowdbunker.com/v/%s' % v_id, ie=CrowdBunkerIE.ie_key(), video_id=v_id)
104 last = channel_json.get('last')
105 if not last:
106 break
107
108 def _real_extract(self, url):
109 id = self._match_id(url)
110 return self.playlist_result(self._entries(id), playlist_id=id)