]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/crowdbunker.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / crowdbunker.py
1 import itertools
2
3 from .common import InfoExtractor
4 from ..utils import (
5 int_or_none,
6 try_get,
7 unified_strdate,
8 )
9
10
11 class CrowdBunkerIE(InfoExtractor):
12 _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/v/(?P<id>[^/?#$&]+)'
13
14 _TESTS = [{
15 'url': 'https://crowdbunker.com/v/0z4Kms8pi8I',
16 'info_dict': {
17 'id': '0z4Kms8pi8I',
18 'ext': 'mp4',
19 'title': '117) Pass vax et solutions',
20 'description': 'md5:86bcb422c29475dbd2b5dcfa6ec3749c',
21 'view_count': int,
22 'duration': 5386,
23 'uploader': 'Jérémie Mercier',
24 'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ',
25 'like_count': int,
26 'upload_date': '20211218',
27 'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg'
28 },
29 'params': {'skip_download': True}
30 }]
31
32 def _real_extract(self, url):
33 id = self._match_id(url)
34 data_json = self._download_json(f'https://api.divulg.org/post/{id}/details',
35 id, headers={'accept': 'application/json, text/plain, */*'})
36 video_json = data_json['video']
37 formats, subtitles = [], {}
38 for sub in video_json.get('captions') or []:
39 sub_url = try_get(sub, lambda x: x['file']['url'])
40 if not sub_url:
41 continue
42 subtitles.setdefault(sub.get('languageCode', 'fr'), []).append({
43 'url': sub_url,
44 })
45
46 mpd_url = try_get(video_json, lambda x: x['dashManifest']['url'])
47 if mpd_url:
48 fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, id)
49 formats.extend(fmts)
50 subtitles = self._merge_subtitles(subtitles, subs)
51 m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url'])
52 if m3u8_url:
53 fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, id)
54 formats.extend(fmts)
55 subtitles = self._merge_subtitles(subtitles, subs)
56
57 thumbnails = [{
58 'url': image['url'],
59 'height': int_or_none(image.get('height')),
60 'width': int_or_none(image.get('width')),
61 } for image in video_json.get('thumbnails') or [] if image.get('url')]
62
63 return {
64 'id': id,
65 'title': video_json.get('title'),
66 'description': video_json.get('description'),
67 'view_count': video_json.get('viewCount'),
68 'duration': video_json.get('duration'),
69 'uploader': try_get(data_json, lambda x: x['channel']['name']),
70 'uploader_id': try_get(data_json, lambda x: x['channel']['id']),
71 'like_count': data_json.get('likesCount'),
72 'upload_date': unified_strdate(video_json.get('publishedAt') or video_json.get('createdAt')),
73 'thumbnails': thumbnails,
74 'formats': formats,
75 'subtitles': subtitles,
76 }
77
78
79 class CrowdBunkerChannelIE(InfoExtractor):
80 _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/@(?P<id>[^/?#$&]+)'
81
82 _TESTS = [{
83 'url': 'https://crowdbunker.com/@Milan_UHRIN',
84 'playlist_mincount': 14,
85 'info_dict': {
86 'id': 'Milan_UHRIN',
87 },
88 }]
89
90 def _entries(self, id):
91 last = None
92
93 for page in itertools.count():
94 channel_json = self._download_json(
95 f'https://api.divulg.org/organization/{id}/posts', id, headers={'accept': 'application/json, text/plain, */*'},
96 query={'after': last} if last else {}, note=f'Downloading Page {page}')
97 for item in channel_json.get('items') or []:
98 v_id = item.get('uid')
99 if not v_id:
100 continue
101 yield self.url_result(
102 'https://crowdbunker.com/v/%s' % v_id, ie=CrowdBunkerIE.ie_key(), video_id=v_id)
103 last = channel_json.get('last')
104 if not last:
105 break
106
107 def _real_extract(self, url):
108 id = self._match_id(url)
109 return self.playlist_result(self._entries(id), playlist_id=id)