]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/weibo.py
[ie/weibo] Fix extractor and support user extraction (#7657)
[yt-dlp.git] / yt_dlp / extractor / weibo.py
CommitLineData
42a1012c 1import random
69b03f84 2import itertools
3import urllib.parse
447a5a71 4
69b03f84 5from .common import InfoExtractor
447a5a71 6from ..utils import (
69b03f84 7 int_or_none,
8 make_archive_id,
9 mimetype2ext,
10 parse_resolution,
11 str_or_none,
6a41a12d 12 strip_jsonp,
69b03f84 13 traverse_obj,
14 url_or_none,
5c97ec5f 15 urlencode_postdata,
69b03f84 16 urljoin,
447a5a71 17)
29ac31af 18
95104372 19
69b03f84 20class WeiboBaseIE(InfoExtractor):
21 def _update_visitor_cookies(self, video_id):
22 visitor_data = self._download_json(
23 'https://passport.weibo.com/visitor/genvisitor', video_id,
24 note='Generating first-visit guest request',
25 transform_source=strip_jsonp,
26 data=urlencode_postdata({
27 'cb': 'gen_callback',
28 'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}',
29 }))
30
31 self._download_webpage(
32 'https://passport.weibo.com/visitor/visitor', video_id,
33 note='Running first-visit callback to get guest cookies',
34 query={
35 'a': 'incarnate',
36 't': visitor_data['data']['tid'],
37 'w': 2,
38 'c': '%03d' % visitor_data['data']['confidence'],
39 'cb': 'cross_domain',
40 'from': 'weibo',
41 '_rand': random.random(),
42 })
43
44 def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs):
45 webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs)
46 if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com':
47 self._update_visitor_cookies(video_id)
48 webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs)
49 return self._parse_json(webpage, video_id, fatal=fatal)
50
51 def _extract_formats(self, video_info):
52 media_info = traverse_obj(video_info, ('page_info', 'media_info'))
53 formats = traverse_obj(media_info, (
54 'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', {
55 'url': 'url',
56 'format': ('quality_desc', {str}),
57 'format_id': ('label', {str}),
58 'ext': ('mime', {mimetype2ext}),
59 'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}),
60 'vcodec': ('video_codecs', {str}),
61 'fps': ('fps', {int_or_none}),
62 'width': ('width', {int_or_none}),
63 'height': ('height', {int_or_none}),
64 'filesize': ('size', {int_or_none}),
65 'acodec': ('audio_codecs', {str}),
66 'asr': ('audio_sample_rate', {int_or_none}),
67 'audio_channels': ('audio_channels', {int_or_none}),
68 }))
69 if not formats: # fallback, should be barely used
70 for url in set(traverse_obj(media_info, (..., {url_or_none}))):
71 if 'label=' in url: # filter out non-video urls
72 format_id, resolution = self._search_regex(
73 r'label=(\w+)&template=(\d+x\d+)', url, 'format info',
74 group=(1, 2), default=(None, None))
75 formats.append({
76 'url': url,
77 'format_id': format_id,
78 **parse_resolution(resolution),
79 **traverse_obj(media_info, (
80 'video_details', lambda _, v: v['label'].startswith(format_id), {
81 'size': ('size', {int_or_none}),
82 'tbr': ('bitrate', {int_or_none}),
83 }
84 ), get_all=False),
85 })
86 return formats
87
88 def _parse_video_info(self, video_info, video_id=None):
89 return {
90 'id': video_id,
91 'extractor_key': WeiboIE.ie_key(),
92 'extractor': WeiboIE.IE_NAME,
93 'formats': self._extract_formats(video_info),
94 'http_headers': {'Referer': 'https://weibo.com/'},
95 '_old_archive_ids': [make_archive_id('WeiboMobile', video_id)],
96 **traverse_obj(video_info, {
97 'id': (('id', 'id_str', 'mid'), {str_or_none}),
98 'display_id': ('mblogid', {str_or_none}),
99 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}),
100 'description': ('text_raw', {str}),
101 'duration': ('page_info', 'media_info', 'duration', {int_or_none}),
102 'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}),
103 'thumbnail': ('page_info', 'page_pic', {url_or_none}),
104 'uploader': ('user', 'screen_name', {str}),
105 'uploader_id': ('user', ('id', 'id_str'), {str_or_none}),
106 'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}),
107 'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}),
108 'like_count': ('attitudes_count', {int_or_none}),
109 'repost_count': ('reposts_count', {int_or_none}),
110 }, get_all=False),
111 'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None,
112 }
113
114
115class WeiboIE(WeiboBaseIE):
116 _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)'
117 _TESTS = [{
118 'url': 'https://weibo.com/7827771738/N4xlMvjhI',
95104372 119 'info_dict': {
69b03f84 120 'id': '4910815147462302',
121 'ext': 'mp4',
122 'display_id': 'N4xlMvjhI',
123 'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】',
124 'description': 'md5:e2637a7673980d68694ea7c43cf12a5f',
125 'duration': 918,
126 'timestamp': 1686312819,
127 'upload_date': '20230609',
128 'thumbnail': r're:https://.*\.jpg',
129 'uploader': '睡前视频基地',
130 'uploader_id': '7827771738',
131 'uploader_url': 'https://weibo.com/u/7827771738',
132 'view_count': int,
133 'like_count': int,
134 'repost_count': int,
135 'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'],
136 },
137 }, {
138 'url': 'https://m.weibo.cn/status/4189191225395228',
139 'info_dict': {
140 'id': '4189191225395228',
95104372 141 'ext': 'mp4',
69b03f84 142 'display_id': 'FBqgOmDxO',
143 'title': '柴犬柴犬的秒拍视频',
144 'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f',
145 'duration': 53,
146 'timestamp': 1514264429,
147 'upload_date': '20171226',
148 'thumbnail': r're:https://.*\.jpg',
149 'uploader': '柴犬柴犬',
150 'uploader_id': '5926682210',
151 'uploader_url': 'https://weibo.com/u/5926682210',
152 'view_count': int,
153 'like_count': int,
154 'repost_count': int,
95104372 155 }
69b03f84 156 }, {
157 'url': 'https://weibo.com/0/4224132150961381',
158 'note': 'no playback_list example',
159 'only_matching': True,
160 }]
29ac31af 161
162 def _real_extract(self, url):
163 video_id = self._match_id(url)
29ac31af 164
69b03f84 165 return self._parse_video_info(self._weibo_download_json(
166 f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id))
95104372 167
447a5a71 168
69b03f84 169class WeiboVideoIE(WeiboBaseIE):
170 _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)'
171 _TESTS = [{
172 'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow',
95104372 173 'info_dict': {
69b03f84 174 'id': '4797700463137878',
95104372 175 'ext': 'mp4',
69b03f84 176 'display_id': 'LEZDodaiW',
177 'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了',
178 'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM ​​​',
179 'duration': 76,
180 'timestamp': 1659344278,
181 'upload_date': '20220801',
182 'thumbnail': r're:https://.*\.jpg',
183 'uploader': '君子爱财陈平安',
184 'uploader_id': '3905382233',
185 'uploader_url': 'https://weibo.com/u/3905382233',
186 'view_count': int,
187 'like_count': int,
188 'repost_count': int,
95104372 189 }
69b03f84 190 }]
447a5a71 191
192 def _real_extract(self, url):
193 video_id = self._match_id(url)
5eca00a2 194
69b03f84 195 post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode()
196 video_info = self._weibo_download_json(
197 f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}',
198 video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo']
199 return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE)
5eca00a2 200
447a5a71 201
69b03f84 202class WeiboUserIE(WeiboBaseIE):
203 _VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)'
204 _TESTS = [{
205 'url': 'https://weibo.com/u/2066652961?tabtype=video',
206 'info_dict': {
207 'id': '2066652961',
208 'title': '萧影殿下的视频',
209 'description': '萧影殿下的全部视频',
210 'uploader': '萧影殿下',
211 },
212 'playlist_mincount': 195,
213 }]
214
215 def _fetch_page(self, uid, cursor=0, page=1):
216 return self._weibo_download_json(
217 'https://weibo.com/ajax/profile/getWaterFallContent',
218 uid, note=f'Downloading videos page {page}',
219 query={'uid': uid, 'cursor': cursor})['data']
220
221 def _entries(self, uid, first_page):
222 cursor = 0
223 for page in itertools.count(1):
224 response = first_page if page == 1 else self._fetch_page(uid, cursor, page)
225 for video_info in traverse_obj(response, ('list', ..., {dict})):
226 yield self._parse_video_info(video_info)
227 cursor = response.get('next_cursor')
228 if (int_or_none(cursor) or -1) < 0:
229 break
230
231 def _real_extract(self, url):
232 uid = self._match_id(url)
233 first_page = self._fetch_page(uid)
234 uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False)
235 metainfo = {
236 'title': f'{uploader}的视频',
237 'description': f'{uploader}的全部视频',
95104372 238 'uploader': uploader,
69b03f84 239 } if uploader else {}
240
241 return self.playlist_result(self._entries(uid, first_page), uid, **metainfo)