]>
Commit | Line | Data |
---|---|---|
15b252df | 1 | import json |
42a1012c | 2 | import random |
69b03f84 | 3 | import itertools |
4 | import urllib.parse | |
447a5a71 | 5 | |
69b03f84 | 6 | from .common import InfoExtractor |
447a5a71 | 7 | from ..utils import ( |
69b03f84 | 8 | int_or_none, |
9 | make_archive_id, | |
10 | mimetype2ext, | |
11 | parse_resolution, | |
12 | str_or_none, | |
6a41a12d | 13 | strip_jsonp, |
69b03f84 | 14 | traverse_obj, |
15 | url_or_none, | |
5c97ec5f | 16 | urlencode_postdata, |
69b03f84 | 17 | urljoin, |
447a5a71 | 18 | ) |
29ac31af | 19 | |
95104372 | 20 | |
69b03f84 | 21 | class WeiboBaseIE(InfoExtractor): |
15b252df | 22 | def _update_visitor_cookies(self, visitor_url, video_id): |
23 | headers = {'Referer': visitor_url} | |
24 | chrome_ver = self._search_regex( | |
25 | r'Chrome/(\d+)', self.get_param('http_headers')['User-Agent'], 'user agent version', default='90') | |
69b03f84 | 26 | visitor_data = self._download_json( |
27 | 'https://passport.weibo.com/visitor/genvisitor', video_id, | |
28 | note='Generating first-visit guest request', | |
15b252df | 29 | headers=headers, transform_source=strip_jsonp, |
69b03f84 | 30 | data=urlencode_postdata({ |
31 | 'cb': 'gen_callback', | |
15b252df | 32 | 'fp': json.dumps({ |
33 | 'os': '1', | |
34 | 'browser': f'Chrome{chrome_ver},0,0,0', | |
35 | 'fonts': 'undefined', | |
36 | 'screenInfo': '1920*1080*24', | |
37 | 'plugins': '' | |
38 | }, separators=(',', ':'))}))['data'] | |
69b03f84 | 39 | |
40 | self._download_webpage( | |
41 | 'https://passport.weibo.com/visitor/visitor', video_id, | |
42 | note='Running first-visit callback to get guest cookies', | |
15b252df | 43 | headers=headers, query={ |
69b03f84 | 44 | 'a': 'incarnate', |
15b252df | 45 | 't': visitor_data['tid'], |
46 | 'w': 3 if visitor_data.get('new_tid') else 2, | |
47 | 'c': f'{visitor_data.get("confidence", 100):03d}', | |
48 | 'gc': '', | |
69b03f84 | 49 | 'cb': 'cross_domain', |
50 | 'from': 'weibo', | |
51 | '_rand': random.random(), | |
52 | }) | |
53 | ||
54 | def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs): | |
55 | webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs) | |
56 | if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com': | |
15b252df | 57 | self._update_visitor_cookies(urlh.url, video_id) |
69b03f84 | 58 | webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs) |
59 | return self._parse_json(webpage, video_id, fatal=fatal) | |
60 | ||
61 | def _extract_formats(self, video_info): | |
62 | media_info = traverse_obj(video_info, ('page_info', 'media_info')) | |
63 | formats = traverse_obj(media_info, ( | |
64 | 'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', { | |
65 | 'url': 'url', | |
66 | 'format': ('quality_desc', {str}), | |
67 | 'format_id': ('label', {str}), | |
68 | 'ext': ('mime', {mimetype2ext}), | |
69 | 'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}), | |
70 | 'vcodec': ('video_codecs', {str}), | |
71 | 'fps': ('fps', {int_or_none}), | |
72 | 'width': ('width', {int_or_none}), | |
73 | 'height': ('height', {int_or_none}), | |
74 | 'filesize': ('size', {int_or_none}), | |
75 | 'acodec': ('audio_codecs', {str}), | |
76 | 'asr': ('audio_sample_rate', {int_or_none}), | |
77 | 'audio_channels': ('audio_channels', {int_or_none}), | |
78 | })) | |
79 | if not formats: # fallback, should be barely used | |
80 | for url in set(traverse_obj(media_info, (..., {url_or_none}))): | |
81 | if 'label=' in url: # filter out non-video urls | |
82 | format_id, resolution = self._search_regex( | |
83 | r'label=(\w+)&template=(\d+x\d+)', url, 'format info', | |
84 | group=(1, 2), default=(None, None)) | |
85 | formats.append({ | |
86 | 'url': url, | |
87 | 'format_id': format_id, | |
88 | **parse_resolution(resolution), | |
89 | **traverse_obj(media_info, ( | |
90 | 'video_details', lambda _, v: v['label'].startswith(format_id), { | |
91 | 'size': ('size', {int_or_none}), | |
92 | 'tbr': ('bitrate', {int_or_none}), | |
93 | } | |
94 | ), get_all=False), | |
95 | }) | |
96 | return formats | |
97 | ||
98 | def _parse_video_info(self, video_info, video_id=None): | |
99 | return { | |
100 | 'id': video_id, | |
101 | 'extractor_key': WeiboIE.ie_key(), | |
102 | 'extractor': WeiboIE.IE_NAME, | |
103 | 'formats': self._extract_formats(video_info), | |
104 | 'http_headers': {'Referer': 'https://weibo.com/'}, | |
105 | '_old_archive_ids': [make_archive_id('WeiboMobile', video_id)], | |
106 | **traverse_obj(video_info, { | |
107 | 'id': (('id', 'id_str', 'mid'), {str_or_none}), | |
108 | 'display_id': ('mblogid', {str_or_none}), | |
109 | 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}), | |
110 | 'description': ('text_raw', {str}), | |
111 | 'duration': ('page_info', 'media_info', 'duration', {int_or_none}), | |
112 | 'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}), | |
113 | 'thumbnail': ('page_info', 'page_pic', {url_or_none}), | |
114 | 'uploader': ('user', 'screen_name', {str}), | |
115 | 'uploader_id': ('user', ('id', 'id_str'), {str_or_none}), | |
116 | 'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}), | |
117 | 'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}), | |
118 | 'like_count': ('attitudes_count', {int_or_none}), | |
119 | 'repost_count': ('reposts_count', {int_or_none}), | |
120 | }, get_all=False), | |
121 | 'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None, | |
122 | } | |
123 | ||
124 | ||
125 | class WeiboIE(WeiboBaseIE): | |
126 | _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)' | |
127 | _TESTS = [{ | |
128 | 'url': 'https://weibo.com/7827771738/N4xlMvjhI', | |
95104372 | 129 | 'info_dict': { |
69b03f84 | 130 | 'id': '4910815147462302', |
131 | 'ext': 'mp4', | |
132 | 'display_id': 'N4xlMvjhI', | |
133 | 'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】', | |
134 | 'description': 'md5:e2637a7673980d68694ea7c43cf12a5f', | |
135 | 'duration': 918, | |
136 | 'timestamp': 1686312819, | |
137 | 'upload_date': '20230609', | |
138 | 'thumbnail': r're:https://.*\.jpg', | |
139 | 'uploader': '睡前视频基地', | |
140 | 'uploader_id': '7827771738', | |
141 | 'uploader_url': 'https://weibo.com/u/7827771738', | |
142 | 'view_count': int, | |
143 | 'like_count': int, | |
144 | 'repost_count': int, | |
145 | 'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'], | |
146 | }, | |
147 | }, { | |
148 | 'url': 'https://m.weibo.cn/status/4189191225395228', | |
149 | 'info_dict': { | |
150 | 'id': '4189191225395228', | |
95104372 | 151 | 'ext': 'mp4', |
69b03f84 | 152 | 'display_id': 'FBqgOmDxO', |
153 | 'title': '柴犬柴犬的秒拍视频', | |
154 | 'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f', | |
155 | 'duration': 53, | |
156 | 'timestamp': 1514264429, | |
157 | 'upload_date': '20171226', | |
158 | 'thumbnail': r're:https://.*\.jpg', | |
159 | 'uploader': '柴犬柴犬', | |
160 | 'uploader_id': '5926682210', | |
161 | 'uploader_url': 'https://weibo.com/u/5926682210', | |
162 | 'view_count': int, | |
163 | 'like_count': int, | |
164 | 'repost_count': int, | |
95104372 | 165 | } |
69b03f84 | 166 | }, { |
167 | 'url': 'https://weibo.com/0/4224132150961381', | |
168 | 'note': 'no playback_list example', | |
169 | 'only_matching': True, | |
170 | }] | |
29ac31af | 171 | |
172 | def _real_extract(self, url): | |
173 | video_id = self._match_id(url) | |
29ac31af | 174 | |
69b03f84 | 175 | return self._parse_video_info(self._weibo_download_json( |
176 | f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id)) | |
95104372 | 177 | |
447a5a71 | 178 | |
69b03f84 | 179 | class WeiboVideoIE(WeiboBaseIE): |
180 | _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)' | |
181 | _TESTS = [{ | |
182 | 'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow', | |
95104372 | 183 | 'info_dict': { |
69b03f84 | 184 | 'id': '4797700463137878', |
95104372 | 185 | 'ext': 'mp4', |
69b03f84 | 186 | 'display_id': 'LEZDodaiW', |
187 | 'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了', | |
188 | 'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM ', | |
189 | 'duration': 76, | |
190 | 'timestamp': 1659344278, | |
191 | 'upload_date': '20220801', | |
192 | 'thumbnail': r're:https://.*\.jpg', | |
193 | 'uploader': '君子爱财陈平安', | |
194 | 'uploader_id': '3905382233', | |
195 | 'uploader_url': 'https://weibo.com/u/3905382233', | |
196 | 'view_count': int, | |
197 | 'like_count': int, | |
198 | 'repost_count': int, | |
95104372 | 199 | } |
69b03f84 | 200 | }] |
447a5a71 | 201 | |
202 | def _real_extract(self, url): | |
203 | video_id = self._match_id(url) | |
5eca00a2 | 204 | |
69b03f84 | 205 | post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode() |
206 | video_info = self._weibo_download_json( | |
207 | f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}', | |
208 | video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo'] | |
209 | return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE) | |
5eca00a2 | 210 | |
447a5a71 | 211 | |
69b03f84 | 212 | class WeiboUserIE(WeiboBaseIE): |
213 | _VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)' | |
214 | _TESTS = [{ | |
215 | 'url': 'https://weibo.com/u/2066652961?tabtype=video', | |
216 | 'info_dict': { | |
217 | 'id': '2066652961', | |
218 | 'title': '萧影殿下的视频', | |
219 | 'description': '萧影殿下的全部视频', | |
220 | 'uploader': '萧影殿下', | |
221 | }, | |
222 | 'playlist_mincount': 195, | |
223 | }] | |
224 | ||
225 | def _fetch_page(self, uid, cursor=0, page=1): | |
226 | return self._weibo_download_json( | |
227 | 'https://weibo.com/ajax/profile/getWaterFallContent', | |
228 | uid, note=f'Downloading videos page {page}', | |
229 | query={'uid': uid, 'cursor': cursor})['data'] | |
230 | ||
231 | def _entries(self, uid, first_page): | |
232 | cursor = 0 | |
233 | for page in itertools.count(1): | |
234 | response = first_page if page == 1 else self._fetch_page(uid, cursor, page) | |
235 | for video_info in traverse_obj(response, ('list', ..., {dict})): | |
236 | yield self._parse_video_info(video_info) | |
237 | cursor = response.get('next_cursor') | |
238 | if (int_or_none(cursor) or -1) < 0: | |
239 | break | |
240 | ||
241 | def _real_extract(self, url): | |
242 | uid = self._match_id(url) | |
243 | first_page = self._fetch_page(uid) | |
244 | uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False) | |
245 | metainfo = { | |
246 | 'title': f'{uploader}的视频', | |
247 | 'description': f'{uploader}的全部视频', | |
95104372 | 248 | 'uploader': uploader, |
69b03f84 | 249 | } if uploader else {} |
250 | ||
251 | return self.playlist_result(self._entries(uid, first_page), uid, **metainfo) |