]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/weibo.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / weibo.py
1 import itertools
2 import json
3 import random
4 import urllib.parse
5
6 from .common import InfoExtractor
7 from ..utils import (
8 int_or_none,
9 make_archive_id,
10 mimetype2ext,
11 parse_resolution,
12 str_or_none,
13 strip_jsonp,
14 traverse_obj,
15 url_or_none,
16 urlencode_postdata,
17 urljoin,
18 )
19
20
21 class WeiboBaseIE(InfoExtractor):
22 def _update_visitor_cookies(self, visitor_url, video_id):
23 headers = {'Referer': visitor_url}
24 chrome_ver = self._search_regex(
25 r'Chrome/(\d+)', self.get_param('http_headers')['User-Agent'], 'user agent version', default='90')
26 visitor_data = self._download_json(
27 'https://passport.weibo.com/visitor/genvisitor', video_id,
28 note='Generating first-visit guest request',
29 headers=headers, transform_source=strip_jsonp,
30 data=urlencode_postdata({
31 'cb': 'gen_callback',
32 'fp': json.dumps({
33 'os': '1',
34 'browser': f'Chrome{chrome_ver},0,0,0',
35 'fonts': 'undefined',
36 'screenInfo': '1920*1080*24',
37 'plugins': ''
38 }, separators=(',', ':'))}))['data']
39
40 self._download_webpage(
41 'https://passport.weibo.com/visitor/visitor', video_id,
42 note='Running first-visit callback to get guest cookies',
43 headers=headers, query={
44 'a': 'incarnate',
45 't': visitor_data['tid'],
46 'w': 3 if visitor_data.get('new_tid') else 2,
47 'c': f'{visitor_data.get("confidence", 100):03d}',
48 'gc': '',
49 'cb': 'cross_domain',
50 'from': 'weibo',
51 '_rand': random.random(),
52 })
53
54 def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs):
55 webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs)
56 if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com':
57 self._update_visitor_cookies(urlh.url, video_id)
58 webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs)
59 return self._parse_json(webpage, video_id, fatal=fatal)
60
61 def _extract_formats(self, video_info):
62 media_info = traverse_obj(video_info, ('page_info', 'media_info'))
63 formats = traverse_obj(media_info, (
64 'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', {
65 'url': 'url',
66 'format': ('quality_desc', {str}),
67 'format_id': ('label', {str}),
68 'ext': ('mime', {mimetype2ext}),
69 'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}),
70 'vcodec': ('video_codecs', {str}),
71 'fps': ('fps', {int_or_none}),
72 'width': ('width', {int_or_none}),
73 'height': ('height', {int_or_none}),
74 'filesize': ('size', {int_or_none}),
75 'acodec': ('audio_codecs', {str}),
76 'asr': ('audio_sample_rate', {int_or_none}),
77 'audio_channels': ('audio_channels', {int_or_none}),
78 }))
79 if not formats: # fallback, should be barely used
80 for url in set(traverse_obj(media_info, (..., {url_or_none}))):
81 if 'label=' in url: # filter out non-video urls
82 format_id, resolution = self._search_regex(
83 r'label=(\w+)&template=(\d+x\d+)', url, 'format info',
84 group=(1, 2), default=(None, None))
85 formats.append({
86 'url': url,
87 'format_id': format_id,
88 **parse_resolution(resolution),
89 **traverse_obj(media_info, (
90 'video_details', lambda _, v: v['label'].startswith(format_id), {
91 'size': ('size', {int_or_none}),
92 'tbr': ('bitrate', {int_or_none}),
93 }
94 ), get_all=False),
95 })
96 return formats
97
98 def _parse_video_info(self, video_info, video_id=None):
99 return {
100 'id': video_id,
101 'extractor_key': WeiboIE.ie_key(),
102 'extractor': WeiboIE.IE_NAME,
103 'formats': self._extract_formats(video_info),
104 'http_headers': {'Referer': 'https://weibo.com/'},
105 '_old_archive_ids': [make_archive_id('WeiboMobile', video_id)],
106 **traverse_obj(video_info, {
107 'id': (('id', 'id_str', 'mid'), {str_or_none}),
108 'display_id': ('mblogid', {str_or_none}),
109 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}),
110 'description': ('text_raw', {str}),
111 'duration': ('page_info', 'media_info', 'duration', {int_or_none}),
112 'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}),
113 'thumbnail': ('page_info', 'page_pic', {url_or_none}),
114 'uploader': ('user', 'screen_name', {str}),
115 'uploader_id': ('user', ('id', 'id_str'), {str_or_none}),
116 'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}),
117 'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}),
118 'like_count': ('attitudes_count', {int_or_none}),
119 'repost_count': ('reposts_count', {int_or_none}),
120 }, get_all=False),
121 'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None,
122 }
123
124
125 class WeiboIE(WeiboBaseIE):
126 _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)'
127 _TESTS = [{
128 'url': 'https://weibo.com/7827771738/N4xlMvjhI',
129 'info_dict': {
130 'id': '4910815147462302',
131 'ext': 'mp4',
132 'display_id': 'N4xlMvjhI',
133 'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】',
134 'description': 'md5:e2637a7673980d68694ea7c43cf12a5f',
135 'duration': 918,
136 'timestamp': 1686312819,
137 'upload_date': '20230609',
138 'thumbnail': r're:https://.*\.jpg',
139 'uploader': '睡前视频基地',
140 'uploader_id': '7827771738',
141 'uploader_url': 'https://weibo.com/u/7827771738',
142 'view_count': int,
143 'like_count': int,
144 'repost_count': int,
145 'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'],
146 },
147 }, {
148 'url': 'https://m.weibo.cn/status/4189191225395228',
149 'info_dict': {
150 'id': '4189191225395228',
151 'ext': 'mp4',
152 'display_id': 'FBqgOmDxO',
153 'title': '柴犬柴犬的秒拍视频',
154 'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f',
155 'duration': 53,
156 'timestamp': 1514264429,
157 'upload_date': '20171226',
158 'thumbnail': r're:https://.*\.jpg',
159 'uploader': '柴犬柴犬',
160 'uploader_id': '5926682210',
161 'uploader_url': 'https://weibo.com/u/5926682210',
162 'view_count': int,
163 'like_count': int,
164 'repost_count': int,
165 }
166 }, {
167 'url': 'https://weibo.com/0/4224132150961381',
168 'note': 'no playback_list example',
169 'only_matching': True,
170 }]
171
172 def _real_extract(self, url):
173 video_id = self._match_id(url)
174
175 return self._parse_video_info(self._weibo_download_json(
176 f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id))
177
178
179 class WeiboVideoIE(WeiboBaseIE):
180 _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)'
181 _TESTS = [{
182 'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow',
183 'info_dict': {
184 'id': '4797700463137878',
185 'ext': 'mp4',
186 'display_id': 'LEZDodaiW',
187 'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了',
188 'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM ​​​',
189 'duration': 76,
190 'timestamp': 1659344278,
191 'upload_date': '20220801',
192 'thumbnail': r're:https://.*\.jpg',
193 'uploader': '君子爱财陈平安',
194 'uploader_id': '3905382233',
195 'uploader_url': 'https://weibo.com/u/3905382233',
196 'view_count': int,
197 'like_count': int,
198 'repost_count': int,
199 }
200 }]
201
202 def _real_extract(self, url):
203 video_id = self._match_id(url)
204
205 post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode()
206 video_info = self._weibo_download_json(
207 f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}',
208 video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo']
209 return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE)
210
211
212 class WeiboUserIE(WeiboBaseIE):
213 _VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)'
214 _TESTS = [{
215 'url': 'https://weibo.com/u/2066652961?tabtype=video',
216 'info_dict': {
217 'id': '2066652961',
218 'title': '萧影殿下的视频',
219 'description': '萧影殿下的全部视频',
220 'uploader': '萧影殿下',
221 },
222 'playlist_mincount': 195,
223 }]
224
225 def _fetch_page(self, uid, cursor=0, page=1):
226 return self._weibo_download_json(
227 'https://weibo.com/ajax/profile/getWaterFallContent',
228 uid, note=f'Downloading videos page {page}',
229 query={'uid': uid, 'cursor': cursor})['data']
230
231 def _entries(self, uid, first_page):
232 cursor = 0
233 for page in itertools.count(1):
234 response = first_page if page == 1 else self._fetch_page(uid, cursor, page)
235 for video_info in traverse_obj(response, ('list', ..., {dict})):
236 yield self._parse_video_info(video_info)
237 cursor = response.get('next_cursor')
238 if (int_or_none(cursor) or -1) < 0:
239 break
240
241 def _real_extract(self, url):
242 uid = self._match_id(url)
243 first_page = self._fetch_page(uid)
244 uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False)
245 metainfo = {
246 'title': f'{uploader}的视频',
247 'description': f'{uploader}的全部视频',
248 'uploader': uploader,
249 } if uploader else {}
250
251 return self.playlist_result(self._entries(uid, first_page), uid, **metainfo)