]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/douyutv.py
[extractor/vk] VKPlay, VKPlayLive: Add extractors (#7358)
[yt-dlp.git] / yt_dlp / extractor / douyutv.py
CommitLineData
6f4e4132
YCH
1import time
2import hashlib
7274f3d0 3import re
f14c2333 4import urllib
6f4e4132 5
a172d962 6from .common import InfoExtractor
3b4b82d4
YCH
7from ..utils import (
8 ExtractorError,
9 unescapeHTML,
7274f3d0
YCH
10 unified_strdate,
11 urljoin,
3b4b82d4 12)
a172d962 13
a172d962 14
2ca1c5aa 15class DouyuTVIE(InfoExtractor):
513cbdda 16 IE_DESC = '斗鱼'
f14c2333 17 _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)'
8343a033 18 _TESTS = [{
a172d962 19 'url': 'http://www.douyutv.com/iseven',
20 'info_dict': {
8343a033
YCH
21 'id': '17732',
22 'display_id': 'iseven',
6f4e4132 23 'ext': 'flv',
c6fe5a7e 24 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
ec85ded8 25 'description': r're:.*m7show@163\.com.*',
f14c2333 26 'thumbnail': r're:^https?://.*\.png',
2ca1c5aa 27 'uploader': '7师傅',
a172d962 28 'is_live': True,
2ca1c5aa
S
29 },
30 'params': {
31 'skip_download': True,
24ca0e9c 32 },
8343a033
YCH
33 }, {
34 'url': 'http://www.douyutv.com/85982',
35 'info_dict': {
36 'id': '85982',
37 'display_id': '85982',
6f4e4132 38 'ext': 'flv',
8343a033
YCH
39 'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
40 'description': 'md5:746a2f7a253966a06755a912f0acc0d2',
f14c2333 41 'thumbnail': r're:^https?://.*\.png',
8343a033 42 'uploader': 'douyu小漠',
8343a033
YCH
43 'is_live': True,
44 },
45 'params': {
46 'skip_download': True,
24ca0e9c 47 },
aa9dc24f 48 'skip': 'Room not found',
24ca0e9c
YCH
49 }, {
50 'url': 'http://www.douyutv.com/17732',
51 'info_dict': {
52 'id': '17732',
53 'display_id': '17732',
6f4e4132 54 'ext': 'flv',
c6fe5a7e 55 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
ec85ded8 56 'description': r're:.*m7show@163\.com.*',
f14c2333 57 'thumbnail': r're:^https?://.*\.png',
24ca0e9c 58 'uploader': '7师傅',
24ca0e9c
YCH
59 'is_live': True,
60 },
61 'params': {
62 'skip_download': True,
63 },
f14c2333
HTL
64 }, {
65 'url': 'https://www.douyu.com/topic/ydxc?rid=6560603',
66 'info_dict': {
67 'id': '6560603',
68 'display_id': '6560603',
69 'ext': 'flv',
70 'title': 're:^阿余:新年快乐恭喜发财! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
71 'description': 're:.*直播时间.*',
72 'thumbnail': r're:^https?://.*\.png',
73 'uploader': '阿涛皎月Carry',
74 'live_status': 'is_live',
75 },
76 'params': {
77 'skip_download': True,
78 },
3bb33568
YCH
79 }, {
80 'url': 'http://www.douyu.com/xiaocang',
81 'only_matching': True,
33da98f4
J
82 }, {
83 # \"room_id\"
84 'url': 'http://www.douyu.com/t/lpl',
85 'only_matching': True,
8343a033 86 }]
a172d962 87
88 def _real_extract(self, url):
89 video_id = self._match_id(url)
a172d962 90
8343a033
YCH
91 if video_id.isdigit():
92 room_id = video_id
93 else:
94 page = self._download_webpage(url, video_id)
95 room_id = self._html_search_regex(
33da98f4 96 r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
8343a033 97
f14c2333
HTL
98 # Grab metadata from API
99 params = {
100 'aid': 'wp',
101 'client_sys': 'wp',
102 'time': int(time.time()),
103 }
104 params['auth'] = hashlib.md5(
105 f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest()
3b4b82d4 106 room = self._download_json(
f14c2333
HTL
107 f'http://www.douyutv.com/api/v1/room/{room_id}', video_id,
108 note='Downloading room info', query=params)['data']
b281aad2 109
b281aad2 110 # 1 = live, 2 = offline
3b4b82d4
YCH
111 if room.get('show_status') == '2':
112 raise ExtractorError('Live stream is offline', expected=True)
113
f14c2333
HTL
114 video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL'))
115 formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id)
2ca1c5aa 116
39ca3b5c 117 title = unescapeHTML(room['room_name'])
f4c68ba3 118 description = room.get('show_details')
b281aad2 119 thumbnail = room.get('room_src')
120 uploader = room.get('nickname')
a172d962 121
122 return {
8343a033
YCH
123 'id': room_id,
124 'display_id': video_id,
a172d962 125 'title': title,
2ca1c5aa 126 'description': description,
a172d962 127 'thumbnail': thumbnail,
2ca1c5aa 128 'uploader': uploader,
a172d962 129 'is_live': True,
f14c2333
HTL
130 'subtitles': subs,
131 'formats': formats,
2ca1c5aa 132 }
7274f3d0
YCH
133
134
135class DouyuShowIE(InfoExtractor):
136 _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
137
138 _TESTS = [{
139 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
140 'md5': '0c2cfd068ee2afe657801269b2d86214',
141 'info_dict': {
142 'id': 'rjNBdvnVXNzvE2yw',
143 'ext': 'mp4',
144 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场',
145 'duration': 7150.08,
146 'thumbnail': r're:^https?://.*\.jpg$',
147 'uploader': '陈一发儿',
148 'uploader_id': 'XrZwYelr5wbK',
149 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
150 'upload_date': '20170402',
151 },
152 }, {
153 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
154 'only_matching': True,
155 }]
156
157 def _real_extract(self, url):
158 url = url.replace('vmobile.', 'v.')
159 video_id = self._match_id(url)
160
161 webpage = self._download_webpage(url, video_id)
162
163 room_info = self._parse_json(self._search_regex(
164 r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
165
166 video_info = None
167
168 for trial in range(5):
169 # Sometimes Douyu rejects our request. Let's try it more times
170 try:
171 video_info = self._download_json(
172 'https://vmobile.douyu.com/video/getInfo', video_id,
173 query={'vid': video_id},
174 headers={
175 'Referer': url,
176 'x-requested-with': 'XMLHttpRequest',
177 })
178 break
179 except ExtractorError:
180 self._sleep(1, video_id)
181
182 if not video_info:
183 raise ExtractorError('Can\'t fetch video info')
184
185 formats = self._extract_m3u8_formats(
186 video_info['data']['video_url'], video_id,
187 entry_protocol='m3u8_native', ext='mp4')
188
189 upload_date = unified_strdate(self._html_search_regex(
190 r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
191 'upload date', fatal=False))
192
193 uploader = uploader_id = uploader_url = None
194 mobj = re.search(
195 r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
196 webpage)
197 if mobj:
198 uploader_id, uploader = mobj.groups()
199 uploader_url = urljoin(url, '/author/' + uploader_id)
200
201 return {
202 'id': video_id,
203 'title': room_info['name'],
204 'formats': formats,
205 'duration': room_info.get('duration'),
206 'thumbnail': room_info.get('pic'),
207 'upload_date': upload_date,
208 'uploader': uploader,
209 'uploader_id': uploader_id,
210 'uploader_url': uploader_url,
211 }