]>
Commit | Line | Data |
---|---|---|
1 | # encoding: utf-8 | |
2 | from __future__ import unicode_literals | |
3 | ||
4 | import re | |
5 | import json | |
6 | import base64 | |
7 | import zlib | |
8 | import xml.etree.ElementTree | |
9 | ||
10 | from hashlib import sha1 | |
11 | from math import pow, sqrt, floor | |
12 | from .subtitles import SubtitlesInfoExtractor | |
13 | from ..utils import ( | |
14 | ExtractorError, | |
15 | compat_urllib_parse, | |
16 | compat_urllib_request, | |
17 | bytes_to_intlist, | |
18 | intlist_to_bytes, | |
19 | unified_strdate, | |
20 | clean_html, | |
21 | urlencode_postdata, | |
22 | ) | |
23 | from ..aes import ( | |
24 | aes_cbc_decrypt, | |
25 | inc, | |
26 | ) | |
27 | ||
28 | ||
29 | class CrunchyrollIE(SubtitlesInfoExtractor): | |
30 | _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' | |
31 | _TEST = { | |
32 | 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', | |
33 | #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412', | |
34 | 'info_dict': { | |
35 | 'id': '645513', | |
36 | 'ext': 'flv', | |
37 | 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', | |
38 | 'description': 'md5:2d17137920c64f2f49981a7797d275ef', | |
39 | 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', | |
40 | 'uploader': 'Yomiuri Telecasting Corporation (YTV)', | |
41 | 'upload_date': '20131013', | |
42 | }, | |
43 | 'params': { | |
44 | # rtmp | |
45 | 'skip_download': True, | |
46 | }, | |
47 | } | |
48 | ||
49 | _FORMAT_IDS = { | |
50 | '360': ('60', '106'), | |
51 | '480': ('61', '106'), | |
52 | '720': ('62', '106'), | |
53 | '1080': ('80', '108'), | |
54 | } | |
55 | ||
56 | def _login(self): | |
57 | (username, password) = self._get_login_info() | |
58 | if username is None: | |
59 | return | |
60 | self.report_login() | |
61 | login_url = 'https://www.crunchyroll.com/?a=formhandler' | |
62 | data = urlencode_postdata({ | |
63 | 'formname': 'RpcApiUser_Login', | |
64 | 'name': username, | |
65 | 'password': password, | |
66 | }) | |
67 | login_request = compat_urllib_request.Request(login_url, data) | |
68 | login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') | |
69 | self._download_webpage(login_request, None, False, 'Wrong login info') | |
70 | ||
71 | ||
72 | def _real_initialize(self): | |
73 | self._login() | |
74 | ||
75 | ||
76 | def _decrypt_subtitles(self, data, iv, id): | |
77 | data = bytes_to_intlist(data) | |
78 | iv = bytes_to_intlist(iv) | |
79 | id = int(id) | |
80 | ||
81 | def obfuscate_key_aux(count, modulo, start): | |
82 | output = list(start) | |
83 | for _ in range(count): | |
84 | output.append(output[-1] + output[-2]) | |
85 | # cut off start values | |
86 | output = output[2:] | |
87 | output = list(map(lambda x: x % modulo + 33, output)) | |
88 | return output | |
89 | ||
90 | def obfuscate_key(key): | |
91 | num1 = int(floor(pow(2, 25) * sqrt(6.9))) | |
92 | num2 = (num1 ^ key) << 5 | |
93 | num3 = key ^ num1 | |
94 | num4 = num3 ^ (num3 >> 3) ^ num2 | |
95 | prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) | |
96 | shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest()) | |
97 | # Extend 160 Bit hash to 256 Bit | |
98 | return shaHash + [0] * 12 | |
99 | ||
100 | key = obfuscate_key(id) | |
101 | class Counter: | |
102 | __value = iv | |
103 | def next_value(self): | |
104 | temp = self.__value | |
105 | self.__value = inc(self.__value) | |
106 | return temp | |
107 | decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) | |
108 | return zlib.decompress(decrypted_data) | |
109 | ||
110 | def _convert_subtitles_to_srt(self, subtitles): | |
111 | output = '' | |
112 | for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1): | |
113 | start = start.replace('.', ',') | |
114 | end = end.replace('.', ',') | |
115 | text = clean_html(text) | |
116 | text = text.replace('\\N', '\n') | |
117 | if not text: | |
118 | continue | |
119 | output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) | |
120 | return output | |
121 | ||
122 | def _convert_subtitles_to_ass(self, subtitles): | |
123 | output = '' | |
124 | ||
125 | def ass_bool(strvalue): | |
126 | assvalue = '0' | |
127 | if strvalue == '1': | |
128 | assvalue = '-1' | |
129 | return assvalue | |
130 | ||
131 | sub_root = xml.etree.ElementTree.fromstring(subtitles) | |
132 | if not sub_root: | |
133 | return output | |
134 | ||
135 | output = '[Script Info]\n' | |
136 | output += 'Title: %s\n' % sub_root.attrib["title"] | |
137 | output += 'ScriptType: v4.00+\n' | |
138 | output += 'WrapStyle: %s\n' % sub_root.attrib["wrap_style"] | |
139 | output += 'PlayResX: %s\n' % sub_root.attrib["play_res_x"] | |
140 | output += 'PlayResY: %s\n' % sub_root.attrib["play_res_y"] | |
141 | output += """ScaledBorderAndShadow: yes | |
142 | ||
143 | [V4+ Styles] | |
144 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding | |
145 | """ | |
146 | for style in sub_root.findall('./styles/style'): | |
147 | output += 'Style: ' + style.attrib["name"] | |
148 | output += ',' + style.attrib["font_name"] | |
149 | output += ',' + style.attrib["font_size"] | |
150 | output += ',' + style.attrib["primary_colour"] | |
151 | output += ',' + style.attrib["secondary_colour"] | |
152 | output += ',' + style.attrib["outline_colour"] | |
153 | output += ',' + style.attrib["back_colour"] | |
154 | output += ',' + ass_bool(style.attrib["bold"]) | |
155 | output += ',' + ass_bool(style.attrib["italic"]) | |
156 | output += ',' + ass_bool(style.attrib["underline"]) | |
157 | output += ',' + ass_bool(style.attrib["strikeout"]) | |
158 | output += ',' + style.attrib["scale_x"] | |
159 | output += ',' + style.attrib["scale_y"] | |
160 | output += ',' + style.attrib["spacing"] | |
161 | output += ',' + style.attrib["angle"] | |
162 | output += ',' + style.attrib["border_style"] | |
163 | output += ',' + style.attrib["outline"] | |
164 | output += ',' + style.attrib["shadow"] | |
165 | output += ',' + style.attrib["alignment"] | |
166 | output += ',' + style.attrib["margin_l"] | |
167 | output += ',' + style.attrib["margin_r"] | |
168 | output += ',' + style.attrib["margin_v"] | |
169 | output += ',' + style.attrib["encoding"] | |
170 | output += '\n' | |
171 | ||
172 | output += """ | |
173 | [Events] | |
174 | Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text | |
175 | """ | |
176 | for event in sub_root.findall('./events/event'): | |
177 | output += 'Dialogue: 0' | |
178 | output += ',' + event.attrib["start"] | |
179 | output += ',' + event.attrib["end"] | |
180 | output += ',' + event.attrib["style"] | |
181 | output += ',' + event.attrib["name"] | |
182 | output += ',' + event.attrib["margin_l"] | |
183 | output += ',' + event.attrib["margin_r"] | |
184 | output += ',' + event.attrib["margin_v"] | |
185 | output += ',' + event.attrib["effect"] | |
186 | output += ',' + event.attrib["text"] | |
187 | output += '\n' | |
188 | ||
189 | return output | |
190 | ||
191 | def _real_extract(self,url): | |
192 | mobj = re.match(self._VALID_URL, url) | |
193 | video_id = mobj.group('video_id') | |
194 | ||
195 | if mobj.group('prefix') == 'm': | |
196 | mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') | |
197 | webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url') | |
198 | else: | |
199 | webpage_url = 'http://www.' + mobj.group('url') | |
200 | ||
201 | webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') | |
202 | note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') | |
203 | if note_m: | |
204 | raise ExtractorError(note_m) | |
205 | ||
206 | mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage) | |
207 | if mobj: | |
208 | msg = json.loads(mobj.group('msg')) | |
209 | if msg.get('type') == 'error': | |
210 | raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) | |
211 | ||
212 | video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL) | |
213 | video_title = re.sub(r' {2,}', ' ', video_title) | |
214 | video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') | |
215 | if not video_description: | |
216 | video_description = None | |
217 | video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) | |
218 | if video_upload_date: | |
219 | video_upload_date = unified_strdate(video_upload_date) | |
220 | video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL) | |
221 | ||
222 | playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) | |
223 | playerdata_req = compat_urllib_request.Request(playerdata_url) | |
224 | playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) | |
225 | playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') | |
226 | playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') | |
227 | ||
228 | stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id') | |
229 | video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False) | |
230 | ||
231 | formats = [] | |
232 | for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage): | |
233 | stream_quality, stream_format = self._FORMAT_IDS[fmt] | |
234 | video_format = fmt+'p' | |
235 | streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/') | |
236 | # urlencode doesn't work! | |
237 | streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format | |
238 | streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') | |
239 | streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) | |
240 | streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format) | |
241 | video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url') | |
242 | video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path') | |
243 | formats.append({ | |
244 | 'url': video_url, | |
245 | 'play_path': video_play_path, | |
246 | 'ext': 'flv', | |
247 | 'format': video_format, | |
248 | 'format_id': video_format, | |
249 | }) | |
250 | ||
251 | subtitles = {} | |
252 | sub_format = self._downloader.params.get('subtitlesformat', 'srt') | |
253 | for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): | |
254 | sub_page = self._download_webpage('http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\ | |
255 | video_id, note='Downloading subtitles for '+sub_name) | |
256 | id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False) | |
257 | iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False) | |
258 | data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False) | |
259 | if not id or not iv or not data: | |
260 | continue | |
261 | id = int(id) | |
262 | iv = base64.b64decode(iv) | |
263 | data = base64.b64decode(data) | |
264 | ||
265 | subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') | |
266 | lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) | |
267 | if not lang_code: | |
268 | continue | |
269 | if sub_format == 'ass': | |
270 | subtitles[lang_code] = self._convert_subtitles_to_ass(subtitle) | |
271 | else: | |
272 | subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle) | |
273 | ||
274 | if self._downloader.params.get('listsubtitles', False): | |
275 | self._list_available_subtitles(video_id, subtitles) | |
276 | return | |
277 | ||
278 | return { | |
279 | 'id': video_id, | |
280 | 'title': video_title, | |
281 | 'description': video_description, | |
282 | 'thumbnail': video_thumbnail, | |
283 | 'uploader': video_uploader, | |
284 | 'upload_date': video_upload_date, | |
285 | 'subtitles': subtitles, | |
286 | 'formats': formats, | |
287 | } |