yt_dlp/extractor/crunchyroll.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import re
   6 import json
   7 import zlib
   8
   9 from hashlib import sha1
  10 from math import pow, sqrt, floor
  11 from .common import InfoExtractor
  12 from .vrv import VRVBaseIE
  13 from ..compat import (
  14     compat_b64decode,
  15     compat_etree_Element,
  16     compat_etree_fromstring,
  17     compat_str,
  18     compat_urllib_parse_urlencode,
  19     compat_urllib_request,
  20     compat_urlparse,
  21 )
  22 from ..utils import (
  23     ExtractorError,
  24     bytes_to_intlist,
  25     extract_attributes,
  26     float_or_none,
  27     format_field,
  28     intlist_to_bytes,
  29     int_or_none,
  30     join_nonempty,
  31     lowercase_escape,
  32     merge_dicts,
  33     qualities,
  34     remove_end,
  35     sanitized_Request,
  36     traverse_obj,
  37     try_get,
  38     xpath_text,
  39 )
  40 from ..aes import (
  41     aes_cbc_decrypt,
  42 )
  43
  44
  45 class CrunchyrollBaseIE(InfoExtractor):
  46     _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login'
  47     _API_BASE = 'https://api.crunchyroll.com'
  48     _NETRC_MACHINE = 'crunchyroll'
  49
  50     def _call_rpc_api(self, method, video_id, note=None, data=None):
  51         data = data or {}
  52         data['req'] = 'RpcApi' + method
  53         data = compat_urllib_parse_urlencode(data).encode('utf-8')
  54         return self._download_xml(
  55             'https://www.crunchyroll.com/xml/',
  56             video_id, note, fatal=False, data=data, headers={
  57                 'Content-Type': 'application/x-www-form-urlencoded',
  58             })
  59
  60     def _perform_login(self, username, password):
  61         if self._get_cookies(self._LOGIN_URL).get('etp_rt'):
  62             return
  63
  64         upsell_response = self._download_json(
  65             f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id',
  66             query={
  67                 'sess_id': 1,
  68                 'device_id': 'whatvalueshouldbeforweb',
  69                 'device_type': 'com.crunchyroll.static',
  70                 'access_token': 'giKq5eY27ny3cqz',
  71                 'referer': self._LOGIN_URL
  72             })
  73         if upsell_response['code'] != 'ok':
  74             raise ExtractorError('Could not get session id')
  75         session_id = upsell_response['data']['session_id']
  76
  77         login_response = self._download_json(
  78             f'{self._API_BASE}/login.1.json', None, 'Logging in',
  79             data=compat_urllib_parse_urlencode({
  80                 'account': username,
  81                 'password': password,
  82                 'session_id': session_id
  83             }).encode('ascii'))
  84         if login_response['code'] != 'ok':
  85             raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
  86         if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
  87             raise ExtractorError('Login succeeded but did not set etp_rt cookie')
  88
  89     # Beta-specific, but needed for redirects
  90     def _get_beta_embedded_json(self, webpage, display_id):
  91         initial_state = self._parse_json(self._search_regex(
  92             r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id)
  93         app_config = self._parse_json(self._search_regex(
  94             r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id)
  95         return initial_state, app_config
  96
  97     def _redirect_to_beta(self, webpage, iekey, video_id):
  98         if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
  99             raise ExtractorError('Received a beta page from non-beta url when not logged in.')
 100         initial_state, app_config = self._get_beta_embedded_json(webpage, video_id)
 101         url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname']
 102         self.to_screen(f'{video_id}: Redirected to beta site - {url}')
 103         return self.url_result(f'{url}', iekey, video_id)
 104
 105     @staticmethod
 106     def _add_skip_wall(url):
 107         parsed_url = compat_urlparse.urlparse(url)
 108         qs = compat_urlparse.parse_qs(parsed_url.query)
 109         # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message:
 110         # > This content may be inappropriate for some people.
 111         # > Are you sure you want to continue?
 112         # since it's not disabled by default in crunchyroll account's settings.
 113         # See https://github.com/ytdl-org/youtube-dl/issues/7202.
 114         qs['skip_wall'] = ['1']
 115         return compat_urlparse.urlunparse(
 116             parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
 117
 118
 119 class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE):
 120     IE_NAME = 'crunchyroll'
 121     _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)'
 122     _TESTS = [{
 123         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
 124         'info_dict': {
 125             'id': '645513',
 126             'ext': 'mp4',
 127             'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
 128             'description': 'md5:2d17137920c64f2f49981a7797d275ef',
 129             'thumbnail': r're:^https?://.*\.jpg$',
 130             'uploader': 'Yomiuri Telecasting Corporation (YTV)',
 131             'upload_date': '20131013',
 132             'url': 're:(?!.*&amp)',
 133         },
 134         'params': {
 135             # rtmp
 136             'skip_download': True,
 137         },
 138         'skip': 'Video gone',
 139     }, {
 140         'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
 141         'info_dict': {
 142             'id': '589804',
 143             'ext': 'flv',
 144             'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
 145             'description': 'md5:2fbc01f90b87e8e9137296f37b461c12',
 146             'thumbnail': r're:^https?://.*\.jpg$',
 147             'uploader': 'Danny Choo Network',
 148             'upload_date': '20120213',
 149         },
 150         'params': {
 151             # rtmp
 152             'skip_download': True,
 153         },
 154         'skip': 'Video gone',
 155     }, {
 156         'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409',
 157         'info_dict': {
 158             'id': '702409',
 159             'ext': 'mp4',
 160             'title': compat_str,
 161             'description': compat_str,
 162             'thumbnail': r're:^https?://.*\.jpg$',
 163             'uploader': 'Re:Zero Partners',
 164             'timestamp': 1462098900,
 165             'upload_date': '20160501',
 166         },
 167         'params': {
 168             # m3u8 download
 169             'skip_download': True,
 170         },
 171     }, {
 172         'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589',
 173         'info_dict': {
 174             'id': '727589',
 175             'ext': 'mp4',
 176             'title': compat_str,
 177             'description': compat_str,
 178             'thumbnail': r're:^https?://.*\.jpg$',
 179             'uploader': 'Kadokawa Pictures Inc.',
 180             'timestamp': 1484130900,
 181             'upload_date': '20170111',
 182             'series': compat_str,
 183             'season': "KONOSUBA -God's blessing on this wonderful world! 2",
 184             'season_number': 2,
 185             'episode': 'Give Me Deliverance From This Judicial Injustice!',
 186             'episode_number': 1,
 187         },
 188         'params': {
 189             # m3u8 download
 190             'skip_download': True,
 191         },
 192     }, {
 193         'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
 194         'only_matching': True,
 195     }, {
 196         # geo-restricted (US), 18+ maturity wall, non-premium available
 197         'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617',
 198         'only_matching': True,
 199     }, {
 200         # A description with double quotes
 201         'url': 'http://www.crunchyroll.com/11eyes/episode-1-piros-jszaka-red-night-535080',
 202         'info_dict': {
 203             'id': '535080',
 204             'ext': 'mp4',
 205             'title': compat_str,
 206             'description': compat_str,
 207             'uploader': 'Marvelous AQL Inc.',
 208             'timestamp': 1255512600,
 209             'upload_date': '20091014',
 210         },
 211         'params': {
 212             # Just test metadata extraction
 213             'skip_download': True,
 214         },
 215     }, {
 216         # make sure we can extract an uploader name that's not a link
 217         'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899',
 218         'info_dict': {
 219             'id': '606899',
 220             'ext': 'mp4',
 221             'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors',
 222             'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"',
 223             'uploader': 'Geneon Entertainment',
 224             'upload_date': '20120717',
 225         },
 226         'params': {
 227             # just test metadata extraction
 228             'skip_download': True,
 229         },
 230         'skip': 'Video gone',
 231     }, {
 232         # A video with a vastly different season name compared to the series name
 233         'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532',
 234         'info_dict': {
 235             'id': '590532',
 236             'ext': 'mp4',
 237             'title': compat_str,
 238             'description': compat_str,
 239             'uploader': 'TV TOKYO',
 240             'timestamp': 1330956000,
 241             'upload_date': '20120305',
 242             'series': 'Nyarko-san: Another Crawling Chaos',
 243             'season': 'Haiyoru! Nyaruani (ONA)',
 244         },
 245         'params': {
 246             # Just test metadata extraction
 247             'skip_download': True,
 248         },
 249     }, {
 250         'url': 'http://www.crunchyroll.com/media-723735',
 251         'only_matching': True,
 252     }, {
 253         'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921',
 254         'only_matching': True,
 255     }]
 256
 257     _FORMAT_IDS = {
 258         '360': ('60', '106'),
 259         '480': ('61', '106'),
 260         '720': ('62', '106'),
 261         '1080': ('80', '108'),
 262     }
 263
 264     def _download_webpage(self, url_or_request, *args, **kwargs):
 265         request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request)
 266                    else sanitized_Request(url_or_request))
 267         # Accept-Language must be set explicitly to accept any language to avoid issues
 268         # similar to https://github.com/ytdl-org/youtube-dl/issues/6797.
 269         # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction
 270         # should be imposed or not (from what I can see it just takes the first language
 271         # ignoring the priority and requires it to correspond the IP). By the way this causes
 272         # Crunchyroll to not work in georestriction cases in some browsers that don't place
 273         # the locale lang first in header. However allowing any language seems to workaround the issue.
 274         request.add_header('Accept-Language', '*')
 275         return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs)
 276
 277     def _decrypt_subtitles(self, data, iv, id):
 278         data = bytes_to_intlist(compat_b64decode(data))
 279         iv = bytes_to_intlist(compat_b64decode(iv))
 280         id = int(id)
 281
 282         def obfuscate_key_aux(count, modulo, start):
 283             output = list(start)
 284             for _ in range(count):
 285                 output.append(output[-1] + output[-2])
 286             # cut off start values
 287             output = output[2:]
 288             output = list(map(lambda x: x % modulo + 33, output))
 289             return output
 290
 291         def obfuscate_key(key):
 292             num1 = int(floor(pow(2, 25) * sqrt(6.9)))
 293             num2 = (num1 ^ key) << 5
 294             num3 = key ^ num1
 295             num4 = num3 ^ (num3 >> 3) ^ num2
 296             prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2)))
 297             shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
 298             # Extend 160 Bit hash to 256 Bit
 299             return shaHash + [0] * 12
 300
 301         key = obfuscate_key(id)
 302
 303         decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
 304         return zlib.decompress(decrypted_data)
 305
 306     def _convert_subtitles_to_srt(self, sub_root):
 307         output = ''
 308
 309         for i, event in enumerate(sub_root.findall('./events/event'), 1):
 310             start = event.attrib['start'].replace('.', ',')
 311             end = event.attrib['end'].replace('.', ',')
 312             text = event.attrib['text'].replace('\\N', '\n')
 313             output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
 314         return output
 315
 316     def _convert_subtitles_to_ass(self, sub_root):
 317         output = ''
 318
 319         def ass_bool(strvalue):
 320             assvalue = '0'
 321             if strvalue == '1':
 322                 assvalue = '-1'
 323             return assvalue
 324
 325         output = '[Script Info]\n'
 326         output += 'Title: %s\n' % sub_root.attrib['title']
 327         output += 'ScriptType: v4.00+\n'
 328         output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style']
 329         output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x']
 330         output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y']
 331         output += """
 332 [V4+ Styles]
 333 Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
 334 """
 335         for style in sub_root.findall('./styles/style'):
 336             output += 'Style: ' + style.attrib['name']
 337             output += ',' + style.attrib['font_name']
 338             output += ',' + style.attrib['font_size']
 339             output += ',' + style.attrib['primary_colour']
 340             output += ',' + style.attrib['secondary_colour']
 341             output += ',' + style.attrib['outline_colour']
 342             output += ',' + style.attrib['back_colour']
 343             output += ',' + ass_bool(style.attrib['bold'])
 344             output += ',' + ass_bool(style.attrib['italic'])
 345             output += ',' + ass_bool(style.attrib['underline'])
 346             output += ',' + ass_bool(style.attrib['strikeout'])
 347             output += ',' + style.attrib['scale_x']
 348             output += ',' + style.attrib['scale_y']
 349             output += ',' + style.attrib['spacing']
 350             output += ',' + style.attrib['angle']
 351             output += ',' + style.attrib['border_style']
 352             output += ',' + style.attrib['outline']
 353             output += ',' + style.attrib['shadow']
 354             output += ',' + style.attrib['alignment']
 355             output += ',' + style.attrib['margin_l']
 356             output += ',' + style.attrib['margin_r']
 357             output += ',' + style.attrib['margin_v']
 358             output += ',' + style.attrib['encoding']
 359             output += '\n'
 360
 361         output += """
 362 [Events]
 363 Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 364 """
 365         for event in sub_root.findall('./events/event'):
 366             output += 'Dialogue: 0'
 367             output += ',' + event.attrib['start']
 368             output += ',' + event.attrib['end']
 369             output += ',' + event.attrib['style']
 370             output += ',' + event.attrib['name']
 371             output += ',' + event.attrib['margin_l']
 372             output += ',' + event.attrib['margin_r']
 373             output += ',' + event.attrib['margin_v']
 374             output += ',' + event.attrib['effect']
 375             output += ',' + event.attrib['text']
 376             output += '\n'
 377
 378         return output
 379
 380     def _extract_subtitles(self, subtitle):
 381         sub_root = compat_etree_fromstring(subtitle)
 382         return [{
 383             'ext': 'srt',
 384             'data': self._convert_subtitles_to_srt(sub_root),
 385         }, {
 386             'ext': 'ass',
 387             'data': self._convert_subtitles_to_ass(sub_root),
 388         }]
 389
 390     def _get_subtitles(self, video_id, webpage):
 391         subtitles = {}
 392         for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage):
 393             sub_doc = self._call_rpc_api(
 394                 'Subtitle_GetXml', video_id,
 395                 'Downloading subtitles for ' + sub_name, data={
 396                     'subtitle_script_id': sub_id,
 397                 })
 398             if not isinstance(sub_doc, compat_etree_Element):
 399                 continue
 400             sid = sub_doc.get('id')
 401             iv = xpath_text(sub_doc, 'iv', 'subtitle iv')
 402             data = xpath_text(sub_doc, 'data', 'subtitle data')
 403             if not sid or not iv or not data:
 404                 continue
 405             subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8')
 406             lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
 407             if not lang_code:
 408                 continue
 409             subtitles[lang_code] = self._extract_subtitles(subtitle)
 410         return subtitles
 411
 412     def _real_extract(self, url):
 413         mobj = self._match_valid_url(url)
 414         video_id = mobj.group('id')
 415
 416         if mobj.group('prefix') == 'm':
 417             mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage')
 418             webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url')
 419         else:
 420             webpage_url = 'http://www.' + mobj.group('url')
 421
 422         webpage = self._download_webpage(
 423             self._add_skip_wall(webpage_url), video_id,
 424             headers=self.geo_verification_headers())
 425         if re.search(r'<div id="preload-data">', webpage):
 426             return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id)
 427         note_m = self._html_search_regex(
 428             r'<div class="showmedia-trailer-notice">(.+?)</div>',
 429             webpage, 'trailer-notice', default='')
 430         if note_m:
 431             raise ExtractorError(note_m, expected=True)
 432
 433         mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
 434         if mobj:
 435             msg = json.loads(mobj.group('msg'))
 436             if msg.get('type') == 'error':
 437                 raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
 438
 439         if 'To view this, please log in to verify you are 18 or older.' in webpage:
 440             self.raise_login_required()
 441
 442         media = self._parse_json(self._search_regex(
 443             r'vilos\.config\.media\s*=\s*({.+?});',
 444             webpage, 'vilos media', default='{}'), video_id)
 445         media_metadata = media.get('metadata') or {}
 446
 447         language = self._search_regex(
 448             r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P<lang>(?:(?!\1).)+)\1',
 449             webpage, 'language', default=None, group='lang')
 450
 451         video_title = self._html_search_regex(
 452             (r'(?s)<h1[^>]*>((?:(?!<h1).)*?<(?:span[^>]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!<h1).)+?)</h1>',
 453              r'<title>(.+?),\s+-\s+.+? Crunchyroll'),
 454             webpage, 'video_title', default=None)
 455         if not video_title:
 456             video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage))
 457         video_title = re.sub(r' {2,}', ' ', video_title)
 458         video_description = (self._parse_json(self._html_search_regex(
 459             r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
 460             webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
 461
 462         thumbnails = []
 463         thumbnail_url = (self._parse_json(self._html_search_regex(
 464             r'<script type="application\/ld\+json">\n\s*(.+?)<\/script>',
 465             webpage, 'thumbnail_url', default='{}'), video_id)).get('image')
 466         if thumbnail_url:
 467             thumbnails.append({
 468                 'url': thumbnail_url,
 469                 'width': 1920,
 470                 'height': 1080
 471             })
 472
 473         if video_description:
 474             video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
 475         video_uploader = self._html_search_regex(
 476             # try looking for both an uploader that's a link and one that's not
 477             [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
 478             webpage, 'video_uploader', default=False)
 479
 480         requested_languages = self._configuration_arg('language')
 481         requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')]
 482         language_preference = qualities((requested_languages or [language or ''])[::-1])
 483         hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1])
 484
 485         formats = []
 486         for stream in media.get('streams', []):
 487             audio_lang = stream.get('audio_lang') or ''
 488             hardsub_lang = stream.get('hardsub_lang') or ''
 489             if (requested_languages and audio_lang.lower() not in requested_languages
 490                     or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs):
 491                 continue
 492             vrv_formats = self._extract_vrv_formats(
 493                 stream.get('url'), video_id, stream.get('format'),
 494                 audio_lang, hardsub_lang)
 495             for f in vrv_formats:
 496                 f['language_preference'] = language_preference(audio_lang)
 497                 f['quality'] = hardsub_preference(hardsub_lang)
 498             formats.extend(vrv_formats)
 499         if not formats:
 500             available_fmts = []
 501             for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
 502                 attrs = extract_attributes(a)
 503                 href = attrs.get('href')
 504                 if href and '/freetrial' in href:
 505                     continue
 506                 available_fmts.append(fmt)
 507             if not available_fmts:
 508                 for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
 509                     available_fmts = re.findall(p, webpage)
 510                     if available_fmts:
 511                         break
 512             if not available_fmts:
 513                 available_fmts = self._FORMAT_IDS.keys()
 514             video_encode_ids = []
 515
 516             for fmt in available_fmts:
 517                 stream_quality, stream_format = self._FORMAT_IDS[fmt]
 518                 video_format = fmt + 'p'
 519                 stream_infos = []
 520                 streamdata = self._call_rpc_api(
 521                     'VideoPlayer_GetStandardConfig', video_id,
 522                     'Downloading media info for %s' % video_format, data={
 523                         'media_id': video_id,
 524                         'video_format': stream_format,
 525                         'video_quality': stream_quality,
 526                         'current_page': url,
 527                     })
 528                 if isinstance(streamdata, compat_etree_Element):
 529                     stream_info = streamdata.find('./{default}preload/stream_info')
 530                     if stream_info is not None:
 531                         stream_infos.append(stream_info)
 532                 stream_info = self._call_rpc_api(
 533                     'VideoEncode_GetStreamInfo', video_id,
 534                     'Downloading stream info for %s' % video_format, data={
 535                         'media_id': video_id,
 536                         'video_format': stream_format,
 537                         'video_encode_quality': stream_quality,
 538                     })
 539                 if isinstance(stream_info, compat_etree_Element):
 540                     stream_infos.append(stream_info)
 541                 for stream_info in stream_infos:
 542                     video_encode_id = xpath_text(stream_info, './video_encode_id')
 543                     if video_encode_id in video_encode_ids:
 544                         continue
 545                     video_encode_ids.append(video_encode_id)
 546
 547                     video_file = xpath_text(stream_info, './file')
 548                     if not video_file:
 549                         continue
 550                     if video_file.startswith('http'):
 551                         formats.extend(self._extract_m3u8_formats(
 552                             video_file, video_id, 'mp4', entry_protocol='m3u8_native',
 553                             m3u8_id='hls', fatal=False))
 554                         continue
 555
 556                     video_url = xpath_text(stream_info, './host')
 557                     if not video_url:
 558                         continue
 559                     metadata = stream_info.find('./metadata')
 560                     format_info = {
 561                         'format': video_format,
 562                         'height': int_or_none(xpath_text(metadata, './height')),
 563                         'width': int_or_none(xpath_text(metadata, './width')),
 564                     }
 565
 566                     if '.fplive.net/' in video_url:
 567                         video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
 568                         parsed_video_url = compat_urlparse.urlparse(video_url)
 569                         direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
 570                             netloc='v.lvlt.crcdn.net',
 571                             path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
 572                         if self._is_valid_url(direct_video_url, video_id, video_format):
 573                             format_info.update({
 574                                 'format_id': 'http-' + video_format,
 575                                 'url': direct_video_url,
 576                             })
 577                             formats.append(format_info)
 578                             continue
 579
 580                     format_info.update({
 581                         'format_id': 'rtmp-' + video_format,
 582                         'url': video_url,
 583                         'play_path': video_file,
 584                         'ext': 'flv',
 585                     })
 586                     formats.append(format_info)
 587         self._sort_formats(formats)
 588
 589         metadata = self._call_rpc_api(
 590             'VideoPlayer_GetMediaMetadata', video_id,
 591             note='Downloading media info', data={
 592                 'media_id': video_id,
 593             })
 594
 595         subtitles = {}
 596         for subtitle in media.get('subtitles', []):
 597             subtitle_url = subtitle.get('url')
 598             if not subtitle_url:
 599                 continue
 600             subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({
 601                 'url': subtitle_url,
 602                 'ext': subtitle.get('format', 'ass'),
 603             })
 604         if not subtitles:
 605             subtitles = self.extract_subtitles(video_id, webpage)
 606
 607         # webpage provide more accurate data than series_title from XML
 608         series = self._html_search_regex(
 609             r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d',
 610             webpage, 'series', fatal=False)
 611
 612         season = episode = episode_number = duration = None
 613
 614         if isinstance(metadata, compat_etree_Element):
 615             season = xpath_text(metadata, 'series_title')
 616             episode = xpath_text(metadata, 'episode_title')
 617             episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
 618             duration = float_or_none(media_metadata.get('duration'), 1000)
 619
 620         if not episode:
 621             episode = media_metadata.get('title')
 622         if not episode_number:
 623             episode_number = int_or_none(media_metadata.get('episode_number'))
 624         thumbnail_url = try_get(media, lambda x: x['thumbnail']['url'])
 625         if thumbnail_url:
 626             thumbnails.append({
 627                 'url': thumbnail_url,
 628                 'width': 640,
 629                 'height': 360
 630             })
 631
 632         season_number = int_or_none(self._search_regex(
 633             r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
 634             webpage, 'season number', default=None))
 635
 636         info = self._search_json_ld(webpage, video_id, default={})
 637
 638         return merge_dicts({
 639             'id': video_id,
 640             'title': video_title,
 641             'description': video_description,
 642             'duration': duration,
 643             'thumbnails': thumbnails,
 644             'uploader': video_uploader,
 645             'series': series,
 646             'season': season,
 647             'season_number': season_number,
 648             'episode': episode,
 649             'episode_number': episode_number,
 650             'subtitles': subtitles,
 651             'formats': formats,
 652         }, info)
 653
 654
 655 class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
 656     IE_NAME = 'crunchyroll:playlist'
 657     _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:\w{1,2}/)?(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
 658
 659     _TESTS = [{
 660         'url': 'https://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
 661         'info_dict': {
 662             'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
 663             'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
 664         },
 665         'playlist_count': 13,
 666     }, {
 667         # geo-restricted (US), 18+ maturity wall, non-premium available
 668         'url': 'http://www.crunchyroll.com/cosplay-complex-ova',
 669         'info_dict': {
 670             'id': 'cosplay-complex-ova',
 671             'title': 'Cosplay Complex OVA'
 672         },
 673         'playlist_count': 3,
 674         'skip': 'Georestricted',
 675     }, {
 676         # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14
 677         'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1',
 678         'only_matching': True,
 679     }, {
 680         'url': 'http://www.crunchyroll.com/fr/ladies-versus-butlers',
 681         'only_matching': True,
 682     }]
 683
 684     def _real_extract(self, url):
 685         show_id = self._match_id(url)
 686
 687         webpage = self._download_webpage(
 688             # https:// gives a 403, but http:// does not
 689             self._add_skip_wall(url).replace('https://', 'http://'), show_id,
 690             headers=self.geo_verification_headers())
 691         if re.search(r'<div id="preload-data">', webpage):
 692             return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id)
 693         title = self._html_search_meta('name', webpage, default=None)
 694
 695         episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"'
 696         season_re = r'<a [^>]+season-dropdown[^>]+>([^<]+)'
 697         paths = re.findall(f'(?s){episode_re}|{season_re}', webpage)
 698
 699         entries, current_season = [], None
 700         for ep_id, ep, season in paths:
 701             if season:
 702                 current_season = season
 703                 continue
 704             entries.append(self.url_result(
 705                 f'http://www.crunchyroll.com{ep}', CrunchyrollIE.ie_key(), ep_id, season=current_season))
 706
 707         return {
 708             '_type': 'playlist',
 709             'id': show_id,
 710             'title': title,
 711             'entries': reversed(entries),
 712         }
 713
 714
 715 class CrunchyrollBetaBaseIE(CrunchyrollBaseIE):
 716     params = None
 717
 718     def _get_params(self, lang):
 719         if not CrunchyrollBetaBaseIE.params:
 720             initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(
 721                 f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None)
 722             api_domain = app_config['cxApiParams']['apiDomain']
 723             basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii')
 724             auth_response = self._download_json(
 725                 f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie',
 726                 headers={
 727                     'Authorization': 'Basic ' + basic_token
 728                 }, data='grant_type=etp_rt_cookie'.encode('ascii'))
 729             policy_response = self._download_json(
 730                 f'{api_domain}/index/v2', None, note='Retrieving signed policy',
 731                 headers={
 732                     'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']
 733                 })
 734             bucket = policy_response['cms']['bucket']
 735             params = {
 736                 'Policy': policy_response['cms']['policy'],
 737                 'Signature': policy_response['cms']['signature'],
 738                 'Key-Pair-Id': policy_response['cms']['key_pair_id']
 739             }
 740             locale = traverse_obj(initial_state, ('localization', 'locale'))
 741             if locale:
 742                 params['locale'] = locale
 743             CrunchyrollBetaBaseIE.params = (api_domain, bucket, params)
 744         return CrunchyrollBetaBaseIE.params
 745
 746     def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey):
 747         initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id)
 748         content_data = initial_state['content']['byId'][internal_id]
 749         if is_episode:
 750             video_id = content_data['external_id'].split('.')[1]
 751             series_id = content_data['episode_metadata']['series_slug_title']
 752         else:
 753             series_id = content_data['slug_title']
 754         series_id = re.sub(r'-{2,}', '-', series_id)
 755         url = f'https://www.crunchyroll.com/{lang}{series_id}'
 756         if is_episode:
 757             url = url + f'/{display_id}-{video_id}'
 758         self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}')
 759         return self.url_result(url, iekey, display_id)
 760
 761
 762 class CrunchyrollBetaIE(CrunchyrollBetaBaseIE):
 763     IE_NAME = 'crunchyroll:beta'
 764     _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)'
 765     _TESTS = [{
 766         'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future',
 767         'info_dict': {
 768             'id': '696363',
 769             'ext': 'mp4',
 770             'timestamp': 1459610100,
 771             'description': 'md5:a022fbec4fbb023d43631032c91ed64b',
 772             'uploader': 'Toei Animation',
 773             'title': 'World Trigger Episode 73 – To the Future',
 774             'upload_date': '20160402',
 775             'episode_number': 73,
 776             'series': 'World Trigger',
 777             'average_rating': 4.9,
 778             'episode': 'To the Future',
 779             'season': 'World Trigger',
 780             'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg',
 781             'season_number': 1,
 782         },
 783         'params': {'skip_download': 'm3u8'},
 784         'expected_warnings': ['Unable to download XML']
 785     }, {
 786         'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn',
 787         'info_dict': {
 788             'id': '648781',
 789             'ext': 'mp4',
 790             'episode_number': 1,
 791             'timestamp': 1389173400,
 792             'series': 'Love, Chunibyo & Other Delusions - Heart Throb -',
 793             'description': 'md5:5579d1a0355cc618558ba23d27067a62',
 794             'uploader': 'TBS',
 795             'episode': 'Wicked Lord Shingan... Reborn',
 796             'average_rating': 4.9,
 797             'season': 'Love, Chunibyo & Other Delusions - Heart Throb -',
 798             'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg',
 799             'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn',
 800             'season_number': 2,
 801             'upload_date': '20140108',
 802         },
 803         'params': {'skip_download': 'm3u8'},
 804         'expected_warnings': ['Unable to download XML']
 805     }, {
 806         'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/',
 807         'only_matching': True,
 808     }]
 809
 810     def _real_extract(self, url):
 811         lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
 812
 813         if not self._get_cookies(url).get('etp_rt'):
 814             return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key())
 815
 816         api_domain, bucket, params = self._get_params(lang)
 817
 818         episode_response = self._download_json(
 819             f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id,
 820             note='Retrieving episode metadata',
 821             query=params)
 822         if episode_response.get('is_premium_only') and not episode_response.get('playback'):
 823             raise ExtractorError('This video is for premium members only.', expected=True)
 824         stream_response = self._download_json(
 825             episode_response['playback'], display_id,
 826             note='Retrieving stream info')
 827
 828         thumbnails = []
 829         for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')):
 830             for thumbnail_data in thumbnails_data:
 831                 thumbnails.append({
 832                     'url': thumbnail_data.get('source'),
 833                     'width': thumbnail_data.get('width'),
 834                     'height': thumbnail_data.get('height'),
 835                 })
 836         subtitles = {}
 837         for lang, subtitle_data in stream_response.get('subtitles').items():
 838             subtitles[lang] = [{
 839                 'url': subtitle_data.get('url'),
 840                 'ext': subtitle_data.get('format')
 841             }]
 842
 843         requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
 844         hardsub_preference = qualities(requested_hardsubs[::-1])
 845         requested_formats = self._configuration_arg('format') or ['adaptive_hls']
 846
 847         formats = []
 848         for stream_type, streams in stream_response.get('streams', {}).items():
 849             if stream_type not in requested_formats:
 850                 continue
 851             for stream in streams.values():
 852                 hardsub_lang = stream.get('hardsub_locale') or ''
 853                 if hardsub_lang.lower() not in requested_hardsubs:
 854                     continue
 855                 format_id = join_nonempty(
 856                     stream_type,
 857                     format_field(stream, 'hardsub_locale', 'hardsub-%s'))
 858                 if not stream.get('url'):
 859                     continue
 860                 if stream_type.split('_')[-1] == 'hls':
 861                     adaptive_formats = self._extract_m3u8_formats(
 862                         stream['url'], display_id, 'mp4', m3u8_id=format_id,
 863                         note='Downloading %s information' % format_id,
 864                         fatal=False)
 865                 elif stream_type.split('_')[-1] == 'dash':
 866                     adaptive_formats = self._extract_mpd_formats(
 867                         stream['url'], display_id, mpd_id=format_id,
 868                         note='Downloading %s information' % format_id,
 869                         fatal=False)
 870                 for f in adaptive_formats:
 871                     if f.get('acodec') != 'none':
 872                         f['language'] = stream_response.get('audio_locale')
 873                     f['quality'] = hardsub_preference(hardsub_lang.lower())
 874                 formats.extend(adaptive_formats)
 875         self._sort_formats(formats)
 876
 877         return {
 878             'id': internal_id,
 879             'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')),
 880             'description': episode_response.get('description').replace(r'\r\n', '\n'),
 881             'duration': float_or_none(episode_response.get('duration_ms'), 1000),
 882             'thumbnails': thumbnails,
 883             'series': episode_response.get('series_title'),
 884             'series_id': episode_response.get('series_id'),
 885             'season': episode_response.get('season_title'),
 886             'season_id': episode_response.get('season_id'),
 887             'season_number': episode_response.get('season_number'),
 888             'episode': episode_response.get('title'),
 889             'episode_number': episode_response.get('sequence_number'),
 890             'subtitles': subtitles,
 891             'formats': formats
 892         }
 893
 894
 895 class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE):
 896     IE_NAME = 'crunchyroll:playlist:beta'
 897     _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)'
 898     _TESTS = [{
 899         'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA',
 900         'info_dict': {
 901             'id': 'girl-friend-beta',
 902             'title': 'Girl Friend BETA',
 903         },
 904         'playlist_mincount': 10,
 905     }, {
 906         'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--',
 907         'info_dict': {
 908             'id': 'love-chunibyo-other-delusions-heart-throb-',
 909             'title': 'Love, Chunibyo & Other Delusions - Heart Throb -',
 910         },
 911         'playlist_mincount': 10,
 912     }, {
 913         'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA',
 914         'only_matching': True,
 915     }]
 916
 917     def _real_extract(self, url):
 918         lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
 919
 920         if not self._get_cookies(url).get('etp_rt'):
 921             return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key())
 922
 923         api_domain, bucket, params = self._get_params(lang)
 924
 925         series_response = self._download_json(
 926             f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id,
 927             note='Retrieving series metadata', query=params)
 928
 929         seasons_response = self._download_json(
 930             f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id,
 931             note='Retrieving season list', query=params)
 932
 933         def entries():
 934             for season in seasons_response['items']:
 935                 episodes_response = self._download_json(
 936                     f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id,
 937                     note=f'Retrieving episode list for {season.get("slug_title")}', query=params)
 938                 for episode in episodes_response['items']:
 939                     episode_id = episode['id']
 940                     episode_display_id = episode['slug_title']
 941                     yield {
 942                         '_type': 'url',
 943                         'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}',
 944                         'ie_key': CrunchyrollBetaIE.ie_key(),
 945                         'id': episode_id,
 946                         'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')),
 947                         'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')),
 948                         'duration': float_or_none(episode.get('duration_ms'), 1000),
 949                         'series': episode.get('series_title'),
 950                         'series_id': episode.get('series_id'),
 951                         'season': episode.get('season_title'),
 952                         'season_id': episode.get('season_id'),
 953                         'season_number': episode.get('season_number'),
 954                         'episode': episode.get('title'),
 955                         'episode_number': episode.get('sequence_number')
 956                     }
 957
 958         return self.playlist_result(entries(), internal_id, series_response.get('title'))