yt_dlp/extractor/crunchyroll.py

   1 import base64
   2 import json
   3 import re
   4 import urllib.request
   5 import xml.etree.ElementTree
   6 import zlib
   7 from hashlib import sha1
   8 from math import floor, pow, sqrt
   9
  10 from .common import InfoExtractor
  11 from .vrv import VRVBaseIE
  12 from ..aes import aes_cbc_decrypt
  13 from ..compat import (
  14     compat_b64decode,
  15     compat_etree_fromstring,
  16     compat_str,
  17     compat_urllib_parse_urlencode,
  18     compat_urlparse,
  19 )
  20 from ..utils import (
  21     ExtractorError,
  22     bytes_to_intlist,
  23     extract_attributes,
  24     float_or_none,
  25     format_field,
  26     int_or_none,
  27     intlist_to_bytes,
  28     join_nonempty,
  29     lowercase_escape,
  30     merge_dicts,
  31     qualities,
  32     remove_end,
  33     sanitized_Request,
  34     traverse_obj,
  35     try_get,
  36     xpath_text,
  37 )
  38
  39
  40 class CrunchyrollBaseIE(InfoExtractor):
  41     _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login'
  42     _API_BASE = 'https://api.crunchyroll.com'
  43     _NETRC_MACHINE = 'crunchyroll'
  44
  45     def _call_rpc_api(self, method, video_id, note=None, data=None):
  46         data = data or {}
  47         data['req'] = 'RpcApi' + method
  48         data = compat_urllib_parse_urlencode(data).encode('utf-8')
  49         return self._download_xml(
  50             'https://www.crunchyroll.com/xml/',
  51             video_id, note, fatal=False, data=data, headers={
  52                 'Content-Type': 'application/x-www-form-urlencoded',
  53             })
  54
  55     def _perform_login(self, username, password):
  56         if self._get_cookies(self._LOGIN_URL).get('etp_rt'):
  57             return
  58
  59         upsell_response = self._download_json(
  60             f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id',
  61             query={
  62                 'sess_id': 1,
  63                 'device_id': 'whatvalueshouldbeforweb',
  64                 'device_type': 'com.crunchyroll.static',
  65                 'access_token': 'giKq5eY27ny3cqz',
  66                 'referer': self._LOGIN_URL
  67             })
  68         if upsell_response['code'] != 'ok':
  69             raise ExtractorError('Could not get session id')
  70         session_id = upsell_response['data']['session_id']
  71
  72         login_response = self._download_json(
  73             f'{self._API_BASE}/login.1.json', None, 'Logging in',
  74             data=compat_urllib_parse_urlencode({
  75                 'account': username,
  76                 'password': password,
  77                 'session_id': session_id
  78             }).encode('ascii'))
  79         if login_response['code'] != 'ok':
  80             raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
  81         if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
  82             raise ExtractorError('Login succeeded but did not set etp_rt cookie')
  83
  84     # Beta-specific, but needed for redirects
  85     def _get_beta_embedded_json(self, webpage, display_id):
  86         initial_state = self._parse_json(self._search_regex(
  87             r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id)
  88         app_config = self._parse_json(self._search_regex(
  89             r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id)
  90         return initial_state, app_config
  91
  92     def _redirect_to_beta(self, webpage, iekey, video_id):
  93         if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
  94             raise ExtractorError('Received a beta page from non-beta url when not logged in.')
  95         initial_state, app_config = self._get_beta_embedded_json(webpage, video_id)
  96         url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname']
  97         self.to_screen(f'{video_id}: Redirected to beta site - {url}')
  98         return self.url_result(f'{url}', iekey, video_id)
  99
 100     @staticmethod
 101     def _add_skip_wall(url):
 102         parsed_url = compat_urlparse.urlparse(url)
 103         qs = compat_urlparse.parse_qs(parsed_url.query)
 104         # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message:
 105         # > This content may be inappropriate for some people.
 106         # > Are you sure you want to continue?
 107         # since it's not disabled by default in crunchyroll account's settings.
 108         # See https://github.com/ytdl-org/youtube-dl/issues/7202.
 109         qs['skip_wall'] = ['1']
 110         return compat_urlparse.urlunparse(
 111             parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
 112
 113
 114 class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE):
 115     IE_NAME = 'crunchyroll'
 116     _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?!series/|watch/)(?:[^/]+/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)'
 117     _TESTS = [{
 118         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
 119         'info_dict': {
 120             'id': '645513',
 121             'ext': 'mp4',
 122             'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
 123             'description': 'md5:2d17137920c64f2f49981a7797d275ef',
 124             'thumbnail': r're:^https?://.*\.jpg$',
 125             'uploader': 'Yomiuri Telecasting Corporation (YTV)',
 126             'upload_date': '20131013',
 127             'url': 're:(?!.*&amp)',
 128         },
 129         'params': {
 130             # rtmp
 131             'skip_download': True,
 132         },
 133         'skip': 'Video gone',
 134     }, {
 135         'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
 136         'info_dict': {
 137             'id': '589804',
 138             'ext': 'flv',
 139             'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
 140             'description': 'md5:2fbc01f90b87e8e9137296f37b461c12',
 141             'thumbnail': r're:^https?://.*\.jpg$',
 142             'uploader': 'Danny Choo Network',
 143             'upload_date': '20120213',
 144         },
 145         'params': {
 146             # rtmp
 147             'skip_download': True,
 148         },
 149         'skip': 'Video gone',
 150     }, {
 151         'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409',
 152         'info_dict': {
 153             'id': '702409',
 154             'ext': 'mp4',
 155             'title': compat_str,
 156             'description': compat_str,
 157             'thumbnail': r're:^https?://.*\.jpg$',
 158             'uploader': 'Re:Zero Partners',
 159             'timestamp': 1462098900,
 160             'upload_date': '20160501',
 161         },
 162         'params': {
 163             # m3u8 download
 164             'skip_download': True,
 165         },
 166     }, {
 167         'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589',
 168         'info_dict': {
 169             'id': '727589',
 170             'ext': 'mp4',
 171             'title': compat_str,
 172             'description': compat_str,
 173             'thumbnail': r're:^https?://.*\.jpg$',
 174             'uploader': 'Kadokawa Pictures Inc.',
 175             'timestamp': 1484130900,
 176             'upload_date': '20170111',
 177             'series': compat_str,
 178             'season': "KONOSUBA -God's blessing on this wonderful world! 2",
 179             'season_number': 2,
 180             'episode': 'Give Me Deliverance From This Judicial Injustice!',
 181             'episode_number': 1,
 182         },
 183         'params': {
 184             # m3u8 download
 185             'skip_download': True,
 186         },
 187     }, {
 188         'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
 189         'only_matching': True,
 190     }, {
 191         # geo-restricted (US), 18+ maturity wall, non-premium available
 192         'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617',
 193         'only_matching': True,
 194     }, {
 195         # A description with double quotes
 196         'url': 'http://www.crunchyroll.com/11eyes/episode-1-piros-jszaka-red-night-535080',
 197         'info_dict': {
 198             'id': '535080',
 199             'ext': 'mp4',
 200             'title': compat_str,
 201             'description': compat_str,
 202             'uploader': 'Marvelous AQL Inc.',
 203             'timestamp': 1255512600,
 204             'upload_date': '20091014',
 205         },
 206         'params': {
 207             # Just test metadata extraction
 208             'skip_download': True,
 209         },
 210     }, {
 211         # make sure we can extract an uploader name that's not a link
 212         'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899',
 213         'info_dict': {
 214             'id': '606899',
 215             'ext': 'mp4',
 216             'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors',
 217             'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"',
 218             'uploader': 'Geneon Entertainment',
 219             'upload_date': '20120717',
 220         },
 221         'params': {
 222             # just test metadata extraction
 223             'skip_download': True,
 224         },
 225         'skip': 'Video gone',
 226     }, {
 227         # A video with a vastly different season name compared to the series name
 228         'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532',
 229         'info_dict': {
 230             'id': '590532',
 231             'ext': 'mp4',
 232             'title': compat_str,
 233             'description': compat_str,
 234             'uploader': 'TV TOKYO',
 235             'timestamp': 1330956000,
 236             'upload_date': '20120305',
 237             'series': 'Nyarko-san: Another Crawling Chaos',
 238             'season': 'Haiyoru! Nyaruani (ONA)',
 239         },
 240         'params': {
 241             # Just test metadata extraction
 242             'skip_download': True,
 243         },
 244     }, {
 245         'url': 'http://www.crunchyroll.com/media-723735',
 246         'only_matching': True,
 247     }, {
 248         'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921',
 249         'only_matching': True,
 250     }]
 251
 252     _FORMAT_IDS = {
 253         '360': ('60', '106'),
 254         '480': ('61', '106'),
 255         '720': ('62', '106'),
 256         '1080': ('80', '108'),
 257     }
 258
 259     def _download_webpage(self, url_or_request, *args, **kwargs):
 260         request = (url_or_request if isinstance(url_or_request, urllib.request.Request)
 261                    else sanitized_Request(url_or_request))
 262         # Accept-Language must be set explicitly to accept any language to avoid issues
 263         # similar to https://github.com/ytdl-org/youtube-dl/issues/6797.
 264         # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction
 265         # should be imposed or not (from what I can see it just takes the first language
 266         # ignoring the priority and requires it to correspond the IP). By the way this causes
 267         # Crunchyroll to not work in georestriction cases in some browsers that don't place
 268         # the locale lang first in header. However allowing any language seems to workaround the issue.
 269         request.add_header('Accept-Language', '*')
 270         return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs)
 271
 272     def _decrypt_subtitles(self, data, iv, id):
 273         data = bytes_to_intlist(compat_b64decode(data))
 274         iv = bytes_to_intlist(compat_b64decode(iv))
 275         id = int(id)
 276
 277         def obfuscate_key_aux(count, modulo, start):
 278             output = list(start)
 279             for _ in range(count):
 280                 output.append(output[-1] + output[-2])
 281             # cut off start values
 282             output = output[2:]
 283             output = list(map(lambda x: x % modulo + 33, output))
 284             return output
 285
 286         def obfuscate_key(key):
 287             num1 = int(floor(pow(2, 25) * sqrt(6.9)))
 288             num2 = (num1 ^ key) << 5
 289             num3 = key ^ num1
 290             num4 = num3 ^ (num3 >> 3) ^ num2
 291             prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2)))
 292             shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
 293             # Extend 160 Bit hash to 256 Bit
 294             return shaHash + [0] * 12
 295
 296         key = obfuscate_key(id)
 297
 298         decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
 299         return zlib.decompress(decrypted_data)
 300
 301     def _convert_subtitles_to_srt(self, sub_root):
 302         output = ''
 303
 304         for i, event in enumerate(sub_root.findall('./events/event'), 1):
 305             start = event.attrib['start'].replace('.', ',')
 306             end = event.attrib['end'].replace('.', ',')
 307             text = event.attrib['text'].replace('\\N', '\n')
 308             output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
 309         return output
 310
 311     def _convert_subtitles_to_ass(self, sub_root):
 312         output = ''
 313
 314         def ass_bool(strvalue):
 315             assvalue = '0'
 316             if strvalue == '1':
 317                 assvalue = '-1'
 318             return assvalue
 319
 320         output = '[Script Info]\n'
 321         output += 'Title: %s\n' % sub_root.attrib['title']
 322         output += 'ScriptType: v4.00+\n'
 323         output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style']
 324         output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x']
 325         output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y']
 326         output += """
 327 [V4+ Styles]
 328 Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
 329 """
 330         for style in sub_root.findall('./styles/style'):
 331             output += 'Style: ' + style.attrib['name']
 332             output += ',' + style.attrib['font_name']
 333             output += ',' + style.attrib['font_size']
 334             output += ',' + style.attrib['primary_colour']
 335             output += ',' + style.attrib['secondary_colour']
 336             output += ',' + style.attrib['outline_colour']
 337             output += ',' + style.attrib['back_colour']
 338             output += ',' + ass_bool(style.attrib['bold'])
 339             output += ',' + ass_bool(style.attrib['italic'])
 340             output += ',' + ass_bool(style.attrib['underline'])
 341             output += ',' + ass_bool(style.attrib['strikeout'])
 342             output += ',' + style.attrib['scale_x']
 343             output += ',' + style.attrib['scale_y']
 344             output += ',' + style.attrib['spacing']
 345             output += ',' + style.attrib['angle']
 346             output += ',' + style.attrib['border_style']
 347             output += ',' + style.attrib['outline']
 348             output += ',' + style.attrib['shadow']
 349             output += ',' + style.attrib['alignment']
 350             output += ',' + style.attrib['margin_l']
 351             output += ',' + style.attrib['margin_r']
 352             output += ',' + style.attrib['margin_v']
 353             output += ',' + style.attrib['encoding']
 354             output += '\n'
 355
 356         output += """
 357 [Events]
 358 Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 359 """
 360         for event in sub_root.findall('./events/event'):
 361             output += 'Dialogue: 0'
 362             output += ',' + event.attrib['start']
 363             output += ',' + event.attrib['end']
 364             output += ',' + event.attrib['style']
 365             output += ',' + event.attrib['name']
 366             output += ',' + event.attrib['margin_l']
 367             output += ',' + event.attrib['margin_r']
 368             output += ',' + event.attrib['margin_v']
 369             output += ',' + event.attrib['effect']
 370             output += ',' + event.attrib['text']
 371             output += '\n'
 372
 373         return output
 374
 375     def _extract_subtitles(self, subtitle):
 376         sub_root = compat_etree_fromstring(subtitle)
 377         return [{
 378             'ext': 'srt',
 379             'data': self._convert_subtitles_to_srt(sub_root),
 380         }, {
 381             'ext': 'ass',
 382             'data': self._convert_subtitles_to_ass(sub_root),
 383         }]
 384
 385     def _get_subtitles(self, video_id, webpage):
 386         subtitles = {}
 387         for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage):
 388             sub_doc = self._call_rpc_api(
 389                 'Subtitle_GetXml', video_id,
 390                 'Downloading subtitles for ' + sub_name, data={
 391                     'subtitle_script_id': sub_id,
 392                 })
 393             if not isinstance(sub_doc, xml.etree.ElementTree.Element):
 394                 continue
 395             sid = sub_doc.get('id')
 396             iv = xpath_text(sub_doc, 'iv', 'subtitle iv')
 397             data = xpath_text(sub_doc, 'data', 'subtitle data')
 398             if not sid or not iv or not data:
 399                 continue
 400             subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8')
 401             lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
 402             if not lang_code:
 403                 continue
 404             subtitles[lang_code] = self._extract_subtitles(subtitle)
 405         return subtitles
 406
 407     def _real_extract(self, url):
 408         mobj = self._match_valid_url(url)
 409         video_id = mobj.group('id')
 410
 411         if mobj.group('prefix') == 'm':
 412             mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage')
 413             webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url')
 414         else:
 415             webpage_url = 'http://www.' + mobj.group('url')
 416
 417         webpage = self._download_webpage(
 418             self._add_skip_wall(webpage_url), video_id,
 419             headers=self.geo_verification_headers())
 420         if re.search(r'<div id="preload-data">', webpage):
 421             return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id)
 422         note_m = self._html_search_regex(
 423             r'<div class="showmedia-trailer-notice">(.+?)</div>',
 424             webpage, 'trailer-notice', default='')
 425         if note_m:
 426             raise ExtractorError(note_m, expected=True)
 427
 428         mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
 429         if mobj:
 430             msg = json.loads(mobj.group('msg'))
 431             if msg.get('type') == 'error':
 432                 raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
 433
 434         if 'To view this, please log in to verify you are 18 or older.' in webpage:
 435             self.raise_login_required()
 436
 437         media = self._parse_json(self._search_regex(
 438             r'vilos\.config\.media\s*=\s*({.+?});',
 439             webpage, 'vilos media', default='{}'), video_id)
 440         media_metadata = media.get('metadata') or {}
 441
 442         language = self._search_regex(
 443             r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P<lang>(?:(?!\1).)+)\1',
 444             webpage, 'language', default=None, group='lang')
 445
 446         video_title = self._html_search_regex(
 447             (r'(?s)<h1[^>]*>((?:(?!<h1).)*?<(?:span[^>]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!<h1).)+?)</h1>',
 448              r'<title>(.+?),\s+-\s+.+? Crunchyroll'),
 449             webpage, 'video_title', default=None)
 450         if not video_title:
 451             video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage))
 452         video_title = re.sub(r' {2,}', ' ', video_title)
 453         video_description = (self._parse_json(self._html_search_regex(
 454             r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
 455             webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
 456
 457         thumbnails = []
 458         thumbnail_url = (self._parse_json(self._html_search_regex(
 459             r'<script type="application\/ld\+json">\n\s*(.+?)<\/script>',
 460             webpage, 'thumbnail_url', default='{}'), video_id)).get('image')
 461         if thumbnail_url:
 462             thumbnails.append({
 463                 'url': thumbnail_url,
 464                 'width': 1920,
 465                 'height': 1080
 466             })
 467
 468         if video_description:
 469             video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
 470         video_uploader = self._html_search_regex(
 471             # try looking for both an uploader that's a link and one that's not
 472             [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
 473             webpage, 'video_uploader', default=False)
 474
 475         requested_languages = self._configuration_arg('language')
 476         requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')]
 477         language_preference = qualities((requested_languages or [language or ''])[::-1])
 478         hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1])
 479
 480         formats = []
 481         for stream in media.get('streams', []):
 482             audio_lang = stream.get('audio_lang') or ''
 483             hardsub_lang = stream.get('hardsub_lang') or ''
 484             if (requested_languages and audio_lang.lower() not in requested_languages
 485                     or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs):
 486                 continue
 487             vrv_formats = self._extract_vrv_formats(
 488                 stream.get('url'), video_id, stream.get('format'),
 489                 audio_lang, hardsub_lang)
 490             for f in vrv_formats:
 491                 f['language_preference'] = language_preference(audio_lang)
 492                 f['quality'] = hardsub_preference(hardsub_lang)
 493             formats.extend(vrv_formats)
 494         if not formats:
 495             available_fmts = []
 496             for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
 497                 attrs = extract_attributes(a)
 498                 href = attrs.get('href')
 499                 if href and '/freetrial' in href:
 500                     continue
 501                 available_fmts.append(fmt)
 502             if not available_fmts:
 503                 for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
 504                     available_fmts = re.findall(p, webpage)
 505                     if available_fmts:
 506                         break
 507             if not available_fmts:
 508                 available_fmts = self._FORMAT_IDS.keys()
 509             video_encode_ids = []
 510
 511             for fmt in available_fmts:
 512                 stream_quality, stream_format = self._FORMAT_IDS[fmt]
 513                 video_format = fmt + 'p'
 514                 stream_infos = []
 515                 streamdata = self._call_rpc_api(
 516                     'VideoPlayer_GetStandardConfig', video_id,
 517                     'Downloading media info for %s' % video_format, data={
 518                         'media_id': video_id,
 519                         'video_format': stream_format,
 520                         'video_quality': stream_quality,
 521                         'current_page': url,
 522                     })
 523                 if isinstance(streamdata, xml.etree.ElementTree.Element):
 524                     stream_info = streamdata.find('./{default}preload/stream_info')
 525                     if stream_info is not None:
 526                         stream_infos.append(stream_info)
 527                 stream_info = self._call_rpc_api(
 528                     'VideoEncode_GetStreamInfo', video_id,
 529                     'Downloading stream info for %s' % video_format, data={
 530                         'media_id': video_id,
 531                         'video_format': stream_format,
 532                         'video_encode_quality': stream_quality,
 533                     })
 534                 if isinstance(stream_info, xml.etree.ElementTree.Element):
 535                     stream_infos.append(stream_info)
 536                 for stream_info in stream_infos:
 537                     video_encode_id = xpath_text(stream_info, './video_encode_id')
 538                     if video_encode_id in video_encode_ids:
 539                         continue
 540                     video_encode_ids.append(video_encode_id)
 541
 542                     video_file = xpath_text(stream_info, './file')
 543                     if not video_file:
 544                         continue
 545                     if video_file.startswith('http'):
 546                         formats.extend(self._extract_m3u8_formats(
 547                             video_file, video_id, 'mp4', entry_protocol='m3u8_native',
 548                             m3u8_id='hls', fatal=False))
 549                         continue
 550
 551                     video_url = xpath_text(stream_info, './host')
 552                     if not video_url:
 553                         continue
 554                     metadata = stream_info.find('./metadata')
 555                     format_info = {
 556                         'format': video_format,
 557                         'height': int_or_none(xpath_text(metadata, './height')),
 558                         'width': int_or_none(xpath_text(metadata, './width')),
 559                     }
 560
 561                     if '.fplive.net/' in video_url:
 562                         video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
 563                         parsed_video_url = compat_urlparse.urlparse(video_url)
 564                         direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
 565                             netloc='v.lvlt.crcdn.net',
 566                             path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
 567                         if self._is_valid_url(direct_video_url, video_id, video_format):
 568                             format_info.update({
 569                                 'format_id': 'http-' + video_format,
 570                                 'url': direct_video_url,
 571                             })
 572                             formats.append(format_info)
 573                             continue
 574
 575                     format_info.update({
 576                         'format_id': 'rtmp-' + video_format,
 577                         'url': video_url,
 578                         'play_path': video_file,
 579                         'ext': 'flv',
 580                     })
 581                     formats.append(format_info)
 582         self._sort_formats(formats)
 583
 584         metadata = self._call_rpc_api(
 585             'VideoPlayer_GetMediaMetadata', video_id,
 586             note='Downloading media info', data={
 587                 'media_id': video_id,
 588             })
 589
 590         subtitles = {}
 591         for subtitle in media.get('subtitles', []):
 592             subtitle_url = subtitle.get('url')
 593             if not subtitle_url:
 594                 continue
 595             subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({
 596                 'url': subtitle_url,
 597                 'ext': subtitle.get('format', 'ass'),
 598             })
 599         if not subtitles:
 600             subtitles = self.extract_subtitles(video_id, webpage)
 601
 602         # webpage provide more accurate data than series_title from XML
 603         series = self._html_search_regex(
 604             r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d',
 605             webpage, 'series', fatal=False)
 606
 607         season = episode = episode_number = duration = None
 608
 609         if isinstance(metadata, xml.etree.ElementTree.Element):
 610             season = xpath_text(metadata, 'series_title')
 611             episode = xpath_text(metadata, 'episode_title')
 612             episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
 613             duration = float_or_none(media_metadata.get('duration'), 1000)
 614
 615         if not episode:
 616             episode = media_metadata.get('title')
 617         if not episode_number:
 618             episode_number = int_or_none(media_metadata.get('episode_number'))
 619         thumbnail_url = try_get(media, lambda x: x['thumbnail']['url'])
 620         if thumbnail_url:
 621             thumbnails.append({
 622                 'url': thumbnail_url,
 623                 'width': 640,
 624                 'height': 360
 625             })
 626
 627         season_number = int_or_none(self._search_regex(
 628             r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
 629             webpage, 'season number', default=None))
 630
 631         info = self._search_json_ld(webpage, video_id, default={})
 632
 633         return merge_dicts({
 634             'id': video_id,
 635             'title': video_title,
 636             'description': video_description,
 637             'duration': duration,
 638             'thumbnails': thumbnails,
 639             'uploader': video_uploader,
 640             'series': series,
 641             'season': season,
 642             'season_number': season_number,
 643             'episode': episode,
 644             'episode_number': episode_number,
 645             'subtitles': subtitles,
 646             'formats': formats,
 647         }, info)
 648
 649
 650 class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
 651     IE_NAME = 'crunchyroll:playlist'
 652     _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:\w{2}(?:-\w{2})?/)?(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
 653
 654     _TESTS = [{
 655         'url': 'https://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
 656         'info_dict': {
 657             'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
 658             'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
 659         },
 660         'playlist_count': 13,
 661     }, {
 662         # geo-restricted (US), 18+ maturity wall, non-premium available
 663         'url': 'http://www.crunchyroll.com/cosplay-complex-ova',
 664         'info_dict': {
 665             'id': 'cosplay-complex-ova',
 666             'title': 'Cosplay Complex OVA'
 667         },
 668         'playlist_count': 3,
 669         'skip': 'Georestricted',
 670     }, {
 671         # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14
 672         'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1',
 673         'only_matching': True,
 674     }, {
 675         'url': 'http://www.crunchyroll.com/fr/ladies-versus-butlers',
 676         'only_matching': True,
 677     }]
 678
 679     def _real_extract(self, url):
 680         show_id = self._match_id(url)
 681
 682         webpage = self._download_webpage(
 683             # https:// gives a 403, but http:// does not
 684             self._add_skip_wall(url).replace('https://', 'http://'), show_id,
 685             headers=self.geo_verification_headers())
 686         if re.search(r'<div id="preload-data">', webpage):
 687             return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id)
 688         title = self._html_search_meta('name', webpage, default=None)
 689
 690         episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"'
 691         season_re = r'<a [^>]+season-dropdown[^>]+>([^<]+)'
 692         paths = re.findall(f'(?s){episode_re}|{season_re}', webpage)
 693
 694         entries, current_season = [], None
 695         for ep_id, ep, season in paths:
 696             if season:
 697                 current_season = season
 698                 continue
 699             entries.append(self.url_result(
 700                 f'http://www.crunchyroll.com{ep}', CrunchyrollIE.ie_key(), ep_id, season=current_season))
 701
 702         return {
 703             '_type': 'playlist',
 704             'id': show_id,
 705             'title': title,
 706             'entries': reversed(entries),
 707         }
 708
 709
 710 class CrunchyrollBetaBaseIE(CrunchyrollBaseIE):
 711     params = None
 712
 713     def _get_params(self, lang):
 714         if not CrunchyrollBetaBaseIE.params:
 715             initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(
 716                 f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None)
 717             api_domain = app_config['cxApiParams']['apiDomain']
 718             basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii')
 719             auth_response = self._download_json(
 720                 f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie',
 721                 headers={
 722                     'Authorization': 'Basic ' + basic_token
 723                 }, data='grant_type=etp_rt_cookie'.encode('ascii'))
 724             policy_response = self._download_json(
 725                 f'{api_domain}/index/v2', None, note='Retrieving signed policy',
 726                 headers={
 727                     'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']
 728                 })
 729             cms = traverse_obj(policy_response, 'cms_beta', 'cms')
 730             bucket = cms['bucket']
 731             params = {
 732                 'Policy': cms['policy'],
 733                 'Signature': cms['signature'],
 734                 'Key-Pair-Id': cms['key_pair_id']
 735             }
 736             locale = traverse_obj(initial_state, ('localization', 'locale'))
 737             if locale:
 738                 params['locale'] = locale
 739             CrunchyrollBetaBaseIE.params = (api_domain, bucket, params)
 740         return CrunchyrollBetaBaseIE.params
 741
 742     def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey):
 743         initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id)
 744         content_data = initial_state['content']['byId'][internal_id]
 745         if is_episode:
 746             video_id = content_data['external_id'].split('.')[1]
 747             series_id = content_data['episode_metadata']['series_slug_title']
 748         else:
 749             series_id = content_data['slug_title']
 750         series_id = re.sub(r'-{2,}', '-', series_id)
 751         url = f'https://www.crunchyroll.com/{lang}{series_id}'
 752         if is_episode:
 753             url = url + f'/{display_id}-{video_id}'
 754         self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}')
 755         return self.url_result(url, iekey, display_id)
 756
 757
 758 class CrunchyrollBetaIE(CrunchyrollBetaBaseIE):
 759     IE_NAME = 'crunchyroll:beta'
 760     _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{2}(?:-\w{2})?/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)'
 761     _TESTS = [{
 762         'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future',
 763         'info_dict': {
 764             'id': '696363',
 765             'ext': 'mp4',
 766             'timestamp': 1459610100,
 767             'description': 'md5:a022fbec4fbb023d43631032c91ed64b',
 768             'uploader': 'Toei Animation',
 769             'title': 'World Trigger Episode 73 – To the Future',
 770             'upload_date': '20160402',
 771             'episode_number': 73,
 772             'series': 'World Trigger',
 773             'average_rating': 4.9,
 774             'episode': 'To the Future',
 775             'season': 'World Trigger',
 776             'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg',
 777             'season_number': 1,
 778         },
 779         'params': {'skip_download': 'm3u8'},
 780         'expected_warnings': ['Unable to download XML']
 781     }, {
 782         'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn',
 783         'info_dict': {
 784             'id': '648781',
 785             'ext': 'mp4',
 786             'episode_number': 1,
 787             'timestamp': 1389173400,
 788             'series': 'Love, Chunibyo & Other Delusions - Heart Throb -',
 789             'description': 'md5:5579d1a0355cc618558ba23d27067a62',
 790             'uploader': 'TBS',
 791             'episode': 'Wicked Lord Shingan... Reborn',
 792             'average_rating': 4.9,
 793             'season': 'Love, Chunibyo & Other Delusions - Heart Throb -',
 794             'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg',
 795             'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn',
 796             'season_number': 2,
 797             'upload_date': '20140108',
 798         },
 799         'params': {'skip_download': 'm3u8'},
 800         'expected_warnings': ['Unable to download XML']
 801     }, {
 802         'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/',
 803         'only_matching': True,
 804     }]
 805
 806     def _real_extract(self, url):
 807         lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
 808
 809         if not self._get_cookies(url).get('etp_rt'):
 810             return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key())
 811
 812         api_domain, bucket, params = self._get_params(lang)
 813
 814         episode_response = self._download_json(
 815             f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id,
 816             note='Retrieving episode metadata', query=params)
 817         if episode_response.get('is_premium_only') and not episode_response.get('playback'):
 818             raise ExtractorError('This video is for premium members only.', expected=True)
 819
 820         stream_response = self._download_json(episode_response['playback'], display_id, note='Retrieving stream info')
 821         get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items()
 822
 823         requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])]
 824         hardsub_preference = qualities(requested_hardsubs[::-1])
 825         requested_formats = self._configuration_arg('format') or ['adaptive_hls']
 826
 827         formats = []
 828         for stream_type, streams in get_streams('streams'):
 829             if stream_type not in requested_formats:
 830                 continue
 831             for stream in streams.values():
 832                 hardsub_lang = stream.get('hardsub_locale') or ''
 833                 if hardsub_lang.lower() not in requested_hardsubs:
 834                     continue
 835                 format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s'))
 836                 if not stream.get('url'):
 837                     continue
 838                 if stream_type.endswith('hls'):
 839                     adaptive_formats = self._extract_m3u8_formats(
 840                         stream['url'], display_id, 'mp4', m3u8_id=format_id,
 841                         fatal=False, note=f'Downloading {format_id} HLS manifest')
 842                 elif stream_type.endswith('dash'):
 843                     adaptive_formats = self._extract_mpd_formats(
 844                         stream['url'], display_id, mpd_id=format_id,
 845                         fatal=False, note=f'Downloading {format_id} MPD manifest')
 846                 for f in adaptive_formats:
 847                     if f.get('acodec') != 'none':
 848                         f['language'] = stream_response.get('audio_locale')
 849                     f['quality'] = hardsub_preference(hardsub_lang.lower())
 850                 formats.extend(adaptive_formats)
 851         self._sort_formats(formats)
 852
 853         return {
 854             'id': internal_id,
 855             'title': '%s Episode %s – %s' % (
 856                 episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')),
 857             'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')),
 858             'duration': float_or_none(episode_response.get('duration_ms'), 1000),
 859             'series': episode_response.get('series_title'),
 860             'series_id': episode_response.get('series_id'),
 861             'season': episode_response.get('season_title'),
 862             'season_id': episode_response.get('season_id'),
 863             'season_number': episode_response.get('season_number'),
 864             'episode': episode_response.get('title'),
 865             'episode_number': episode_response.get('sequence_number'),
 866             'formats': formats,
 867             'thumbnails': [{
 868                 'url': thumb.get('source'),
 869                 'width': thumb.get('width'),
 870                 'height': thumb.get('height'),
 871             } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []],
 872             'subtitles': {
 873                 lang: [{
 874                     'url': subtitle_data.get('url'),
 875                     'ext': subtitle_data.get('format')
 876                 }] for lang, subtitle_data in get_streams('subtitles')
 877             },
 878         }
 879
 880
 881 class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE):
 882     IE_NAME = 'crunchyroll:playlist:beta'
 883     _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{2}(?:-\w{2})?/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)'
 884     _TESTS = [{
 885         'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA',
 886         'info_dict': {
 887             'id': 'girl-friend-beta',
 888             'title': 'Girl Friend BETA',
 889         },
 890         'playlist_mincount': 10,
 891     }, {
 892         'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--',
 893         'info_dict': {
 894             'id': 'love-chunibyo-other-delusions-heart-throb-',
 895             'title': 'Love, Chunibyo & Other Delusions - Heart Throb -',
 896         },
 897         'playlist_mincount': 10,
 898     }, {
 899         'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA',
 900         'only_matching': True,
 901     }, {
 902         'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy',
 903         'only_matching': True,
 904     }]
 905
 906     def _real_extract(self, url):
 907         lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id')
 908
 909         if not self._get_cookies(url).get('etp_rt'):
 910             return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key())
 911
 912         api_domain, bucket, params = self._get_params(lang)
 913
 914         series_response = self._download_json(
 915             f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id,
 916             note='Retrieving series metadata', query=params)
 917
 918         seasons_response = self._download_json(
 919             f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id,
 920             note='Retrieving season list', query=params)
 921
 922         def entries():
 923             for season in seasons_response['items']:
 924                 episodes_response = self._download_json(
 925                     f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id,
 926                     note=f'Retrieving episode list for {season.get("slug_title")}', query=params)
 927                 for episode in episodes_response['items']:
 928                     episode_id = episode['id']
 929                     episode_display_id = episode['slug_title']
 930                     yield {
 931                         '_type': 'url',
 932                         'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}',
 933                         'ie_key': CrunchyrollBetaIE.ie_key(),
 934                         'id': episode_id,
 935                         'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')),
 936                         'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')),
 937                         'duration': float_or_none(episode.get('duration_ms'), 1000),
 938                         'series': episode.get('series_title'),
 939                         'series_id': episode.get('series_id'),
 940                         'season': episode.get('season_title'),
 941                         'season_id': episode.get('season_id'),
 942                         'season_number': episode.get('season_number'),
 943                         'episode': episode.get('title'),
 944                         'episode_number': episode.get('sequence_number')
 945                     }
 946
 947         return self.playlist_result(entries(), internal_id, series_response.get('title'))