yt_dlp/extractor/animelab.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5
   6 from ..utils import (
   7     ExtractorError,
   8     urlencode_postdata,
   9     int_or_none,
  10     str_or_none,
  11     determine_ext,
  12 )
  13
  14 from ..compat import compat_HTTPError
  15
  16
  17 class AnimeLabBaseIE(InfoExtractor):
  18     _LOGIN_REQUIRED = True
  19     _LOGIN_URL = 'https://www.animelab.com/login'
  20     _NETRC_MACHINE = 'animelab'
  21
  22     def _login(self):
  23         def is_logged_in(login_webpage):
  24             return 'Sign In' not in login_webpage
  25
  26         login_page = self._download_webpage(
  27             self._LOGIN_URL, None, 'Downloading login page')
  28
  29         # Check if already logged in
  30         if is_logged_in(login_page):
  31             return
  32
  33         (username, password) = self._get_login_info()
  34         if username is None and self._LOGIN_REQUIRED:
  35             self.raise_login_required('Login is required to access any AnimeLab content')
  36
  37         login_form = {
  38             'email': username,
  39             'password': password,
  40         }
  41
  42         try:
  43             response = self._download_webpage(
  44                 self._LOGIN_URL, None, 'Logging in', 'Wrong login info',
  45                 data=urlencode_postdata(login_form),
  46                 headers={'Content-Type': 'application/x-www-form-urlencoded'})
  47         except ExtractorError as e:
  48             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
  49                 raise ExtractorError('Unable to log in (wrong credentials?)', expected=True)
  50             else:
  51                 raise
  52
  53         # if login was successful
  54         if is_logged_in(response):
  55             return
  56
  57         raise ExtractorError('Unable to login (cannot verify if logged in)')
  58
  59     def _real_initialize(self):
  60         self._login()
  61
  62
  63 class AnimeLabIE(AnimeLabBaseIE):
  64     _VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)'
  65
  66     # the following tests require authentication, but a free account will suffice
  67     # just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file
  68     # or you can set 'username' and 'password' there
  69     # the tests also select a specific format so that the same video is downloaded
  70     # regardless of whether the user is premium or not (needs testing on a premium account)
  71     _TEST = {
  72         'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42',
  73         'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f',
  74         'info_dict': {
  75             'id': '383',
  76             'ext': 'mp4',
  77             'display_id': 'fullmetal-alchemist-brotherhood-episode-42',
  78             'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive',
  79             'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4',
  80             'series': 'Fullmetal Alchemist: Brotherhood',
  81             'episode': 'Signs of a Counteroffensive',
  82             'episode_number': 42,
  83             'duration': 1469,
  84             'season': 'Season 1',
  85             'season_number': 1,
  86             'season_id': '38',
  87         },
  88         'params': {
  89             'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]',
  90         },
  91         'skip': 'All AnimeLab content requires authentication',
  92     }
  93
  94     def _real_extract(self, url):
  95         display_id = self._match_id(url)
  96
  97         # unfortunately we can get different URLs for the same formats
  98         # e.g. if we are using a "free" account so no dubs available
  99         # (so _remove_duplicate_formats is not effective)
 100         # so we use a dictionary as a workaround
 101         formats = {}
 102         for language_option_url in ('https://www.animelab.com/player/%s/subtitles',
 103                                     'https://www.animelab.com/player/%s/dubbed'):
 104             actual_url = language_option_url % display_id
 105             webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url)
 106
 107             video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id)
 108             position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position'))
 109
 110             raw_data = video_collection[position]['videoEntry']
 111
 112             video_id = str_or_none(raw_data['id'])
 113
 114             # create a title from many sources (while grabbing other info)
 115             # TODO use more fallback sources to get some of these
 116             series = raw_data.get('showTitle')
 117             video_type = raw_data.get('videoEntryType', {}).get('name')
 118             episode_number = raw_data.get('episodeNumber')
 119             episode_name = raw_data.get('name')
 120
 121             title_parts = (series, video_type, episode_number, episode_name)
 122             if None not in title_parts:
 123                 title = '%s - %s %s - %s' % title_parts
 124             else:
 125                 title = episode_name
 126
 127             description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None)
 128
 129             duration = int_or_none(raw_data.get('duration'))
 130
 131             thumbnail_data = raw_data.get('images', [])
 132             thumbnails = []
 133             for thumbnail in thumbnail_data:
 134                 for instance in thumbnail['imageInstances']:
 135                     image_data = instance.get('imageInfo', {})
 136                     thumbnails.append({
 137                         'id': str_or_none(image_data.get('id')),
 138                         'url': image_data.get('fullPath'),
 139                         'width': image_data.get('width'),
 140                         'height': image_data.get('height'),
 141                     })
 142
 143             season_data = raw_data.get('season', {}) or {}
 144             season = str_or_none(season_data.get('name'))
 145             season_number = int_or_none(season_data.get('seasonNumber'))
 146             season_id = str_or_none(season_data.get('id'))
 147
 148             for video_data in raw_data['videoList']:
 149                 current_video_list = {}
 150                 current_video_list['language'] = video_data.get('language', {}).get('languageCode')
 151
 152                 is_hardsubbed = video_data.get('hardSubbed')
 153
 154                 for video_instance in video_data['videoInstances']:
 155                     httpurl = video_instance.get('httpUrl')
 156                     url = httpurl if httpurl else video_instance.get('rtmpUrl')
 157                     if url is None:
 158                         # this video format is unavailable to the user (not premium etc.)
 159                         continue
 160
 161                     current_format = current_video_list.copy()
 162
 163                     format_id_parts = []
 164
 165                     format_id_parts.append(str_or_none(video_instance.get('id')))
 166
 167                     if is_hardsubbed is not None:
 168                         if is_hardsubbed:
 169                             format_id_parts.append('yeshardsubbed')
 170                         else:
 171                             format_id_parts.append('nothardsubbed')
 172
 173                     format_id_parts.append(current_format['language'])
 174
 175                     format_id = '_'.join([x for x in format_id_parts if x is not None])
 176
 177                     ext = determine_ext(url)
 178                     if ext == 'm3u8':
 179                         for format_ in self._extract_m3u8_formats(
 180                                 url, video_id, m3u8_id=format_id, fatal=False):
 181                             formats[format_['format_id']] = format_
 182                         continue
 183                     elif ext == 'mpd':
 184                         for format_ in self._extract_mpd_formats(
 185                                 url, video_id, mpd_id=format_id, fatal=False):
 186                             formats[format_['format_id']] = format_
 187                         continue
 188
 189                     current_format['url'] = url
 190                     quality_data = video_instance.get('videoQuality')
 191                     if quality_data:
 192                         quality = quality_data.get('name') or quality_data.get('description')
 193                     else:
 194                         quality = None
 195
 196                     height = None
 197                     if quality:
 198                         height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None))
 199
 200                     if height is None:
 201                         self.report_warning('Could not get height of video')
 202                     else:
 203                         current_format['height'] = height
 204                     current_format['format_id'] = format_id
 205
 206                     formats[current_format['format_id']] = current_format
 207
 208         formats = list(formats.values())
 209         self._sort_formats(formats)
 210
 211         return {
 212             'id': video_id,
 213             'display_id': display_id,
 214             'title': title,
 215             'description': description,
 216             'series': series,
 217             'episode': episode_name,
 218             'episode_number': int_or_none(episode_number),
 219             'thumbnails': thumbnails,
 220             'duration': duration,
 221             'formats': formats,
 222             'season': season,
 223             'season_number': season_number,
 224             'season_id': season_id,
 225         }
 226
 227
 228 class AnimeLabShowsIE(AnimeLabBaseIE):
 229     _VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)'
 230
 231     _TEST = {
 232         'url': 'https://www.animelab.com/shows/attack-on-titan',
 233         'info_dict': {
 234             'id': '45',
 235             'title': 'Attack on Titan',
 236             'description': 'md5:989d95a2677e9309368d5cf39ba91469',
 237         },
 238         'playlist_count': 59,
 239         'skip': 'All AnimeLab content requires authentication',
 240     }
 241
 242     def _real_extract(self, url):
 243         _BASE_URL = 'http://www.animelab.com'
 244         _SHOWS_API_URL = '/api/videoentries/show/videos/'
 245         display_id = self._match_id(url)
 246
 247         webpage = self._download_webpage(url, display_id, 'Downloading requested URL')
 248
 249         show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data')
 250         show_data = self._parse_json(show_data_str, display_id)
 251
 252         show_id = str_or_none(show_data.get('id'))
 253         title = show_data.get('name')
 254         description = show_data.get('shortSynopsis') or show_data.get('longSynopsis')
 255
 256         entries = []
 257         for season in show_data['seasons']:
 258             season_id = season['id']
 259             get_data = urlencode_postdata({
 260                 'seasonId': season_id,
 261                 'limit': 1000,
 262             })
 263             # despite using urlencode_postdata, we are sending a GET request
 264             target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8')
 265             response = self._download_webpage(
 266                 target_url,
 267                 None, 'Season id %s' % season_id)
 268
 269             season_data = self._parse_json(response, display_id)
 270
 271             for video_data in season_data['list']:
 272                 entries.append(self.url_result(
 273                     _BASE_URL + '/player/' + video_data['slug'], 'AnimeLab',
 274                     str_or_none(video_data.get('id')), video_data.get('name')
 275                 ))
 276
 277         return {
 278             '_type': 'playlist',
 279             'id': show_id,
 280             'title': title,
 281             'description': description,
 282             'entries': entries,
 283         }
 284
 285 # TODO implement myqueue