[yt-dlp.git] / yt_dlp / extractor / animelab.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor

from ..utils import (
    ExtractorError,
    urlencode_postdata,
    int_or_none,
    str_or_none,
    determine_ext,
)

from ..compat import compat_HTTPError


class AnimeLabBaseIE(InfoExtractor):
    _LOGIN_URL = 'https://www.animelab.com/login'
    _NETRC_MACHINE = 'animelab'
    _LOGGED_IN = False

    def _is_logged_in(self, login_page=None):
        if not self._LOGGED_IN:
            if not login_page:
                login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page')
            AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page
        return self._LOGGED_IN

    def _perform_login(self, username, password):
        if self._is_logged_in():
            return

        login_form = {
            'email': username,
            'password': password,
        }

        try:
            response = self._download_webpage(
                self._LOGIN_URL, None, 'Logging in', 'Wrong login info',
                data=urlencode_postdata(login_form),
                headers={'Content-Type': 'application/x-www-form-urlencoded'})
        except ExtractorError as e:
            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
                raise ExtractorError('Unable to log in (wrong credentials?)', expected=True)
            raise

        if not self._is_logged_in(response):
            raise ExtractorError('Unable to login (cannot verify if logged in)')

    def _real_initialize(self):
        if not self._is_logged_in():
            self.raise_login_required('Login is required to access any AnimeLab content')


class AnimeLabIE(AnimeLabBaseIE):
    _VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)'

    # the following tests require authentication, but a free account will suffice
    # just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file
    # or you can set 'username' and 'password' there
    # the tests also select a specific format so that the same video is downloaded
    # regardless of whether the user is premium or not (needs testing on a premium account)
    _TEST = {
        'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42',
        'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f',
        'info_dict': {
            'id': '383',
            'ext': 'mp4',
            'display_id': 'fullmetal-alchemist-brotherhood-episode-42',
            'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive',
            'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4',
            'series': 'Fullmetal Alchemist: Brotherhood',
            'episode': 'Signs of a Counteroffensive',
            'episode_number': 42,
            'duration': 1469,
            'season': 'Season 1',
            'season_number': 1,
            'season_id': '38',
        },
        'params': {
            'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]',
        },
        'skip': 'All AnimeLab content requires authentication',
    }

    def _real_extract(self, url):
        display_id = self._match_id(url)

        # unfortunately we can get different URLs for the same formats
        # e.g. if we are using a "free" account so no dubs available
        # (so _remove_duplicate_formats is not effective)
        # so we use a dictionary as a workaround
        formats = {}
        for language_option_url in ('https://www.animelab.com/player/%s/subtitles',
                                    'https://www.animelab.com/player/%s/dubbed'):
            actual_url = language_option_url % display_id
            webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url)

            video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id)
            position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position'))

            raw_data = video_collection[position]['videoEntry']

            video_id = str_or_none(raw_data['id'])

            # create a title from many sources (while grabbing other info)
            # TODO use more fallback sources to get some of these
            series = raw_data.get('showTitle')
            video_type = raw_data.get('videoEntryType', {}).get('name')
            episode_number = raw_data.get('episodeNumber')
            episode_name = raw_data.get('name')

            title_parts = (series, video_type, episode_number, episode_name)
            if None not in title_parts:
                title = '%s - %s %s - %s' % title_parts
            else:
                title = episode_name

            description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None)

            duration = int_or_none(raw_data.get('duration'))

            thumbnail_data = raw_data.get('images', [])
            thumbnails = []
            for thumbnail in thumbnail_data:
                for instance in thumbnail['imageInstances']:
                    image_data = instance.get('imageInfo', {})
                    thumbnails.append({
                        'id': str_or_none(image_data.get('id')),
                        'url': image_data.get('fullPath'),
                        'width': image_data.get('width'),
                        'height': image_data.get('height'),
                    })

            season_data = raw_data.get('season', {}) or {}
            season = str_or_none(season_data.get('name'))
            season_number = int_or_none(season_data.get('seasonNumber'))
            season_id = str_or_none(season_data.get('id'))

            for video_data in raw_data['videoList']:
                current_video_list = {}
                current_video_list['language'] = video_data.get('language', {}).get('languageCode')

                is_hardsubbed = video_data.get('hardSubbed')

                for video_instance in video_data['videoInstances']:
                    httpurl = video_instance.get('httpUrl')
                    url = httpurl if httpurl else video_instance.get('rtmpUrl')
                    if url is None:
                        # this video format is unavailable to the user (not premium etc.)
                        continue

                    current_format = current_video_list.copy()

                    format_id_parts = []

                    format_id_parts.append(str_or_none(video_instance.get('id')))

                    if is_hardsubbed is not None:
                        if is_hardsubbed:
                            format_id_parts.append('yeshardsubbed')
                        else:
                            format_id_parts.append('nothardsubbed')

                    format_id_parts.append(current_format['language'])

                    format_id = '_'.join([x for x in format_id_parts if x is not None])

                    ext = determine_ext(url)
                    if ext == 'm3u8':
                        for format_ in self._extract_m3u8_formats(
                                url, video_id, m3u8_id=format_id, fatal=False):
                            formats[format_['format_id']] = format_
                        continue
                    elif ext == 'mpd':
                        for format_ in self._extract_mpd_formats(
                                url, video_id, mpd_id=format_id, fatal=False):
                            formats[format_['format_id']] = format_
                        continue

                    current_format['url'] = url
                    quality_data = video_instance.get('videoQuality')
                    if quality_data:
                        quality = quality_data.get('name') or quality_data.get('description')
                    else:
                        quality = None

                    height = None
                    if quality:
                        height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None))

                    if height is None:
                        self.report_warning('Could not get height of video')
                    else:
                        current_format['height'] = height
                    current_format['format_id'] = format_id

                    formats[current_format['format_id']] = current_format

        formats = list(formats.values())
        self._sort_formats(formats)

        return {
            'id': video_id,
            'display_id': display_id,
            'title': title,
            'description': description,
            'series': series,
            'episode': episode_name,
            'episode_number': int_or_none(episode_number),
            'thumbnails': thumbnails,
            'duration': duration,
            'formats': formats,
            'season': season,
            'season_number': season_number,
            'season_id': season_id,
        }


class AnimeLabShowsIE(AnimeLabBaseIE):
    _VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)'

    _TEST = {
        'url': 'https://www.animelab.com/shows/attack-on-titan',
        'info_dict': {
            'id': '45',
            'title': 'Attack on Titan',
            'description': 'md5:989d95a2677e9309368d5cf39ba91469',
        },
        'playlist_count': 59,
        'skip': 'All AnimeLab content requires authentication',
    }

    def _real_extract(self, url):
        _BASE_URL = 'http://www.animelab.com'
        _SHOWS_API_URL = '/api/videoentries/show/videos/'
        display_id = self._match_id(url)

        webpage = self._download_webpage(url, display_id, 'Downloading requested URL')

        show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data')
        show_data = self._parse_json(show_data_str, display_id)

        show_id = str_or_none(show_data.get('id'))
        title = show_data.get('name')
        description = show_data.get('shortSynopsis') or show_data.get('longSynopsis')

        entries = []
        for season in show_data['seasons']:
            season_id = season['id']
            get_data = urlencode_postdata({
                'seasonId': season_id,
                'limit': 1000,
            })
            # despite using urlencode_postdata, we are sending a GET request
            target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8')
            response = self._download_webpage(
                target_url,
                None, 'Season id %s' % season_id)

            season_data = self._parse_json(response, display_id)

            for video_data in season_data['list']:
                entries.append(self.url_result(
                    _BASE_URL + '/player/' + video_data['slug'], 'AnimeLab',
                    str_or_none(video_data.get('id')), video_data.get('name')
                ))

        return {
            '_type': 'playlist',
            'id': show_id,
            'title': title,
            'description': description,
            'entries': entries,
        }

# TODO implement myqueue
Commit	Line	Data
ba3c9477	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	from .common import InfoExtractor
	5
	6	from ..utils import (
	7	ExtractorError,
	8	urlencode_postdata,
	9	int_or_none,
	10	str_or_none,
	11	determine_ext,
	12	)
	13
	14	from ..compat import compat_HTTPError
	15
	16
	17	class AnimeLabBaseIE(InfoExtractor):
ba3c9477	18	_LOGIN_URL = 'https://www.animelab.com/login'
ba3c9477	19	_NETRC_MACHINE = 'animelab'
52efa4b3	20	_LOGGED_IN = False
ba3c9477	21
52efa4b3	22	def _is_logged_in(self, login_page=None):
	23	if not self._LOGGED_IN:
	24	if not login_page:
	25	login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page')
	26	AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page
	27	return self._LOGGED_IN
ba3c9477	28
52efa4b3	29	def _perform_login(self, username, password):
52efa4b3	30	if self._is_logged_in():
ba3c9477	31	return
ba3c9477	32
ba3c9477	33	login_form = {
	34	'email': username,
	35	'password': password,
	36	}
	37
	38	try:
	39	response = self._download_webpage(
	40	self._LOGIN_URL, None, 'Logging in', 'Wrong login info',
	41	data=urlencode_postdata(login_form),
	42	headers={'Content-Type': 'application/x-www-form-urlencoded'})
	43	except ExtractorError as e:
	44	if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
	45	raise ExtractorError('Unable to log in (wrong credentials?)', expected=True)
52efa4b3	46	raise
ba3c9477	47
52efa4b3	48	if not self._is_logged_in(response):
52efa4b3	49	raise ExtractorError('Unable to login (cannot verify if logged in)')
ba3c9477	50
ba3c9477	51	def _real_initialize(self):
52efa4b3	52	if not self._is_logged_in():
52efa4b3	53	self.raise_login_required('Login is required to access any AnimeLab content')
ba3c9477	54
	55
	56	class AnimeLabIE(AnimeLabBaseIE):
	57	_VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)'
	58
	59	# the following tests require authentication, but a free account will suffice
	60	# just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file
	61	# or you can set 'username' and 'password' there
	62	# the tests also select a specific format so that the same video is downloaded
	63	# regardless of whether the user is premium or not (needs testing on a premium account)
	64	_TEST = {
	65	'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42',
	66	'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f',
	67	'info_dict': {
	68	'id': '383',
	69	'ext': 'mp4',
	70	'display_id': 'fullmetal-alchemist-brotherhood-episode-42',
	71	'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive',
	72	'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4',
	73	'series': 'Fullmetal Alchemist: Brotherhood',
	74	'episode': 'Signs of a Counteroffensive',
	75	'episode_number': 42,
	76	'duration': 1469,
	77	'season': 'Season 1',
	78	'season_number': 1,
	79	'season_id': '38',
	80	},
	81	'params': {
	82	'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]',
	83	},
	84	'skip': 'All AnimeLab content requires authentication',
	85	}
	86
	87	def _real_extract(self, url):
	88	display_id = self._match_id(url)
	89
	90	# unfortunately we can get different URLs for the same formats
	91	# e.g. if we are using a "free" account so no dubs available
	92	# (so _remove_duplicate_formats is not effective)
	93	# so we use a dictionary as a workaround
	94	formats = {}
	95	for language_option_url in ('https://www.animelab.com/player/%s/subtitles',
	96	'https://www.animelab.com/player/%s/dubbed'):
	97	actual_url = language_option_url % display_id
	98	webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url)
	99
	100	video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s?\((.?)\);', webpage, 'AnimeLab VideoCollection'), display_id)
	101	position = int_or_none(self._search_regex(r'playlistPosition\s?=\s?(\d+)', webpage, 'Playlist Position'))
	102
	103	raw_data = video_collection[position]['videoEntry']
	104
	105	video_id = str_or_none(raw_data['id'])
	106
	107	# create a title from many sources (while grabbing other info)
	108	# TODO use more fallback sources to get some of these
	109	series = raw_data.get('showTitle')
	110	video_type = raw_data.get('videoEntryType', {}).get('name')
	111	episode_number = raw_data.get('episodeNumber')
	112	episode_name = raw_data.get('name')
	113
	114	title_parts = (series, video_type, episode_number, episode_name)
	115	if None not in title_parts:
	116	title = '%s - %s %s - %s' % title_parts
	117	else:
118	title = episode_name
119
120	description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None)
121
122	duration = int_or_none(raw_data.get('duration'))
123
124	thumbnail_data = raw_data.get('images', [])
125	thumbnails = []
126	for thumbnail in thumbnail_data:
127	for instance in thumbnail['imageInstances']:
128	image_data = instance.get('imageInfo', {})
129	thumbnails.append({
130	'id': str_or_none(image_data.get('id')),
131	'url': image_data.get('fullPath'),
132	'width': image_data.get('width'),
133	'height': image_data.get('height'),
134	})
135
136	season_data = raw_data.get('season', {}) or {}
137	season = str_or_none(season_data.get('name'))
138	season_number = int_or_none(season_data.get('seasonNumber'))
139	season_id = str_or_none(season_data.get('id'))
140
141	for video_data in raw_data['videoList']:
142	current_video_list = {}
143	current_video_list['language'] = video_data.get('language', {}).get('languageCode')
144
145	is_hardsubbed = video_data.get('hardSubbed')
146
147	for video_instance in video_data['videoInstances']:
148	httpurl = video_instance.get('httpUrl')
149	url = httpurl if httpurl else video_instance.get('rtmpUrl')
150	if url is None:
151	# this video format is unavailable to the user (not premium etc.)
152	continue
153
154	current_format = current_video_list.copy()
155
156	format_id_parts = []
157
158	format_id_parts.append(str_or_none(video_instance.get('id')))
159
160	if is_hardsubbed is not None:
161	if is_hardsubbed:
162	format_id_parts.append('yeshardsubbed')
163	else:
164	format_id_parts.append('nothardsubbed')
165
166	format_id_parts.append(current_format['language'])
167
168	format_id = '_'.join([x for x in format_id_parts if x is not None])
169
170	ext = determine_ext(url)
171	if ext == 'm3u8':
172	for format_ in self._extract_m3u8_formats(
173	url, video_id, m3u8_id=format_id, fatal=False):
174	formats[format_['format_id']] = format_
175	continue
176	elif ext == 'mpd':
177	for format_ in self._extract_mpd_formats(
178	url, video_id, mpd_id=format_id, fatal=False):
179	formats[format_['format_id']] = format_
180	continue
181
182	current_format['url'] = url
183	quality_data = video_instance.get('videoQuality')
184	if quality_data:
185	quality = quality_data.get('name') or quality_data.get('description')
186	else:
187	quality = None
188
189	height = None
190	if quality:
191	height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None))
192
193	if height is None:
194	self.report_warning('Could not get height of video')
195	else:
196	current_format['height'] = height
197	current_format['format_id'] = format_id
198
199	formats[current_format['format_id']] = current_format
200
201	formats = list(formats.values())
202	self._sort_formats(formats)
203
204	return {
205	'id': video_id,
206	'display_id': display_id,
207	'title': title,
208	'description': description,
209	'series': series,
210	'episode': episode_name,
211	'episode_number': int_or_none(episode_number),
212	'thumbnails': thumbnails,
213	'duration': duration,
214	'formats': formats,
215	'season': season,
216	'season_number': season_number,
217	'season_id': season_id,
218	}
219
220
221	class AnimeLabShowsIE(AnimeLabBaseIE):
222	_VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)'
223
224	_TEST = {
225	'url': 'https://www.animelab.com/shows/attack-on-titan',
226	'info_dict': {
227	'id': '45',
228	'title': 'Attack on Titan',
229	'description': 'md5:989d95a2677e9309368d5cf39ba91469',
230	},
231	'playlist_count': 59,
232	'skip': 'All AnimeLab content requires authentication',
233	}
234
235	def _real_extract(self, url):
236	_BASE_URL = 'http://www.animelab.com'
237	_SHOWS_API_URL = '/api/videoentries/show/videos/'
238	display_id = self._match_id(url)
239
240	webpage = self._download_webpage(url, display_id, 'Downloading requested URL')
241
242	show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data')
243	show_data = self._parse_json(show_data_str, display_id)
244
245	show_id = str_or_none(show_data.get('id'))
246	title = show_data.get('name')
247	description = show_data.get('shortSynopsis') or show_data.get('longSynopsis')
248
249	entries = []
250	for season in show_data['seasons']:
251	season_id = season['id']
252	get_data = urlencode_postdata({
253	'seasonId': season_id,
254	'limit': 1000,
255	})
256	# despite using urlencode_postdata, we are sending a GET request
257	target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8')
258	response = self._download_webpage(
259	target_url,
260	None, 'Season id %s' % season_id)
261
262	season_data = self._parse_json(response, display_id)
263
264	for video_data in season_data['list']:
265	entries.append(self.url_result(
266	_BASE_URL + '/player/' + video_data['slug'], 'AnimeLab',
267	str_or_none(video_data.get('id')), video_data.get('name')
268	))
269
270	return {
271	'_type': 'playlist',
272	'id': show_id,
273	'title': title,
274	'description': description,
275	'entries': entries,
276	}
277
278	# TODO implement myqueue