[yt-dlp.git] / yt_dlp / extractor / nebula.py

# coding: utf-8
from __future__ import unicode_literals

import itertools
import json
import time
import urllib

from ..utils import (
    ExtractorError,
    parse_iso8601,
    try_get,
)
from .common import InfoExtractor


class NebulaBaseIE(InfoExtractor):
    _NETRC_MACHINE = 'watchnebula'

    _nebula_api_token = None
    _nebula_bearer_token = None
    _zype_access_token = None

    def _perform_nebula_auth(self):
        username, password = self._get_login_info()
        if not (username and password):
            self.raise_login_required()

        data = json.dumps({'email': username, 'password': password}).encode('utf8')
        response = self._download_json(
            'https://api.watchnebula.com/api/v1/auth/login/',
            data=data, fatal=False, video_id=None,
            headers={
                'content-type': 'application/json',
                # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
                'cookie': ''
            },
            note='Logging in to Nebula with supplied credentials',
            errnote='Authentication failed or rejected')
        if not response or not response.get('key'):
            self.raise_login_required()

        # save nebula token as cookie
        self._set_cookie(
            'nebula.app', 'nebula-auth',
            urllib.parse.quote(
                json.dumps({
                    "apiToken": response["key"],
                    "isLoggingIn": False,
                    "isLoggingOut": False,
                }, separators=(",", ":"))),
            expire_time=int(time.time()) + 86400 * 365,
        )

        return response['key']

    def _retrieve_nebula_api_token(self):
        """
        Check cookie jar for valid token. Try to authenticate using credentials if no valid token
        can be found in the cookie jar.
        """
        nebula_cookies = self._get_cookies('https://nebula.app')
        nebula_cookie = nebula_cookies.get('nebula-auth')
        if nebula_cookie:
            self.to_screen('Authenticating to Nebula with token from cookie jar')
            nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value)
            nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
            if nebula_api_token:
                return nebula_api_token

        return self._perform_nebula_auth()

    def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
        assert method in ('GET', 'POST',)
        assert auth_type in ('api', 'bearer',)

        def inner_call():
            authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
            return self._download_json(
                url, video_id, note=note, headers={'Authorization': authorization},
                data=b'' if method == 'POST' else None)

        try:
            return inner_call()
        except ExtractorError as exc:
            # if 401 or 403, attempt credential re-auth and retry
            if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
                self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
                self._login()
                return inner_call()
            else:
                raise

    def _fetch_nebula_bearer_token(self):
        """
        Get a Bearer token for the Nebula API. This will be required to fetch video meta data.
        """
        response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
                                         method='POST',
                                         note='Authorizing to Nebula')
        return response['token']

    def _fetch_zype_access_token(self):
        """
        Get a Zype access token, which is required to access video streams -- in our case: to
        generate video URLs.
        """
        user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token')

        access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str)
        if not access_token:
            if try_get(user_object, lambda x: x['is_subscribed'], bool):
                # TODO: Reimplement the same Zype token polling the Nebula frontend implements
                # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
                raise ExtractorError(
                    'Unable to extract Zype access token from Nebula API authentication endpoint. '
                    'Open an arbitrary video in a browser with this account to generate a token',
                    expected=True)
            raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
        return access_token

    def _build_video_info(self, episode):
        zype_id = episode['zype_id']
        zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}'
        channel_slug = episode['channel_slug']
        return {
            'id': episode['zype_id'],
            'display_id': episode['slug'],
            '_type': 'url_transparent',
            'ie_key': 'Zype',
            'url': zype_video_url,
            'title': episode['title'],
            'description': episode['description'],
            'timestamp': parse_iso8601(episode['published_at']),
            'thumbnails': [{
                # 'id': tn.get('name'),  # this appears to be null
                'url': tn['original'],
                'height': key,
            } for key, tn in episode['assets']['thumbnail'].items()],
            'duration': episode['duration'],
            'channel': episode['channel_title'],
            'channel_id': channel_slug,
            'channel_url': f'https://nebula.app/{channel_slug}',
            'uploader': episode['channel_title'],
            'uploader_id': channel_slug,
            'uploader_url': f'https://nebula.app/{channel_slug}',
            'series': episode['channel_title'],
            'creator': episode['channel_title'],
        }

    def _perform_login(self, username=None, password=None):
        # FIXME: username should be passed from here to inner functions
        self._nebula_api_token = self._retrieve_nebula_api_token()
        self._nebula_bearer_token = self._fetch_nebula_bearer_token()
        self._zype_access_token = self._fetch_zype_access_token()


class NebulaIE(NebulaBaseIE):
    _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)'
    _TESTS = [
        {
            'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast',
            'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',
            'info_dict': {
                'id': '5c271b40b13fd613090034fd',
                'ext': 'mp4',
                'title': 'That Time Disney Remade Beauty and the Beast',
                'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
                'upload_date': '20180731',
                'timestamp': 1533009600,
                'channel': 'Lindsay Ellis',
                'channel_id': 'lindsayellis',
                'uploader': 'Lindsay Ellis',
                'uploader_id': 'lindsayellis',
            },
            'params': {
                'usenetrc': True,
            },
        },
        {
            'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
            'md5': '6d4edd14ce65720fa63aba5c583fb328',
            'info_dict': {
                'id': '5e7e78171aaf320001fbd6be',
                'ext': 'mp4',
                'title': 'Landing Craft - How The Allies Got Ashore',
                'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
                'upload_date': '20200327',
                'timestamp': 1585348140,
                'channel': 'Real Engineering',
                'channel_id': 'realengineering',
                'uploader': 'Real Engineering',
                'uploader_id': 'realengineering',
            },
            'params': {
                'usenetrc': True,
            },
        },
        {
            'url': 'https://nebula.app/videos/money-episode-1-the-draw',
            'md5': '8c7d272910eea320f6f8e6d3084eecf5',
            'info_dict': {
                'id': '5e779ebdd157bc0001d1c75a',
                'ext': 'mp4',
                'title': 'Episode 1: The Draw',
                'description': r'contains:There’s free money on offer… if the players can all work together.',
                'upload_date': '20200323',
                'timestamp': 1584980400,
                'channel': 'Tom Scott Presents: Money',
                'channel_id': 'tom-scott-presents-money',
                'uploader': 'Tom Scott Presents: Money',
                'uploader_id': 'tom-scott-presents-money',
            },
            'params': {
                'usenetrc': True,
            },
        },
        {
            'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
            'only_matching': True,
        },
    ]

    def _fetch_video_metadata(self, slug):
        return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/',
                                     video_id=slug,
                                     auth_type='bearer',
                                     note='Fetching video meta data')

    def _real_extract(self, url):
        slug = self._match_id(url)
        video = self._fetch_video_metadata(slug)
        return self._build_video_info(video)


class NebulaCollectionIE(NebulaBaseIE):
    IE_NAME = 'nebula:collection'
    _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P<id>[-\w]+)'
    _TESTS = [
        {
            'url': 'https://nebula.app/tom-scott-presents-money',
            'info_dict': {
                'id': 'tom-scott-presents-money',
                'title': 'Tom Scott Presents: Money',
                'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
            },
            'playlist_count': 5,
            'params': {
                'usenetrc': True,
            },
        }, {
            'url': 'https://nebula.app/lindsayellis',
            'info_dict': {
                'id': 'lindsayellis',
                'title': 'Lindsay Ellis',
                'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
            },
            'playlist_mincount': 100,
            'params': {
                'usenetrc': True,
            },
        },
    ]

    def _generate_playlist_entries(self, collection_id, channel):
        episodes = channel['episodes']['results']
        for page_num in itertools.count(2):
            for episode in episodes:
                yield self._build_video_info(episode)
            next_url = channel['episodes']['next']
            if not next_url:
                break
            channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
                                            note=f'Retrieving channel page {page_num}')
            episodes = channel['episodes']['results']

    def _real_extract(self, url):
        collection_id = self._match_id(url)
        channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/'
        channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel')
        channel_details = channel['details']

        return self.playlist_result(
            entries=self._generate_playlist_entries(collection_id, channel),
            playlist_id=collection_id,
            playlist_title=channel_details['title'],
            playlist_description=channel_details['description']
        )
Commit	Line	Data
bdc196a4 GS	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
359df0fc	4	import itertools
bdc196a4 GS	5	import json
bdc196a4 GS	6	import time
359df0fc	7	import urllib
bdc196a4	8
bdc196a4 GS	9	from ..utils import (
	10	ExtractorError,
	11	parse_iso8601,
	12	try_get,
bdc196a4	13	)
359df0fc HH	14	from .common import InfoExtractor
	15
	16
	17	class NebulaBaseIE(InfoExtractor):
	18	_NETRC_MACHINE = 'watchnebula'
	19
	20	_nebula_api_token = None
	21	_nebula_bearer_token = None
	22	_zype_access_token = None
	23
	24	def _perform_nebula_auth(self):
	25	username, password = self._get_login_info()
	26	if not (username and password):
	27	self.raise_login_required()
	28
	29	data = json.dumps({'email': username, 'password': password}).encode('utf8')
	30	response = self._download_json(
	31	'https://api.watchnebula.com/api/v1/auth/login/',
	32	data=data, fatal=False, video_id=None,
	33	headers={
	34	'content-type': 'application/json',
	35	# Submitting the 'sessionid' cookie always causes a 403 on auth endpoint
	36	'cookie': ''
	37	},
	38	note='Logging in to Nebula with supplied credentials',
	39	errnote='Authentication failed or rejected')
	40	if not response or not response.get('key'):
	41	self.raise_login_required()
	42
	43	# save nebula token as cookie
	44	self._set_cookie(
	45	'nebula.app', 'nebula-auth',
	46	urllib.parse.quote(
	47	json.dumps({
	48	"apiToken": response["key"],
	49	"isLoggingIn": False,
	50	"isLoggingOut": False,
	51	}, separators=(",", ":"))),
	52	expire_time=int(time.time()) + 86400 * 365,
	53	)
	54
	55	return response['key']
	56
	57	def _retrieve_nebula_api_token(self):
	58	"""
	59	Check cookie jar for valid token. Try to authenticate using credentials if no valid token
	60	can be found in the cookie jar.
	61	"""
	62	nebula_cookies = self._get_cookies('https://nebula.app')
	63	nebula_cookie = nebula_cookies.get('nebula-auth')
	64	if nebula_cookie:
	65	self.to_screen('Authenticating to Nebula with token from cookie jar')
	66	nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value)
	67	nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken')
	68	if nebula_api_token:
	69	return nebula_api_token
	70
	71	return self._perform_nebula_auth()
bdc196a4	72
359df0fc HH	73	def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''):
	74	assert method in ('GET', 'POST',)
	75	assert auth_type in ('api', 'bearer',)
bdc196a4	76
359df0fc HH	77	def inner_call():
	78	authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}'
	79	return self._download_json(
	80	url, video_id, note=note, headers={'Authorization': authorization},
	81	data=b'' if method == 'POST' else None)
	82
	83	try:
	84	return inner_call()
	85	except ExtractorError as exc:
	86	# if 401 or 403, attempt credential re-auth and retry
	87	if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403):
	88	self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}')
	89	self._login()
	90	return inner_call()
	91	else:
	92	raise
	93
	94	def _fetch_nebula_bearer_token(self):
	95	"""
	96	Get a Bearer token for the Nebula API. This will be required to fetch video meta data.
	97	"""
	98	response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/',
	99	method='POST',
	100	note='Authorizing to Nebula')
	101	return response['token']
bdc196a4	102
359df0fc HH	103	def _fetch_zype_access_token(self):
	104	"""
	105	Get a Zype access token, which is required to access video streams -- in our case: to
	106	generate video URLs.
	107	"""
	108	user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token')
	109
	110	access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str)
	111	if not access_token:
	112	if try_get(user_object, lambda x: x['is_subscribed'], bool):
	113	# TODO: Reimplement the same Zype token polling the Nebula frontend implements
	114	# see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532
	115	raise ExtractorError(
	116	'Unable to extract Zype access token from Nebula API authentication endpoint. '
	117	'Open an arbitrary video in a browser with this account to generate a token',
	118	expected=True)
	119	raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint')
	120	return access_token
	121
	122	def _build_video_info(self, episode):
	123	zype_id = episode['zype_id']
	124	zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}'
	125	channel_slug = episode['channel_slug']
	126	return {
	127	'id': episode['zype_id'],
	128	'display_id': episode['slug'],
	129	'_type': 'url_transparent',
	130	'ie_key': 'Zype',
	131	'url': zype_video_url,
	132	'title': episode['title'],
	133	'description': episode['description'],
	134	'timestamp': parse_iso8601(episode['published_at']),
	135	'thumbnails': [{
	136	# 'id': tn.get('name'), # this appears to be null
	137	'url': tn['original'],
	138	'height': key,
	139	} for key, tn in episode['assets']['thumbnail'].items()],
	140	'duration': episode['duration'],
	141	'channel': episode['channel_title'],
	142	'channel_id': channel_slug,
	143	'channel_url': f'https://nebula.app/{channel_slug}',
	144	'uploader': episode['channel_title'],
	145	'uploader_id': channel_slug,
	146	'uploader_url': f'https://nebula.app/{channel_slug}',
	147	'series': episode['channel_title'],
	148	'creator': episode['channel_title'],
	149	}
	150
52efa4b3	151	def _perform_login(self, username=None, password=None):
52efa4b3	152	# FIXME: username should be passed from here to inner functions
359df0fc HH	153	self._nebula_api_token = self._retrieve_nebula_api_token()
	154	self._nebula_bearer_token = self._fetch_nebula_bearer_token()
	155	self._zype_access_token = self._fetch_zype_access_token()
	156
359df0fc HH	157
359df0fc HH	158	class NebulaIE(NebulaBaseIE):
bdc196a4 GS	159	_VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com\|nebula\.app)/videos/(?P<id>[-\w]+)'
	160	_TESTS = [
	161	{
	162	'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast',
	163	'md5': 'fe79c4df8b3aa2fea98a93d027465c7e',
	164	'info_dict': {
	165	'id': '5c271b40b13fd613090034fd',
	166	'ext': 'mp4',
	167	'title': 'That Time Disney Remade Beauty and the Beast',
	168	'description': 'Note: this video was originally posted on YouTube with the sponsor read included. We weren’t able to remove it without reducing video quality, so it’s presented here in its original context.',
	169	'upload_date': '20180731',
	170	'timestamp': 1533009600,
	171	'channel': 'Lindsay Ellis',
359df0fc	172	'channel_id': 'lindsayellis',
bdc196a4	173	'uploader': 'Lindsay Ellis',
359df0fc	174	'uploader_id': 'lindsayellis',
bdc196a4 GS	175	},
	176	'params': {
	177	'usenetrc': True,
	178	},
bdc196a4 GS	179	},
	180	{
	181	'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
	182	'md5': '6d4edd14ce65720fa63aba5c583fb328',
	183	'info_dict': {
	184	'id': '5e7e78171aaf320001fbd6be',
	185	'ext': 'mp4',
	186	'title': 'Landing Craft - How The Allies Got Ashore',
	187	'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
	188	'upload_date': '20200327',
	189	'timestamp': 1585348140,
359df0fc HH	190	'channel': 'Real Engineering',
	191	'channel_id': 'realengineering',
	192	'uploader': 'Real Engineering',
	193	'uploader_id': 'realengineering',
bdc196a4 GS	194	},
	195	'params': {
	196	'usenetrc': True,
	197	},
bdc196a4 GS	198	},
	199	{
	200	'url': 'https://nebula.app/videos/money-episode-1-the-draw',
	201	'md5': '8c7d272910eea320f6f8e6d3084eecf5',
	202	'info_dict': {
	203	'id': '5e779ebdd157bc0001d1c75a',
	204	'ext': 'mp4',
	205	'title': 'Episode 1: The Draw',
	206	'description': r'contains:There’s free money on offer… if the players can all work together.',
	207	'upload_date': '20200323',
	208	'timestamp': 1584980400,
	209	'channel': 'Tom Scott Presents: Money',
359df0fc	210	'channel_id': 'tom-scott-presents-money',
bdc196a4	211	'uploader': 'Tom Scott Presents: Money',
359df0fc	212	'uploader_id': 'tom-scott-presents-money',
bdc196a4 GS	213	},
	214	'params': {
	215	'usenetrc': True,
	216	},
bdc196a4 GS	217	},
	218	{
	219	'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
	220	'only_matching': True,
	221	},
	222	]
bdc196a4	223
359df0fc HH	224	def _fetch_video_metadata(self, slug):
	225	return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/',
	226	video_id=slug,
	227	auth_type='bearer',
	228	note='Fetching video meta data')
bdc196a4	229
359df0fc HH	230	def _real_extract(self, url):
	231	slug = self._match_id(url)
	232	video = self._fetch_video_metadata(slug)
	233	return self._build_video_info(video)
bdc196a4	234
bdc196a4	235
359df0fc HH	236	class NebulaCollectionIE(NebulaBaseIE):
	237	IE_NAME = 'nebula:collection'
	238	_VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com\|nebula\.app)/(?!videos/)(?P<id>[-\w]+)'
	239	_TESTS = [
	240	{
	241	'url': 'https://nebula.app/tom-scott-presents-money',
	242	'info_dict': {
	243	'id': 'tom-scott-presents-money',
	244	'title': 'Tom Scott Presents: Money',
	245	'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
bdc196a4	246	},
359df0fc HH	247	'playlist_count': 5,
	248	'params': {
	249	'usenetrc': True,
	250	},
	251	}, {
	252	'url': 'https://nebula.app/lindsayellis',
	253	'info_dict': {
	254	'id': 'lindsayellis',
	255	'title': 'Lindsay Ellis',
	256	'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
	257	},
	258	'playlist_mincount': 100,
	259	'params': {
	260	'usenetrc': True,
	261	},
	262	},
	263	]
bdc196a4	264
359df0fc HH	265	def _generate_playlist_entries(self, collection_id, channel):
	266	episodes = channel['episodes']['results']
	267	for page_num in itertools.count(2):
	268	for episode in episodes:
	269	yield self._build_video_info(episode)
	270	next_url = channel['episodes']['next']
	271	if not next_url:
	272	break
	273	channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer',
	274	note=f'Retrieving channel page {page_num}')
	275	episodes = channel['episodes']['results']
bdc196a4 GS	276
bdc196a4 GS	277	def _real_extract(self, url):
359df0fc HH	278	collection_id = self._match_id(url)
	279	channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/'
	280	channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel')
	281	channel_details = channel['details']
bdc196a4	282
359df0fc HH	283	return self.playlist_result(
	284	entries=self._generate_playlist_entries(collection_id, channel),
	285	playlist_id=collection_id,
	286	playlist_title=channel_details['title'],
	287	playlist_description=channel_details['description']
	288	)